In [1]:
import os
import sys
# 如果当前代码文件运行测试需要加入修改路径，避免出现后导包问题
BASE_DIR = os.path.dirname(os.path.dirname(os.getcwd()))
sys.path.insert(0, os.path.join(BASE_DIR))

PYSPARK_PYTHON = "/miniconda2/envs/reco_sys/bin/python"
# 当存在多个版本时，不指定很可能会导致出错
os.environ["PYSPARK_PYTHON"] = PYSPARK_PYTHON
os.environ["PYSPARK_DRIVER_PYTHON"] = PYSPARK_PYTHON

from pyspark.ml.feature import OneHotEncoder
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline
from pyspark.sql.types import *
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.classification import LogisticRegressionModel
from offline import SparkSessionBase

class CtrLogisticRegression(SparkSessionBase):

    SPARK_APP_NAME = "ctrLogisticRegression"
    ENABLE_HIVE_SUPPORT = True

    def __init__(self):

        self.spark = self._create_spark_hbase()

ctr = CtrLogisticRegression()

In [2]:
ctr.spark.sql("use profile")
# +-------------------+----------+----------+-------+
# |            user_id|article_id|channel_id|clicked|
# +-------------------+----------+----------+-------+
# |1105045287866466304|     14225|         0|  false|
user_article_basic = ctr.spark.sql("select * from user_article_basic").select(
    ['user_id', 'article_id', 'channel_id', 'clicked'])

In [3]:
user_article_basic.show()

+-------------------+----------+----------+-------+
|            user_id|article_id|channel_id|clicked|
+-------------------+----------+----------+-------+
|1105045287866466304|     14225|         0|  false|
|1106476833370537984|     14208|         0|  false|
|1111189494544990208|     19322|         0|  false|
|1111524501104885760|     44161|         0|  false|
|1112727762809913344|     18172|        18|   true|
|                  1|     44386|        18|   true|
|                  1|     44696|         0|  false|
|                 10|     43907|         0|  false|
|1106473203766657024|     16005|         0|  false|
|1108264901190615040|     15196|         0|  false|
|                 23|     44739|        18|   true|
|                 33|     13570|         0|  false|
|                  1|     17632|         0|  false|
|1106473203766657024|     17665|         0|  false|
|1111189494544990208|     44368|         0|  false|
|                 10|     44368|         0|  false|
|11050938831

### 用户画像读取处理与日志数据合并

In [4]:
user_profile_hbase = ctr.spark.sql(
    "select user_id, information.birthday, information.gender, article_partial, env from user_profile_hbase")
user_profile_hbase = user_profile_hbase.drop('env')

# +--------------------+--------+------+--------------------+
# |             user_id|birthday|gender|     article_partial|
# +--------------------+--------+------+--------------------+
# |              user:1|     0.0|  null|Map(18:Animal -> ...|

_schema = StructType([
    StructField("user_id", LongType()),
    StructField("birthday", DoubleType()),
    StructField("gender", BooleanType()),
    StructField("weights", MapType(StringType(), DoubleType()))
])

def get_user_id(row):
    return int(row.user_id.split(":")[1]), row.birthday, row.gender, row.article_partial


In [5]:
user_profile_hbase_temp = user_profile_hbase.rdd.map(get_user_id)
user_profile_hbase_schema = ctr.spark.createDataFrame(user_profile_hbase_temp, schema=_schema)


In [6]:
train = user_article_basic.join(user_profile_hbase_schema, on=['user_id'], how='left').drop('channel_id')


In [8]:
train.show()

+-------------------+----------+-------+--------+------+--------------------+
|            user_id|article_id|clicked|birthday|gender|             weights|
+-------------------+----------+-------+--------+------+--------------------+
|1106473203766657024|     16005|  false|     0.0|  null|Map(18:text -> 0....|
|1106473203766657024|     17665|  false|     0.0|  null|Map(18:text -> 0....|
|1106473203766657024|     44664|  false|     0.0|  null|Map(18:text -> 0....|
|1106473203766657024|     44386|  false|     0.0|  null|Map(18:text -> 0....|
|1106473203766657024|     13778|  false|     0.0|  null|Map(18:text -> 0....|
|1106473203766657024|     13039|  false|     0.0|  null|Map(18:text -> 0....|
|1106473203766657024|     13648|  false|     0.0|  null|Map(18:text -> 0....|
|1106473203766657024|     17304|  false|     0.0|  null|Map(18:text -> 0....|
|1106473203766657024|     19233|  false|     0.0|  null|Map(18:text -> 0....|
|1106473203766657024|     44466|  false|     0.0|  null|Map(18:t

### 文章向量读取&合并

In [10]:
ctr.spark.sql("use article")
article_vector = ctr.spark.sql("select * from article_vector")
train = train.join(article_vector, on=['article_id'], how='left').drop('birthday').drop('gender')


In [11]:
train.show()

+-------------------+-------------------+-------+--------------------+----------+--------------------+
|         article_id|            user_id|clicked|             weights|channel_id|       articlevector|
+-------------------+-------------------+-------+--------------------+----------+--------------------+
|              13401|                 10|  false|Map(18:tp2 -> 0.1...|        18|[0.06157120217893...|
|              13401|1106396183141548032|  false|Map(18:tp2 -> 0.1...|        18|[0.06157120217893...|
|              14805|1106473203766657024|  false|Map(18:text -> 0....|        18|[0.11028526511434...|
|              14805|1103195673450250240|  false|Map(18:Animal -> ...|        18|[0.11028526511434...|
|              14805|1105045287866466304|  false|Map(18:text -> 0....|        18|[0.11028526511434...|
|              14805|1111524501104885760|  false|Map(18:text -> 0....|        18|[0.11028526511434...|
|              14805|1105105185656537088|  false|Map(18:SHOldboySt...|   

In [12]:
### 合并文章画像权重特征
ctr.spark.sql("use article")
article_profile = ctr.spark.sql("select * from article_profile")


In [13]:

def article_profile_to_feature(row):

    try:
        weights = sorted(row.keywords.values())[:10]
    except Exception as e:
        weights = [0.0] * 10
    return row.article_id, weights
article_profile = article_profile.rdd.map(article_profile_to_feature).toDF(['article_id', 'article_weights'])

train = train.join(article_profile, on=['article_id'], how='left')

In [14]:
train.show()

+-------------------+-------------------+-------+--------------------+----------+--------------------+--------------------+
|         article_id|            user_id|clicked|             weights|channel_id|       articlevector|     article_weights|
+-------------------+-------------------+-------+--------------------+----------+--------------------+--------------------+
|              13401|                 10|  false|Map(18:tp2 -> 0.1...|        18|[0.06157120217893...|[0.08196639249252...|
|              13401|1106396183141548032|  false|Map(18:tp2 -> 0.1...|        18|[0.06157120217893...|[0.08196639249252...|
|              14805|1106473203766657024|  false|Map(18:text -> 0....|        18|[0.11028526511434...|[0.15069781969741...|
|              14805|1103195673450250240|  false|Map(18:Animal -> ...|        18|[0.11028526511434...|[0.15069781969741...|
|              14805|1105045287866466304|  false|Map(18:text -> 0....|        18|[0.11028526511434...|[0.15069781969741...|
|       

In [15]:
train.rdd.take(1)

[Row(article_id=13401, user_id=10, clicked=False, weights={'18:有序': 0.14441985236498525, '18:symmetric': 0.1441433918957283, '18:prd': 0.1441433918957283, '18:address': 0.1441433918957283, '18:转义字符': 0.1441433918957283, '18:numb': 0.14441985236498525, '18:cquire': 0.1441433918957283, '18:lis': 0.1441433918957283, '18:ceng': 0.14441985236498525, '18:encoding': 0.1441433918957283, '18:chouti': 0.14441985236498525, '18:语音合成': 0.1441433918957283, '18:Requests': 0.1441433918957283, '18:wusir': 0.1441433918957283, '18:session': 0.1441433918957283, '18:host': 0.1441433918957283, '18:sort': 0.14441985236498525, '18:代码自动补全': 1.2972905270615547, '18:popitem': 0.1441433918957283, '18:鸡蛋': 0.1441433918957283, '18:ida': 0.145755889693766, '18:collections': 0.1441433918957283, '18:finally': 0.1441433918957283, '18:queue': 0.1441433918957283, '18:布尔值': 0.1441433918957283, '18:urlretrieve': 0.1441433918957283, '18:lambda': 0.1441433918957283, '18:implicit': 0.1441433918957283, '18:ImageFont': 0.144143

### 处理用户画像权重

In [16]:
columns = ['article_id', 'user_id', 'channel_id', 'articlevector', 'user_weights', 'article_weights', 'clicked']
def get_user_weights(row):

    from pyspark.ml.linalg import Vectors
    try:
        user_weights = sorted([row.article_partial[key] for key in row.article_partial.keys() if key.split(':')[0] == str(row.channel_id)])[
                  :10]
    except Exception:
        user_weights = [0.0] * 10

    return row.article_id, row.user_id, row.channel_id, Vectors.dense(row.articlevector), Vectors.dense(
        user_weights), Vectors.dense(row.article_weights), int(row.clicked)

train_vector = train.rdd.map(get_user_weights).toDF(columns)


In [25]:
train_vector = train_vector.dropna()

In [26]:
train_version_two = VectorAssembler().setInputCols(columns[2:6]).setOutputCol("features").transform(train_vector)


In [27]:

train_dataset, test_dataset = train_version_two.randomSplit([0.8, 0.2])

In [None]:
### 训练模型
lr = LogisticRegression()
model = lr.setLabelCol("clicked").setFeaturesCol("features").fit(train_dataset)


## 点击率预测结果

In [18]:
online_model = LogisticRegressionModel.load('/headlines/models/logistic_ctr_model.obj')

In [28]:
res_transfrom = online_model.transform(test_dataset)

In [29]:
res_transfrom.select(["clicked", "probability", "prediction"]).show()

+-------+--------------------+----------+
|clicked|         probability|prediction|
+-------+--------------------+----------+
|      0|[0.89423045678107...|       0.0|
|      0|[0.89423045678107...|       0.0|
|      0|[0.89426978371863...|       0.0|
|      0|[0.89426450318935...|       0.0|
|      0|[0.89426450318935...|       0.0|
|      0|[0.89436330452490...|       0.0|
|      0|[0.89437568953445...|       0.0|
|      0|[0.89441205607391...|       0.0|
|      0|[0.89440179073518...|       0.0|
|      0|[0.89440179073518...|       0.0|
|      0|[0.89431388412402...|       0.0|
|      0|[0.89431388412402...|       0.0|
|      0|[0.89431388412402...|       0.0|
|      0|[0.89431682922322...|       0.0|
|      0|[0.89438068510966...|       0.0|
|      0|[0.89433063939791...|       0.0|
|      0|[0.89436206744987...|       0.0|
|      0|[0.89433503039271...|       0.0|
|      0|[0.89433503039271...|       0.0|
|      0|[0.89436484089326...|       0.0|
+-------+--------------------+----

In [30]:
def vector_to_double(row):
    return float(row.clicked), float(row.probability[1]) 

score_label = res_transfrom.select(["clicked", "probability"]).rdd.map(vector_to_double)


In [32]:
score_label.collect()

[(0.0, 0.10576954321892673),
 (0.0, 0.10576954321892673),
 (0.0, 0.10573021628136077),
 (0.0, 0.10573549681064946),
 (0.0, 0.10573549681064946),
 (0.0, 0.10563669547509504),
 (0.0, 0.10562431046554033),
 (0.0, 0.10558794392608098),
 (0.0, 0.10559820926481729),
 (0.0, 0.10559820926481729),
 (0.0, 0.1056861158759755),
 (0.0, 0.1056861158759755),
 (0.0, 0.1056861158759755),
 (0.0, 0.10568317077677611),
 (0.0, 0.105619314890337),
 (0.0, 0.1056693606020893),
 (0.0, 0.10563793255012317),
 (0.0, 0.1056649696072864),
 (0.0, 0.1056649696072864),
 (0.0, 0.10563515910673836),
 (0.0, 0.10560590876054553),
 (0.0, 0.10568620505244941),
 (0.0, 0.10568620505244941),
 (0.0, 0.10568620505244941),
 (0.0, 0.10562974738174792),
 (0.0, 0.1054320752404753),
 (0.0, 0.1054320752404753),
 (0.0, 0.10585358794457175),
 (0.0, 0.10585358794457175),
 (0.0, 0.10585358794457175),
 (0.0, 0.10577854573252297),
 (1.0, 0.10563120791722667),
 (0.0, 0.10563120791722667),
 (0.0, 0.10563120791722667),
 (0.0, 0.105631207917226