In [1]:
import os
import sys
# 如果当前代码文件运行测试需要加入修改路径，避免出现后导包问题
BASE_DIR = os.path.dirname(os.path.dirname(os.getcwd()))
sys.path.insert(0, os.path.join(BASE_DIR))

PYSPARK_PYTHON = "/miniconda2/envs/reco_sys/bin/python"
# 当存在多个版本时，不指定很可能会导致出错
os.environ["PYSPARK_PYTHON"] = PYSPARK_PYTHON
os.environ["PYSPARK_DRIVER_PYTHON"] = PYSPARK_PYTHON

from pyspark.ml.feature import OneHotEncoder
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline
from pyspark.sql.types import *
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.classification import LogisticRegressionModel
from offline import SparkSessionBase

In [2]:
class CtrLogisticRegression(SparkSessionBase):

    SPARK_APP_NAME = "ctrLogisticRegression"
    ENABLE_HIVE_SUPPORT = True

    def __init__(self):

        self.spark = self._create_spark_hbase()

ctr = CtrLogisticRegression()

In [3]:
ctr.spark

In [4]:
ctr.spark.sql("use profile")

DataFrame[]

In [5]:
user_article_basic = ctr.spark.sql("select * from user_article_basic").select(
    ['user_id', 'article_id', 'channel_id', 'clicked'])

In [6]:
user_article_basic.show()

+-------------------+----------+----------+-------+
|            user_id|article_id|channel_id|clicked|
+-------------------+----------+----------+-------+
|1105045287866466304|     14225|         0|  false|
|1106476833370537984|     14208|         0|  false|
|1111189494544990208|     19322|         0|  false|
|1111524501104885760|     44161|         0|  false|
|1112727762809913344|     18172|        18|   true|
|                  1|     44386|        18|   true|
|                  1|     44696|         0|  false|
|                 10|     43907|         0|  false|
|1106473203766657024|     16005|         0|  false|
|1108264901190615040|     15196|         0|  false|
|                 23|     44739|        18|   true|
|                 33|     13570|         0|  false|
|                  1|     17632|         0|  false|
|1106473203766657024|     17665|         0|  false|
|1111189494544990208|     44368|         0|  false|
|                 10|     44368|         0|  false|
|11050938831

In [7]:
user_profile_hbase = ctr.spark.sql(
    "select user_id, information.birthday, information.gender, article_partial, env from user_profile_hbase")
user_profile_hbase = user_profile_hbase.drop('env')

# +--------------------+--------+------+--------------------+
# |             user_id|birthday|gender|     article_partial|
# +--------------------+--------+------+--------------------+
# |              user:1|     0.0|  null|Map(18:Animal -> ...|

_schema = StructType([
    StructField("user_id", LongType()),
    StructField("birthday", DoubleType()),
    StructField("gender", BooleanType()),
    StructField("weights", MapType(StringType(), DoubleType()))
])

def get_user_id(row):
    return int(row.user_id.split(":")[1]), row.birthday, row.gender, row.article_partial

In [None]:
user_profile_hbase.show()

In [8]:
user_profile_hbase_temp = user_profile_hbase.rdd.map(get_user_id)
user_profile_hbase_schema = ctr.spark.createDataFrame(user_profile_hbase_temp, schema=_schema)

train = user_article_basic.join(user_profile_hbase_schema, on=['user_id'], how='left').drop('channel_id')


In [10]:
train.show()

+-------------------+----------+-------+--------+------+-------+
|            user_id|article_id|clicked|birthday|gender|weights|
+-------------------+----------+-------+--------+------+-------+
|1106473203766657024|     16005|  false|    null|  null|   null|
|1106473203766657024|     17665|  false|    null|  null|   null|
|1106473203766657024|     44664|  false|    null|  null|   null|
|1106473203766657024|     44386|  false|    null|  null|   null|
|1106473203766657024|     13778|  false|    null|  null|   null|
|1106473203766657024|     13039|  false|    null|  null|   null|
|1106473203766657024|     13648|  false|    null|  null|   null|
|1106473203766657024|     17304|  false|    null|  null|   null|
|1106473203766657024|     19233|  false|    null|  null|   null|
|1106473203766657024|     44466|  false|    null|  null|   null|
|1106473203766657024|     18795|  false|    null|  null|   null|
|1106473203766657024|    134812|  false|    null|  null|   null|
|1106473203766657024|    

In [9]:
ctr.spark.sql("use article")
article_vector = ctr.spark.sql("select * from article_vector")

In [12]:
article_vector.show()

+----------+----------+--------------------+
|article_id|channel_id|       articlevector|
+----------+----------+--------------------+
|        26|        17|[0.02069368539384...|
|        29|        17|[-0.1446092289546...|
|       474|        17|[0.17293323921293...|
|      1677|        17|[-0.1303829028565...|
|      1806|        17|[0.02166337053188...|
|      1950|        17|[-0.3318378543653...|
|      2040|        17|[-0.0164312324191...|
|      2529|        17|[0.02575729180313...|
|      3506|        17|[0.08157531127196...|
|     38543|        17|[-0.3340523649251...|
|     39104|        17|[-0.1363798526910...|
|     40557|        17|[-0.1039882155372...|
|     41895|        17|[-0.0438782209959...|
|     74783|        17|[-0.0667113812378...|
|     75264|        17|[-0.0319393678308...|
|     75465|        17|[-0.0328539103164...|
|     76584|        17|[0.19926537834339...|
|     77605|        17|[0.12450708812808...|
|     78365|        17|[0.09693023461912...|
|     7859

In [10]:
train = train.join(article_vector, on=['article_id'], how='left').drop('birthday').drop('gender')


In [14]:
train.show()

+-------------------+-------------------+-------+-------+----------+--------------------+
|         article_id|            user_id|clicked|weights|channel_id|       articlevector|
+-------------------+-------------------+-------+-------+----------+--------------------+
|              13401|                 10|  false|   null|        18|[0.06157120217893...|
|              13401|1106396183141548032|  false|   null|        18|[0.06157120217893...|
|              14805|1106473203766657024|  false|   null|        18|[0.11028526511434...|
|              14805|1103195673450250240|  false|   null|        18|[0.11028526511434...|
|              14805|1105045287866466304|  false|   null|        18|[0.11028526511434...|
|              14805|1111524501104885760|  false|   null|        18|[0.11028526511434...|
|              14805|1105105185656537088|  false|   null|        18|[0.11028526511434...|
|              14805|                  1|  false|   null|        18|[0.11028526511434...|
|         

In [11]:
ctr.spark.sql("use article")
article_profile = ctr.spark.sql("select * from article_profile")

def article_profile_to_feature(row):

    try:
        weights = sorted(row.keywords.values())[:10]
    except Exception as e:
        weights = [0.0] * 10
    return row.article_id, weights
article_profile = article_profile.rdd.map(article_profile_to_feature).toDF(['article_id', 'article_weights'])

article_profile.show()

train = train.join(article_profile, on=['article_id'], how='left')

+----------+--------------------+
|article_id|     article_weights|
+----------+--------------------+
|        26|[0.19827163395829...|
|        29|[0.26031398249056...|
|       474|[0.49818598558926...|
|       964|[0.42194661121527...|
|      1677|[0.19827339246090...|
|      1697|[0.25105539265038...|
|      1806|[0.18449119772340...|
|      1950|[0.33331407122173...|
|      2040|[0.38583431341698...|
|      2214|[0.43761156267670...|
|      2250|[0.46477621366740...|
|      2453|[0.50514620188273...|
|      2509|[0.15138306650944...|
|      2529|[0.11634963900866...|
|      2927|[0.28513034617795...|
|      3091|[0.23478830492918...|
|      3506|[0.22844780420769...|
|      3764|[0.27265314149033...|
|      4590|[0.40296288036812...|
|      4823|[0.21729897161021...|
+----------+--------------------+
only showing top 20 rows



In [20]:
train.show()

+-------------------+-------------------+-------+-------+----------+--------------------+--------------------+
|         article_id|            user_id|clicked|weights|channel_id|       articlevector|     article_weights|
+-------------------+-------------------+-------+-------+----------+--------------------+--------------------+
|              13401|                 10|  false|   null|        18|[0.06157120217893...|[0.08196639249252...|
|              13401|1106396183141548032|  false|   null|        18|[0.06157120217893...|[0.08196639249252...|
|              14805|1106473203766657024|  false|   null|        18|[0.11028526511434...|[0.15069781969741...|
|              14805|1103195673450250240|  false|   null|        18|[0.11028526511434...|[0.15069781969741...|
|              14805|1105045287866466304|  false|   null|        18|[0.11028526511434...|[0.15069781969741...|
|              14805|1111524501104885760|  false|   null|        18|[0.11028526511434...|[0.15069781969741...|
|

In [13]:
columns = ['article_id', 'user_id', 'channel_id', 'articlevector', 'user_weights', 'article_weights', 'clicked']
def get_user_weights(row):

    from pyspark.ml.linalg import Vectors
    try:
        user_weights = sorted([row.article_partial[key] for key in row.article_partial.keys() if key.split(':')[0] == str(row.channel_id)])[
                  :10]
    except Exception:
        user_weights = [0.0] * 10

    return row.article_id, row.user_id, row.channel_id, Vectors.dense(row.articlevector), Vectors.dense(
        user_weights), Vectors.dense(row.article_weights), int(row.clicked)

train_vector = train.rdd.map(get_user_weights).toDF(columns)

In [14]:
train_vector1 = train_vector.dropna()

In [None]:
columns[2:6]

In [15]:
train_version_two = VectorAssembler().setInputCols(columns[2:6]).setOutputCol("features").transform(train_vector1)

In [16]:
train_version_two.show()

+----------+-------------------+----------+--------------------+--------------------+--------------------+-------+--------------------+
|article_id|            user_id|channel_id|       articlevector|        user_weights|     article_weights|clicked|            features|
+----------+-------------------+----------+--------------------+--------------------+--------------------+-------+--------------------+
|     13401|                 10|        18|[0.06157120217893...|[0.0,0.0,0.0,0.0,...|[0.08196639249252...|      0|[18.0,0.061571202...|
|     13401|1106396183141548032|        18|[0.06157120217893...|[0.0,0.0,0.0,0.0,...|[0.08196639249252...|      0|[18.0,0.061571202...|
|     14805|1106473203766657024|        18|[0.11028526511434...|[0.0,0.0,0.0,0.0,...|[0.15069781969741...|      0|[18.0,0.110285265...|
|     14805|1103195673450250240|        18|[0.11028526511434...|[0.0,0.0,0.0,0.0,...|[0.15069781969741...|      0|[18.0,0.110285265...|
|     14805|1105045287866466304|        18|[0.11

In [17]:
# 保存到TFRecords文件中
df = train_version_two.select(['user_id', 'article_id', 'clicked', 'features'])
df_array = df.collect()
import pandas as pd
df = pd.DataFrame(df_array)

In [18]:
df

Unnamed: 0,0,1,2,3
0,10,13401,0,"[18.0, 0.061571202178931625, 0.035721198358704..."
1,1106396183141548032,13401,0,"[18.0, 0.061571202178931625, 0.035721198358704..."
2,1106473203766657024,14805,0,"[18.0, 0.11028526511434833, 0.0474637816380961..."
3,1103195673450250240,14805,0,"[18.0, 0.11028526511434833, 0.0474637816380961..."
4,1105045287866466304,14805,0,"[18.0, 0.11028526511434833, 0.0474637816380961..."
5,1111524501104885760,14805,0,"[18.0, 0.11028526511434833, 0.0474637816380961..."
6,1105105185656537088,14805,0,"[18.0, 0.11028526511434833, 0.0474637816380961..."
7,1,14805,0,"[18.0, 0.11028526511434833, 0.0474637816380961..."
8,10,14805,0,"[18.0, 0.11028526511434833, 0.0474637816380961..."
9,1112727762809913344,14805,0,"[18.0, 0.11028526511434833, 0.0474637816380961..."


In [19]:
import tensorflow as tf

def write_to_tfrecords(click_batch, feature_batch):
    """将用户与文章的点击日志构造的样本写入TFRecords文件
    """

    # 1、构造tfrecords的存储实例
    writer = tf.python_io.TFRecordWriter("./train_ctr_20200519.tfrecords")

    # 2、循环将所有样本一个个封装成example，写入这个文件
    for i in range(len(click_batch)):
        # 取出第i个样本的特征值和目标值，格式转换
        click = click_batch[i]
        feature = feature_batch[i].tostring()
        # [18.0, 0.09475817797242475, 0.0543921297305341...

        # 构造example，int64, float64, bytes
        example = tf.train.Example(features=tf.train.Features(feature={
            "label": tf.train.Feature(int64_list=tf.train.Int64List(value=[click])),
            "feature": tf.train.Feature(bytes_list=tf.train.BytesList(value=[feature]))
        }))

        # 序列化example,写入文件
        writer.write(example.SerializeToString())

    writer.close()

# 开启会话打印内容
with tf.Session() as sess:
    # 创建线程协调器
    coord = tf.train.Coordinator()

    # 开启子线程去读取数据
    # 返回子线程实例
    threads = tf.train.start_queue_runners(sess=sess, coord=coord)

    # 存入数据
    write_to_tfrecords(df.iloc[:, 2], df.iloc[:, 3])

    # 关闭子线程，回收
    coord.request_stop()

    coord.join(threads)

Instructions for updating:
To construct input pipelines, use the `tf.data` module.


In [35]:
import tensorflow as tf

In [36]:
dataset1 = tf.data.Dataset.from_tensor_slices(tf.random_uniform([4, 10]))
print(dataset1.output_types)
print(dataset1.output_shapes)

dataset2 = tf.data.Dataset.from_tensor_slices(
   (tf.random_uniform([4]),
    tf.random_uniform([4, 100], maxval=100, dtype=tf.int32)))
print(dataset2.output_types)
print(dataset2.output_shapes)

<dtype: 'float32'>
(10,)
(tf.float32, tf.int32)
(TensorShape([]), TensorShape([Dimension(100)]))
