## Tensorflow2实现推荐系统双塔DNN排序

演示步骤：
1. 读取电影数据集，包括（用户信息、电影信息、评分行为信息）
2. 搭建双塔模型并训练，用到了keras函数Embedding、点积等技术
3. 模型应用1：保存模型用于在线predict预估
4. 模型应用2：导出embedding用于召回

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import matplotlib.pyplot as plt

In [2]:
tf.__version__

'2.3.0'

### 1. 读取和处理数据

In [23]:
csv_file = "./my_datas/questions_86.csv"
csv_data = pd.read_csv(csv_file, low_memory = False)#防止弹出警告
df = pd.DataFrame(csv_data)

In [24]:
df = df.drop_duplicates(subset=['userName', 'p_id'], keep='last')

In [25]:
df.head()

Unnamed: 0,mark,userName,p_id
1,Compile Error,codejesus_young,P1000
2,Compile Error,zhijieS,P1000
3,Accepted,wanghj25,P1000
4,Accepted,zhouyongqiang,P1000
6,Unaccepted,SJY001,P1000


In [26]:
df.loc[df['mark']=='Compile Error','mark'] ='Unaccepted'

In [27]:
df

Unnamed: 0,mark,userName,p_id
1,Unaccepted,codejesus_young,P1000
2,Unaccepted,zhijieS,P1000
3,Accepted,wanghj25,P1000
4,Accepted,zhouyongqiang,P1000
6,Unaccepted,SJY001,P1000
...,...,...,...
17943,Unaccepted,YMC920106840337,P1086
17944,Unaccepted,MKS阿克曼,P1086
17945,Accepted,CYC920106840413,P1086
17946,Unaccepted,SZY920106840108,P1086


In [28]:
# 读入questions详细数据
csv_file = "./my_datas/questions_data_finished.csv"
csv_data = pd.read_csv(csv_file, low_memory = False)#防止弹出警告
df_questions = pd.DataFrame(csv_data)

In [29]:
df_questions

Unnamed: 0,title,content,tag,submited,passed,passing_rate,level,p_id
0,超级玛丽游戏,超级玛丽是一个非常经典的游戏。请你用字符画的形式输出超级玛丽中的一个场景。\n ...,,503.06,193.88,0.39,入门,P1000
1,A+BProblem,\n 题目背景 强烈推荐新用户必读贴\n不熟悉算法竞赛的选手请看这里：\n...,,576.31,338.79,0.59,入门,P1001
2,[NOIP2002普及组]过河卒,棋盘上 AAA 点有一个过河卒，需要走到目标 BBB 点。卒行走的规则：可以向下、或者向右。...,NOIp普及组\n2002,211.77,64.11,0.30,普及-,P1002
3,[NOIP2011提高组]铺地毯,\n 为了准备一个独特的颁奖典礼，组织者在会场的一片矩形区域（可看做...,NOIp提高组\n2011,219.80,77.64,0.35,普及-,P1003
4,[NOIP2000提高组]方格取数,\n 设有 N×NN \times NN×N 的方格图 (N≤9)(...,NOIp提高组\n2000,62.95,30.97,0.49,普及+/提高,P1004
...,...,...,...,...,...,...,...,...
150,Peter的烟,\n Peter 有 nnn 根烟，他每吸完一根烟就把烟蒂保存起来，...,,62.90,30.09,0.48,入门,P1150
151,子数整数,\n 对于一个五位数a1a2a3a4a5a_1a_2a_3a_4a_...,,62.65,25.70,0.41,入门,P1151
152,欢乐的跳,\n 一个nnn个元素的整数数组，如果数组两个连续元素之间差的绝对值...,,49.46,17.81,0.36,入门,P1152
153,点和线,\n 平面上有一些点，你可以用直线将两点连接起来。那么有多少种方法可...,高性能,1.05,372.00,354.29,提高+/省选-,P1153


In [38]:
# 合并成一个df
df = pd.merge(df,df_questions)
df.drop(columns=["title", "content", "passed"], inplace=True)

In [39]:
df.sample(frac=1)

Unnamed: 0,mark,userName,p_id,tag,submited,passing_rate,level,mark_idx,userName_idx,p_id_idx,level_idx,submited_idx,passing_rate_idx
11312,Accepted,yangying1212,P1077,NOIp普及组\n2012,45.13,0.43,普及/提高-,1,4066,72,5,72,20
7369,Accepted,♂♂♂,P1050,NOIp普及组\n高性能\n2005,7.72,0.30,提高+/省选-,1,4922,45,4,45,2
11710,Accepted,汉关秦月,P1080,NOIp提高组\n2012,65.47,0.22,普及+/提高,1,2018,75,2,75,39
5077,Accepted,zenglanxuan,P1035,,312.17,0.40,?,1,3584,30,3,30,24
10401,Unaccepted,铁锅炖阿奈,P1070,NOIp普及组\n2009,12.00,0.36,提高+/省选-,0,5142,65,4,65,27
...,...,...,...,...,...,...,...,...,...,...,...,...,...
11485,Unaccepted,miaohongxuan,P1079,NOIp提高组\n2012,28.99,0.53,普及-,0,6760,74,1,74,17
12100,Accepted,idontcal,P1083,NOIp提高组\n高性能\n2012,54.04,0.32,普及+/提高,1,7228,78,2,78,40
4702,Accepted,郭梁,P1033,NOIp提高组\n2002,14.36,0.43,普及/提高-,1,1537,28,5,28,20
7985,Accepted,gaozihan,P1055,NOIp普及组\n2008,345.62,0.25,普及-,1,5233,50,1,50,26


In [40]:
df.head(10)

Unnamed: 0,mark,userName,p_id,tag,submited,passing_rate,level,mark_idx,userName_idx,p_id_idx,level_idx,submited_idx,passing_rate_idx
0,Unaccepted,codejesus_young,P1000,,503.06,0.39,入门,0,0,0,0,0,0
1,Unaccepted,zhijieS,P1000,,503.06,0.39,入门,0,1,0,0,0,0
2,Accepted,wanghj25,P1000,,503.06,0.39,入门,1,2,0,0,0,0
3,Accepted,zhouyongqiang,P1000,,503.06,0.39,入门,1,3,0,0,0,0
4,Unaccepted,SJY001,P1000,,503.06,0.39,入门,0,4,0,0,0,0
5,Unaccepted,qwxxx,P1000,,503.06,0.39,入门,0,5,0,0,0,0
6,Accepted,suoh123,P1000,,503.06,0.39,入门,1,6,0,0,0,0
7,Accepted,一起吃饭,P1000,,503.06,0.39,入门,1,7,0,0,0,0
8,Accepted,jynf,P1000,,503.06,0.39,入门,1,8,0,0,0,0
9,Unaccepted,xccmy,P1000,,503.06,0.39,入门,0,9,0,0,0,0


#### 给列新增数字索引列

目的是：防止embedding过大

In [41]:
def add_index_column(param_df, column_name):
    values = list(param_df[column_name].unique())
    value_index_dict = {value:idx for idx,value in enumerate(values)}
    param_df[f"{column_name}_idx"] = param_df[column_name].map(value_index_dict)

In [42]:
add_index_column(df, "mark")
add_index_column(df, "userName")
add_index_column(df, "p_id")
add_index_column(df, "level")
add_index_column(df, "submited")
add_index_column(df, "passing_rate")

In [43]:
df.to_csv("./my_datas/tensorflow_question_user_markwithindex.csv", index=False)

In [46]:
df

Unnamed: 0,mark,userName,p_id,tag,submited,passing_rate,level,mark_idx,userName_idx,p_id_idx,level_idx,submited_idx,passing_rate_idx
0,Unaccepted,codejesus_young,P1000,,503.06,0.39,入门,0,0,0,0,0,0
1,Unaccepted,zhijieS,P1000,,503.06,0.39,入门,0,1,0,0,0,0
2,Accepted,wanghj25,P1000,,503.06,0.39,入门,1,2,0,0,0,0
3,Accepted,zhouyongqiang,P1000,,503.06,0.39,入门,1,3,0,0,0,0
4,Unaccepted,SJY001,P1000,,503.06,0.39,入门,0,4,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
12657,Unaccepted,YMC920106840337,P1086,NOIp普及组\n2004,52.65,0.40,普及-,0,7454,81,1,81,24
12658,Unaccepted,MKS阿克曼,P1086,NOIp普及组\n2004,52.65,0.40,普及-,0,7455,81,1,81,24
12659,Accepted,CYC920106840413,P1086,NOIp普及组\n2004,52.65,0.40,普及-,1,5200,81,1,81,24
12660,Unaccepted,SZY920106840108,P1086,NOIp普及组\n2004,52.65,0.40,普及-,0,7456,81,1,81,24


In [47]:
num_users = df["userName_idx"].max() + 1
num_questions = df["p_id_idx"].max() + 1
num_level = df["level_idx"].max() + 1
num_submited = df["submited_idx"].max() + 1
num_passing_rate = df["passing_rate_idx"].max() + 1
num_users, num_questions,num_level,num_submited,num_passing_rate

(7457, 82, 7, 82, 41)

#### 评分的归一化

#### 构建训练数据集

In [48]:
df_sample = df.sample(frac=0.1)
X = df_sample[["userName_idx","p_id_idx","level_idx","submited_idx","passing_rate_idx"]]
y = df_sample.pop("mark_idx")

### 2. 搭建双塔模型并训练

In [49]:
num_users, num_questions,num_level,num_submited,num_passing_rate

(7457, 82, 7, 82, 41)

In [50]:
def get_model():
    """函数式API搭建双塔DNN模型"""
    
    # 输入
    user_id = keras.layers.Input(shape=(1,), name="userName_idx")
    question_id = keras.layers.Input(shape=(1,), name="p_id_idx")
    level = keras.layers.Input(shape=(1,), name="level_idx")
#     submited = keras.layers.Input(shape=(1,), name="submited_idx")
#     passing_rate = keras.layers.Input(shape=(1,), name="passing_rate_idx")
    submited = keras.layers.Input(shape=(1,), name="submited_idx")
    passing_rate = keras.layers.Input(shape=(1,), name="passing_rate_idx")
    # user 塔
    user_vector = layers.Embedding(num_users, 100)(user_id)
    user_vector = layers.Dense(32, activation='relu')(user_vector)
    user_vector = layers.Dense(8, activation='relu', 
                               name="user_embedding", kernel_regularizer='l2')(user_vector)
       # movie塔
#     question_vector = layers.Embedding(num_questions, 100)(question_id)
    question_vector = tf.keras.layers.concatenate([
        layers.Embedding(num_questions, 100)(question_id),
        layers.Embedding(num_level, 2)(level),
        layers.Embedding(num_submited, 2)(submited),
        layers.Embedding(num_passing_rate, 2)(passing_rate)
    ])
    question_vector = layers.Dense(32, activation='relu')(question_vector)
    question_vector = layers.Dense(8, activation='relu', 
                                name="question_embedding", kernel_regularizer='l2')(question_vector)

    dot_user_question = tf.reduce_sum(user_vector*question_vector, axis = 1)
    dot_user_question = tf.expand_dims(dot_user_question, 1)

    output = layers.Dense(1, activation='sigmoid')(dot_user_question)
    
    return keras.models.Model(inputs=[user_id, question_id,level,submited,passing_rate], outputs=[output]) 

In [51]:
model = get_model()
model.compile(loss=tf.keras.losses.MeanSquaredError(), 
              optimizer=keras.optimizers.RMSprop(),
              metrics=["accuracy"]
             )

In [52]:
tensorboard_cb = tf.keras.callbacks.TensorBoard("./logs_board1")

In [53]:
fit_x_train = [
        X["userName_idx"], 
        X["p_id_idx"],
        X["level_idx"],
        X["submited_idx"],
        X["passing_rate_idx"],
    ]

from datetime import datetime
TIMESTAMP = "{0:%Y-%m-%dT%H-%M-%S/}".format(datetime.now())
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="./logs/logs_"+TIMESTAMP)

history = model.fit(
    x=fit_x_train,
    y=y,
    batch_size=32,
    epochs=20,
    verbose=1,
    callbacks=[tensorboard_cb]
)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


### 3. 模型的预估-predict

In [54]:
inputs = df.sample(frac=1.0)[
    ["userName_idx","p_id_idx","level_idx","submited_idx","passing_rate_idx"]].head(10)

In [55]:
# 对于（用户ID，召回的电影ID列表），计算分数
model.predict([
        inputs["userName_idx"], 
        inputs["p_id_idx"],
        inputs["level_idx"],
        inputs["submited_idx"],
        inputs["passing_rate_idx"]
    ])


array([[[0.9985496 ]],

       [[0.32629177]],

       [[0.58081704]],

       [[0.98424023]],

       [[0.60675454]],

       [[0.444464  ]],

       [[0.89549685]],

       [[0.44944432]],

       [[0.6037979 ]],

       [[0.6780196 ]]], dtype=float32)

#### 模型的保存

In [56]:
model.save("./my_datas/model_tensorflow_two_tower.h5")

In [57]:
new_model = tf.keras.models.load_model("./my_datas/model_tensorflow_two_tower.h5")

In [58]:
new_model.predict([
        inputs["userName_idx"], 
        inputs["p_id_idx"],
        inputs["level_idx"],
        inputs["submited_idx"],
        inputs["passing_rate_idx"]
    ])

array([[[0.9985496 ]],

       [[0.32629177]],

       [[0.58081704]],

       [[0.98424023]],

       [[0.60675454]],

       [[0.444464  ]],

       [[0.89549685]],

       [[0.44944432]],

       [[0.6037979 ]],

       [[0.6780196 ]]], dtype=float32)

### 4. 保存模型的embedding可用于召回

#### 得到user embedding

In [59]:
model.input

[<tf.Tensor 'userName_idx_1:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'p_id_idx_1:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'level_idx_1:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'submited_idx_1:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'passing_rate_idx_1:0' shape=(None, 1) dtype=float32>]

In [60]:
user_layer_model = keras.models.Model(
    inputs=[model.input[0]],
    outputs=model.get_layer("user_embedding").output
)

In [61]:
user_embeddings = []
for index, row in df.iterrows():
    user_id = row["userName_idx"]
    user_input = [
        np.reshape(row["userName_idx"], [1,1])
    ]
    user_embedding = np.array(user_layer_model(user_input))
#     embedding_str = ",".join([str(x) for x in user_embedding.numpy().flatten()])
    embedding_str = str(user_embedding[0][0].tolist())
    user_embeddings.append([user_id, embedding_str])

In [62]:
df_user_embedding = pd.DataFrame(user_embeddings, columns = ["user_id", "user_embedding"])
df_user_embedding

Unnamed: 0,user_id,user_embedding
0,0,"[0.017622776329517365, 0.0, 0.4034920930862427..."
1,1,"[0.017622744664549828, 0.0, 0.3997224867343902..."
2,2,"[0.01762237958610058, 0.0, 0.4941332936286926,..."
3,3,"[0.01762225851416588, 0.0, 0.484690397977829, ..."
4,4,"[0.017619172111153603, 0.0, 1.207775592803955,..."
...,...,...
12657,7454,"[0.017622843384742737, 0.0, 0.3752875924110412..."
12658,7455,"[0.01762285642325878, 0.0, 0.3630298376083374,..."
12659,5200,"[0.017623621970415115, 0.0, 0.0399009883403778..."
12660,7456,"[0.01762268878519535, 0.0, 0.4080047309398651,..."


In [63]:
output = "./my_datas/tensorflow_user_embedding.csv"
df_user_embedding.to_csv(output, index=False)

#### 得到movie embedding

In [64]:
model.input

[<tf.Tensor 'userName_idx_1:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'p_id_idx_1:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'level_idx_1:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'submited_idx_1:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'passing_rate_idx_1:0' shape=(None, 1) dtype=float32>]

In [65]:
question_layer_model = keras.models.Model(
    inputs=[model.input[1],model.input[2],model.input[3],model.input[4],],
    outputs=model.get_layer("question_embedding").output
)

In [66]:
df.head()

Unnamed: 0,mark,userName,p_id,tag,submited,passing_rate,level,mark_idx,userName_idx,p_id_idx,level_idx,submited_idx,passing_rate_idx
0,Unaccepted,codejesus_young,P1000,,503.06,0.39,入门,0,0,0,0,0,0
1,Unaccepted,zhijieS,P1000,,503.06,0.39,入门,0,1,0,0,0,0
2,Accepted,wanghj25,P1000,,503.06,0.39,入门,1,2,0,0,0,0
3,Accepted,zhouyongqiang,P1000,,503.06,0.39,入门,1,3,0,0,0,0
4,Unaccepted,SJY001,P1000,,503.06,0.39,入门,0,4,0,0,0,0


In [67]:
question_embeddings = []
for index, row in df.iterrows():
    question_id = row["p_id_idx"]
    question_input = [
        np.reshape(row["p_id_idx"], [1,1]),
        np.reshape(row["level_idx"], [1,1]),
        np.reshape(row["submited_idx"], [1,1]),
        np.reshape(row["passing_rate_idx"], [1,1])
    ]
    question_embedding = np.array(question_layer_model(question_input))
#     embedding_str = ",".join([str(x) for x in question_embedding.numpy().flatten()])
    embedding_str = str(question_embedding[0][0].tolist())
    question_embeddings.append([question_id, embedding_str])

In [68]:
df_question_embedding = pd.DataFrame(question_embeddings, columns = ["question_id", "question_embedding"])
df_question_embedding = df_question_embedding.drop_duplicates()
df_question_embedding

Unnamed: 0,question_id,question_embedding
0,0,"[0.0, 0.12929199635982513, 8.098901748657227, ..."
199,1,"[0.0, 0.13007792830467224, 5.729770183563232, ..."
431,2,"[0.0, 0.12835854291915894, 4.0134711265563965,..."
545,3,"[0.0, 0.12861864268779755, 5.334175109863281, ..."
673,4,"[0.0, 0.1301136612892151, 3.415855646133423, 0..."
...,...,...
11852,77,"[0.0, 0.13020765781402588, 3.5948965549468994,..."
12062,78,"[0.0, 0.12887223064899445, 5.227313041687012, ..."
12186,79,"[0.0, 0.1308649182319641, 3.5959696769714355, ..."
12300,80,"[0.0, 0.1296006590127945, 7.632290363311768, 0..."


In [118]:
output = "./my_datas/tensorflow_question_embedding.csv"
df_question_embedding.to_csv(output, index=False)