In [1]:
# default_exp algo.rs.match.deepmatch

%reload_ext autoreload
%autoreload 2

# DeepMatch
https://zhuanlan.zhihu.com/p/126282487

随着深度学习技术的普及，越来越多的深度学习算法被应用到了工业界中。笔者自去年毕业进入企业后，有幸参与了某新业务的推荐系统搭建以及用户体验和业务指标的优化当中，其中在召回部分也进行过一些基于向量召回的探索并取得了一些收益。

之前在读研期间出于个人兴趣开发过一个基于深度学习的点击率预测算法库[DeepCTR](https://github.com/shenweichen/DeepCTR)，随着时间的迭代得到了一些同学的支持和认可，自己也亲身使用到了里面的算法应用到了自己的业务当中并取得了显著的收益。

相比于排序中各种点击率预估模型，自己对于召回模块的了解还有很多欠缺，借着这个机会，抱着学习的心态，和几位热心的优秀小伙伴一起做了DeepMatch这个项目，希望它能够帮助到大家！

https://github.com/shenweichen/DeepMatch


下面简单介绍一下如何
## 安装和使用

In [2]:
# !pip install -U deepmatch
!pip freeze | grep deepmatch

deepmatch==0.1.3


# 示例1: YoutubeDNN-ml_1m
https://github.com/shenweichen/DeepMatch/blob/master/examples/colab_MovieLen1M_YoutubeDNN.ipynb
    
下面已大家比较熟悉的YoutubeDNN为例子，给大家介绍如何使用deepmatch进行召回模型的训练，用户和物品向量的导出，以及使用faiss进行近似最近邻搜索。

整段代码不到100行，可以是非常的方便进行学习和使用了～

__运行环境 tf = 1.14.0, tf2会报错!!!__

## 任务
通过用户的历史点击，来预测用户下一次点击的item_id

X = [(user_id, hist_items_list, next_item, items_list_length),]

Y = [1, 0...]  二分类

## 导入需要的库

In [3]:
import pandas as pd
from deepctr.inputs import SparseFeat, VarLenSparseFeat
from sklearn.preprocessing import LabelEncoder
from tensorflow.python.keras import backend as K
from tensorflow.python.keras.models import Model

from deepmatch.models import *
from deepmatch.utils import sampledsoftmaxloss


  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


## preprocess

In [4]:
import random
import numpy as np
from tqdm import tqdm
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

### gen_data_set

In [5]:
def gen_data_set(data, negsample: int=0):
    """
    negsample: 负样本数相对于正样本数的倍数
    """
    data.sort_values("timestamp", inplace=True)
    item_ids = data['movie_id'].unique()

    train_set = []
    test_set = []
    for reviewerID, hist in tqdm(data.groupby('user_id')):
        pos_list = hist['movie_id'].tolist()  # 用户看过的电影列表
        rating_list = hist['rating'].tolist()  # 用户看过的电影评分列表

        if negsample > 0:
            # 产生负样本，策略：从用户没有评分的items中随机有放回采样
            candidate_set = list(set(item_ids) - set(pos_list))
            neg_list = np.random.choice(candidate_set,size=len(pos_list)*negsample,replace=True)
        for i in range(1, len(pos_list)):
            hist = pos_list[:i]
            if i != len(pos_list) - 1:
                # (user_id, items_list, next_item, label, items_list_length, next_item_rating)
                train_set.append((reviewerID, hist[::-1], pos_list[i], 1,len(hist[::-1]),rating_list[i]))
                for negi in range(negsample):
                    train_set.append((reviewerID, hist[::-1], neg_list[i*negsample+negi], 0,len(hist[::-1])))
            else:
                test_set.append((reviewerID, hist[::-1], pos_list[i],1,len(hist[::-1]),rating_list[i]))

    random.shuffle(train_set)
    random.shuffle(test_set)

    print(len(train_set[0]),len(test_set[0]))

    return train_set,test_set

### gen_model_input

In [6]:
def gen_model_input(train_set,user_profile,seq_max_len):

    train_uid = np.array([line[0] for line in train_set])
    train_seq = [line[1] for line in train_set]
    train_iid = np.array([line[2] for line in train_set])
    train_label = np.array([line[3] for line in train_set])
    train_hist_len = np.array([line[4] for line in train_set])

    train_seq_pad = pad_sequences(train_seq, maxlen=seq_max_len, padding='post', truncating='post', value=0)
    train_model_input = {"user_id": train_uid, "movie_id": train_iid, "hist_movie_id": train_seq_pad,
                         "hist_len": train_hist_len}

    for key in ["gender", "age", "occupation", "zip"]:
        train_model_input[key] = user_profile.loc[train_model_input['user_id']][key].values

    return train_model_input,train_label

## 读取数据

In [28]:
data_path = "../data/rs/"

unames = ['user_id','gender','age','occupation','zip']
user = pd.read_csv(data_path+'ml_1m/users.dat',sep='::',header=None,names=unames)
rnames = ['user_id','movie_id','rating','timestamp']
ratings = pd.read_csv(data_path+'ml_1m/ratings.dat',sep='::',header=None,names=rnames)
mnames = ['movie_id','title','genres']
movies = pd.read_csv(data_path+'ml_1m/movies.dat',sep='::',header=None,names=mnames)

data = pd.merge(pd.merge(ratings,movies),user)#.iloc[:10000]


  after removing the cwd from sys.path.
  
  


In [29]:
data.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,title,genres,gender,age,occupation,zip
0,1,1193,5,978300760,One Flew Over the Cuckoo's Nest (1975),Drama,F,1,10,48067
1,1,661,3,978302109,James and the Giant Peach (1996),Animation|Children's|Musical,F,1,10,48067
2,1,914,3,978301968,My Fair Lady (1964),Musical|Romance,F,1,10,48067
3,1,3408,4,978300275,Erin Brockovich (2000),Drama,F,1,10,48067
4,1,2355,5,978824291,"Bug's Life, A (1998)",Animation|Children's|Comedy,F,1,10,48067


## 构建特征列，训练模型，导出embedding

In [30]:
#data = pd.read_csvdata = pd.read_csv("./movielens_sample.txt")
sparse_features = ["movie_id", "user_id",
                    "gender", "age", "occupation", "zip", ]
SEQ_LEN = 50
negsample = 0

# 1.Label Encoding for sparse features,and process sequence features with `gen_date_set` and `gen_model_input`

features = ['user_id', 'movie_id', 'gender', 'age', 'occupation', 'zip']
feature_max_idx = {}
for feature in features:
    lbe = LabelEncoder()
    data[feature] = lbe.fit_transform(data[feature]) + 1
    feature_max_idx[feature] = data[feature].max() + 1

user_profile = data[["user_id", "gender", "age", "occupation", "zip"]].drop_duplicates('user_id')

item_profile = data[["movie_id"]].drop_duplicates('movie_id')

user_profile.set_index("user_id", inplace=True)

user_item_list = data.groupby("user_id")['movie_id'].apply(list)

In [31]:
user_item_list.head(2)

user_id
1    [1105, 640, 854, 3178, 2163, 1108, 1196, 2600,...
2    [1105, 2890, 2129, 1783, 1118, 1849, 1155, 126...
Name: movie_id, dtype: object

In [32]:
data.head(2)

Unnamed: 0,user_id,movie_id,rating,timestamp,title,genres,gender,age,occupation,zip
0,1,1105,5,978300760,One Flew Over the Cuckoo's Nest (1975),Drama,1,1,11,1589
1,1,640,3,978302109,James and the Giant Peach (1996),Animation|Children's|Musical,1,1,11,1589


In [33]:
train_set, test_set = gen_data_set(data, negsample)

100%|██████████| 6040/6040 [00:11<00:00, 512.09it/s]


6 6


In [34]:
train_set[:1]

[(678,
  [48,
   1458,
   1103,
   822,
   3522,
   3205,
   1902,
   180,
   1293,
   1702,
   429,
   779,
   2768,
   2363,
   473,
   2415,
   529,
   2416,
   1315,
   1609,
   24,
   1738,
   1985,
   468,
   746,
   2254,
   247,
   31,
   1473,
   706,
   213,
   692,
   2128,
   2148,
   2520,
   2634,
   1646,
   2125,
   2675,
   2506,
   183,
   1395,
   537,
   1431,
   3041,
   741,
   1463,
   1279,
   2240,
   2669,
   703,
   276,
   198,
   3000,
   191,
   344,
   230,
   524,
   2347,
   504,
   539,
   1739,
   2201,
   1845,
   1926,
   1429,
   1540,
   2060,
   2808,
   2,
   633,
   1481,
   3051,
   3024,
   308,
   2905,
   3031,
   3026,
   922,
   2255,
   56,
   1554,
   1476,
   3296,
   1571,
   173,
   1224,
   1847,
   1287,
   1844,
   160,
   1988,
   805,
   397,
   2832,
   963,
   218,
   2495,
   1559,
   395,
   744,
   172,
   1903,
   1568,
   1603,
   611,
   1306,
   1308,
   712,
   1594,
   10,
   3029,
   413,
   1932,
   1576,
   314,
  

In [35]:
train_seq = [line[1] for line in train_set]

In [36]:
train_seq[:2]

[[48,
  1458,
  1103,
  822,
  3522,
  3205,
  1902,
  180,
  1293,
  1702,
  429,
  779,
  2768,
  2363,
  473,
  2415,
  529,
  2416,
  1315,
  1609,
  24,
  1738,
  1985,
  468,
  746,
  2254,
  247,
  31,
  1473,
  706,
  213,
  692,
  2128,
  2148,
  2520,
  2634,
  1646,
  2125,
  2675,
  2506,
  183,
  1395,
  537,
  1431,
  3041,
  741,
  1463,
  1279,
  2240,
  2669,
  703,
  276,
  198,
  3000,
  191,
  344,
  230,
  524,
  2347,
  504,
  539,
  1739,
  2201,
  1845,
  1926,
  1429,
  1540,
  2060,
  2808,
  2,
  633,
  1481,
  3051,
  3024,
  308,
  2905,
  3031,
  3026,
  922,
  2255,
  56,
  1554,
  1476,
  3296,
  1571,
  173,
  1224,
  1847,
  1287,
  1844,
  160,
  1988,
  805,
  397,
  2832,
  963,
  218,
  2495,
  1559,
  395,
  744,
  172,
  1903,
  1568,
  1603,
  611,
  1306,
  1308,
  712,
  1594,
  10,
  3029,
  413,
  1932,
  1576,
  314,
  757,
  992,
  1298,
  540,
  1626,
  2309,
  1733,
  484,
  495,
  993,
  2406,
  2345,
  309,
  1242,
  1944,
  342,
  208

In [37]:
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

In [38]:
pad_sequences(train_seq[:2], maxlen=50, padding='post', truncating='post', value=0)

array([[  48, 1458, 1103,  822, 3522, 3205, 1902,  180, 1293, 1702,  429,
         779, 2768, 2363,  473, 2415,  529, 2416, 1315, 1609,   24, 1738,
        1985,  468,  746, 2254,  247,   31, 1473,  706,  213,  692, 2128,
        2148, 2520, 2634, 1646, 2125, 2675, 2506,  183, 1395,  537, 1431,
        3041,  741, 1463, 1279, 2240, 2669],
       [1102, 2684, 2286, 1197,  536, 2799,   80,  890, 2235,  978, 1744,
        3187, 1354, 1149, 1779,  576, 1890,  108, 2960, 1784, 2914, 1926,
        2050,  993,  314, 1573, 2467, 2440,  862, 2439, 2441, 1631, 1527,
        2942,  725, 1100, 1033, 1144, 2653, 2751, 2154, 1540,  320, 2099,
        1273, 1276, 3458, 1277, 1168, 2977]], dtype=int32)

In [39]:
train_model_input, train_label = gen_model_input(train_set, user_profile, SEQ_LEN)
test_model_input, test_label = gen_model_input(test_set, user_profile, SEQ_LEN)

In [40]:
train_model_input

{'user_id': array([ 678, 3705,  186, ..., 2459,  245, 4523]),
 'movie_id': array([2311,  800, 2496, ..., 1849, 2711,  860]),
 'hist_movie_id': array([[  48, 1458, 1103, ..., 1279, 2240, 2669],
        [1102, 2684, 2286, ..., 1277, 1168, 2977],
        [2789,  705, 3494, ..., 1121, 2172,    0],
        ...,
        [1486,   50,  310, ...,    0,    0,    0],
        [1166, 3592, 2657, ..., 2182, 2735, 2485],
        [2100,  853, 3239, ...,   75, 3560, 2221]], dtype=int32),
 'hist_len': array([736, 714,  49, ...,  27, 375, 385]),
 'gender': array([2, 2, 2, ..., 2, 2, 2]),
 'age': array([3, 5, 2, ..., 6, 4, 2]),
 'occupation': array([ 1,  8,  6, ...,  8, 17,  3]),
 'zip': array([1258, 1072, 2875, ..., 1453, 2204, 1878])}

In [41]:
SparseFeat('user_id', feature_max_idx['user_id'], 16)

SparseFeat(name='user_id', vocabulary_size=6041, embedding_dim=16, use_hash=False, dtype='int32', embedding_name='user_id', group_name='default_group')

In [42]:
# 2.count #unique features for each sparse field and generate feature config for sequence feature

embedding_dim = 32

user_feature_columns = [SparseFeat('user_id', feature_max_idx['user_id'], 16),
                        SparseFeat("gender", feature_max_idx['gender'], 16),
                        SparseFeat("age", feature_max_idx['age'], 16),
                        SparseFeat("occupation", feature_max_idx['occupation'], 16),
                        SparseFeat("zip", feature_max_idx['zip'], 16),
                        VarLenSparseFeat(SparseFeat('hist_movie_id', feature_max_idx['movie_id'], embedding_dim,
                                                    embedding_name="movie_id"), SEQ_LEN, 'mean', 'hist_len'),
                        ]

item_feature_columns = [SparseFeat('movie_id', feature_max_idx['movie_id'], embedding_dim)]

## Define Model and train
召回模型的网络结构
![](img/rma01.png)

In [43]:
# 3.Define Model and train

K.set_learning_phase(True)

model = YoutubeDNN(user_feature_columns, item_feature_columns, num_sampled=200, user_dnn_hidden_units=(128,64, embedding_dim))
# model = MIND(user_feature_columns,item_feature_columns,dynamic_k=True,p=1,k_max=2,num_sampled=5,user_dnn_hidden_units=(64,16),init_std=0.001)

model.compile(optimizer="adam", loss=sampledsoftmaxloss)  # "binary_crossentropy")





In [50]:
model.summary()

Model: "model_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
movie_id (InputLayer)           [(None, 1)]          0                                            
__________________________________________________________________________________________________
embedding_index_1 (EmbeddingInd (3707,)              0           movie_id[0][0]                   
__________________________________________________________________________________________________
hist_movie_id (InputLayer)      [(None, 50)]         0                                            
__________________________________________________________________________________________________
sparse_seq_emb_hist_movie_id (E multiple             118624      hist_movie_id[0][0]              
                                                                 embedding_index_1[0][0]    

In [45]:
history = model.fit(train_model_input, train_label,  # train_label,
                    batch_size=512, epochs=24, verbose=1, validation_split=0.0, )

Epoch 1/24
Epoch 2/24
Epoch 3/24
Epoch 4/24
Epoch 5/24
Epoch 6/24
Epoch 7/24
Epoch 8/24
Epoch 9/24
Epoch 10/24
Epoch 11/24
Epoch 12/24
Epoch 13/24
Epoch 14/24
Epoch 15/24
Epoch 16/24
Epoch 17/24
Epoch 18/24
Epoch 19/24
Epoch 20/24
Epoch 21/24
Epoch 22/24
Epoch 23/24
Epoch 24/24


训练完整后，由于在实际使用时，我们需要根据当前的用户特征实时产生用户侧向量，并对物品侧向量构建索引进行近似最近邻查找。这里由于是离线模拟，所以我们导出所有待测试用户的表示向量，和所有物品的表示向量。


## Generate user features for testing and full item features for retrieval

In [46]:
test_user_model_input = test_model_input
all_item_model_input = {"movie_id": item_profile['movie_id'].values,}

# 以下两行是deepmatch中的通用使用方法，分别获得用户向量模型和物品向量模型
user_embedding_model = Model(inputs=model.user_input, outputs=model.user_embedding)
item_embedding_model = Model(inputs=model.item_input, outputs=model.item_embedding)

# 输入对应的数据拿到对应的向量
user_embs = user_embedding_model.predict(test_user_model_input, batch_size=2 ** 12)
# user_embs = user_embs[:, i, :]  i in [0,k_max) if MIND
item_embs = item_embedding_model.predict(all_item_model_input, batch_size=2 ** 12)

print(user_embs.shape)
print(item_embs.shape)

(6040, 32)
(3706, 32)


## 使用faiss进行ANN查找并评估结果
[可选的]如果有安装faiss库的同学，可以体验以下将上一步导出的物品向量构建索引，然后用用户向量来进行ANN(Approximate Nearest Neighbor，近似最近邻)查找并评估效果

In [47]:
# ! pip install faiss-cpu
!pip freeze | grep faiss

faiss-cpu==1.6.3


In [48]:
test_true_label = {line[0]:[line[2]] for line in test_set}  # user_id: [next_item_id]

import numpy as np
import faiss
from tqdm import tqdm
from deepmatch.utils import recall_N

index = faiss.IndexFlatIP(embedding_dim)
# faiss.normalize_L2(item_embs)
index.add(item_embs)
# faiss.normalize_L2(user_embs)
D, I = index.search(user_embs, 50)
s = []
hit = 0
for i, uid in tqdm(enumerate(test_user_model_input['user_id'])):
    try:
        pred = [item_profile['movie_id'].values[x] for x in I[i]]
        filter_item = None
        recall_score = recall_N(test_true_label[uid], pred, N=50)
        s.append(recall_score)
        if test_true_label[uid] in pred:
            hit += 1
    except:
        print(i)
print("")
print("recall", np.mean(s))
print("hit rate", hit / len(test_user_model_input['user_id']))

## result
训练了24个epoch，top50召回达到了0.29，

# nb_export

In [4]:
from nbdev.export import *
notebook2script()

Converted 00_core.ipynb.
Converted engineering_nbdev.ipynb.
Converted index.ipynb.


In [7]:
!nbdev_build_docs

No notebooks were modified
converting /Users/luoyonggui/PycharmProjects/nbdevlib/index.ipynb to README.md
