参考文档 https://deepmatch.readthedocs.io/en/latest/Features.html
参考代码 https://github.com/shenweichen/DeepMatch/blob/master/examples/colab_MovieLen1M_DSSM_InBatchSoftmax.ipynb
不考虑代码冗余的情况，熟悉tensorflow环境的背景下，重复用几遍。后面再考虑抽象公共函数

# 1、处理数据

In [1]:
import pandas as pd

data_path="../推荐算法-双塔/ml-1m"
unames = ['user_id','gender', 'age','occupation','zip']
user_df = pd.read_csv(data_path+'/users.dat', sep='::',
                      engine="python",
                      encoding='iso-8859-1',
                      header=None, names=unames)
rnames = ['user_id','movie_id','rating','timestamp']
rating_df = pd.read_csv(data_path+'/ratings.dat', sep='::',
                     engine='python',encoding='iso-8859-1',
                     header=None, names=rnames)
mnames = ['movie_id','title','genres']
movie_df = pd.read_csv(data_path+'/movies.dat',sep='::',
                    engine="python",encoding='iso-8859-1',
                    header=None, names=mnames)
data = pd.merge(pd.merge(user_df, rating_df), movie_df)
data.tail(10)

Unnamed: 0,user_id,gender,age,occupation,zip,movie_id,rating,timestamp,title,genres
1000199,5334,F,56,13,46140,3382,5,960796159,Song of Freedom (1936),Drama
1000200,5420,F,1,19,14850,1843,3,960156505,Slappy and the Stinkers (1998),Children's|Comedy
1000201,5433,F,35,17,45014,286,3,960240881,Nemesis 2: Nebula (1995),Action|Sci-Fi|Thriller
1000202,5494,F,35,17,94306,3530,4,959816296,Smoking/No Smoking (1993),Comedy
1000203,5556,M,45,6,92103,2198,3,959445515,Modulations (1998),Documentary
1000204,5949,M,18,17,47901,2198,5,958846401,Modulations (1998),Documentary
1000205,5675,M,35,14,30030,2703,3,976029116,Broken Vessels (1998),Drama
1000206,5780,M,18,17,92886,2845,1,958153068,White Boys (1999),Drama
1000207,5851,F,18,20,55410,3607,5,957756608,One Little Indian (1973),Comedy|Drama|Western
1000208,5938,M,25,1,35401,2909,4,957273353,"Five Wives, Three Secretaries and Me (1998)",Documentary


In [2]:
data.shape

(1000209, 10)

# 2、构建特征序列

In [3]:
sparse_features = ['user_id','gender','age','occupation','zip','movie_id','genres']

In [4]:
from sklearn.preprocessing import LabelEncoder
# labelEncoder的说明 
# https://zhuanlan.zhihu.com/p/33569866
# 注意这个没有把对应的genres的多分类
feature_max_idx = {}
for feature in sparse_features:
    lbe = LabelEncoder()
    data[feature] = lbe.fit_transform(data[feature]) + 1
    feature_max_idx[feature] = data[feature].max() + 1
data.tail(3)

Unnamed: 0,user_id,gender,age,occupation,zip,movie_id,rating,timestamp,title,genres
1000206,5780,2,2,18,2999,2639,1,958153068,White Boys (1999),240
1000207,5851,1,2,21,1906,3368,5,957756608,One Little Indian (1973),192
1000208,5938,2,3,2,1266,2703,4,957273353,"Five Wives, Three Secretaries and Me (1998)",236


In [None]:
#zip() 函数是 Python 内置函数之一，它可以将多个序列（列表、元组、字典、集合、字符串以及 range() 区间
#构成的列表）“压缩”成一个 zip 对象。所谓“压缩”，其实就是将这些序列中对应位置的元素重新组合，生成一个个新的元组。
#http://c.biancheng.net/view/2237.html

from tqdm import tqdm,trange
import numpy as np
import random
def gen_data_set(data, seq_max_len=50, negsample=0):
    train_set = []
    test_set = []
    
    data.sort_values('timestamp', inplace=True)
    item_ids = data['movie_id'].unique()
    #print(item_ids[:3])
    item_id_genres_map = dict(zip(data['movie_id'].values, data['genres'].values))
    #print(item_id_genres_map[858])
    
    #tqdm是一个方便且易于扩展的Python进度条，可以在python执行长循环时在命令行界面实时地显示一个进度提示信息，包括执行进度、处理速度等信息，且可在一定程度上进行定制。
    for reviewerID, hist in data.groupby('user_id'):
        pos_list = hist['movie_id'].tolist()
        #print(pos_list[:3])
        genres_list = hist['genres'].tolist()
        rating_list = hist['rating'].tolist()
        
        #全局正样本中负采样
        if negsample > 0:
            candidate_set = list(set(item_ids) - set(pos_list))
            neg_list = np.random.choice(candidate_set, size=len(pos_list)*negsample)
        for i in range(1, len(pos_list)):
            #取到前i个
            hist = pos_list[:i]
            genres_hist = genres_list[:i]
            seq_len = min(i, seq_max_len)
            #https://blog.csdn.net/weixin_35757704/article/details/124037380
            #[::]列表[起始:终止:步长]，默认是my_list[0:-1:1]
            #[::3]等同于my_list[0:-1:3],步长为正从做往右，步长为负，从右往左
            #hist[::-1][:seq_len] 逆序取了seq_len个
            #如果不是最后一个
            if i != len(pos_list) - 1:
               train_set.append((reviewerID, pos_list[i], 1, 
                                hist[::-1][:seq_len],
                                seq_len, 
                                genres_hist[::-1][:seq_len],
                                genres_list[i],
                                rating_list[i])) 
               for negi in range(negsample):
                    train_set.append((reviewerID, neg_list[i * negsample + negi], 0, 
                                hist[::-1][:seq_len],
                                seq_len, 
                                genres_hist[::-1][:seq_len],
                                item_id_genres_map[neg_list[i * negsample + negi]]))
            else:#每一个用户的最后一个正样本用于测试
                test_set.append((reviewerID, pos_list[i], 1, hist[::-1][:seq_len], seq_len, genres_hist[::-1][:seq_len],
                                 genres_list[i],
                                 rating_list[i]))
    random.shuffle(train_set)
    random.shuffle(test_set)
    print(len(train_set[0]),len(test_set[0]))
                  
    return train_set, test_set
    
SEQ_LEN = 50
#这里设置为0， 用batch内负采样
negsample = 0
train_set, test_set = gen_data_set(data, SEQ_LEN, negsample)

In [None]:
len(train_set)

In [None]:
user_profile = data[['user_id','gender','age','occupation','zip']].drop_duplicates('user_id')
item_profile  = data[['movie_id','genres']].drop_duplicates('movie_id')
#最后还有一个参数 inplace=True 表示直接修改 df, 而不是返回新对象
user_profile.set_index('user_id', inplace=True)
#user_profile.index
user_item_list = data.groupby('user_id')['movie_id'].apply(list)
user_item_list.head()

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

def gen_model_input(train_set, user_profile, seq_max_len):
    train_uid = np.array([line[0] for line in train_set])
    #print(train_uid[:3])
    #[5111 1264 4867]
    #正样本的movie_id
    train_iid = np.array([line[1] for line in train_set])
    train_label = np.array([line[2] for line in train_set])
    train_seq = [line[3] for line in train_set]
    #print(train_seq[:3])
    #[[1900, 2826, 2872, 1975, 358, 3190, 1740, 1346, 1216, 2163, 3032, 3369, 990, 1928, 3043, 1223, 1970, 1179, 3220, 2711, 1088, 1007, 2719, 2054, 339, 67, 1049, 1122, 1343, 855, 2833, 1174, 964, 1201, 695, 1769, 3430, 1696, 1505, 1168, 1008, 867, 1085, 1194, 1341, 3315, 1178, 3272, 1706, 3135], [97, 1513, 2796, 1101, 1217, 157, 1047, 240, 1264, 1705, 2930, 1365, 1932, 2969, 418, 561, 1211, 3047, 2282, 495, 2923, 61, 3027, 1268, 1011, 1536, 2241, 1619, 1779, 243, 1934, 1638, 1303, 314, 2530, 1295, 1180, 470, 1783, 2079, 3248, 2722, 1357, 2890, 549, 1891, 3032, 2857, 2160, 2945], [1161, 859, 2880, 1251, 1067, 568, 50, 299, 1567, 288, 1100, 1203, 2160, 2476, 1202, 2806, 3435, 3198, 1296, 2097, 1009, 3511, 229, 1815, 2558, 2059, 3272, 2401, 2587, 32, 2006, 145, 1119, 1831, 1564, 1163, 1134, 860, 1198, 1990, 1213, 1060, 1111, 1105, 2786, 1085, 1114, 853, 2109, 852]]
    train_hist_len = np.array([line[4] for line in train_set])
    train_seq_genres = np.array([line[5] for line in train_set],dtype=list)
    train_genres = np.array([line[6] for line in train_set])
    train_seq_pad = pad_sequences(train_seq, maxlen=seq_max_len, padding='post', truncating='post', value=0)
    train_seq_genres_pad = pad_sequences(train_seq_genres, maxlen=seq_max_len, padding='post', truncating='post',
                                         value=0)
    
    train_model_input = {"user_id": train_uid, "movie_id": train_iid, "hist_movie_id": train_seq_pad,
                    "hist_genres": train_seq_genres_pad,
                    "hist_len": train_hist_len, "genres": train_genres}
    for key in ["gender", "age", "occupation", "zip"]:
        train_model_input[key] = user_profile.loc[train_model_input['user_id']][key].values
    return train_model_input, train_label
    
train_model_input, train_label = gen_model_input(train_set, user_profile, SEQ_LEN)
test_model_input, test_label = gen_model_input(test_set, user_profile, SEQ_LEN)


In [None]:
type(train_set[0][5])

In [None]:
train_model_input['user_id'][:4]

# 3、count #unique features for each sparse field and generate feature config for sequence feature

In [None]:
#https://blog.csdn.net/liboshi123/article/details/110550681
#namedtuple()是产生具有命名字段的元组的工厂函数，namedtuple 比普通tuple具有更好的可读性，可以使代码更易于维护。同时与字典相比，又更加的轻量和高效。
#namedtuple(typename, field_names,*,verbos=False, rename=Flase)
#返回一个新类，名为typename

#https://blog.csdn.net/Florine113/article/details/120988102
#cls在python中表示类本身，self为类的一个实例。
#cls可以返回类的一个实例。
from collections import namedtuple
import tensorflow as tf

DEFAULT_GROUP_NAME = "default_group"

class SparseFeat(namedtuple('SparseFeat',
                            ['name', 'vocabulary_size', 'embedding_dim', 'use_hash', 'vocabulary_path', 'dtype', 'embeddings_initializer',
                             'embedding_name',
                             'group_name', 'trainable'])):
    __slots__ = ()
    
    def __new__(cls, name, vocabulary_size, embedding_dim=4, use_hash=False, vocabulary_path=None, dtype="int32", embeddings_initializer=None,
                embedding_name=None,
                group_name=DEFAULT_GROUP_NAME, trainable=True):

        if embedding_dim == "auto":
            embedding_dim = 6 * int(pow(vocabulary_size, 0.25))
        if embeddings_initializer is None:
            embeddings_initializer = tf.keras.initializers.RandomNormal(mean=0.0, stddev=0.0001, seed=2020)

        if embedding_name is None:
            embedding_name = name

        return super(SparseFeat, cls).__new__(cls, name, vocabulary_size, embedding_dim, use_hash, vocabulary_path, dtype,
                                              embeddings_initializer,
                                              embedding_name, group_name, trainable)
SparseFeat('user_id', feature_max_idx['user_id'], 16)


In [None]:
feature_max_idx

In [None]:
class VarLenSparseFeat(namedtuple('VarLenSparseFeat',
                                  ['sparsefeat', 'maxlen', 'combiner', 'length_name', 'weight_name', 'weight_norm'])):
    __slots__ = ()

    def __new__(cls, sparsefeat, maxlen, combiner="mean", length_name=None, weight_name=None, weight_norm=True):
        return super(VarLenSparseFeat, cls).__new__(cls, sparsefeat, maxlen, combiner, length_name, weight_name,
                                                    weight_norm)

    @property
    def name(self):
        return self.sparsefeat.name

    @property
    def vocabulary_size(self):
        return self.sparsefeat.vocabulary_size

    @property
    def embedding_dim(self):
        return self.sparsefeat.embedding_dim

    @property
    def use_hash(self):
        return self.sparsefeat.use_hash

    @property
    def vocabulary_path(self):
        return self.sparsefeat.vocabulary_path

    @property
    def dtype(self):
        return self.sparsefeat.dtype

    @property
    def embeddings_initializer(self):
        return self.sparsefeat.embeddings_initializer

    @property
    def embedding_name(self):
        return self.sparsefeat.embedding_name

    @property
    def group_name(self):
        return self.sparsefeat.group_name

    @property
    def trainable(self):
        return self.sparsefeat.trainable

    def __hash__(self):
        return self.name.__hash__()
    
VarLenSparseFeat(SparseFeat('hist_movie_id', feature_max_idx['movie_id'], 32,
                                                    embedding_name="movie_id"), SEQ_LEN, 'mean', 'hist_len')

In [None]:
class DenseFeat(namedtuple('DenseFeat', ['name', 'dimension', 'dtype', 'transform_fn'])):
    """ Dense feature
    Args:
        name: feature name.
        dimension: dimension of the feature, default = 1.
        dtype: dtype of the feature, default="float32".
        transform_fn: If not `None` , a function that can be used to transform
        values of the feature.  the function takes the input Tensor as its
        argument, and returns the output Tensor.
        (e.g. lambda x: (x - 3.0) / 4.2).
    """
    __slots__ = ()

    def __new__(cls, name, dimension=1, dtype="float32", transform_fn=None):
        return super(DenseFeat, cls).__new__(cls, name, dimension, dtype, transform_fn)

    def __hash__(self):
        return self.name.__hash__()

In [None]:
embedding_dim = 32

user_feature_columns = [SparseFeat('user_id', feature_max_idx['user_id'], 16),
                        SparseFeat("gender", feature_max_idx['gender'], 16),
                        SparseFeat("age", feature_max_idx['age'], 16),
                        SparseFeat("occupation", feature_max_idx['occupation'], 16),
                        SparseFeat("zip", feature_max_idx['zip'], 16),
                        VarLenSparseFeat(SparseFeat('hist_movie_id', feature_max_idx['movie_id'], embedding_dim,
                                                    embedding_name="movie_id"), SEQ_LEN, 'mean', 'hist_len'),
                        VarLenSparseFeat(SparseFeat('hist_genres', feature_max_idx['genres'], embedding_dim,
                                embedding_name="genres"), SEQ_LEN, 'mean', 'hist_len'),
                        ]

item_feature_columns = [SparseFeat('movie_id', feature_max_idx['movie_id'], embedding_dim),
                        SparseFeat('genres', feature_max_idx['genres'], embedding_dim)
                       ]
user_feature_columns

In [None]:
from collections import Counter 
#https://blog.csdn.net/qq_29678299/article/details/89975667
#dict的子类，用于计数
train_counter = Counter(train_model_input['movie_id'])
#train_counter， 以上求得movie_id分别对应的样本数
#{525: 570,427: 1135,1505: 1363,956: 640,901: 334,....2073: 574}
# train_counter.get(525,0) -> 570
#item_count 次数的list【0，2174,.......】
item_count = [train_counter.get(i,0) for i in range(item_feature_columns[0].vocabulary_size)]


In [None]:
class NegativeSampler(
    namedtuple('NegativeSampler', ['sampler', 'num_sampled', 'item_name', 'item_count', 'distortion'])):
    """ NegativeSampler
    Args:
        sampler: sampler name,['inbatch', 'uniform', 'frequency' 'adaptive',] .
        num_sampled: negative samples number per one positive sample.
        item_name: pkey of item features .
        item_count: global frequency of item .
        distortion: skew factor of the unigram probability distribution.
    """
    __slots__ = ()

    def __new__(cls, sampler, num_sampled, item_name, item_count=None, distortion=1.0, ):
        if sampler not in ['inbatch', 'uniform', 'frequency', 'adaptive']:
            raise ValueError(' `%s` sampler is not supported ' % sampler)
        if sampler in ['inbatch', 'frequency'] and item_count is None:
            raise ValueError(' `item_count` must not be `None` when using `inbatch` or `frequency` sampler')
        return super(NegativeSampler, cls).__new__(cls, sampler, num_sampled, item_name, item_count, distortion)

In [None]:
#注意这里的item_count 是上面的list。
sampler_config = NegativeSampler('inbatch',num_sampled=255,item_name="movie_id",item_count=item_count)

# 4、搭建模型

Embedding 使用方法, tf.keras.layers.Embedding
tf.keras.layers.Embedding(
    input_dim,
    output_dim,
    embeddings_initializer='uniform',
    embeddings_regularizer=None,
    activity_regularizer=None,
    embeddings_constraint=None,
    mask_zero=False,
    input_length=None,
    **kwargs
)

In [None]:
#python中OrderedDict用法， 有序的字典
import collections
def build_input_features(feature_columns, prefix=''):
    input_features = collections.OrderedDict() #思考，这里为什么要用有序字典
  
    for fc in feature_columns:
        if isinstance(fc, SparseFeat):
            input_features[fc.name] = tf.keras.Input(
                shape=(1,), name=prefix + fc.name, dtype=fc.dtype)
        elif isinstance(fc, DenseFeat):
            input_features[fc.name] = tf.keras.Input(
                shape=(fc.dimension,), name=prefix + fc.name, dtype=fc.dtype)
        elif isinstance(fc, VarLenSparseFeat):
            input_features[fc.name] = tf.keras.Input(shape=(fc.maxlen,), name=prefix + fc.name,
                                            dtype=fc.dtype)
            if fc.weight_name is not None:
                input_features[fc.weight_name] = tf.keras.Input(shape=(fc.maxlen, 1), name=prefix + fc.weight_name,
                                                       dtype="float32")
            if fc.length_name is not None:
                input_features[fc.length_name] = tf.keras.Input((1,), name=prefix + fc.length_name, dtype='int32')

        else:
            raise TypeError("Invalid feature column type,got", type(fc))

    return input_features
#测试
build_input_features(user_feature_columns)

In [None]:
#表达式中使用if else,意思是if成立执行左边，否则执行右边
def create_embedding_matrix(feature_columns, l2_reg, seed, prefix="sparse", seq_mask_zero=True):
    sparse_emb_dict = {}
    
    sparse_feature_columns = list(
        filter(lambda x: isinstance(x, SparseFeat), feature_columns)) if feature_columns else []
    
    for feat in sparse_feature_columns:
        emb = tf.keras.layers.Embedding(feat.vocabulary_size, feat.embedding_dim,
                        embeddings_initializer=feat.embeddings_initializer,
                        embeddings_regularizer=tf.keras.regularizers.L2(l2_reg),
                        name=prefix + '_emb_' + feat.embedding_name)
        emb.trainable = feat.trainable
        sparse_emb_dict[feat.embedding_name] = emb
    
    #varlen_sparse_feature 对比 sparse_feature， 
    #1、多了，mask_zero=seq_mask_zero
    #2、名字不同
    varlen_sparse_feature_columns = list(
        filter(lambda x: isinstance(x, VarLenSparseFeat), feature_columns)) if feature_columns else []
    
    if varlen_sparse_feature_columns and len(varlen_sparse_feature_columns) > 0:
        for feat in varlen_sparse_feature_columns:
            # if feat.name not in sparse_embedding:
            emb = tf.keras.layers.Embedding(feat.vocabulary_size, feat.embedding_dim,
                            embeddings_initializer=feat.embeddings_initializer,
                            embeddings_regularizer=tf.keras.regularizers.L2(
                                l2_reg),
                            name=prefix + '_seq_emb_' + feat.name,
                            mask_zero=seq_mask_zero)
            emb.trainable = feat.trainable
            sparse_emb_dict[feat.embedding_name] = emb
            
    return sparse_emb_dict

In [None]:
#测试
tensor = tf.constant([[1, 2, 3], [4, 5, 6]])
tensor = tf.ones_like(tensor) * (-2 ** 32 + 1)
tensor

In [None]:
#WeightedSequenceLayer(weight_normalization=fc.weight_norm)(
#                   [embedding_dict[feature_name], features[feature_length_name], features[fc.weight_name]])
#其中
#embedding_dict,varlen_embedding_lookup 中查找的经过变换embedding变换的output{'user_id': XXX, .......}
#features, #OrderedDict([('user_id',KerasTensor),('gender',KerasTensor)...] INPUT的张量
#fc的属性 length_name='hist_len', weight_name=None, weight_norm=True

#功能
#如果fc.weight_name is  None:则直接用原来embedding的输出。
#如果fc.weight_name is  not None，WeightedSequenceLayer加权，如果有feature_length_name, 则传入，否则不传入。三个，见例子
#如果有feature_length_name，SequencePoolingLayer的supports_masking=False
#如果没有feature_length_name，SequencePoolingLayer的supports_masking=True
class WeightedSequenceLayer(tf.keras.layers.Layer):
    """The WeightedSequenceLayer is used to apply weight score on variable-length sequence feature/multi-value feature.

      Input shape
        - A list of two  tensor [seq_value,seq_len,seq_weight]

        - seq_value is a 3D tensor with shape: ``(batch_size, T, embedding_size)`` T应该是多少个值

        - seq_len is a 2D tensor with shape : ``(batch_size, 1)``,indicate valid length of each sequence.

        - seq_weight is a 3D tensor with shape: ``(batch_size, T, 1)``

      Output shape
        - 3D tensor with shape: ``(batch_size, T, embedding_size)``.

      Arguments
        - **weight_normalization**: bool.Whether normalize the weight score before applying to sequence.

        - **supports_masking**:If True,the input need to support masking.
    """
    def __init__(self, weight_normalization=True, supports_masking=False, **kwargs):
        super(WeightedSequenceLayer, self).__init__(**kwargs)
        self.weight_normalization = weight_normalization
        self.supports_masking = supports_masking
    
    def build(self, input_shape):
        super(WeightedSequenceLayer, self).build(input_shape)# Be sure to call this somewhere!
        if not self.supports_masking:
            self.seq_len_max = int(input_shape[0][1])
    def call(self, input_list, mask=None, **kwargs):
        if self.supports_masking:
            if mask is None:
                raise ValueError("When supports_masking is true, please input mask")
            key_input, value_input = input_list
            #expand_dims Returns a tensor with a length 1 axis inserted at index axis.
            #tf.expand_dims(input, axis, name=None)
            #image = tf.zeros([10,10,3])  
            #tf.expand_dims(image, axis=0).shape.as_list()   [1,10,10,3] 
            #tf.expand_dims(image, axis=-1).shape.as_list()   [10,10,3,1]
            mask = tf.expand_dims(mask[0], axis=2)
        else :
            #分别是embedding_dict[feature_name], features[feature_length_name], features[fc.weight_name]
            #即feature对应的embedding的输出，'hist_len‘对应的input tensor， 以及 'weight_name'对应的input tensor
            key_input, key_length_input, value_input = input_list
            #sequence_mask, https://www.cnblogs.com/qianyuesheng/p/16445654.html
            #输入是一个数，所以返回是一个shape(1,seq_len_max)的Tensor
            mask = tf.sequence_mask(key_length_input,
                                    self.seq_len_max, dtype=tf.bool)
            #transpose， 把第二维和第三维的维度换一下。 比如原来shape是（batch_size, 1, seq_len_max）变成（batch_size, seq_len_max, 1）
            mask = tf.transpose(mask, (0, 2, 1))
            
        embedding_size = key_input.shape[-1]
            
        if self.weight_normalization:
            #Creates a tensor of all ones that has the same shape as the input.
            #tf.ones_like(input, dtype=None, name=None)
            #*表示乘号,**表示次方
            #遗留问题，这里为什么又乘以（-2 ** 32 + 1）
            paddings = tf.ones_like(value_input) * (-2 ** 32 + 1)
        else :
            paddings = tf.zeros_like(value_input)
        
        #操作'weight_name'对应的input tensor
        value_input = tf.where(mask, value_input, paddings)
        
        if self.weight_normalization:
            #tf.keras.layers.Softmax(axis=-1, **kwargs)(value_input) ,或者
            #tf.keras.layers.Softmax(axis=-1, **kwargs)(value_input，mask) 
            value_input = tf.keras.layers.Softmax(value_input)
        
        if len(value_input.shape) == 2:
            value_input = tf.expand_dims(value_input, axis=2)
            #his operation creates a new tensor by replicating input multiples times. 
            #The output tensor's i'th dimension has input.dims(i) * multiples[i] elements, 
            #and the values of input are replicated multiples[i] times along the 'i'th dimension. For example, tiling [a b c d] by [2] produces [a b c d a b c d].
            value_input = tf.tile(value_input, [1, 1, embedding_size])
        
        #A tensor, the element-wise product of the inputs. 
        return tf.multiply(key_input, value_input)
        
    

In [None]:
a = np.array(np.random.randint(30,size=(5,3,2)))
a

In [None]:
tf.math.reduce_max(a, axis=0, keepdims=True)

In [None]:
tf.math.reduce_max(a, axis=0, keepdims=False)

In [None]:
tf.math.reduce_max(a, axis=1, keepdims=True)

In [None]:
tf.math.reduce_max(a, axis=1, keepdims=False)

In [None]:
tf.math.reduce_max(a, axis=2, keepdims=True)

In [None]:
tf.math.reduce_max(a, axis=2, keepdims=False)

In [None]:
tf.math.reduce_sum(a, axis=0, keepdims=True)

In [None]:
tf.math.reduce_sum(a, axis=0, keepdims=False)

In [None]:
tf.math.reduce_sum(a, axis=1, keepdims=True)

In [None]:
tf.math.reduce_sum(a, axis=1, keepdims=False)

In [None]:
b = tf.math.reduce_sum(a, axis=1, keepdims=False)
b = tf.expand_dims(b, axis=1)
b

In [None]:
tf.math.reduce_sum(a, axis=2, keepdims=True)

In [None]:
tf.math.reduce_sum(a, axis=2, keepdims=False)

In [None]:

#如果有weight，就对embedding进行了加权。 如果没有就是原来embedding的输出
#如果原来有seq的len , SequencePoolingLayer(combiner, supports_masking=False)([seq_input, features[feature_length_name]])
#否则supports_masking设置为true。 #SequencePoolingLayer(combiner, supports_masking=True)(seq_input)
class SequencePoolingLayer(tf.keras.layers.Layer):
    """The SequencePoolingLayer is used to apply pooling operation(sum,mean,max) on variable-length sequence feature/multi-value feature.

      Input shape
        - A list of two  tensor [seq_value,seq_len]

        - seq_value is a 3D tensor with shape: ``(batch_size, T, embedding_size)``

        - seq_len is a 2D tensor with shape : ``(batch_size, 1)``,indicate valid length of each sequence.

      Output shape
        - 3D tensor with shape: ``(batch_size, 1, embedding_size)``.

      Arguments
        - **mode**:str.Pooling operation to be used,can be sum,mean or max.

        - **supports_masking**:If True,the input need to support masking.
    """

    def __init__(self, mode='mean', supports_masking=False, **kwargs):

        if mode not in ['sum', 'mean', 'max']:
            raise ValueError("mode must be sum or mean")
        self.mode = mode
        self.eps = tf.constant(1e-8, tf.float32)
        super(SequencePoolingLayer, self).__init__(**kwargs)

        self.supports_masking = supports_masking

    def build(self, input_shape):
        if not self.supports_masking:
            self.seq_len_max = int(input_shape[0][1])
        super(SequencePoolingLayer, self).build(
            input_shape)  # Be sure to call this somewhere!

    def call(self, seq_value_len_list, mask=None, **kwargs):
        if self.supports_masking:
            if mask is None:
                raise ValueError(
                    "When supports_masking=True,input must support masking")
            uiseq_embed_list = seq_value_len_list
            mask = tf.cast(mask, tf.float32)  # tf.to_float(mask)
            user_behavior_length = reduce_sum(mask, axis=-1, keep_dims=True)
            mask = tf.expand_dims(mask, axis=2)
        else:#因为可能没有weighted，所以还是需要mask处理
            #分别是上一层的输出，与长度
            uiseq_embed_list, user_behavior_length = seq_value_len_list
            #sequence_mask, https://www.cnblogs.com/qianyuesheng/p/16445654.html
            #输入是一个数，所以返回是一个shape(1,seq_len_max)的Tensor
            mask = tf.sequence_mask(user_behavior_length,
                                    self.seq_len_max, dtype=tf.float32)
            
            #transpose， 把第二维和第三维的维度换一下。 比如原来shape是（batch_size, 1, seq_len_max）变成（batch_size, seq_len_max, 1）
            mask = tf.transpose(mask, (0, 2, 1))

        embedding_size = uiseq_embed_list.shape[-1]

        mask = tf.tile(mask, [1, 1, embedding_size])

        if self.mode == "max":
            #mask原来有值为true， 否则为false
            hist = uiseq_embed_list - (1 - mask) * 1e9
            #This is the reduction operation for the elementwise tf.math.maximum op.
            #>>> x = tf.constant([5, 1, 2, 4])>>> tf.reduce_max(x) <tf.Tensor: shape=(), dtype=int32, numpy=5>
            #shape(batch_size, T, embedding_size)
            return tf.math.reduce_max(hist, 1, keep_dims=True)
        
       
        hist = tf.math.reduce_sum(uiseq_embed_list * mask, 1, keep_dims=True)

        if self.mode == "mean":
            hist = tf.math.reduce_mean(uiseq_embed_list * mask, keep_dims=True)

        
        return hist

    def compute_output_shape(self, input_shape):
        if self.supports_masking:
            return (None, 1, input_shape[-1])
        else:
            return (None, 1, input_shape[0][-1])

    def compute_mask(self, inputs, mask):
        return None

    def get_config(self, ):
        config = {'mode': self.mode, 'supports_masking': self.supports_masking}
        base_config = super(SequencePoolingLayer, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))

In [None]:
#输入
#sparse_embedding_dict{'user_id': <keras.layers.core.embedding.Embedding>, .......}
#sparse_input_dict, #OrderedDict([('user_id',KerasTensor),('gender',KerasTensor)...] INPUT的张量
#sparse_feature_columns： feature的各种配置
def embedding_lookup(sparse_embedding_dict, sparse_input_dict, sparse_feature_columns, return_feat_list=(),
                     mask_feat_list=(), to_list=False):
    #defaultdict就可以避免这个错误，defaultdict的作用是在于，当字典里的element不存在但被查找时，返回的不是keyError而是一个默认值，
    #这个默认值是什么呢,defaultdict(factory_function)这个factory_function可以是list、set、str等等，作用是当key不存在时，返回的是工厂函数的默认值，比如list对应[ ]
    group_embedding_dict = collections.defaultdict(list)
    for fc in sparse_feature_columns:
        feature_name = fc.name
        embedding_name = fc.embedding_name
        if (len(return_feat_list) == 0 or feature_name in return_feat_list):
            if fc.use_hash:#本例中都是false
                lookup_idx = Hash(fc.vocabulary_size, mask_zero=(feature_name in mask_feat_list), vocabulary_path=fc.vocabulary_path)(
                    sparse_input_dict[feature_name])
            else:
                lookup_idx = sparse_input_dict[feature_name] #返回了特征对应的input
            #fc.group_name='default_group'
            #各feature的input通过一层embedding层之后的输出，concat到一起
            group_embedding_dict[fc.group_name].append(sparse_embedding_dict[embedding_name](lookup_idx))
    if to_list:
        return list(chain.from_iterable(group_embedding_dict.values()))
    return group_embedding_dict


In [None]:
#输入
#sparse_embedding_dict{'user_id': <keras.layers.core.embedding.Embedding>, .......}
#sparse_input_dict, #OrderedDict([('user_id',KerasTensor),('gender',KerasTensor)...] INPUT的张量
#sparse_feature_columns： feature的各种配置
#和embedding_lookup最大的区别在于， 一个feature作为了一个group 
def varlen_embedding_lookup(embedding_dict, sequence_input_dict, varlen_sparse_feature_columns):
    varlen_embedding_vec_dict = {}
    for fc in varlen_sparse_feature_columns:
        feature_name = fc.name
        embedding_name = fc.embedding_name
        if fc.use_hash:
            lookup_idx = Hash(fc.vocabulary_size, mask_zero=True, vocabulary_path=fc.vocabulary_path)(sequence_input_dict[feature_name])
        else:
            lookup_idx = sequence_input_dict[feature_name]
        varlen_embedding_vec_dict[feature_name] = embedding_dict[embedding_name](lookup_idx)
    return varlen_embedding_vec_dict

In [None]:
#输入
#embedding_dict,varlen_embedding_lookup 中查找的经过变换embedding变换的output{'user_id': XXX, .......}
#features, #OrderedDict([('user_id',KerasTensor),('gender',KerasTensor)...] INPUT的张量
#varlen_sparse_feature_columns： feature的各种配置
#函数作用
def get_varlen_pooling_list(embedding_dict, features, varlen_sparse_feature_columns, to_list=False):
    pooling_vec_list = defaultdict(list)
    for fc in varlen_sparse_feature_columns:
        feature_name = fc.name
        # 本例子中combiner='mean'， pooling的方式
        combiner = fc.combiner
        feature_length_name = fc.length_name
        #如果fc.weight_name is  None:则直接用原来embedding的输出。
        #如果fc.weight_name is  not None，WeightedSequenceLayer加权，如果有feature_length_name, 则传入，否则不传入。
        # 如果有feature_length_name，SequencePoolingLayer的supports_masking=False
        # 如果没有feature_length_name，SequencePoolingLayer的supports_masking=True
        if feature_length_name is not None:
            if fc.weight_name is not None:
                seq_input = WeightedSequenceLayer(weight_normalization=fc.weight_norm)(
                    [embedding_dict[feature_name], features[feature_length_name], features[fc.weight_name]])
            else:
                seq_input = embedding_dict[feature_name]
            vec = SequencePoolingLayer(combiner, supports_masking=False)(
                [seq_input, features[feature_length_name]])
        else:
            if fc.weight_name is not None:
                seq_input = WeightedSequenceLayer(weight_normalization=fc.weight_norm, supports_masking=True)(
                    [embedding_dict[feature_name], features[fc.weight_name]])
            else:
                seq_input = embedding_dict[feature_name]
            vec = SequencePoolingLayer(combiner, supports_masking=True)(
                seq_input)
        pooling_vec_list[fc.group_name].append(vec)
    if to_list:
        return chain.from_iterable(pooling_vec_list.values())
    return pooling_vec_list

In [None]:
#features, #OrderedDict([('user_id',KerasTensor),('gender',KerasTensor)...] INPUT的张量
#feature_columns： feature的各种配置
#dense相比于sparse没有经过embedding层，如果配置了fc.transform_fn，执行fc.transform_fn的结果作为输出
def get_dense_input(features, feature_columns):
    dense_feature_columns = list(
        filter(lambda x: isinstance(x, DenseFeat), feature_columns)) if feature_columns else []
    dense_input_list = []
    for fc in dense_feature_columns:
        if fc.transform_fn is None:
            dense_input_list.append(features[fc.name])
        else:
            transform_result = Lambda(fc.transform_fn)(features[fc.name])
            dense_input_list.append(transform_result)
    return dense_input_list

In [None]:
#features, #OrderedDict([('user_id',KerasTensor),('gender',KerasTensor)...] INPUT的张量
#feature_columns： feature的各种配置
#l2_reg_embedding=1e-6
#embedding_matrix_dict{'user_id': <keras.layers.core.embedding.Embedding>, .......}
def input_from_feature_columns(features, feature_columns, l2_reg, seed, prefix='', seq_mask_zero=True,
                               support_dense=True, support_group=False, embedding_matrix_dict=None):
    sparse_feature_columns = list(
        filter(lambda x: isinstance(x, SparseFeat), feature_columns)) if feature_columns else []
    varlen_sparse_feature_columns = list(
        filter(lambda x: isinstance(x, VarLenSparseFeat), feature_columns)) if feature_columns else []
    if embedding_matrix_dict is None:
        embedding_matrix_dict = create_embedding_matrix(feature_columns, l2_reg, seed, prefix=prefix,
                                                        seq_mask_zero=seq_mask_zero)

    group_sparse_embedding_dict = embedding_lookup(embedding_matrix_dict, features, sparse_feature_columns)
    dense_value_list = get_dense_input(features, feature_columns)
    if not support_dense and len(dense_value_list) > 0:
        raise ValueError("DenseFeat is not supported in dnn_feature_columns")

    sequence_embed_dict = varlen_embedding_lookup(embedding_matrix_dict, features, varlen_sparse_feature_columns)
    group_varlen_sparse_embedding_dict = get_varlen_pooling_list(sequence_embed_dict, features,
                                                                 varlen_sparse_feature_columns)
    group_embedding_dict = mergeDict(group_sparse_embedding_dict, group_varlen_sparse_embedding_dict)
    if not support_group:
        group_embedding_dict = list(chain.from_iterable(group_embedding_dict.values()))
    return group_embedding_dict, dense_value_list


In [None]:
def DSSM(user_feature_columns, 
         item_feature_colums,
         user_dnn_hidden_units=(64,42),
         item_dnn_hidden_units=(64,32),
         dnn_activation='relu',
         dnn_use_bn=False,
         l2_reg_dnn=0,
         l2_reg_embedding=1e-6,
         dnn_dropout=0,
         loss_type='softmax',
         temperature=0.05,
         sampler_config=None,
         seed=1024
        ):
    """Instantiates the Deep Structured Semantic Model architecture.

    :param user_feature_columns: An iterable containing user's features used by  the model.
    :param item_feature_columns: An iterable containing item's features used by  the model.
    :param user_dnn_hidden_units: list,list of positive integer or empty list, the layer number and units in each layer of user tower
    :param item_dnn_hidden_units: list,list of positive integer or empty list, the layer number and units in each layer of item tower
    :param dnn_activation: Activation function to use in deep net
    :param dnn_use_bn: bool. Whether use BatchNormalization before activation or not in deep net
    :param l2_reg_dnn: float. L2 regularizer strength applied to DNN
    :param l2_reg_embedding: float. L2 regularizer strength applied to embedding vector
    :param dnn_dropout: float in [0,1), the probability we will drop out a given DNN coordinate.
    :param loss_type: string. Loss type.
    :param temperature: float. Scaling factor.
    :param sampler_config: negative sample config.
    :param seed: integer ,to use as random seed.
    :return: A Keras model instance.

    """
    #定义input
    user_features = build_input_features(user_feature_columns)
    #OrderedDict([('user_id',KerasTensor),('gender',KerasTensor)...]
    user_inputs_list = list(user_features.values())
    
    #定义embedding
    embedding_matrix_dict = create_embedding_matrix(user_feature_columns + item_feature_columns, l2_reg_embedding,
                                                    seed=seed,
                                                    seq_mask_zero=True)
    print(embedding_matrix_dict)
    #{'user_id': <keras.layers.core.embedding.Embedding>, .......}

    
    #sparse 非多值的embedding后的output进行contact, 
    #sparse 多值的特征embedding后经过WeightedSequenceLayer（可选），再进行get_varlen_pooling_list
    #dense 不经过embedding , 可选的经过fc.transform_fn得到output
    user_sparse_embedding_list, user_dense_value_list = input_from_feature_columns(user_features,
                                                                                   user_feature_columns,
                                                                                   l2_reg_embedding, seed=seed,
                                                                                   embedding_matrix_dict=embedding_matrix_dict)
    print(user_sparse_embedding_list)
    print(user_dense_value_list)
    
    
model = DSSM(user_feature_columns, item_feature_columns,user_dnn_hidden_units=(128,64, embedding_dim),
             item_dnn_hidden_units=(64, embedding_dim,),loss_type='softmax',sampler_config=sampler_config)