**背景:**

承接demo_v1，完善上一篇被简化的部分。逐步代码模块化。

In [3]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

print("version of tensorflow:", tf.__version__)

version of tensorflow: 2.4.1


# 预备

## mask & padding 

对不等长的序列需要padding，一般是末尾用0补齐。  
可以参考[官方guide](https://www.tensorflow.org/guide/keras/understanding_masking_and_padding#passing_mask_tensors_directly_to_layers).

In [4]:
raw_inputs = [
    [711, 632, 71],
    [73, 8, 3215, 55, 927],
    [83, 91, 1, 645, 1253, 927],
]

# tensorflow 2.12使用下面的代码
# padded_inputs = tf.keras.utils.pad_sequences(
#     raw_inputs, padding="post"
# )
padded_inputs = tf.keras.preprocessing.sequence.pad_sequences(
    raw_inputs, padding="post"
)
print(padded_inputs)

[[ 711  632   71    0    0    0]
 [  73    8 3215   55  927    0]
 [  83   91    1  645 1253  927]]


对序列做embedding，设置mask_zero=True，查看mask的效果。  

In [5]:
embedding = layers.Embedding(input_dim=5000, output_dim=16, mask_zero=True)
masked_output = embedding(padded_inputs)

print(masked_output._keras_mask)

tf.Tensor(
[[ True  True  True False False False]
 [ True  True  True  True  True False]
 [ True  True  True  True  True  True]], shape=(3, 6), dtype=bool)


2023-05-29 14:08:21.315221: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2023-05-29 14:08:21.316286: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


**备注：**  
1. 在使用tf.keras的Sequence或者Functional API时，mask layer或者指明了mask的embedding layer的下游，只要支持mask都会自动使用这个信息。
2. 可以认为embedding是mask的生产者，它实现了compute_mask方法供调用；rnn或者lstm等是mask的消费者，他们的__call__方法里支持mask参数，可以手动传进去。 
3. 在自定义layer的时候，需要重新实现compute_mask方法，参考[官方文档](https://www.tensorflow.org/api_docs/python/tf/keras/layers/Layer#compute_mask)和 [源代码](https://github.com/keras-team/keras/blob/v2.12.0/keras/engine/base_layer.py#L976-L998).

  
**回到DIN的实现上，sequence特征涉及embedding之后的pooling操作，以及attention部分，都需要处理mask的问题。**

## 规范feature column参数

### dense feature 
这类特征可以直接作为模型的输入，或者先[分桶](https://www.tensorflow.org/api_docs/python/tf/keras/layers/Discretization)然后再考虑是否embedding。

In [12]:
# 如果不做处理
age = {
    "name": "age",
    "dtype": "float32",
    "dim": 1 # 1维
}

# 如果需要分桶再embedding
age = {
    "name": "age",
    "dtype": "float32",
    "dim": 1, # 1维
    
    "use_bucket": True,
    "bins": [20, 30, 40, 50, 60], # 若use_bucket=False以下可以不填
    "emb_name": "age_emb",
    "emb_dim": 8
}

# 多维：view 各个cid的次数
view_cid_cnt = {
    "name": "view_cid_cnt",
    "dtype": "float32",
    "dim": 100 # 100维   
}

### sparse feature
这类特征可以直接embedding，但是如果取值过多也可以先hash到有限个取值上。参考[这里](https://www.tensorflow.org/guide/keras/preprocessing_layers#applying_the_hashing_trick_to_an_integer_categorical_feature)。

In [7]:
iid = {
    "name": "iid",
    "dtype": "bytes",
    
    "vocab_size": 100000,
    "use_hash": True, # 若use_hash=False，hash_size可以不填
    "hash_size": 10000,
    "emb_name": "iid_emb",
    "emb_dim": 1000    
}

### seq/multi sparse feature
这类特征dim>1，可能是sequence，也可以不是，eg：tag list。和sparse feature相比，需要多定义max_len, combiner(pooling参数).

In [8]:
view_iid = {
    "name": "iid",
    "dtype": "bytes",
    "max_len": 100,
    
    "vocab_size": 100000,
    "use_hash": True,
    "hash_size": 10000, # 若use_hash=False，hash_size可以不填
    "emb_name": "iid_emb",
    "emb_dim": 1000,
    
    "need_padding": True, # 如果输入不定长，则需要设置为True
    "combiner": "mean"    
}

## 定义模型结构参数

In [9]:
model_params = {
    "user_feat": [],
    "item_feat": [],
    "mlp_hidden_size": [128, 64, 64],
    "use_bn": True,
    "din": ["view_iid|iid", "view_cid|cid"], # 这里必须是seq特征｜item特征 
    "din_hidden_size": [64, 32]
}

# 代码模块化

## 定义feature column
1. tensorflow 2.0以上，不再使用feature column，转而用tf.keras.layers。两者的对应关系可以查看[官方文档](https://www.tensorflow.org/guide/migrate/migrating_feature_columns#feature_column_equivalence_table). 
2. 三种特征类型这里使用namedtuple类来定义，可以参考namedtuple的[教程](https://realpython.com/python-namedtuple/#subclassing-namedtuple-classes). 这样做可以增加代码可读性，适用于定义属性多，方法少的类。

In [10]:
from collections import namedtuple

In [11]:
class DenseFeat(namedtuple('DenseFeat', ['name', 'dtype', 'dim', 'use_bucket', 'bins', 'emb_name', 'emb_dim'])):
    
    # use __slots__ = () to prevent the automatic creation of a per-instance __dict__
    # keep memory efficient
    __slots__ = () 

    # set default value of arguments
    def __new__(cls, name, dtype="float32", dim=1, use_bucket=False, bins=None, emb_name=None, emb_dim=None):
        return super(DenseFeat, cls).__new__(cls, name, dtype, dim, use_bucket, bins, emb_name, emb_dim)
    

In [15]:
class SparseFeat(namedtuple('SparseFeat', ['name', 'vocab_size', 'emb_dim', 'dtype', 'use_hash', 'hash_size', 'emb_name'])):
    
    __slots__ = () 
    
    def __new__(cls, name, vocab_size, emb_dim, dtype="int32", use_hash=False, hash_size=None, emb_name=None):
        
        if emb_name is None:
            emb_name = name

        return super(SparseFeat, cls).__new__(cls, name, vocab_size, emb_dim, dtype, use_hash, hash_size, emb_name)  
    

SeqSparseFeat是多个SparseFeat类采用嵌套的形式，需要解析出SparseFeat类的属性。

In [19]:
class SeqSparseFeat(namedtuple('SeqSparseFeat', ['sparsefeat', 'max_len', 'need_padding', 'combiner'])):
    
    __slots__ = () 
    
    def __new__ (cls, sparsefeat, max_len, need_padding=True, combiner='nean'):
        return super(SeqSparseFeat, cls).__new__(cls, sparsefeat, maxlen, need_padding, combiner)
    
    @property
    def name(self):
        return self.sparsefeat.name
    
    @property
    def vocab_size(self):
        return self.sparsefeat.vocab_size    
    
    @property
    def emb_dim(self):
        return self.sparsefeat.emb_dim  
    
    @property
    def dtype(self):
        return self.sparsefeat.dtype  
    
    @property
    def use_hash(self):
        return self.sparsefeat.use_hash  
    
    @property
    def hash_size(self):
        return self.sparsefeat.hash_size  
    
    @property
    def emb_name(self):
        return self.sparsefeat.emb_name  

## 自定义mlp结构

这部分是继承keras.layers，实现自定义的layer，可以参考[官方文档](https://www.tensorflow.org/guide/keras/making_new_layers_and_models_via_subclassing)，主要是希望代码更结构化更清晰。  
自定义layer里关于get_config、from_config的说明可以查看[这里](https://www.tensorflow.org/guide/keras/serialization_and_saving#savedmodel_format)。

In [20]:
from tensorflow.python.keras.layers import Layer

In [23]:
class MLP(Layer):
    """The Multi Layer Percetron

      Input shape
        - nD tensor with shape: ``(batch_size, ..., input_dim)``. The most common situation would be a 2D input with shape ``(batch_size, input_dim)``.

      Output shape
        - nD tensor with shape: ``(batch_size, ..., hidden_size[-1])``. For instance, for a 2D input with shape ``(batch_size, input_dim)``, the output would have shape ``(batch_size, hidden_size[-1])``.

    """
    def __init__(self, hidden_units, activation='relu', l2_reg=0, dropout_rate=0, use_bn=False, seed=1024, **kwargs):
        self.hidden_units = hidden_units
        self.activation = activation
        self.l2_reg = l2_reg
        self.dropout_rate = dropout_rate
        self.use_bn = use_bn  
        self.seed = seed
        super(MLP, self).__init__(**kwargs)
        
    def build(self, input_shape):
                    
        if l2_reg > 0:
            self.mlp_layers = [layers.Dense(units=unit, activation=None, 
                                                kernel_regularizer=tf.keras.regularizers.l2(self.l2_reg)) for unit in self.hidden_units]
        else:
            self.mlp_layers = [layers.Dense(units=unit, activation=None) for unit in self.hidden_units]
                
        if self.use_bn:
            self.bn_layers = [layers.BatchNormalization() for _ in range(len(self.hidden_units))]
            
        self.activation_layers = [layers.Activation(self.activation) for _ in range(len(self.hidden_units))]

        self.dropout_layers = [layers.Dropout(self.dropout_rate, seed=self.seed + i) for i in
                               range(len(self.hidden_units))]  
        
    def call(self, inputs, training=None, **kwargs):

        deep_input = inputs

        for i in range(len(self.hidden_units)):
            fc = self.mlp_layers[i]

            if self.use_bn:
                fc = self.bn_layers[i](fc, training=training)
            
            fc = self.activation_layers[i](fc)

            fc = self.dropout_layers[i](fc, training=training)
            
            deep_input = fc
            
        return deep_input

    def compute_output_shape(self, input_shape):
        if len(self.hidden_units) > 0:
            shape = input_shape[:-1] + (self.hidden_units[-1],)
        else:
            shape = input_shape

        return tuple(shape)

    def get_config(self, ):
        config = {'activation': self.activation, 'hidden_units': self.hidden_units,
                  'l2_reg': self.l2_reg, 'use_bn': self.use_bn, 'dropout_rate': self.dropout_rate, 'seed': self.seed}
        base_config = super(MLP, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))     
    

## 自定义din结构

最小激活单元，mlp结构，输出注意力得分。

In [34]:
class LocalActivationUnit(Layer):
    """
      Input shape
        - A list of two 3D tensor with shape:  ``(batch_size, 1, embedding_size)`` and ``(batch_size, T, embedding_size)``

      Output shape : attention scores 
        - 3D tensor with shape: ``(batch_size, T, 1)``
    """
    def __init__(self, hidden_units=(64, 32), activation='sigmoid', l2_reg=0, dropout_rate=0, use_bn=False, seed=1024,
                 **kwargs):
        self.hidden_units = hidden_units
        self.activation = activation
        self.l2_reg = l2_reg
        self.dropout_rate = dropout_rate
        self.use_bn = use_bn
        self.seed = seed
        super(LocalActivationUnit, self).__init__(**kwargs)
        self.supports_masking = True 
    
    def build(self, input_shape):
        
        self.mlp = MLP(self.hidden_units, self.activation, self.l2_reg, self.dropout_rate, self.use_bn, seed=self.seed)
        self.out_layer = keras.Layer.Dense(units=1, activation='sigmoid') 
        
    def call(self, inputs, training=None, **kwargs):

        query, keys = inputs

        keys_len = keys.get_shape()[1]
        queries = K.repeat_elements(query, keys_len, 1)

        att_input = tf.concat(
            [queries, keys, queries - keys, queries * keys], axis=-1)

        att_out = self.mlp(att_input, training=training)

        attention_score = self.out_layer(att_out)

        return attention_score   
    
    def compute_output_shape(self, input_shape):
        return input_shape[1][:2] + (1,)

    def compute_mask(self, inputs, mask):
        return mask

    def get_config(self, ):
        config = {'activation': self.activation, 'hidden_units': self.hidden_units,
                  'l2_reg': self.l2_reg, 'dropout_rate': self.dropout_rate, 'use_bn': self.use_bn, 'seed': self.seed}
        base_config = super(LocalActivationUnit, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))       
    

下面写完整的layer，这里supports_masking设置为True，注意input（这里input是embedding）一定要支持mask。

In [35]:
class AttentionSequencePoolingLayer(Layer):
    """The Attentional sequence pooling operation used in DIN.

      Input shape
        - A list of three tensor: [query,keys,keys_length]

        - query is a 3D tensor with shape:  ``(batch_size, 1, embedding_size)``

        - keys is a 3D tensor with shape:   ``(batch_size, T, embedding_size)``

      Output shape
        - 3D tensor with shape: ``(batch_size, 1, embedding_size)``.

    """

    def __init__(self, att_hidden_units=(80, 40), att_activation='sigmoid', **kwargs):

        self.att_hidden_units = att_hidden_units
        self.att_activation = att_activation
        super(AttentionSequencePoolingLayer, self).__init__(**kwargs)
        self.supports_masking = True

    def build(self, input_shape):
        self.local_att = LocalActivationUnit(
            self.att_hidden_units, self.att_activation, l2_reg=0, dropout_rate=0, use_bn=False, seed=1024, )

    def call(self, inputs, mask=None, training=None, **kwargs):

        queries, keys = inputs
        key_masks = tf.expand_dims(mask[-1], axis=1)

        attention_score = self.local_att([queries, keys], training=training)

        outputs = tf.transpose(attention_score, (0, 2, 1))

        outputs = tf.matmul(outputs, keys)

        outputs._uses_learning_phase = training is not None

        return outputs

    def compute_output_shape(self, input_shape):
        return (None, 1, input_shape[0][-1])

    def compute_mask(self, inputs, mask):
        return None

    def get_config(self, ):

        config = {'att_hidden_units': self.att_hidden_units, 'att_activation': self.att_activation}
        base_config = super(AttentionSequencePoolingLayer, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))

遗留flatten和pooling，之后再看是否需要自定义。

# 模拟数据

**和demo_v1相比，有以下变化：**
1. 已经定义了feature column类，这里直接用这些类生成模拟数据。
2. 保证每类特征至少有两个，以便更好的测试代码。
3. 考虑序列特征的mask。

# 构建模型