In [1]:
import torch 
from torch import nn
import metaspore as ma

In [2]:
S3_ROOT_DIR = 's3://dmetasoul-bucket/'

In [3]:
class Dice(nn.Module):
    """
    The Data Adaptive Activation Function in DIN, a generalization of PReLu.
    """
    def __init__(self, emb_size, dim=2, epsilon=1e-8):
        super(Dice, self).__init__()
        assert dim == 2 or dim == 3
 
        self.bn = nn.BatchNorm1d(emb_size, eps=epsilon)
        self.sigmoid = nn.Sigmoid()
        self.dim = dim
        
        # wrap alpha in nn.Parameter to make it trainable
        self.alpha = nn.Parameter(torch.zeros((emb_size,))) if self.dim == 2 else nn.Parameter(
            torch.zeros((emb_size, 1)))
 
 
    def forward(self, x):
        assert x.dim() == self.dim
        if self.dim == 2:
            x_p = self.sigmoid(self.bn(x))
            out = self.alpha * (1 - x_p) * x + x_p * x
        else:
            x = torch.transpose(x, 1, 2)
            x_p = self.sigmoid(self.bn(x))
            out = self.alpha * (1 - x_p) * x + x_p * x
            out = torch.transpose(out, 1, 2)
        return out

def init_weights(model):
    if isinstance(model, nn.Linear):
        if model.weight is not None:
            nn.init.kaiming_uniform_(model.weight.data)
        if model.bias is not None:
            nn.init.normal_(model.bias.data)
    elif isinstance(model, (nn.BatchNorm1d,nn.BatchNorm2d,nn.BatchNorm3d)):
        if model.weight is not None:
            nn.init.normal_(model.weight.data, mean=1, std=0.02)
        if model.bias is not None:
            nn.init.constant_(model.bias.data, 0)
    else:
        pass

In [4]:
class MLP(nn.Module):
    def __init__(self, input_size, hidden_layers,
                 dropout=0.0, batchnorm=True, activation='dice'):
        super(MLP, self).__init__()
        from collections import OrderedDict
        modules = OrderedDict()
        previous_size = input_size
        for index, hidden_layer in enumerate(hidden_layers):
            modules[f"dense{index}"] = nn.Linear(previous_size, hidden_layer)
            if batchnorm:
                modules[f"batchnorm{index}"] = nn.BatchNorm1d(hidden_layer)
            if activation:
                modules[f"activation{index}"] = Dice(hidden_layer) # 直接使用dice，不用get_activation_layer函数
            if dropout:
                modules[f"dropout{index}"] = nn.Dropout(dropout)
            previous_size = hidden_layer
        self.mlp = nn.Sequential(modules)
 
    def forward(self, x):
        return self.mlp(x)
    
    
# def get_activation_layer(name, hidden_size=None, dice_dim=2):
#     name = name.lower()
#     name_dict = {x.lower():x for x in dir(nn) if '__' not in x and 'Z'>=x[0]>='A'}
#     if name=="linear":
#         return Identity()
#     elif name=="dice":
#         assert dice_dim
#         return Dice(hidden_size, dice_dim)
#     else:
#         assert name in name_dict, f'activation type {name} not supported!'
#         return getattr(nn,name_dict[name])()


In [5]:
import torch
from torch import nn 
import numpy as np
class Attention(nn.Module):
    def __init__(
            self,
            input_size,
            hidden_layers,
            dropout=0.0,
            batchnorm=True,
            return_scores=False):
        
        super().__init__()
        self.return_scores = return_scores
        
        self.mlp = MLP(
            input_size=input_size * 4,
            hidden_layers=hidden_layers,
            dropout=dropout,
            batchnorm=batchnorm,
            activation='dice')
        self.fc = nn.Linear(hidden_layers[-1], 1)
 
    def forward(self, query, keys, keys_length):
        """
        Parameters
        ----------
        query: 2D tensor, [Batch, Hidden]
        keys: 3D tensor, [Batch, Time, Hidden]
        keys_length: 1D tensor, [Batch]
        Returns
        -------
        outputs: 2D tensor, [Batch, Hidden]
        """
        
        batch_size, max_length, dim = keys.size()
        
        query = query.unsqueeze(1).expand(-1, max_length, -1)
 
        din_all = torch.cat(
            [query, keys, query - keys, query * keys], dim=-1)
 
        din_all = din_all.view(batch_size * max_length, -1) # [B*T 4*H]
        
        outputs = self.mlp(din_all)
 
        outputs = self.fc(outputs).view(batch_size, max_length)  # [B, T]
 
        # Scale
        outputs = outputs / (dim ** 0.5)
 
        # Mask
        mask = (torch.arange(max_length, device=keys_length.device).repeat(
            batch_size, 1) < keys_length.view(-1, 1))
        outputs[~mask] = -np.inf
 
        # Activation
        outputs = torch.sigmoid(outputs)  # [B, T]
 
        if not self.return_scores:
            # Weighted sum
            outputs = torch.matmul(
                outputs.unsqueeze(1), keys).squeeze()  # [B, H]
            
        return outputs

In [6]:
# model = Attention(16,
#             [16, 8],
#             dropout=0.0,
#             batchnorm=True,
#             return_scores=False)
# query = torch.ones(100, 2, 16)
# keys = torch.ones(100, 10, 16)
# keys_length = torch.ones(100)*10
# print(query.shape)
# print(keys.shape)
# print(keys_length.shape)
# model(query, keys, keys_length)
seq_column_index_list = [1, 3]
target_column_index_list = [2, 4]
for seq_column_index, target_column_index in zip(seq_column_index_list, target_column_index_list):
    print(seq_column_index, target_column_index)

1 2
3 4


In [7]:
import torch
import metaspore as ma
import copy
class DIN(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self._embedding_size = 16
        self._schema_dir = S3_ROOT_DIR + 'demo/movielens/schema/'
        
        self._column_name_path =  'column_schema_group_attention.txt'
        self._combine_schema_path =  'combine_column_schema_group_attention.txt'
        self._embedding_table = ma.EmbeddingLookup(self._embedding_size, self._column_name_path, self._combine_schema_path)
        self._embedding_table.updater = ma.FTRLTensorUpdater()
        self._embedding_table.initializer = ma.NormalTensorInitializer(var=0.01)
        
        self.attention_hidden_layers=[16, 8]
        self.batchnorm=True
        self.return_score=False
        self._attention = Attention(self._embedding_size,
                                    hidden_layers=[16,8],
                                    dropout=0.1,
                                    batchnorm=True,
                                    return_scores=False)
        # user_embedding + sum_pooling + target_embedding
        self.feature_num = 5
        total_input_size = self._embedding_size * self.feature_num
        mlp_hidden_layers=[32, 16]
        self.mlp = MLP(input_size=total_input_size,
            hidden_layers=mlp_hidden_layers,
            dropout=0.25, batchnorm=True, activation="dice")
        self.final_layer = nn.Linear(mlp_hidden_layers[-1], 1)
        self.apply(init_weights)
        
        self._sparse = ma.EmbeddingSumConcat(self._embedding_size, self._column_name_path, self._combine_schema_path)
        self._sparse.updater = ma.FTRLTensorUpdater()
        self._sparse.initializer = ma.NormalTensorInitializer(var=0.01)
        self._dense = torch.nn.Sequential(
            ma.nn.Normalization(self._sparse.feature_count * self._embedding_size),
            torch.nn.Linear(self._sparse.feature_count * self._embedding_size, 1024),
            torch.nn.ReLU(),
            torch.nn.Linear(1024, 512),
            torch.nn.ReLU(),
            torch.nn.Linear(512, 1),
        )
        
    def forward(self, x):
        x, offset = self._embedding_table(x) 
        # print("offset: ", offset)
        
        all_column = self.feature_num
        # 利用offset计算取出每个feature的embedding
        x_reshape = [x[offset[i]:offset[i+1],:] for i in range(offset.shape[0]-1)]
        x_reshape.append(x[offset[offset.shape[0]-1]:x.shape[0],:])
        # print("x_reshape: ", len(x_reshape)) # batch_size * feature数量
        user_column_index = 0
        user_embedding = x_reshape[0::all_column]
        # print("user_embdding:", len(user_embedding)) # 100
        user_embedding = torch.nn.utils.rnn.pad_sequence(user_embedding, batch_first = True) # [B 1 H]
        user_embedding = user_embedding.squeeze() 
        # print("user_embedding.shape: ", user_embedding.shape) # [B H]
            
        # attention group
        seq_column_index_list = [1, 3]
        target_column_index_list = [2, 4]
        all_sum_pooling=None
        for seq_column_index, target_column_index in zip(seq_column_index_list, target_column_index_list):
            # 计算item_seq_length
            item_seq_length = [offset[i] - offset[i-1] for i in range(seq_column_index+1, offset.shape[0], all_column)]
            item_seq_length = torch.tensor(item_seq_length) # 维度为[B]
            # print("item_seq_length: ", item_seq_length)
        
            # split feature
            item_seq_embedding = x_reshape[seq_column_index::all_column] # 长度为100的列表，列表中每个元素就是每个样本的item_seq对应的tensor 
            # print(item_seq_embedding)
            item_seq_embedding = torch.nn.utils.rnn.pad_sequence(item_seq_embedding, batch_first = True)
            # print("item_seq_embdding.shape: ", item_seq_embedding.shape) # [B T H]

            target_item_embedding = x_reshape[target_column_index::all_column]
            # print("target_item_embedding:", len(target_item_embedding)) # B
            target_item_embedding = torch.nn.utils.rnn.pad_sequence(target_item_embedding, batch_first = True)
            target_item_embedding = target_item_embedding.squeeze() 
            # print("target_item_embedding.shape: ", target_item_embedding.shape) # [B H]
            sum_pooling = self._attention(target_item_embedding, item_seq_embedding, item_seq_length) 
            # print(sum_pooling.shape)
            if all_sum_pooling == None:
                all_sum_pooling = torch.cat((sum_pooling, target_item_embedding), dim=1)
            else:
                all_sum_pooling = torch.cat((all_sum_pooling, sum_pooling, target_item_embedding), dim=1)
            
        # print("all_sum_pooling.shape: ", all_sum_pooling.shape) # [B H]
        emb_concat = torch.cat((user_embedding, all_sum_pooling), dim=1) 
        # print("emb_concat.shape: ", emb_concat.shape) # [B 3*H]
        final_layer_inputs = self.mlp(emb_concat) # [B  mlp_hidden_layers[-1]]
        output = self.final_layer(final_layer_inputs) # [B 1]
        # print("output.shape: ", output.shape)
        return torch.sigmoid(output) 
    
    
        # x = self._sparse(x)
        # x = self._dense(x)
        # return torch.sigmoid(x)

In [8]:
module= DIN()

[32mloaded combine schema from[m [32mcolumn name file [m'column_schema_group_attention.txt' [32mand combine schema file [m'combine_column_schema_group_attention.txt'
[32mloaded combine schema from[m [32mcolumn name file [m'column_schema_group_attention.txt' [32mand combine schema file [m'combine_column_schema_group_attention.txt'


In [9]:
def train(local=True,
          batch_size=1000,
          worker_count=1,
          server_count=1,
          worker_cpu=1,
          server_cpu=1,
          worker_memory='5G',
          server_memory='5G',
          coordinator_memory='5G',
          module_class=None,
          model_in_path=None,
          model_out_path=None,
          model_export_path=None,
          model_version=None,
          experiment_name=None,
          input_label_column_index=0,
          delimiter='\001',
          train_dataset_path=None,
          test_dataset_path=None,
          is_catchup=True,
          consul_host=None,
          consul_port=None,
          consul_endpoint_prefix=None,
          max_sparse_feature_age=15,
          metric_update_interval=10,
         ):
    import pyspark
    import metaspore as ma
    if module_class is None:
        module_class = DIN
    print('local: %s' % local)
    print('batch_size: %d' % batch_size)
    print('worker_count: %d' % worker_count)
    print('server_count: %d' % server_count)
    print('worker_cpu: %d' % worker_cpu)
    print('server_cpu: %d' % server_cpu)
    print('worker_memory: %s' % worker_memory)
    print('server_memory: %s' % server_memory)
    print('coordinator_memory: %s' % coordinator_memory)
    print('module_class: %s' % module_class)
    print('model_in_path: %s' % model_in_path)
    print('model_out_path: %s' % model_out_path)
    print('model_export_path: %s' % model_export_path)
    print('model_version: %s' % model_version)
    print('experiment_name: %s' % experiment_name)
    print('input_label_column_index: %d' % input_label_column_index)
    print('delimiter: %r' % delimiter)
    print('train_dataset_path: %s' % train_dataset_path)
    print('test_dataset_path: %s' % test_dataset_path)
    print('is_catchup: %s' % is_catchup)
    print('consul_host: %s' % consul_host)
    print('consul_port: %s' % consul_port)
    print('consul_endpoint_prefix: %s' % consul_endpoint_prefix)
    print('max_sparse_feature_age: %d' % max_sparse_feature_age)
    print('metric_update_interval: %d' % metric_update_interval)
    module = module_class()
    config={}
    estimator = ma.PyTorchEstimator(module=module,
                                    worker_count=worker_count,
                                    server_count=server_count,
                                    model_in_path=model_in_path,
                                    model_out_path=model_out_path,
                                    model_export_path=model_export_path,
                                    model_version=model_version,
                                    experiment_name=experiment_name,
                                    input_label_column_index=input_label_column_index,
                                    consul_host=consul_host,
                                    consul_port=consul_port,
                                    consul_endpoint_prefix=consul_endpoint_prefix,
                                    max_sparse_feature_age=max_sparse_feature_age,
                                    metric_update_interval=metric_update_interval,
                                    **config,
                                   )
    spark_session = ma.spark.get_session(local=local,
                                         batch_size=batch_size,
                                         worker_count=estimator.worker_count,
                                         server_count=estimator.server_count,
                                         worker_cpu=worker_cpu,
                                         server_cpu=server_cpu,
                                         worker_memory=worker_memory,
                                         server_memory=server_memory,
                                         coordinator_memory=coordinator_memory,
                                        )
    with spark_session:
        train_dataset = spark_session.read.parquet(train_dataset_path)
        model = estimator.fit(train_dataset)
        if test_dataset_path is not None:
            test_dataset = spark_session.read.parquet(test_dataset_path)
            result = model.transform(test_dataset)
            evaluator = pyspark.ml.evaluation.BinaryClassificationEvaluator()
            test_auc = evaluator.evaluate(result)
            print('test_auc: %g' % test_auc)
        if not is_catchup and model.consul_endpoint_prefix is not None:
            model.publish()

In [None]:
model_out_path = S3_ROOT_DIR + 'demo/output/dev/model_out/'
train_dataset_path = S3_ROOT_DIR + 'demo/movielens/1m/rank/train_DIN_group_attention.parquet'
test_dataset_path = S3_ROOT_DIR + 'demo/movielens/1m/rank/test_DIN_group_attention.parquet'
# test_dataset_path = None
train(model_out_path=model_out_path,
      train_dataset_path=train_dataset_path,
      test_dataset_path=test_dataset_path)

local: True
batch_size: 1000
worker_count: 1
server_count: 1
worker_cpu: 1
server_cpu: 1
worker_memory: 5G
server_memory: 5G
coordinator_memory: 5G
module_class: <class '__main__.DIN'>
model_in_path: None
model_out_path: s3://dmetasoul-bucket/demo/output/dev/model_out/
model_export_path: None
model_version: None
experiment_name: None
input_label_column_index: 0
delimiter: '\x01'
train_dataset_path: s3://dmetasoul-bucket/demo/movielens/1m/rank/train_DIN_group_attention.parquet
test_dataset_path: s3://dmetasoul-bucket/demo/movielens/1m/rank/test_DIN_group_attention.parquet
is_catchup: True
consul_host: None
consul_port: None
consul_endpoint_prefix: None
max_sparse_feature_age: 15
metric_update_interval: 10
[32mloaded combine schema from[m [32mcolumn name file [m'column_schema_group_attention.txt' [32mand combine schema file [m'combine_column_schema_group_attention.txt'
[32mloaded combine schema from[m [32mcolumn name file [m'column_schema_group_attention.txt' [32mand combine s

22/08/24 08:02:54 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/08/24 08:02:54 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
                                                                                