In [1]:
import pandas as pd
import numpy as np
import scipy.sparse as scs
import matplotlib.pyplot as plt
import sqlite3
import json
import joblib
import pickle
import warnings
from tqdm.notebook import tqdm
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import collections
import numpy as np
import os
import pandas as pd
import torch
import torch.nn.functional as F
from sklearn.preprocessing import LabelEncoder
from utils.data import MatchDataGenerator, df_to_dict
from utils.basic_layers import MLP, EmbeddingLayer
from utils.features import SparseFeature, SequenceFeature
from utils.match import Annoy, generate_seq_feature_match, gen_model_input
from utils.metrics import topk_metrics
from utils.trainer import MatchTrainer

warnings.filterwarnings("ignore")
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
torch.manual_seed(42);

In [2]:
data = pd.read_csv('train_10k.csv')


In [3]:
data = data.drop(columns=['release', 'artist_name', 'artist_country', 'artist_city', 'year', 'title', 'genre'])

In [4]:
data.head()

Unnamed: 0,user_id,song_id,play_count
0,6fbf6970611d01e10aebeab374f461116155867e,SOPVPCY12A81C23555,1
1,fa8a8753518e6c2d3713990dc2a172ea17000b80,SOBSMEQ12AB018282F,1
2,c9fdf63587a7a963e383ea2f1b58d1014377caab,SONQEYS12AF72AABC9,1
3,e329cc2012d31242297d294fa0279b79a1bd5cc7,SOHTAXD12A8C141E75,1
4,2a9178398fa6377a340d5b9b6be87de32b4059a2,SOAWWJW12AB01814F5,2


In [6]:
# data["cat_id"] = data["genres"].apply(lambda x: x.split("|")[0])
user_col, item_col = "user_id", "song_id"
sparse_features = ['user_id', 'song_id']
# sparse_features = ['user_id', 'movie_id', 'gender', 'age', 'occupation', 'zip', "cat_id"]

In [7]:
save_dir = './saved/'
if not os.path.exists(save_dir):
    os.makedirs(save_dir)
    
# print(f'Before encoding: \n {data[sparse_features].tail()}')

feature_max_idx = {}
for feature in sparse_features:
    encoder = LabelEncoder()
    data[feature] = encoder.fit_transform(data[feature]) + 1 # лучше энкодить не с 0, особенно в sequential NN
    feature_max_idx[feature] = data[feature].max() + 1
    if feature == user_col:
        user_map = {encode_id + 1: raw_id for encode_id, raw_id in enumerate(encoder.classes_)}
    if feature == item_col:
        item_map = {encode_id + 1: raw_id for encode_id, raw_id in enumerate(encoder.classes_)}
np.save(save_dir + "raw_id_maps.npy", (user_map, item_map))

# print(f'After encoding: \n {data[sparse_features].tail()}')

In [8]:
# user_cols = ["user_id", "gender", "age", "occupation", "zip"]
user_cols = ["user_id"]

item_cols = ['song_id']
user_profile = data[user_cols].drop_duplicates('user_id')
item_profile = data[item_cols].drop_duplicates('song_id')

In [9]:
df_train, df_test = generate_seq_feature_match(data,
                                               user_col,
                                               item_col,
                                               item_attribute_cols=[],
                                               sample_method=1,
                                               mode=0,
                                               neg_ratio=3,
                                               min_item=3)

generate sequence features: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 70/70 [00:00<00:00, 1107.96it/s]


n_train: 39440, n_test: 70
0 cold start users droped 


In [10]:
x_train = gen_model_input(df_train, user_profile, user_col, item_profile, item_col, seq_max_len=50)
x_test = gen_model_input(df_test, user_profile, user_col, item_profile, item_col, seq_max_len=50)
y_train = x_train["label"]
y_test = x_test["label"]

In [11]:
user_features = [
    SparseFeature(feature_name, vocab_size=feature_max_idx[feature_name], embed_dim=16) for feature_name in user_cols
]

user_features += [
    SequenceFeature("hist_song_id",
                    vocab_size=feature_max_idx["song_id"],
                    embed_dim=16,
                    pooling="mean",
                    shared_with="song_id")
]

item_features = [
    SparseFeature(feature_name, vocab_size=feature_max_idx[feature_name], embed_dim=16) for feature_name in item_cols
]

In [12]:
all_item = df_to_dict(item_profile)
test_user = x_test
data_generator = MatchDataGenerator(x=x_train, y=y_train)
train_dl, test_dl, item_dl = data_generator.generate_dataloader(test_user, all_item, batch_size=128)

In [14]:

class DSSM(torch.nn.Module):
    """Deep Structured Semantic Model
    Args:
        user_features (list[Feature Class]): training by the user tower module.
        item_features (list[Feature Class]): training by the item tower module.
        temperature (float): temperature factor for similarity score, default to 1.0.
        user_params (dict): the params of the User Tower module, 
        keys include:`{"dims":list, "activation":str, "dropout":float, "output_layer":bool`}.
        item_params (dict): the params of the Item Tower module, keys include:`{"dims":list, "activation":str, "dropout":float, "output_layer":bool`}.
    """

    def __init__(self, user_features, item_features, user_params, item_params, temperature=1.0):
        super().__init__()
        self.user_features = user_features
        self.item_features = item_features
        self.temperature = temperature
        self.user_dims = sum([f.embed_dim for f in user_features])
        self.item_dims = sum([f.embed_dim for f in item_features])

        self.embedding = EmbeddingLayer(user_features + item_features)
        self.user_mlp = MLP(self.user_dims, output_layer=False, **user_params)
        self.item_mlp = MLP(self.item_dims, output_layer=False, **item_params)
        self.mode = None

    def forward(self, x):
        item_embedding = self.item_tower(x)
        user_embedding = self.user_tower(x)
        # item_embedding = self.item_tower(x)
        if self.mode == "user":
            return user_embedding
        if self.mode == "item":
            return item_embedding
        # y = torch.mul(user_embedding, item_embedding).sum(dim=2)
        # y = y / self.temperature
        y = F.cosine_similarity(item_embedding, user_embedding, dim=1)
        return y

    def item_tower(self, x):
        if self.mode == "user":
            return None
        # Какая тут размерность? 
        # print(x['song_id'].shape)
        # print(self.item_features)
        input_item = self.embedding(x, self.item_features, squeeze_dim=True)
        item_embedding = self.item_mlp(input_item)
        item_embedding = F.normalize(item_embedding, p=2, dim=1)
        return item_embedding
    
    def user_tower(self, x):
        if self.mode == "item":
            return None
        # print(x['user_id'].shape)
        input_user = self.embedding(x, self.user_features, squeeze_dim=True)
        user_embedding = self.user_mlp(input_user)
        user_embedding = F.normalize(user_embedding, p=2, dim=1)
        return user_embedding

In [15]:
model = DSSM(user_features,
             item_features,
             temperature=0.02,
             user_params={
                 "dims": [256, 128, 64],
                 "activation": 'prelu',
             },
             item_params={
                 "dims": [256, 128, 64],
                 "activation": 'prelu',
             })

trainer = MatchTrainer(model,
                       mode=0,
                       optimizer_params={
                           "lr": 1e-2,
                           "weight_decay": 1e-5
                       },
                       n_epoch=3,
                       device='cpu')


trainer.fit(train_dl)

epoch: 0


train:   0%|                                                                                                                                                                       | 0/309 [00:00<?, ?it/s]


RuntimeError: all elements of input should be between 0 and 1