In [4]:
import pandas as pd
import numpy as np
import pickle
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.model_selection import train_test_split

In [5]:
ratings_df = pd.read_csv('subsample.csv')

In [6]:
user_map = {id: i for i, id in enumerate(ratings_df.user_id.unique())}
track_map = {id: i for i, id in enumerate(ratings_df.track_id.unique())}

ratings_df['user_id'] = ratings_df['user_id'].replace(user_map)
ratings_df['track_id'] = ratings_df['track_id'].replace(track_map)

In [7]:
ratings_df_train, ratings_df_test = train_test_split(ratings_df,
                                   stratify=ratings_df['user_id'], 
                                   test_size=0.20,
                                   random_state=42)

print('# interactions on Train set: %d' % len(ratings_df_train))
print('# interactions on Test set: %d' % len(ratings_df_test))

# interactions on Train set: 499252
# interactions on Test set: 124813


In [6]:
train, val = train_test_split(ratings_df_train,
                                   stratify=ratings_df_train['user_id'], 
                                   test_size=0.20,
                                   random_state=42)

print('# interactions on Train set: %d' % len(train))
print('# interactions on Val set: %d' % len(val))

# interactions on Train set: 399401
# interactions on Val set: 99851


In [7]:
class MF(nn.Module):
    def __init__(self, num_users, num_items, emb_size=100):
        super(MF, self).__init__()
        self.user_emb = nn.Embedding(num_users, emb_size)
        self.item_emb = nn.Embedding(num_items, emb_size)
        # initializing our matrices with a positive number generally will yield better results
        self.user_emb.weight.data.uniform_(0, 0.5)
        self.item_emb.weight.data.uniform_(0, 0.5)
    def forward(self, u, v):
        u = self.user_emb(u)
        v = self.item_emb(v)
        return (u*v).sum(1)  # taking the dot product

## Подбор лучшей функции потерь

In [11]:
def validation_score(model, val_data, criterion, verbose):
    model.eval()
    usernames = torch.LongTensor(val_data.user_id.values)
    tracks = torch.LongTensor(val_data.track_id.values)
    ratings = torch.FloatTensor(val_data['rating'].values)
    y_hat = model(usernames, tracks)
    loss = criterion(y_hat, ratings)
    if verbose:
      print("val loss %.3f " % loss.item())
    return loss.item()

def train_model(model, train_data, val_data, optimizer,
                criterion=F.mse_loss, epochs=10, verbose=True):
    model.train()
    for i in range(epochs):
        usernames = torch.LongTensor(train_data.user_id.values)
        tracks = torch.LongTensor(train_data.track_id.values)
        ratings = torch.FloatTensor(train_data['rating'].values)
        y_hat = model(usernames, tracks)
        loss = criterion(y_hat, ratings)
        optimizer.zero_grad()  # reset gradient
        loss.backward()
        optimizer.step()
        if verbose:
          print("train loss %.3f " % loss.item())
        val_score = validation_score(model, val_data, criterion, verbose)
    return val_score

In [15]:
def validation_score_bce(model, val_data, criterion, verbose):
    model.eval()
    usernames = torch.LongTensor(val_data.user_id.values)
    tracks = torch.LongTensor(val_data.track_id.values)
    output_pos = model(usernames, tracks)
    output_neg = model(usernames, tracks[torch.randperm(tracks.shape[0])])
    accuracy_batch = torch.cat([output_pos.sigmoid() > 0.5, output_neg.sigmoid() < 0.5]).float().mean()
    if verbose:
      print("val accuracy %.3f " % accuracy_batch)
      print("pos accuracy %.3f " % (output_pos.sigmoid() > 0.5).float().mean())
      print("neg accuracy %.3f " % (output_neg.sigmoid() < 0.5).float().mean())
    return accuracy_batch

def train_model_bce(model, train_data, val_data, optimizer,
                criterion=nn.BCEWithLogitsLoss(), epochs=10, verbose=True):
    model.train()
    for i in range(epochs):
        usernames = torch.LongTensor(train_data.user_id.values)
        tracks = torch.LongTensor(train_data.track_id.values)
        
        output_pos = model(usernames, tracks)
        output_neg = model(usernames, tracks[torch.randperm(tracks.shape[0])])
        
        output = torch.cat([output_pos, output_neg])
        targets = torch.cat([torch.ones_like(output_pos), torch.zeros_like(output_pos)])

        loss = criterion(output, targets)
        optimizer.zero_grad()  # reset gradient
        loss.backward()
        optimizer.step()
        if verbose:
          print("train loss %.3f " % loss.item())
        val_score = validation_score_bce(model, val_data, criterion, verbose)
    return val_score

In [10]:
n, m = len(ratings_df.user_id.unique()), len(ratings_df.track_id.unique())

In [12]:
model = MF(n, m, emb_size=100)
optimizer = torch.optim.Adam(model.parameters())
train_model(model, train, val, optimizer)

train loss 38.578 
val loss 37.956 
train loss 37.961 
val loss 37.347 
train loss 37.352 
val loss 36.745 
train loss 36.750 
val loss 36.151 
train loss 36.156 
val loss 35.564 
train loss 35.569 
val loss 34.986 
train loss 34.991 
val loss 34.415 
train loss 34.421 
val loss 33.853 
train loss 33.858 
val loss 33.298 
train loss 33.303 
val loss 32.751 


32.751495361328125

In [16]:
model = MF(n, m, emb_size=100)
optimizer = torch.optim.Adam(model.parameters())
train_model_bce(model, train, val, optimizer)

train loss 3.131 
val accuracy 0.500 
pos accuracy 1.000 
neg accuracy 0.000 
train loss 3.106 
val accuracy 0.500 
pos accuracy 1.000 
neg accuracy 0.000 
train loss 3.081 
val accuracy 0.500 
pos accuracy 1.000 
neg accuracy 0.000 
train loss 3.056 
val accuracy 0.500 
pos accuracy 1.000 
neg accuracy 0.000 
train loss 3.032 
val accuracy 0.500 
pos accuracy 1.000 
neg accuracy 0.000 
train loss 3.007 
val accuracy 0.500 
pos accuracy 1.000 
neg accuracy 0.000 
train loss 2.983 
val accuracy 0.500 
pos accuracy 1.000 
neg accuracy 0.000 
train loss 2.959 
val accuracy 0.500 
pos accuracy 1.000 
neg accuracy 0.000 
train loss 2.935 
val accuracy 0.500 
pos accuracy 1.000 
neg accuracy 0.000 
train loss 2.911 
val accuracy 0.500 
pos accuracy 1.000 
neg accuracy 0.000 


tensor(0.5000)

BCELoss вероятно нам не подходит, потому что сигмоида от перемешанных эмбеддингов меньше 0.5, а для не перемешанных больше 0.5. Будем использовать MSELoss.

## Подбор оптимизатора

In [None]:
model = MF(n, m, emb_size=100)
optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
train_model(model, train, val, optimizer)

train loss 38.451 
val loss 38.419 
train loss 38.438 
val loss 38.395 
train loss 38.414 
val loss 38.361 
train loss 38.379 
val loss 38.317 
train loss 38.336 
val loss 38.265 
train loss 38.283 
val loss 38.206 
train loss 38.224 
val loss 38.140 
train loss 38.158 
val loss 38.068 
train loss 38.085 
val loss 37.990 
train loss 38.008 
val loss 37.909 


In [None]:
model = MF(n, m, emb_size=100)
optimizer = torch.optim.RMSprop(model.parameters())
train_model(model, train, val, optimizer)

train loss 38.410 
val loss 4.877 
train loss 4.880 
val loss 2.656 
train loss 2.654 
val loss 1.720 
train loss 1.715 
val loss 1.220 
train loss 1.214 
val loss 0.919 
train loss 0.911 
val loss 0.721 
train loss 0.713 
val loss 0.584 
train loss 0.575 
val loss 0.485 
train loss 0.476 
val loss 0.411 
train loss 0.401 
val loss 0.354 


0.35381805896759033

In [None]:
model = MF(n, m, emb_size=100)
optimizer = torch.optim.Adadelta(model.parameters())
train_model(model, train, val, optimizer)

train loss 38.665 
val loss 37.744 
train loss 37.742 
val loss 36.841 
train loss 36.838 
val loss 35.961 
train loss 35.954 
val loss 35.103 
train loss 35.094 
val loss 34.268 
train loss 34.257 
val loss 33.458 
train loss 33.444 
val loss 32.671 
train loss 32.655 
val loss 31.907 
train loss 31.890 
val loss 31.166 
train loss 31.147 
val loss 30.447 


Лучший оптимизатор - RMSprop

## Подбор размера эмбеддинга, lr, weight_decay и количества эпох

In [None]:
for emb in range(10, 110, 10):
  model = MF(n, m, emb_size=emb)
  optimizer = torch.optim.RMSprop(model.parameters())
  mse = train_model(model, train, val, optimizer, verbose=False)
  print(f'emb_size={emb}, mse={mse}')

emb_size=10, mse=0.013544056564569473
emb_size=20, mse=0.024569591507315636
emb_size=30, mse=0.04286257177591324
emb_size=40, mse=0.06701702624559402
emb_size=50, mse=0.09746062010526657
emb_size=60, mse=0.13457538187503815
emb_size=70, mse=0.17908911406993866
emb_size=80, mse=0.23461976647377014
emb_size=90, mse=0.2910347282886505
emb_size=100, mse=0.35464444756507874


In [None]:
for lr in (0.1, 0.01, 0.001, 0.0001, 0.00001):
  model = MF(n, m, emb_size=100)
  optimizer = torch.optim.RMSprop(model.parameters(), lr=lr)
  mse = train_model(model, train, val, optimizer, verbose=False)
  print(f'lr={lr}, mse={mse}')

lr=0.1, mse=2.7183213233947754
lr=0.01, mse=0.35556358098983765
lr=0.001, mse=17.180782318115234
lr=0.0001, mse=35.4277458190918
lr=1e-05, mse=38.11247634887695


In [None]:
for wd in (0.1, 0.01, 0.001, 0.0001, 0.00001, 0):
  model = MF(n, m, emb_size=100)
  optimizer = torch.optim.RMSprop(model.parameters(), weight_decay=wd)
  mse = train_model(model, train, val, optimizer, verbose=False)
  print(f'wd={wd}, mse={mse}')

wd=0.1, mse=0.011228643357753754
wd=0.01, mse=0.016448549926280975
wd=0.001, mse=0.11904768645763397
wd=0.0001, mse=0.3018614947795868
wd=1e-05, mse=0.3485008776187897
wd=0, mse=0.35651886463165283


In [None]:
for num_epochs in (5, 10, 20, 30):
  model = MF(n, m, emb_size=100)
  optimizer = torch.optim.RMSprop(model.parameters(), weight_decay=0.01)
  mse = train_model(model, train, val, optimizer, epochs=num_epochs, verbose=False)
  print(f'num_epochs={num_epochs}, mse={mse}')

num_epochs=5, mse=0.2205890268087387
num_epochs=10, mse=0.016256045550107956
num_epochs=20, mse=0.014443607069551945
num_epochs=30, mse=0.01558822114020586


При подборе гиперпараметров возник парадокс - лучшие с точки зрения mse параметры показывали не самые лучшие метрики recall@10 и precision@10 на тестовых данных. По этой причине было решено делать выводы на основе recall@10 и precision@10 на тестовых данных, для улучшения которых нужно изменить weight_decay и количества эпох. Ниже приведены лучшие результаты на тестовых данных.

## Результаты на тестовых данных

In [None]:
model = MF(len(ratings_df.user_id.unique()), len(ratings_df.track_id.unique()), emb_size=100)
optimizer = torch.optim.RMSprop(model.parameters(), weight_decay=0.01)
train_model(model, ratings_df_train, ratings_df_test, optimizer, epochs=20)

train loss 38.634 
val loss 4.927 
train loss 4.928 
val loss 1.904 
train loss 1.904 
val loss 0.863 
train loss 0.863 
val loss 0.424 
train loss 0.425 
val loss 0.219 
train loss 0.219 
val loss 0.117 
train loss 0.117 
val loss 0.064 
train loss 0.065 
val loss 0.037 
train loss 0.037 
val loss 0.023 
train loss 0.023 
val loss 0.016 
train loss 0.016 
val loss 0.013 
train loss 0.013 
val loss 0.012 
train loss 0.012 
val loss 0.012 
train loss 0.012 
val loss 0.012 
train loss 0.012 
val loss 0.012 
train loss 0.012 
val loss 0.013 
train loss 0.013 
val loss 0.013 
train loss 0.013 
val loss 0.014 
train loss 0.014 
val loss 0.014 
train loss 0.014 
val loss 0.014 


0.014441419392824173

In [None]:
interactions_df = (
    ratings_df_train.groupby("user_id")["track_id"]
    .agg(lambda x: list(x))
    .reset_index()
    .rename(columns={"track_id": "true_train"})
    .set_index("user_id")
)
interactions_df["true_test"] = ratings_df_test.groupby("user_id")["track_id"].agg(
    lambda x: list(x)
)
interactions_df.loc[pd.isnull(interactions_df.true_test), "true_test"] = [
    [""]
    for x in range(
        len(interactions_df.loc[pd.isnull(interactions_df.true_test), "true_test"])
    )
]


In [None]:
user_list = []
track_list = []
tracks = sorted(ratings_df_test['track_id'].unique().tolist())
for user_id in ratings_df_test['user_id'].unique():
  user = torch.tensor(user_id)
  tracks_tensor = torch.tensor(tracks)
  predictions = model(user, tracks_tensor).detach().numpy()
  user_list.extend([user_id] * 10)
  track_list.extend(predictions.argsort()[-10:][::-1])

prediction_df = pd.DataFrame({'user_id': user_list, 'track_id': track_list})
interactions_df["prediction_nn"] = prediction_df.groupby("user_id")["track_id"].agg(
    lambda x: list(x)
)

In [None]:
(
interactions_df.apply(
        lambda row: len(
            set(row["true_test"]).intersection(set(row["prediction_nn"][:10]))
        )
        / len(row["prediction_nn"][:10]),
        axis=1,
    )
).mean()

0.042599277978339345

In [None]:
(
interactions_df.apply(
        lambda row: len(
            set(row["true_test"]).intersection(set(row["prediction_nn"][:10]))
        )
        / len(row["true_test"])
        + 0.001,
        axis=1,
    )
).mean()

0.007707242107268697

recall@10 и precision@10 для нейронной сети меньше бейзлайна почти в два раза, в качестве основных моделей сервиса решено использовать lightfm и svd