In [None]:
# !pip install pytorch-lightning==1.1.0rc1

In [None]:
# !wget http://files.grouplens.org/datasets/movielens/ml-1m.zip
# !unzip ml-1m.zip

In [4]:
import pandas as pd
import numpy as np
import scipy.sparse as sp
import multiprocessing
from collections import namedtuple
from lightfm import LightFM
from IPython.display import display

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.metrics import ndcg_score

import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader

import pytorch_lightning as pl
from pytorch_lightning.callbacks.model_checkpoint import ModelCheckpoint
from pytorch_lightning.callbacks.early_stopping import EarlyStopping

Данные из 1ого дз: movie lens 1m. Из explicit делаем implicit данные, считая что 4-5 explicit = 1 implicit. 
<br>
Из https://arxiv.org/abs/1708.05031 :
* В качестве метрик используем метрики: Hit Ratio (HR) и Normalized Discounted Cumulative Gain (NDCG) все @k. HR@k дает скор в зависимости от того, есть ли positive айтем в top-k. Чем ближе positive айтем к 1-ому месту в полученном ранжировании, тем больший скор дает NDCG@k. Один семпл: 1 pos + 99 neg айтемов, метрики считаем по отранжированному списку длины 10.
* В test set выделяем 1 предпоследний(не последний, иначе в train вообще не попадет айтем с самым большим индексом) positive айтем для каждого пользователя.

####  Reading data, explicit -> implicit

In [5]:
ratings = pd.read_csv('ml-1m/ratings.dat', delimiter='::', header=None, 
        names=['user_id', 'movie_id', 'rating', 'timestamp'], 
        usecols=['user_id', 'movie_id', 'rating'], engine='python')

movie_info = pd.read_csv('ml-1m/movies.dat', delimiter='::', header=None, 
        names=['movie_id', 'name', 'category'], engine='python')

ratings = ratings.loc[(ratings['rating'] >= 4)]

#### Sparse matrix

In [6]:
def to_csr(data, rows, cols):
    user_item = sp.coo_matrix((data, (rows, cols)))
    user_item_csr = user_item.tocsr()
    return user_item_csr

In [7]:
users = ratings['user_id']
movies = ratings['movie_id']
ratings_sparse = to_csr(np.ones_like(users), users, movies)

#### Dict {user_id: (pos_train_ids, pos_test_id, neg_ids)}

In [8]:
def pos_neg_dataset(ratings):
    user_ids = np.unique(users.values)
    movie_ids = np.unique(movies.values)
    User = namedtuple('User', 'pos_train, pos_test, neg')
    Ds = dict()
    
    for u in user_ids:
        user_rating = ratings[u]
        rated = list(user_rating.indices)
        if len(rated) > 1:
            not_rated = list(np.setdiff1d(movie_ids, rated))
            user = User([*rated[:-2], rated[-1]], rated[-2], not_rated)
            Ds[u] = user
    return Ds

In [9]:
Ds = pos_neg_dataset(ratings_sparse)

#### Sparce train matrix, test set 

In [10]:
user_train = np.hstack([np.full(len(Ds[user_id].pos_train), user_id) for user_id in Ds.keys()])
user_test = np.array(list(Ds.keys()))

item_train = np.hstack([np.array(Ds[user_id].pos_train) for user_id in Ds.keys()])
item_test = np.array([Ds[user_id].pos_test for user_id in Ds.keys()])

ratings_train_sparse = to_csr(np.ones_like(user_train), user_train, item_train)

#### Recommender + metrics

In [11]:
def title_by_id(id):
    return movie_info[movie_info['movie_id'] == id]['name'].item()

def category_by_id(id):
    return movie_info[movie_info['movie_id'] == id]['category'].item()

def rec_print(title_list, category_list):
    display(pd.DataFrame(zip(title_list, category_list), columns=['Title', 'Category']))
    
    
class Recommender:
    def __init__(self, model, ratings, pos_neg_ds, item_embs, preprocess=None):
        self.model = model
        self.Ds = pos_neg_ds
        self.ratings = ratings
        self.item_embs = item_embs
        self.preprocess = preprocess
             
    def similar_movies(self, movie_id=1, k=10):
        title = title_by_id(movie_id)
        category = category_by_id(movie_id)
        print('For movie')
        rec_print([title], [category])
        print('similars are')

        score = cosine_similarity(np.expand_dims(self.item_embs[movie_id], axis=0), self.item_embs)[0]
        ranked_similars = np.argsort(score)[::-1]
        top_k_ids = [s for s in ranked_similars if s != movie_id and s in movie_info['movie_id'].values][:k]
        top_k_titles = [title_by_id(r) for r in top_k_ids]
        top_k_categories = [category_by_id(r) for r in top_k_ids]
        rec_print(top_k_titles, top_k_categories)
        
    def prediction_for_user(self, user_id=4):
        real_movie_id = self.ratings[user_id].indices   
        print('--- User\'s choice ---')
        real_titles = [title_by_id(r) for r in real_movie_id[:10]]
        real_category = [category_by_id(r) for r in real_movie_id[:10]]
        rec_print(real_titles, real_category)
        print()

        print('--- Our recommendations ---')
        user_id = np.full((len(self.item_embs),), user_id)
        item_id = np.arange(len(self.item_embs))
        
        if self.preprocess:
            user_id = self.preprocess(user_id)
            item_id = self.preprocess(item_id)
            
        user_pred = self.model.predict(user_id, item_id)
        ranked_movie_id = np.argsort(user_pred)[::-1]
        not_rated_movie_id = [i for i in ranked_movie_id if i not in real_movie_id and i in movie_info['movie_id'].values]
        pred_titles = [title_by_id(r) for r in not_rated_movie_id[:10]]
        pred_category = [category_by_id(r) for r in not_rated_movie_id[:10]]
        rec_print(pred_titles, pred_category)
    
    def one_sample_metric(self, user_id, pos_item, k=10):
        np.random.shuffle(self.Ds[user_id].neg)
        neg_items = self.Ds[user_id].neg[:99]
        items = np.array([pos_item, *neg_items])
        y_target = np.array([1, *[0] * 99])
        perm = np.random.permutation(len(items))
        items = items[perm]
        y_target = y_target[perm]
        user_id = np.full((items.shape), user_id)
        
        if self.preprocess:
            user_id = self.preprocess(user_id)
            items = self.preprocess(items)
        
        y_pred = self.model.predict(user_id, items)

        if self.preprocess:
            items = items.detach().cpu().numpy()
            
        top_k_items = items[np.argsort(y_pred)[-k:]]
        hr = int(pos_item in top_k_items)
        ndcg = ndcg_score([y_target], [y_pred], k=k)
        return hr, ndcg

    def compute_metrics(self, user_test, item_test):
        hr_list = []
        ndcg_list = []
        for u, i in zip(user_test, item_test):
            hr, ndcg = self.one_sample_metric(u, i)
            hr_list.append(hr)
            ndcg_list.append(ndcg)
        print('Avg hr: {0:.3f}'.format(np.mean(hr_list)))
        print('Avg ndcg: {0:.3f}'.format(np.mean(ndcg_list)))

## 1. WARP

In [None]:
warp = LightFM(no_components=64, loss='warp', max_sampled=200)
warp.fit(ratings_train_sparse, epochs=30)

U, V = warp.user_embeddings, warp.item_embeddings
warp_recommender = Recommender(warp, ratings_sparse, Ds, V)
warp_recommender.compute_metrics(user_test, item_test)

Avg hr: 0.695
Avg ndcg: 0.414


#### похожие фильмы

In [None]:
warp_recommender.similar_movies()

For movie


Unnamed: 0,Title,Category
0,Toy Story (1995),Animation|Children's|Comedy


similars are


Unnamed: 0,Title,Category
0,Toy Story 2 (1999),Animation|Children's|Comedy
1,Babe (1995),Children's|Comedy|Drama
2,Hercules (1997),Adventure|Animation|Children's|Comedy|Musical
3,Beauty and the Beast (1991),Animation|Children's|Musical
4,"Bug's Life, A (1998)",Animation|Children's|Comedy
5,Antz (1998),Animation|Children's
6,Babe: Pig in the City (1998),Children's|Comedy
7,"Hunchback of Notre Dame, The (1996)",Animation|Children's|Musical
8,Mulan (1998),Animation|Children's
9,Aladdin (1992),Animation|Children's|Comedy|Musical


#### предсказания для пользователя

In [None]:
warp_recommender.prediction_for_user()

--- User's choice ---


Unnamed: 0,Title,Category
0,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Fantasy|Sci-Fi
1,Jurassic Park (1993),Action|Adventure|Sci-Fi
2,Die Hard (1988),Action|Thriller
3,E.T. the Extra-Terrestrial (1982),Children's|Drama|Fantasy|Sci-Fi
4,Raiders of the Lost Ark (1981),Action|Adventure
5,"Good, The Bad and The Ugly, The (1966)",Action|Western
6,Alien (1979),Action|Horror|Sci-Fi|Thriller
7,"Terminator, The (1984)",Action|Sci-Fi|Thriller
8,Jaws (1975),Action|Horror
9,Rocky (1976),Action|Drama



--- Our recommendations ---


Unnamed: 0,Title,Category
0,Terminator 2: Judgment Day (1991),Action|Sci-Fi|Thriller
1,"Godfather, The (1972)",Action|Crime|Drama
2,Braveheart (1995),Action|Drama|War
3,Aliens (1986),Action|Sci-Fi|Thriller|War
4,"French Connection, The (1971)",Action|Crime|Drama|Thriller
5,"Fugitive, The (1993)",Action|Thriller
6,"Matrix, The (1999)",Action|Sci-Fi|Thriller
7,"Thing, The (1982)",Action|Horror|Sci-Fi|Thriller
8,Star Wars: Episode V - The Empire Strikes Back...,Action|Adventure|Drama|Sci-Fi|War
9,Indiana Jones and the Last Crusade (1989),Action|Adventure


Рекомендации и симилары хорошие. Метрики сравним с другми моделями.

## 2.  NCF

NCF = Generalized Matrix Factorization(GMF) + MLP. В GMF elementwise перемножаются эмбеддинги юзера и айтема.
По полученному эмбеддингу делают предсказание. В MLP конкатенируются эмбеддинги юзера и айтема и пропускаются
через нейросеть. GMF и MLP предобучаются отдельно и на разных эмбеддингах. В NCF эмбеддинги пропускаются через соответствующие модели 
и общее предсказание делается по сконкатенированному выходу из этих моделей. Слой перед сигмоидой берет веса GMF и MLP с весами. В статье веса =0.5. Результаты лучше с весом=1.
<br>
<br>
Train, val датасеты составляем, так, чтобы на каждый positive айтем приходилось 3 negative(в статье рекомендуют 3-6). Val используем, чтобы вовремя останавливать обучение.

In [None]:
class GMF(nn.Module):
    def __init__(self, n_users, n_items, embed_dim):
        super().__init__()
        self.user_embedder = nn.Embedding(n_users, embed_dim)
        self.item_embedder = nn.Embedding(n_items, embed_dim)
        self.out = nn.Sequential(nn.Linear(embed_dim, 1),
                                 nn.Sigmoid())
        
    def forward(self, user_idx, item_idx, return_embs=False):
        user_embs = self.user_embedder(user_idx)
        item_embs = self.item_embedder(item_idx)
        el_prod = torch.mul(user_embs, item_embs)
        if return_embs:
            return el_prod
        y = self.out(el_prod)
        return y
    
        
class MLP(nn.Module):
    def __init__(self, n_users, n_items, embed_dim, hidden_dims):
        super().__init__()
        assert 2 * embed_dim == hidden_dims[0]
        self.user_embedder = nn.Embedding(n_users, embed_dim)
        self.item_embedder = nn.Embedding(n_items, embed_dim)
        
        self.net = []
        all_dims = [2 * embed_dim, *hidden_dims] 
        for i, dim in enumerate(all_dims[:-1]):
            self.net.extend([nn.Linear(all_dims[i], all_dims[i + 1]), nn.ReLU()])
        self.net = nn.Sequential(*self.net)
        
        self.out = nn.Sequential(nn.Linear(all_dims[-1], 1),
                                 nn.Sigmoid())
        
    def forward(self, user_idx, item_idx, return_embs=False):
        user_embs = self.user_embedder(user_idx)
        item_embs = self.item_embedder(item_idx)
        embs = torch.cat([user_embs, item_embs], dim=-1) 
        embs = self.net(embs)
        if return_embs:
            return embs
        y = self.out(embs)
        return y
            
        
class NCF(nn.Module):
    def __init__(self, gmf_params, mlp_params, alpha, gmf_ckpt=None, mlp_ckpt=None):
        super().__init__()
        self.gmf = GMF(**gmf_params)
        self.mlp = MLP(**mlp_params)
        hidden_dim = gmf_params['embed_dim'] + mlp_params['hidden_dims'][-1]
        self.out = nn.Sequential(nn.Linear(hidden_dim, 1),
                                 nn.Sigmoid())
        self.alpha = alpha
        
        if gmf_ckpt and mlp_ckpt:
            self.load_weights(gmf_ckpt, mlp_ckpt) 
            
    def forward(self, user_idx, item_idx):
        emb_gmf = self.gmf(user_idx, item_idx, return_embs=True).squeeze(1)
        emb_mlp = self.mlp(user_idx, item_idx, return_embs=True)
        emb = torch.cat([emb_gmf, emb_mlp], dim=-1)   
        y = self.out(emb)
        return y  

    def load_weights(self, gmf_ckpt, mlp_ckpt):
        gmf_state_dict = torch.load(gmf_ckpt)['state_dict']
        mlp_state_dict = torch.load(mlp_ckpt)['state_dict']

        for key in list(gmf_state_dict.keys()):
            new_key = key[key.index('.') + 1:]
            self.gmf.state_dict()[new_key] = gmf_state_dict.pop(key) 

        for key in list(mlp_state_dict.keys()):
            new_key = key[key.index('.') + 1:]
            self.mlp.state_dict()[new_key] = mlp_state_dict.pop(key) 

        last_layer_weight = torch.cat([self.alpha * self.gmf.out[0].weight.data, 
                                    (1 - self.alpha) * self.mlp.out[0].weight.data], dim=-1)
        last_layer_bias = self.alpha * self.gmf.out[0].bias.data + (1 - self.alpha) * self.mlp.out[0].bias.data
        with torch.no_grad():
            self.out[0].weight.copy_(last_layer_weight)
            self.out[0].bias.copy_(last_layer_bias) 
        print('Weights for GMF and MLP loaded')
    
    def predict(self, user_idx, item_idx):
        y_pred = self(user_idx, item_idx)
        return y_pred.detach().cpu().numpy().reshape(-1,)

#### Data preprocessing

In [None]:
user_pos_train = torch.LongTensor(user_train)
item_pos_train = torch.LongTensor(item_train)

# Train & val: 1 pos + 3 neg
neg_len = [min(len(Ds[user_id].neg), 3 * len(Ds[user_id].pos_train)) for user_id in Ds.keys()]
user_neg_train = [torch.full((neg_len[i], ), user_id) for i, user_id in enumerate(Ds.keys())]
item_neg_train = []
for i, user_id in enumerate(Ds.keys()):
    np.random.shuffle(Ds[user_id].neg)
    item_neg_train.append(torch.tensor(Ds[user_id].neg)[:neg_len[i]])
    
# Dataloaders for train & val: 90% of left data - train, 10% of left data - val
user_neg_train = torch.cat(user_neg_train, dim=0)
item_neg_train = torch.cat(item_neg_train, dim=0)
y_pos = torch.ones_like(user_pos_train)
y_neg = torch.zeros_like(user_neg_train)

user_train = torch.cat([user_pos_train, user_neg_train], dim=0)
item_train = torch.cat([item_pos_train, item_neg_train], dim=0)
y = torch.cat([y_pos, y_neg], dim=0)

random_sample = torch.randperm(len(user_train))
user_train = user_train[random_sample]
item_train = item_train[random_sample]
y = y[random_sample]

user_train, user_val, item_train, item_val, y_train, y_val = train_test_split(user_train, 
                                                                              item_train, y,
                                                                              test_size=0.1)
train_dataset = TensorDataset(user_train, item_train, y_train)
val_dataset = TensorDataset(user_val, item_val, y_val)

train_loader = DataLoader(train_dataset, batch_size=1024, shuffle=True, 
                          num_workers=multiprocessing.cpu_count())
val_loader = DataLoader(val_dataset, batch_size=1024, num_workers=multiprocessing.cpu_count())

In [None]:
class Learner(pl.LightningModule):
    def __init__(self, model, params, lr=1e-3, weight_decay=0):
        super().__init__()
        self.model = model(**params)
        self.lr = lr 
        self.weight_decay = weight_decay
        self.loss_fn = nn.BCELoss()  
        
    def forward(self, user_idx, item_idx):
        return self.model(user_idx, item_idx)
    
    def training_step(self, batch, *args):
        user_idx, item_idx, y_target = batch
        y_pred = self(user_idx, item_idx)
        loss = self.loss_fn(y_pred, y_target.unsqueeze(1).to(torch.float32))
        return {'loss': loss}      
    
    def validation_step(self, batch, *args):
        user_idx, item_idx, y_target = batch
        y_pred = self(user_idx, item_idx)
        loss = self.loss_fn(y_pred, y_target.unsqueeze(1).to(torch.float32))
        return {'val_loss': loss}

    def validation_epoch_end(self, outputs):
        avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean()
        return {'val_loss': avg_loss}
        
    def configure_optimizers(self):
        return torch.optim.Adam(self.model.parameters(), lr=self.lr, weight_decay=self.weight_decay)

Гипераметры, которые в статье указаны, как наилучишие. Размеры эмбеддингов - 64, в MLP 4 слоя, каждый следующий меньше предыдущего в 2 раза.

In [None]:
n_users = np.max(users) + 1
n_items = np.max(movies) + 1

gmf_params = {'n_users': n_users,
              'n_items': n_items,
              'embed_dim': 64}

mlp_params = {'n_users': n_users,
              'n_items': n_items,
              'embed_dim': 64,
              'hidden_dims': [128, 64, 32, 16]}

#### Pre-train GMF

In [None]:
early_stop_callback = EarlyStopping(monitor='val_loss')
saving_callback = ModelCheckpoint(filepath='gmf_checkpoints/gmf_{epoch}',
                                  save_top_k=3, 
                                  monitor='val_loss', 
                                  save_weights_only=True, 
                                  verbose=True)


gmf = Learner(GMF, gmf_params)
trainer = pl.Trainer(gpus=torch.cuda.device_count(), 
                     max_epochs=20, 
                     checkpoint_callback=saving_callback, 
                     early_stop_callback=early_stop_callback, 
                     progress_bar_refresh_rate=50)
trainer.fit(gmf, train_loader, val_loader)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores

  | Name    | Type    | Params
------------------------------------
0 | model   | GMF     | 639 K 
1 | loss_fn | BCELoss | 0     


HBox(children=(HTML(value='Validation sanity check'), FloatProgress(value=1.0, bar_style='info', layout=Layout…

HBox(children=(HTML(value='Training'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), max…

HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…


Epoch 00000: val_loss reached 0.56199 (best 0.56199), saving model to /Users/sofia/Desktop/recsys/gmf_checkpoints/gmf_epoch=0.ckpt as top 3


HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…


Epoch 00001: val_loss reached 0.56148 (best 0.56148), saving model to /Users/sofia/Desktop/recsys/gmf_checkpoints/gmf_epoch=1.ckpt as top 3


HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…


Epoch 00002: val_loss reached 0.56164 (best 0.56148), saving model to /Users/sofia/Desktop/recsys/gmf_checkpoints/gmf_epoch=2.ckpt as top 3


HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…


Epoch 00003: val_loss reached 0.55556 (best 0.55556), saving model to /Users/sofia/Desktop/recsys/gmf_checkpoints/gmf_epoch=3.ckpt as top 3


HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…


Epoch 00004: val_loss reached 0.49445 (best 0.49445), saving model to /Users/sofia/Desktop/recsys/gmf_checkpoints/gmf_epoch=4.ckpt as top 3


HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…


Epoch 00005: val_loss reached 0.43798 (best 0.43798), saving model to /Users/sofia/Desktop/recsys/gmf_checkpoints/gmf_epoch=5.ckpt as top 3


HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…


Epoch 00006: val_loss reached 0.41296 (best 0.41296), saving model to /Users/sofia/Desktop/recsys/gmf_checkpoints/gmf_epoch=6.ckpt as top 3


HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…


Epoch 00007: val_loss reached 0.40180 (best 0.40180), saving model to /Users/sofia/Desktop/recsys/gmf_checkpoints/gmf_epoch=7.ckpt as top 3


HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…


Epoch 00008: val_loss reached 0.39529 (best 0.39529), saving model to /Users/sofia/Desktop/recsys/gmf_checkpoints/gmf_epoch=8.ckpt as top 3


HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…


Epoch 00009: val_loss reached 0.39124 (best 0.39124), saving model to /Users/sofia/Desktop/recsys/gmf_checkpoints/gmf_epoch=9.ckpt as top 3


HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…


Epoch 00010: val_loss reached 0.38851 (best 0.38851), saving model to /Users/sofia/Desktop/recsys/gmf_checkpoints/gmf_epoch=10.ckpt as top 3


HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…


Epoch 00011: val_loss reached 0.38711 (best 0.38711), saving model to /Users/sofia/Desktop/recsys/gmf_checkpoints/gmf_epoch=11.ckpt as top 3


HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…


Epoch 00012: val_loss reached 0.38563 (best 0.38563), saving model to /Users/sofia/Desktop/recsys/gmf_checkpoints/gmf_epoch=12.ckpt as top 3


HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…


Epoch 00013: val_loss reached 0.38539 (best 0.38539), saving model to /Users/sofia/Desktop/recsys/gmf_checkpoints/gmf_epoch=13.ckpt as top 3


HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…


Epoch 00014: val_loss reached 0.38567 (best 0.38539), saving model to /Users/sofia/Desktop/recsys/gmf_checkpoints/gmf_epoch=14.ckpt as top 3


HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…


Epoch 00015: val_loss  was not in top 3


HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…


Epoch 00016: val_loss  was not in top 3
Saving latest checkpoint..





1

#### Pre-train MLP

In [None]:
early_stop_callback = EarlyStopping(monitor='val_loss')
saving_callback = ModelCheckpoint(filepath='mlp_checkpoints/mlp_{epoch}',
                                  save_top_k=3, 
                                  monitor='val_loss', 
                                  save_weights_only=True, 
                                  verbose=True)
 
mlp = Learner(MLP, mlp_params)
trainer = pl.Trainer(gpus=torch.cuda.device_count(), 
                     max_epochs=20,
                     checkpoint_callback=saving_callback,
                     early_stop_callback=early_stop_callback,
                     progress_bar_refresh_rate=50)
trainer.fit(mlp, train_loader, val_loader)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores

  | Name    | Type    | Params
------------------------------------
0 | model   | MLP     | 667 K 
1 | loss_fn | BCELoss | 0     


HBox(children=(HTML(value='Validation sanity check'), FloatProgress(value=1.0, bar_style='info', layout=Layout…

HBox(children=(HTML(value='Training'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), max…

HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…


Epoch 00000: val_loss reached 0.38086 (best 0.38086), saving model to /Users/sofia/Desktop/recsys/mlp_checkpoints/gmf_epoch=0.ckpt as top 3


HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…


Epoch 00001: val_loss reached 0.37664 (best 0.37664), saving model to /Users/sofia/Desktop/recsys/mlp_checkpoints/gmf_epoch=1.ckpt as top 3


HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…


Epoch 00002: val_loss reached 0.37547 (best 0.37547), saving model to /Users/sofia/Desktop/recsys/mlp_checkpoints/gmf_epoch=2.ckpt as top 3


HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…


Epoch 00003: val_loss reached 0.37470 (best 0.37470), saving model to /Users/sofia/Desktop/recsys/mlp_checkpoints/gmf_epoch=3.ckpt as top 3


HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…


Epoch 00004: val_loss reached 0.37422 (best 0.37422), saving model to /Users/sofia/Desktop/recsys/mlp_checkpoints/gmf_epoch=4.ckpt as top 3


HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…


Epoch 00005: val_loss reached 0.37151 (best 0.37151), saving model to /Users/sofia/Desktop/recsys/mlp_checkpoints/gmf_epoch=5.ckpt as top 3


HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…


Epoch 00006: val_loss reached 0.36471 (best 0.36471), saving model to /Users/sofia/Desktop/recsys/mlp_checkpoints/gmf_epoch=6.ckpt as top 3


HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…


Epoch 00007: val_loss reached 0.35888 (best 0.35888), saving model to /Users/sofia/Desktop/recsys/mlp_checkpoints/gmf_epoch=7.ckpt as top 3


HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…


Epoch 00008: val_loss reached 0.35548 (best 0.35548), saving model to /Users/sofia/Desktop/recsys/mlp_checkpoints/gmf_epoch=8.ckpt as top 3


HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…


Epoch 00009: val_loss reached 0.35380 (best 0.35380), saving model to /Users/sofia/Desktop/recsys/mlp_checkpoints/gmf_epoch=9.ckpt as top 3


HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…


Epoch 00010: val_loss reached 0.35386 (best 0.35380), saving model to /Users/sofia/Desktop/recsys/mlp_checkpoints/gmf_epoch=10.ckpt as top 3


HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…


Epoch 00011: val_loss reached 0.35338 (best 0.35338), saving model to /Users/sofia/Desktop/recsys/mlp_checkpoints/gmf_epoch=11.ckpt as top 3


HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…


Epoch 00012: val_loss  was not in top 3


HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…


Epoch 00013: val_loss  was not in top 3


HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…


Epoch 00014: val_loss  was not in top 3
Saving latest checkpoint..





1

#### Train NCF

In [None]:
ncf_params = {'gmf_params': gmf_params,
              'mlp_params': mlp_params,
              'alpha': 1, 
              'gmf_ckpt': 'gmf_checkpoints/gmf_epoch=13.ckpt',
              'mlp_ckpt': 'mlp_checkpoints/mlp_epoch=11.ckpt'}


early_stop_callback = EarlyStopping(monitor='val_loss')
saving_callback = ModelCheckpoint(filepath='ncf_checkpoints/ncf_{epoch}',
                                  save_top_k=3, 
                                  monitor='val_loss', 
                                  save_weights_only=True, 
                                  verbose=True)

ncf = Learner(NCF, ncf_params)
trainer = pl.Trainer(gpus=torch.cuda.device_count(),
                     max_epochs=10, 
                     checkpoint_callback=saving_callback,
                     early_stop_callback=early_stop_callback, 
                     progress_bar_refresh_rate=50)
trainer.fit(ncf, train_loader, val_loader)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores

  | Name    | Type    | Params
------------------------------------
0 | model   | NCF     | 1 M   
1 | loss_fn | BCELoss | 0     


HBox(children=(HTML(value='Validation sanity check'), FloatProgress(value=1.0, bar_style='info', layout=Layout…

HBox(children=(HTML(value='Training'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), max…

HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…


Epoch 00000: val_loss reached 0.38023 (best 0.38023), saving model to /Users/sofia/Desktop/recsys/ncf_checkpoints/ncf_epoch=0.ckpt as top 3


HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…


Epoch 00001: val_loss reached 0.37656 (best 0.37656), saving model to /Users/sofia/Desktop/recsys/ncf_checkpoints/ncf_epoch=1.ckpt as top 3


HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…


Epoch 00002: val_loss reached 0.37539 (best 0.37539), saving model to /Users/sofia/Desktop/recsys/ncf_checkpoints/ncf_epoch=2.ckpt as top 3


HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…


Epoch 00003: val_loss reached 0.37489 (best 0.37489), saving model to /Users/sofia/Desktop/recsys/ncf_checkpoints/ncf_epoch=3.ckpt as top 3


HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…


Epoch 00004: val_loss reached 0.37609 (best 0.37489), saving model to /Users/sofia/Desktop/recsys/ncf_checkpoints/ncf_epoch=4.ckpt as top 3


HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…


Epoch 00005: val_loss  was not in top 3


HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…


Epoch 00006: val_loss reached 0.37418 (best 0.37418), saving model to /Users/sofia/Desktop/recsys/ncf_checkpoints/ncf_epoch=6.ckpt as top 3


HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…


Epoch 00007: val_loss reached 0.37197 (best 0.37197), saving model to /Users/sofia/Desktop/recsys/ncf_checkpoints/ncf_epoch=7.ckpt as top 3


HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…


Epoch 00008: val_loss reached 0.37131 (best 0.37131), saving model to /Users/sofia/Desktop/recsys/ncf_checkpoints/ncf_epoch=8.ckpt as top 3


HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…


Epoch 00009: val_loss  was not in top 3
Saving latest checkpoint..





1

In [None]:
ncf.load_state_dict(torch.load('ncf_checkpoints/ncf_epoch=8.ckpt')['state_dict'])
pretrained_ncf = ncf.model

# item embeddings for similars
item_embs_gmf = pretrained_ncf.gmf.item_embedder.weight.detach()
item_embs_mlp = pretrained_ncf.mlp.item_embedder.weight.detach()
item_embs = torch.cat([item_embs_gmf, item_embs_mlp], dim=-1).numpy()

preprocess_fn = lambda x: torch.LongTensor(x).view(len(x))
ncf_recommender = Recommender(pretrained_ncf, ratings_sparse, Ds, item_embs, preprocess_fn)
ncf_recommender.compute_metrics(user_test, item_test)

Avg hr: 0.437
Avg ndcg: 0.214


In [None]:
ncf_recommender.similar_movies()

For movie


Unnamed: 0,Title,Category
0,Toy Story (1995),Animation|Children's|Comedy


similars are


Unnamed: 0,Title,Category
0,"Bear, The (1988)",Adventure
1,Things to Do in Denver when You're Dead (1995),Crime|Drama|Romance
2,Mad Max (1979),Action|Sci-Fi
3,"Mark of Zorro, The (1940)",Adventure
4,"Addams Family, The (1991)",Comedy
5,"French Connection, The (1971)",Action|Crime|Drama|Thriller
6,Glengarry Glen Ross (1992),Drama
7,"Producers, The (1968)",Comedy|Musical
8,Stand by Me (1986),Adventure|Comedy|Drama
9,Fantasia (1940),Animation|Children's|Musical


In [None]:
ncf_recommender.prediction_for_user()

--- User's choice ---


Unnamed: 0,Title,Category
0,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Fantasy|Sci-Fi
1,Jurassic Park (1993),Action|Adventure|Sci-Fi
2,Die Hard (1988),Action|Thriller
3,E.T. the Extra-Terrestrial (1982),Children's|Drama|Fantasy|Sci-Fi
4,Raiders of the Lost Ark (1981),Action|Adventure
5,"Good, The Bad and The Ugly, The (1966)",Action|Western
6,Alien (1979),Action|Horror|Sci-Fi|Thriller
7,"Terminator, The (1984)",Action|Sci-Fi|Thriller
8,Jaws (1975),Action|Horror
9,Rocky (1976),Action|Drama



--- Our recommendations ---


Unnamed: 0,Title,Category
0,American Beauty (1999),Comedy|Drama
1,Star Wars: Episode V - The Empire Strikes Back...,Action|Adventure|Drama|Sci-Fi|War
2,Fargo (1996),Crime|Drama|Thriller
3,L.A. Confidential (1997),Crime|Film-Noir|Mystery|Thriller
4,"Godfather, The (1972)",Action|Crime|Drama
5,Braveheart (1995),Action|Drama|War
6,Casablanca (1942),Drama|Romance|War
7,"Matrix, The (1999)",Action|Sci-Fi|Thriller
8,"Shawshank Redemption, The (1994)",Drama
9,Being John Malkovich (1999),Comedy


Метрики упали, симилары плохие. Рекомендации нормальные.


## 3. Attention

Для каждого юзера надо составить датасет из (последовательность предыдущих просмотренных фильм, следующий фильм). Будем поочередно в качестве следующего фильма брать positive айтем(т.е. фильм который был реально просмотрен после последовательности фильмов) и negative айтем(т.е фильм, кторый не был просмотрен вообще). Получится negative sampling как в NCF. Но здесь возьмем меньшее количество negative pairs. Из всех возможных последовательностей примерно половина - positive пары, другая часть - negative. В тест отправим одну пару (последовательность, positive фильм) для каждого юзера. 

#### New dataset

In [12]:
def attention_dataset(ratings, train_seq_len, test_seq_len=10):
    user_ids = np.unique(users.values)
    movie_ids = np.unique(movies.values)
    Sample = namedtuple('Sample', 'seq, next_item, target')
    Ds_train = []
    Ds_test = []
    
    for u in user_ids:
        user_rating = ratings[u]
        rated = list(user_rating.indices)
        if len(rated) > train_seq_len + 1:
            not_rated = list(np.setdiff1d(movie_ids, rated))
            
            user_sequence = [rated[i:i + train_seq_len] for i in range(len(rated) - train_seq_len)]
            next_pos = [rated[i] for i in range(train_seq_len, len(rated))]
            next_neg = list(np.random.choice(not_rated, len(next_pos), replace=False))
            next_items = [next_neg, next_pos]
            
            for i, seq in enumerate(user_sequence[:-1]):
                # if 0 - then for sequence take negative next item & target = 0
                # if 1 - then for sequence take positive next item & target = 1
                target = np.random.choice([0, 1])
                Ds_train.append(Sample(seq, next_items[target][i], target))
            Ds_test.append((rated[-test_seq_len - 1:-1], rated[-1]))
    return Ds_train, Ds_test

In [13]:
Ds_attn_train, Ds_attn_test = attention_dataset(ratings_sparse, train_seq_len=15)

In [14]:
sequences_train, next_items_train, targets_train = map(torch.LongTensor, zip(*Ds_attn_train))
train_dataset = TensorDataset(sequences_train, next_items_train, targets_train)
train_loader = DataLoader(train_dataset, batch_size=1024, shuffle=True, 
                          num_workers=multiprocessing.cpu_count())

sequences_test, next_items_test = map(torch.LongTensor, zip(*Ds_attn_test))

#### Attention model

In [16]:
class SelfAttention(nn.Module):
    def __init__(self, n_items, embed_dim, num_heads):
        super().__init__()
        self.num_heads = num_heads
        self.item_embedder = nn.Embedding(n_items, embed_dim)
        self.attention_layer = nn.MultiheadAttention(embed_dim * num_heads, num_heads, kdim=embed_dim,
                                                    vdim=embed_dim) 
        
        self.out = nn.Sequential(nn.Linear(embed_dim, 1), 
                                 nn.Sigmoid())
        
    def forward(self, sequence, next_item):
        seq_embs = self.item_embedder(sequence)  # key, value 
        next_emb = self.item_embedder(next_item) # query 
            
        # sizes: seq_len, bs, emd_dim
        if len(seq_embs.shape) < 3:
            seq_embs = seq_embs.unsqueeze(0)
            next_emb = next_emb.unsqueeze(0)
            seq_embs_inp = seq_embs.permute(1, 0, 2)
            next_emb_inp = next_emb.view(1, *next_emb.shape[1:]).repeat(1, 1, self.num_heads).permute(1, 0, 2)

        else:
            seq_embs_inp = seq_embs.permute(1, 0, 2)
            next_emb_inp = next_emb.view(1, *next_emb.shape).repeat(1, 1, self.num_heads)
        
        _, attn_output_weights = self.attention_layer(next_emb_inp, seq_embs_inp, seq_embs_inp)
        
        # weights * values
        attn_next = (attn_output_weights * seq_embs_inp.permute(1, 2, 0)).sum(dim=2)
        return self.out(attn_next)    
    
    def predict(self, seq):
        scores = []
        for item_id in range(len(self.item_embedder.weight)):
            score = self(seq, torch.LongTensor([item_id])).detach().cpu().item()
            scores.append(score)
        return scores

#### Train

In [18]:
class AttentionLearner(pl.LightningModule):
    def __init__(self, model, params, lr=1e-3):
        super().__init__()
        self.model = model(**params)
        self.lr = lr 
        self.loss_fn = nn.BCELoss()  
        
    def forward(self, seq, next_item):
        return self.model(seq, next_item)
    
    def training_step(self, batch, *args):
        seq, next_item, y_target = batch
        y_pred = self(seq, next_item)
        loss = self.loss_fn(y_pred, y_target.unsqueeze(1).to(torch.float32))
        return {'loss': loss}      
        
    def configure_optimizers(self):
        return torch.optim.Adam(self.model.parameters(), lr=self.lr)

In [20]:
attn_params = {'n_items': ratings_sparse.shape[1],
               'embed_dim': 64 * 5,
               'num_heads': 5}

attn = AttentionLearner(SelfAttention, attn_params)
trainer = pl.Trainer(gpus=torch.cuda.device_count(), 
                     max_epochs=20, 
                     progress_bar_refresh_rate=50)
trainer.fit(attn, train_loader)

GPU available: True, used: True
TPU available: None, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name    | Type          | Params
------------------------------------------
0 | model   | SelfAttention | 7.4 M 
1 | loss_fn | BCELoss       | 0     
------------------------------------------
7.4 M     Trainable params
0         Non-trainable params
7.4 M     Total params


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…




1

Немного переопределим класс Recommender

In [21]:
class AttentionRecommender(Recommender):    
    def prediction_for_user(self, user_id=4, seq_len=15):
        real_movie_id = self.ratings[user_id].indices   
        print('--- User\'s choice ---')
        real_titles = [title_by_id(r) for r in real_movie_id]
        real_category = [category_by_id(r) for r in real_movie_id]
        rec_print(real_titles, real_category)
        print()

        print('--- Our recommendations ---')
        # Keys, values = whole user's history; key = each movie in dataset
        user_seq = torch.LongTensor([*self.Ds[user_id].pos_train, self.Ds[user_id].pos_test])
        user_pred = self.model.predict(user_seq)    
        ranked_movie_id = np.argsort(user_pred)[::-1]
        not_rated_movie_id = [i for i in ranked_movie_id if i not in real_movie_id and i in movie_info['movie_id'].values]
        pred_titles = [title_by_id(r) for r in not_rated_movie_id[:10]]
        pred_category = [category_by_id(r) for r in not_rated_movie_id[:10]]
        rec_print(pred_titles, pred_category)
    
    def one_sample_metric(self, seq, next_item_pos, k=10):
        all_movies = np.unique(movies.values)
        next_item_neg = torch.LongTensor(np.random.choice(all_movies, 99, replace=False))
        y_target_pos = torch.ones_like(next_item_pos)
        y_target_neg = torch.zeros_like(next_item_neg)
        seq = seq.repeat(100, 1)
        next_items = torch.cat([next_item_pos.unsqueeze(0), next_item_neg], dim=-1)
        y_target = torch.cat([y_target_pos.unsqueeze(0), y_target_neg], dim=-1).numpy()
        perm = np.random.permutation(len(next_items))
        next_items = next_items[perm]
        seq = seq[perm]
        y_target = y_target[perm]
        
        y_pred = self.model(seq, next_items).detach().numpy().squeeze()
        items = next_items.numpy()
        top_k_items = items[np.argsort(y_pred)[-k:]]
        hr = int(next_item_pos.item() in top_k_items)
        ndcg = ndcg_score([y_target], [y_pred], k=k)
        return hr, ndcg

In [22]:
items_embs = attn.model.item_embedder.weight.detach()
attn_rec = AttentionRecommender(attn.model, ratings_sparse, Ds, items_embs)
attn_rec.compute_metrics(sequences_test, next_items_test)

Avg hr: 0.716
Avg ndcg: 0.433


In [23]:
attn_rec.similar_movies()

For movie


Unnamed: 0,Title,Category
0,Toy Story (1995),Animation|Children's|Comedy


similars are


Unnamed: 0,Title,Category
0,"Dog's Life, A (1920)",Comedy
1,Alien Escape (1995),Horror|Sci-Fi
2,"Allnighter, The (1987)",Comedy|Romance
3,Stanley & Iris (1990),Drama|Romance
4,Farewell My Concubine (1993),Drama|Romance
5,Suture (1993),Film-Noir|Thriller
6,"Enfer, L' (1994)",Drama
7,Mary Reilly (1996),Drama|Thriller
8,Little City (1998),Comedy|Romance
9,Deceiver (1997),Crime


In [24]:
attn_rec.prediction_for_user()

--- User's choice ---


Unnamed: 0,Title,Category
0,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Fantasy|Sci-Fi
1,Jurassic Park (1993),Action|Adventure|Sci-Fi
2,Die Hard (1988),Action|Thriller
3,E.T. the Extra-Terrestrial (1982),Children's|Drama|Fantasy|Sci-Fi
4,Raiders of the Lost Ark (1981),Action|Adventure
5,"Good, The Bad and The Ugly, The (1966)",Action|Western
6,Alien (1979),Action|Horror|Sci-Fi|Thriller
7,"Terminator, The (1984)",Action|Sci-Fi|Thriller
8,Jaws (1975),Action|Horror
9,Rocky (1976),Action|Drama



--- Our recommendations ---


Unnamed: 0,Title,Category
0,Strangers on a Train (1951),Film-Noir|Thriller
1,Planet of the Apes (1968),Action|Sci-Fi
2,Election (1999),Comedy
3,Double Indemnity (1944),Crime|Film-Noir
4,"Bug's Life, A (1998)",Animation|Children's|Comedy
5,Pleasantville (1998),Comedy
6,Hud (1963),Drama|Western
7,American History X (1998),Drama
8,Eyes of Laura Mars (1978),Mystery|Thriller
9,Time Bandits (1981),Adventure|Fantasy|Sci-Fi


Метрики лучше, чем в WARP и NCF. Симилары и рекомендации не очень. 