In [15]:
#!L
%pip install implicit lightfm

Collecting implicit
  Downloading implicit-0.4.4.tar.gz (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 2.8 MB/s eta 0:00:01
[?25hCollecting lightfm
  Downloading lightfm-1.16.tar.gz (310 kB)
[K     |████████████████████████████████| 310 kB 4.4 MB/s eta 0:00:01
[?25hCollecting numpy
  Downloading numpy-1.19.4-cp37-cp37m-manylinux2010_x86_64.whl (14.5 MB)
[K     |████████████████████████████████| 14.5 MB 4.5 MB/s eta 0:00:01
[?25hCollecting requests
  Downloading requests-2.25.0-py2.py3-none-any.whl (61 kB)
[K     |████████████████████████████████| 61 kB 6.1 MB/s  eta 0:00:01
[?25hCollecting certifi>=2017.4.17
  Downloading certifi-2020.12.5-py2.py3-none-any.whl (147 kB)
[K     |████████████████████████████████| 147 kB 8.7 MB/s eta 0:00:01
[?25hCollecting chardet<4,>=3.0.2
  Downloading chardet-3.0.4-py2.py3-none-any.whl (133 kB)
[K     |████████████████████████████████| 133 kB 6.5 MB/s eta 0:00:01
[?25hCollecting idna<3,>=2.5
  Downloading idna-2.10-py2.py3-none-a

In [127]:
#!L
import os
import shutil
import zipfile
import urllib.request


import pandas as pd
import numpy as np
import scipy.sparse as sp
from scipy.sparse import coo_matrix

import torch
import torch.nn as nn
import torch.nn.functional as F

import implicit
from lightfm import LightFM

## Считаем данные

Датасет -- movie lens, так как использовался в первой домашке.

На нём так же обучались в статьях про NCF и NAIS.

In [105]:
#!L
ratings = pd.read_csv('ml-1m/ratings.dat', delimiter='::', header=None, names=['user_id', 'movie_id', 'rating', 'timestamp'], engine='python')
ratings['user_id']  -= 1
ratings['movie_id'] -= 1
ratings.head(5)

Unnamed: 0,user_id,movie_id,rating,timestamp
0,0,1192,5,978300760
1,0,660,3,978302109
2,0,913,3,978301968
3,0,3407,4,978300275
4,0,2354,5,978824291


In [106]:
#!L
movie_info = pd.read_csv('ml-1m/movies.dat', delimiter='::', header=None, 
        names=['movie_id', 'name', 'category'], engine='python')
movie_info['movie_id'] -= 1
movie_info.head(5)

Unnamed: 0,movie_id,name,category
0,0,Toy Story (1995),Animation|Children's|Comedy
1,1,Jumanji (1995),Adventure|Children's|Fantasy
2,2,Grumpier Old Men (1995),Comedy|Romance
3,3,Waiting to Exhale (1995),Comedy|Drama
4,4,Father of the Bride Part II (1995),Comedy


In [107]:
#!L
N_USERS  = max(ratings['user_id'])  + 1
N_MOVIES = max(ratings['movie_id']) + 1

In [125]:
#!L
implicit_ratings = ratings[ratings['rating'] >= 4]
implicit_ratings

Unnamed: 0,user_id,movie_id,rating,timestamp
0,0,1192,5,978300760
3,0,3407,4,978300275
4,0,2354,5,978824291
6,0,1286,5,978302039
7,0,2803,5,978300719
...,...,...,...,...
1000202,6039,1088,4,956704996
1000205,6039,1093,5,956704887
1000206,6039,561,5,956704746
1000207,6039,1095,4,956715648


## Разобъём на train / test 

Разбиваем:
убираем у каждого пользователя последний просмотр в test.

Так делалось в статье про NCF.

In [128]:
#!L
def leave_one_out_split(ratings):
    ratings = ratings.sort_values(by=['user_id', 'timestamp'])
    
    N_USERS = max(ratings['user_id'])
    ratings_loo  = []
    ratings_test = []
    
    for user_id in range(N_USERS):
        user_ratings = implicit_ratings[implicit_ratings['user_id'] == user_id]
        ratings_loo.append(user_ratings.iloc[:-1])
        ratings_test.append(user_ratings.iloc[-1:])
        
    ratings_train = pd.concat(ratings_loo)
    ratings_test  = pd.concat(ratings_test)
    return ratings_train, ratings_test

ratings_train, ratings_test = leave_one_out_split(implicit_ratings)
ratings_train

Unnamed: 0,user_id,movie_id,rating,timestamp
0,0,1192,5,978300760
3,0,3407,4,978300275
4,0,2354,5,978824291
6,0,1286,5,978302039
7,0,2803,5,978300719
...,...,...,...,...
999861,6038,2018,4,956706538
999862,6038,1077,4,956705545
999863,6038,1080,4,956705989
999865,6038,1085,4,956706182


In [110]:
#!L
ratings_test

Unnamed: 0,user_id,movie_id,rating,timestamp
52,0,1245,4,978302091
181,1,1246,5,978298652
232,2,2080,4,978298504
253,3,1239,5,978294260
451,4,713,4,978244493
...,...,...,...,...
998634,6034,2045,4,956711064
999521,6035,1098,4,956752131
999724,6036,1096,5,956709587
999744,6037,1078,5,956707547


## Обучим BPR модель

In [111]:
#!L
class MfRecommender:
    def __init__(self, user_vecs, item_vecs, user_bias=None, item_bias=None, n_rec=10):
        self.user_vecs = user_vecs
        self.item_vecs = item_vecs
        
        self.user_bias = user_bias if (user_bias is not None) else np.zeros(len(user_vecs))
        self.item_bias = item_bias if (item_bias is not None) else np.zeros(len(item_vecs))
        
        self.n_rec = n_rec
    
    def recommend(self, user_id, user_item_csr):
        seen = (user_item_csr[user_id] != 0).toarray()[0]
        similarity = np.dot(self.item_vecs, self.user_vecs[user_id]) + self.item_bias
        similarity[seen] = 0
        recommended = similarity.argsort()[::-1][:self.n_rec]
        return recommended[:,np.newaxis]
    
    def similar_items(self, item_id):
        similarity = np.dot(self.item_vecs, self.item_vecs[item_id]) / np.linalg.norm(self.item_vecs)
        similars = similarity.argsort()[::-1][:self.n_rec]
        return similars[:,np.newaxis]

In [112]:
#!L
num_components = 32

In [113]:
#!L
def to_coomatrix(ratings_df, shape):
    l = len(ratings_df)
    data = np.ones(l)
    rows = ratings_df['user_id'] .to_numpy()
    cols = ratings_df['movie_id'].to_numpy()
    
    return coo_matrix((data, (rows, cols)), shape=shape)

In [114]:
bpr_model = LightFM(no_components=num_components,
                    loss='bpr',
                    learning_schedule='adagrad',
                    user_alpha=1e-05,
                    item_alpha=1e-05)

train_matrix = to_coomatrix(ratings_train, (N_USERS, N_MOVIES))

In [115]:
bpr_model.fit_partial(train_matrix, epochs=20)

<lightfm.lightfm.LightFM at 0x7f31241720d0>

In [116]:
bpr_recommender = MfRecommender(
    bpr_model.user_embeddings,
    bpr_model.item_embeddings,
    bpr_model.user_biases,
    bpr_model.item_biases
)

In [117]:
#!L
USER_ID = 3

In [118]:
#!L
get_recommendations = lambda user_id, recommender : [movie_info[movie_info["movie_id"] == x[0]]["name"].to_string() 
                                                        for x in recommender.recommend(user_id, train_matrix.tocsr())]

In [119]:
#!L
get_recommendations(USER_ID, bpr_recommender)

['1178    Star Wars: Episode V - The Empire Strikes Back...',
 '1182    Aliens (1986)',
 '847    Godfather, The (1972)',
 '1220    Terminator, The (1984)',
 '585    Terminator 2: Judgment Day (1991)',
 '537    Blade Runner (1982)',
 '1203    Godfather: Part II, The (1974)',
 '1192    Star Wars: Episode VI - Return of the Jedi (1983)',
 '2502    Matrix, The (1999)',
 '1353    Star Trek: The Wrath of Khan (1982)']

In [120]:
#!L
get_similars = lambda item_id, recommender: [movie_info[movie_info["movie_id"] == x[0]]["name"].to_string() 
                                             for x in recommender.similar_items(item_id)]

In [121]:
#!L
get_similars(0, bpr_recommender)

['0    Toy Story (1995)',
 '3045    Toy Story 2 (1999)',
 '584    Aladdin (1992)',
 '591    Beauty and the Beast (1991)',
 "2286    Bug's Life, A (1998)",
 '360    Lion King, The (1994)',
 '1132    Wrong Trousers, The (1993)',
 '2012    Little Mermaid, The (1989)',
 '257    Star Wars: Episode IV - A New Hope (1977)',
 '735    Close Shave, A (1995)']

In [122]:
#!L
def evaluate(recommender, ratings_test, limit=None):
    precision_at_5  = []
    precision_at_10 = []
    
    if limit == None:
        limit = len(ratings_test)
    ratings_test = ratings_test[:limit]
    
    for user_id, user_rec in zip(ratings_test['user_id'], ratings_test['movie_id']):
        recs = recommender.recommend(user_id, train_matrix.tocsr())
        precision_at_5.append( user_rec in recs[:5])
        precision_at_10.append(user_rec in recs[:10])
        
    precision_at_5  = np.average(precision_at_5)
    precision_at_10 = np.average(precision_at_10)
    
    return {'precision@5' : precision_at_5,
            'precision@10': precision_at_10}

In [129]:
#!L
%%time
evaluate(bpr_recommender, ratings_test)

CPU times: user 7min 24s, sys: 18min 19s, total: 25min 44s
Wall time: 3min 15s


{'precision@5': 0.06029484843465297, 'precision@10': 0.09392082160013251}

In [20]:
#!L
def evaluate_hits(recommender, ratings_test):
    # TODO
    pass

## NCF

In [130]:
#!L
device = torch.device('cuda')
device

device(type='cuda')

In [131]:
#!L
class MultiLayerPerceptron(nn.Module):
    def __init__(self, input_dim, embed_dims, dropout=0.1):
        super().__init__()
        layers = []
        for embed_dim in embed_dims:
            layers.extend([
                nn.Linear(input_dim, embed_dim),
                nn.ReLU(),
            #    nn.Dropout(p=dropout),
            ])
            input_dim = embed_dim
        self.layers = nn.Sequential(*layers)
    
    def forward(self, x):
        return self.layers(x)

In [132]:
#!L
def Embedding(n, dim):
    scale = 1. / np.sqrt(dim)
    embedding = nn.Embedding(n, dim)
    embedding.weight.data.uniform_(-scale, scale)
    return embedding
    
class NCF(nn.Module):
    def __init__(self, n_users, n_items, dim=32, hidden_sizes=[16, 8]):
        super().__init__()
        self.dim = dim
        self.mf_user_emb = Embedding(n_users, dim)
        self.nn_user_emb = Embedding(n_users, dim)
        
        self.mf_item_emb = Embedding(n_items, dim)
        self.nn_item_emb = Embedding(n_items, dim)
        
        self.mlp = MultiLayerPerceptron(dim * 2, hidden_sizes)
        self.linear = nn.Linear(hidden_sizes[-1] + dim, 1)
    
    #2d indices: batch * sample_i
    def forward(self, user_idx, item_idx):
        gft_out = self.mf_user_emb(user_idx) * self.mf_item_emb(item_idx)
        
        x_nn = torch.cat((self.nn_user_emb(user_idx), self.nn_item_emb(item_idx)), dim=-1)
        mlp_out = self.mlp(x_nn)
        
        x = torch.cat((mlp_out, gft_out), dim=-1)
        x = self.linear(x)
        return x

In [133]:
#!L
model = NCF(N_USERS, N_MOVIES).to(device)

model.forward(
    torch.LongTensor([[1,2,3,4,5],
                      [1,2,3,4,5]]).to(device), 
    torch.LongTensor([[1,2,3,4,5],
                      [1,2,3,4,5]]).to(device)
).size()

torch.Size([2, 5, 1])

In [149]:
#!L
def train_model(model, users, items, epochs=100, lr=0.1, batch_size=128):
    optimizer = torch.optim.SGD(model.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()
    
    N = len(users)
    n_samples = 4
    
    for epoch in range(epochs):
        perm = torch.randperm(N).to(device)
        losses = []
        accs = []
        
        for l in range(0, N - batch_size, batch_size):
            optimizer.zero_grad()
        
            idx = perm[l:l + batch_size]
            
            sample_items = torch.randint(N_MOVIES, size=(batch_size, n_samples)).to(device)
            sample_items[:,0] = items[idx]
            
            cur_users = users[idx].unsqueeze(1).repeat(1, n_samples)           
            y      = torch.zeros(batch_size, 1, dtype=torch.long).to(device)
            
            output = model.forward(cur_users, sample_items)
            acc = (output.argmax(dim=1) == 0).float().mean().cpu().detach()
            accs.append(acc)
            loss = criterion(output, y)
            loss.backward()
            losses.append(loss.cpu().detach())
            optimizer.step()
            
        if epoch % 10 == 0:
            print(f"epoch {epoch:3d} average_loss {np.average(losses)} acc {np.average(accs)}")

train_users = torch.LongTensor(ratings_train['user_id'].to_numpy() ).to(device)
train_items = torch.LongTensor(ratings_train['movie_id'].to_numpy()).to(device)

In [26]:
#!L
model = NCF(N_USERS, N_MOVIES).to(device)
train_model(model, train_users, train_items, epochs=200, lr=0.1)

epoch   0 average_loss 1.386310338973999 acc 0.24832108616828918
epoch  10 average_loss 1.386296272277832 acc 0.2500919997692108
epoch  20 average_loss 1.386278510093689 acc 0.25113579630851746
epoch  30 average_loss 1.3861908912658691 acc 0.255620539188385
epoch  40 average_loss 1.3854225873947144 acc 0.282709538936615
epoch  50 average_loss 1.1119334697723389 acc 0.5356038212776184
epoch  60 average_loss 0.7525672912597656 acc 0.69557785987854
epoch  70 average_loss 0.7318239212036133 acc 0.6973841786384583
epoch  80 average_loss 0.7282417416572571 acc 0.6972107887268066
epoch  90 average_loss 0.7259624004364014 acc 0.6976512670516968
epoch 100 average_loss 0.724484920501709 acc 0.6980510950088501
epoch 110 average_loss 0.7242345213890076 acc 0.6985287666320801
epoch 120 average_loss 0.7247249484062195 acc 0.69724440574646
epoch 130 average_loss 0.7234907150268555 acc 0.698093593120575
epoch 140 average_loss 0.7225040197372437 acc 0.6984898447990417
epoch 150 average_loss 0.721281051

In [153]:
#!L
class NeuralRecommender:
    def __init__(self, model, n_items, n_rec=10):
        self.model = model
        self.n_items = n_items
        self.n_rec = n_rec
    
    def recommend(self, user_id, user_item_csr):
        seen  = (user_item_csr[user_id] != 0).toarray()[0]
        curr_items = torch.arange(self.n_items, dtype=torch.long).to(device)
        curr_users = (user_id * torch.ones(self.n_items, dtype=torch.long)).to(device)
        
        score = model.forward(curr_users, curr_items).cpu().detach().numpy()[:,0]
        score[seen] = -1e15
        recommended = score.argsort()[::-1][:self.n_rec]
        return recommended[:,np.newaxis]

In [28]:
#!L
ncf_recommender = NeuralRecommender(model, N_MOVIES)
get_recommendations(USER_ID, ncf_recommender)

['2789    American Beauty (1999)',
 '1178    Star Wars: Episode V - The Empire Strikes Back...',
 '3633    Mad Max (1979)',
 '1023    Die Hard (1988)',
 '1081    E.T. the Extra-Terrestrial (1982)',
 '2623    Run Lola Run (Lola rennt) (1998)',
 '257    Star Wars: Episode IV - A New Hope (1977)',
 '1180    Raiders of the Lost Ark (1981)',
 '1183    Good, The Bad and The Ugly, The (1966)',
 '1196    Alien (1979)']

In [29]:
#!L
%%time
evaluate(ncf_recommender, ratings_test)

CPU times: user 2min 20s, sys: 0 ns, total: 2min 20s
Wall time: 2min 20s


{'precision@5': 0.0009938711280437303, 'precision@10': 0.0014908066920655955}

## Bpr torch

In [137]:
#!L
def Embedding(n, dim):
    scale = 1. / np.sqrt(dim)
    embedding = nn.Embedding(n, dim)
    embedding.weight.data.uniform_(0, scale)
    return embedding

class BprModel(nn.Module):
    def __init__(self, n_users, n_items, dim=32):
        super().__init__()
        self.dim = dim
        self.user_emb  = Embedding(n_users, dim)
        self.user_bias = Embedding(n_users, 1)
        
        self.item_emb  = Embedding(n_items, dim)
        self.item_bias = Embedding(n_items, 1)
    
    def forward(self, user_idx, item_idx):
        output_prod = (self.user_emb(user_idx) * self.item_emb(item_idx)).sum(dim=-1).unsqueeze(-1)
        output_bias = self.user_bias(user_idx) + self.item_bias(item_idx)
        return output_prod + output_bias
        
model = BprModel(N_USERS, N_MOVIES).to(device)

In [138]:
#!L
output = model.forward(
    torch.LongTensor([[1,2,3,4,5],
                      [1,2,3,4,5]]).to(device), 
    torch.LongTensor([[1,2,3,4,5],
                      [1,2,3,4,5]]).to(device)
)
output.size()

torch.Size([2, 5, 1])

In [151]:
#!L

model = BprModel(N_USERS, N_MOVIES).to(device)
train_model(model, train_users, train_items, epochs=20)

epoch   0 average_loss 1.306992530822754 acc 0.4204724431037903
epoch  10 average_loss 1.1142301559448242 acc 0.6998365521430969


In [154]:
#!L
ncf_recommender = NeuralRecommender(model, N_MOVIES)
get_recommendations(USER_ID, ncf_recommender)

['2789    American Beauty (1999)',
 '1178    Star Wars: Episode V - The Empire Strikes Back...',
 '589    Silence of the Lambs, The (1991)',
 '2502    Matrix, The (1999)',
 '2693    Sixth Sense, The (1999)',
 '604    Fargo (1996)',
 '1192    Star Wars: Episode VI - Return of the Jedi (1983)',
 '585    Terminator 2: Judgment Day (1991)',
 '315    Shawshank Redemption, The (1994)',
 "523    Schindler's List (1993)"]

In [155]:
#!L
evaluate(ncf_recommender, ratings_test)

{'precision@5': 0.005631936392247805, 'precision@10': 0.04555242670200431}