In [18]:
import os
import shutil
import zipfile
import urllib.request

import implicit
import pandas as pd
import numpy as np
from lightfm import LightFM
import scipy.sparse as sp
from scipy.sparse import coo_matrix

## Считаем данные

Датасет -- movie lens, так как использовался в первой домашке.

На нём так же обучались в статьях про NCF и NAIS.

In [2]:
ratings = pd.read_csv('ml-1m/ratings.dat', delimiter='::', header=None, names=['user_id', 'movie_id', 'rating', 'timestamp'], engine='python')
ratings['user_id']  -= 1
ratings['movie_id'] -= 1
ratings.head(5)

Unnamed: 0,user_id,movie_id,rating,timestamp
0,0,1192,5,978300760
1,0,660,3,978302109
2,0,913,3,978301968
3,0,3407,4,978300275
4,0,2354,5,978824291


In [37]:
movie_info = pd.read_csv('ml-1m/movies.dat', delimiter='::', header=None, 
        names=['movie_id', 'name', 'category'], engine='python')
movie_info['movie_id'] -= 1
movie_info.head(5)

Unnamed: 0,movie_id,name,category
0,0,Toy Story (1995),Animation|Children's|Comedy
1,1,Jumanji (1995),Adventure|Children's|Fantasy
2,2,Grumpier Old Men (1995),Comedy|Romance
3,3,Waiting to Exhale (1995),Comedy|Drama
4,4,Father of the Bride Part II (1995),Comedy


In [26]:
N_MOVIES = max(ratings['movie_id']) + 1
N_USERS  = max(ratings['user_id'])  + 1

In [74]:
implicit_ratings = ratings[ratings['rating'] >= 4]
implicit_ratings

Unnamed: 0,user_id,movie_id,rating,timestamp
0,0,1192,5,978300760
3,0,3407,4,978300275
4,0,2354,5,978824291
6,0,1286,5,978302039
7,0,2803,5,978300719
...,...,...,...,...
1000202,6039,1088,4,956704996
1000205,6039,1093,5,956704887
1000206,6039,561,5,956704746
1000207,6039,1095,4,956715648


## Разобъём на train / test 

Разбиваем:
убираем у каждого пользователя последний просмотр в test.

Так делалось в статье про NCF.

In [75]:
def leave_one_out_split(ratings):
    ratings = ratings.sort_values(by=['user_id', 'timestamp'])
    
    N_USERS = max(ratings['user_id'])
    ratings_loo  = []
    ratings_test = []
    
    for user_id in range(N_USERS):
        user_ratings = implicit_ratings[implicit_ratings['user_id'] == user_id]
        ratings_loo.append(user_ratings.iloc[:-1])
        ratings_test.append(user_ratings.iloc[-1:])
        
    ratings_train = pd.concat(ratings_loo)
    ratings_test  = pd.concat(ratings_test)
    return ratings_train, ratings_test

ratings_train, ratings_test = leave_one_out_split(implicit_ratings)
ratings_train

Unnamed: 0,user_id,movie_id,rating,timestamp
0,0,1192,5,978300760
3,0,3407,4,978300275
4,0,2354,5,978824291
6,0,1286,5,978302039
7,0,2803,5,978300719
...,...,...,...,...
999861,6038,2018,4,956706538
999862,6038,1077,4,956705545
999863,6038,1080,4,956705989
999865,6038,1085,4,956706182


In [76]:
ratings_test

Unnamed: 0,user_id,movie_id,rating,timestamp
52,0,1245,4,978302091
181,1,1246,5,978298652
232,2,2080,4,978298504
253,3,1239,5,978294260
451,4,713,4,978244493
...,...,...,...,...
998634,6034,2045,4,956711064
999521,6035,1098,4,956752131
999724,6036,1096,5,956709587
999744,6037,1078,5,956707547


## Обучим BPR модель

In [30]:
class MfRecommender:
    def __init__(self, user_vecs, item_vecs, user_bias=None, item_bias=None, n_rec=10):
        self.user_vecs = user_vecs
        self.item_vecs = item_vecs
        
        self.user_bias = user_bias if (user_bias is not None) else np.zeros(len(user_vecs))
        self.item_bias = item_bias if (item_bias is not None) else np.zeros(len(item_vecs))
        
        self.n_rec = n_rec
    
    def recommend(self, user_id, user_item_csr):
        seen = (user_item_csr[user_id] != 0).toarray()[0]
        similarity = np.dot(self.item_vecs, self.user_vecs[user_id]) + self.item_bias
        similarity[seen] = 0
        recommended = similarity.argsort()[::-1][:self.n_rec]
        return recommended[:,np.newaxis]
    
    def similar_items(self, item_id):
        similarity = np.dot(self.item_vecs, self.item_vecs[item_id]) / np.linalg.norm(self.item_vecs)
        similars = similarity.argsort()[::-1][:self.n_rec]
        return similars[:,np.newaxis]

In [10]:
num_components = 32

In [28]:
def to_coomatrix(ratings_df, shape):
    l = len(ratings_df)
    data = np.ones(l)
    rows = ratings_df['user_id'] .to_numpy()
    cols = ratings_df['movie_id'].to_numpy()
    
    return coo_matrix((data, (rows, cols)), shape=shape)

In [29]:
bpr_model = LightFM(no_components=num_components,
                    loss='bpr',
                    learning_schedule='adagrad',
                    user_alpha=1e-05,
                    item_alpha=1e-05)

train_matrix = to_coomatrix(ratings_train, (N_USERS, N_MOVIES))
bpr_model.fit_partial(train_matrix, epochs=20)

<lightfm.lightfm.LightFM at 0x7f4aba89e250>

In [43]:
bpr_recommender = MfRecommender(
    bpr_model.user_embeddings,
    bpr_model.item_embeddings,
    bpr_model.user_biases,
    bpr_model.item_biases
)

In [32]:
USER_ID = 3

In [45]:
get_recommendations = lambda user_id, recommender : [movie_info[movie_info["movie_id"] == x[0]]["name"].to_string() 
                                                        for x in recommender.recommend(user_id, train_matrix.tocsr())]

In [46]:
get_recommendations(USER_ID, bpr_recommender)

['1178    Star Wars: Episode V - The Empire Strikes Back...',
 '847    Godfather, The (1972)',
 '1182    Aliens (1986)',
 '1203    Godfather: Part II, The (1974)',
 '1220    Terminator, The (1984)',
 '585    Terminator 2: Judgment Day (1991)',
 '537    Blade Runner (1982)',
 '2502    Matrix, The (1999)',
 '1204    Full Metal Jacket (1987)',
 '108    Braveheart (1995)']

In [47]:
get_similars = lambda item_id, recommender: [movie_info[movie_info["movie_id"] == x[0]]["name"].to_string() 
                                             for x in recommender.similar_items(item_id)]

In [48]:
get_similars(0, bpr_recommender)

['0    Toy Story (1995)',
 '3045    Toy Story 2 (1999)',
 '584    Aladdin (1992)',
 '591    Beauty and the Beast (1991)',
 '360    Lion King, The (1994)',
 "2286    Bug's Life, A (1998)",
 '2012    Little Mermaid, The (1989)',
 '1132    Wrong Trousers, The (1993)',
 '33    Babe (1995)',
 '1245    Groundhog Day (1993)']

In [60]:
%%time

def evaluate(recommender, ratings_test):
    precision_at_5  = []
    precision_at_10 = []
    
    for user_id, user_rec in zip(ratings_test['user_id'], ratings_test['movie_id']):
        recs = recommender.recommend(user_id, train_matrix.tocsr())
        precision_at_5.append( user_rec in recs[:5])
        precision_at_10.append(user_rec in recs[:10])
        
    precision_at_5  = np.average(precision_at_5)
    precision_at_10 = np.average(precision_at_10)
    
    return {'precision@5': precision_at_5,
            'precision@10': precision_at_10
           }

evaluate(bpr_recommender, ratings_test)

CPU times: user 4min 16s, sys: 5min 59s, total: 10min 16s
Wall time: 2min 35s


{'precision@5': 0.05615371873447076, 'precision@10': 0.08928275633592844}

In [62]:
def evaluate_hits(recommender, ratings_test):
    # TODO
    pass

## NCF

In [64]:
#!L
import torch
import torch.nn as nn
import torch.nn.functional as F

In [65]:
#!L
device = torch.device('cuda')
device

device(type='cuda')

In [66]:
#!L
class MultiLayerPerceptron(nn.Module):
    def __init__(self, input_dim, embed_dims, dropout=0.1):
        super().__init__()
        layers = []
        for embed_dim in embed_dims:
            layers.extend([
                nn.Linear(input_dim, embed_dim),
                nn.ReLU(),
            #    nn.Dropout(p=dropout),
            ])
            input_dim = embed_dim
        self.layers = nn.Sequential(*layers)
    
    def forward(self, x):
        return self.layers(x)

In [67]:
#!L
def Embedding(n, dim):
    scale = 1. / np.sqrt(dim)
    embedding = nn.Embedding(n, dim)
    embedding = embedding.weight.data.uniform_(-scale, scale)
    return embedding
    
class NCF(nn.Module):
    def __init__(self, n_users, n_items, dim=32, hidden_sizes=[16, 8]):
        super().__init__()
        self.dim = dim
        self.mf_user_emb = Embedding(n_users, dim).to(device)
        self.nn_user_emb = Embedding(n_users, dim).to(device)
        
        self.mf_item_emb = Embedding(n_items, dim).to(device)
        self.nn_item_emb = Embedding(n_items, dim).to(device)
        
        self.mlp = MultiLayerPerceptron(dim * 2, hidden_sizes)
        self.linear = nn.Linear(hidden_sizes[-1] + dim, 1)
    
    #2d indices: batch * sample_i
    def forward(self, user_idx, item_idx):
        gft_out = self.mf_user_emb[user_idx] * self.mf_item_emb[item_idx]
        
        x_nn = torch.cat((self.nn_user_emb[user_idx], self.nn_item_emb[item_idx]), dim=-1)
        mlp_out = self.mlp(x_nn)
        
        x = torch.cat((mlp_out, gft_out), dim=-1)
        x = self.linear(x)
        return x

In [77]:
#!L
%whos

Variable               Type             Data/Info
-------------------------------------------------
Embedding              function         <function Embedding at 0x7f7f7cd8fcb0>
F                      module           <module 'torch.nn.functio<...>/torch/nn/functional.py'>
LightFM                type             <class 'lightfm.lightfm.LightFM'>
MFRecommender          type             <class '__main__.MFRecommender'>
MfRecommender          type             <class '__main__.MfRecommender'>
MultiLayerPerceptron   type             <class '__main__.MultiLayerPerceptron'>
NCF                    type             <class '__main__.NCF'>
N_MOVIES               int              3952
N_USERS                int              6040
USER_ID                int              3
bpr_model              LightFM          <lightfm.lightfm.LightFM <...>object at 0x7f7f7cd24050>
bpr_recomender         MfRecommender    <__main__.MfRecommender object at 0x7f7f7cdb2f10>
bpr_recommender        MfRecommender    <__m

In [81]:
#!L

def train_model(model, users, items, epochs=100, lr=0.2):
    optimizer = torch.optim.SGD(model.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()
    
    batch_size = 4096
    N = len(users)
    n_samples = 4
    
    for epoch in range(epochs):
        perm = torch.randperm(N).to(device)
        losses = []
        accs = []
        
        for l in range(0, N - batch_size, batch_size):
            optimizer.zero_grad()
        
            idx = perm[l:l + batch_size]
            
            sample_items = torch.randint(N_MOVIES, size=(batch_size, n_samples)).to(device)
            sample_items[:,0] = items[idx]
            
            cur_users = users[idx].unsqueeze(1).repeat(1, n_samples)           
            y      = torch.zeros(batch_size, 1, dtype=torch.long).to(device)
            
            output = model.forward(cur_users, sample_items)
            acc = (output.argmax(dim=1) == 0).float().mean().cpu().detach()
            accs.append(acc)
            loss = criterion(output, y)
            loss.backward()
            losses.append(loss.cpu().detach())
            
            optimizer.step()
        if epoch % 10 == 0:
            print(f"epoch {epoch:3d} average_loss {np.average(losses)} acc {np.average(accs)}")

train_users = torch.LongTensor(ratings_train['user_id'].to_numpy() ).to(device)
train_items = torch.LongTensor(ratings_train['movie_id'].to_numpy()).to(device)

In [83]:
#!L
model = NCF(N_USERS, N_MOVIES).to(device)
train_model(model, train_users, train_items, epochs=1000)

epoch   0 average_loss 1.386231780052185 acc 0.2556276321411133
epoch  10 average_loss 1.3748334646224976 acc 0.29665032029151917
epoch  20 average_loss 1.3526051044464111 acc 0.3350900709629059
epoch  30 average_loss 1.2296507358551025 acc 0.4556247889995575
epoch  40 average_loss 1.189273715019226 acc 0.4848455786705017
epoch  50 average_loss 1.1672065258026123 acc 0.501125156879425
epoch  60 average_loss 1.1589010953903198 acc 0.50672447681427
epoch  70 average_loss 1.1499584913253784 acc 0.5127643346786499
epoch  80 average_loss 1.1422096490859985 acc 0.5172632336616516
epoch  90 average_loss 1.1344717741012573 acc 0.5213304758071899
epoch 100 average_loss 1.1293681859970093 acc 0.5259037017822266
epoch 110 average_loss 1.1264634132385254 acc 0.5275348424911499
epoch 120 average_loss 1.1205317974090576 acc 0.5310907959938049
epoch 130 average_loss 1.1168993711471558 acc 0.53391432762146
epoch 140 average_loss 1.1127678155899048 acc 0.5354623198509216
epoch 150 average_loss 1.109126

In [86]:
#!L
curr_items = torch.arange(N_MOVIES, dtype=torch.long).to(device)
curr_users = (USER * torch.ones(N_MOVIES, dtype=torch.long)).to(device)
curr_users.size(), curr_items.size()

(torch.Size([3952]), torch.Size([3952]))

In [87]:
#!L
pref = model.forward(curr_users, curr_items).cpu().detach().numpy()
pref

array([[4.438849 ],
       [3.0431933],
       [3.1024418],
       ...,
       [3.4360943],
       [3.0028226],
       [2.4713576]], dtype=float32)

In [91]:
#!L
pref_movies = np.argsort(pref[:,0])[::-1]
for movie_id in pref_movies[:10]:
    print(movie_info[movie_info['movie_id'] == movie_id]['name'].to_string())

1192    Star Wars: Episode VI - Return of the Jedi (1983)
1575    L.A. Confidential (1997)
1178    Star Wars: Episode V - The Empire Strikes Back...
3474    Diner (1982)
49    Usual Suspects, The (1995)
1284    Butch Cassidy and the Sundance Kid (1969)
1196    Alien (1979)
3408    Empire Records (1995)
1271    Indiana Jones and the Last Crusade (1989)
315    Shawshank Redemption, The (1994)


In [110]:
#!L
class NcfRecommender:
    def __init__(self, model, n_items, n_rec=10):
        self.model = model
        self.n_items = n_items
        self.n_rec = n_rec
    
    def recommend(self, user_id, user_item_csr):
        seen  = (user_item_csr[user_id] != 0).toarray()[0]
        curr_items = torch.arange(self.n_items, dtype=torch.long).to(device)
        curr_users = (user_id * torch.ones(self.n_items, dtype=torch.long)).to(device)
        
        score = model.forward(curr_users, curr_items).cpu().detach().numpy()[:,0]
        score[seen] = 0
        recommended = score.argsort()[::-1][:self.n_rec]
        return recommended[:,np.newaxis]

In [111]:
#!L
ncf_recommender = NcfRecommender(model, N_MOVIES)
ncf_recommender.recommend(USER_ID, train_matrix.tocsr())

array([[1209],
       [1616],
       [1195],
       [3542],
       [  49],
       [1303],
       [3476],
       [1290],
       [ 317],
       [2376]])

In [113]:
#!L

def evaluate(recommender, ratings_test):
    precision_at_5  = []
    precision_at_10 = []
    
    for user_id, user_rec in list(zip(ratings_test['user_id'], ratings_test['movie_id'])):
        recs = recommender.recommend(user_id, train_matrix.tocsr())
        precision_at_5.append( user_rec in recs[:5])
        precision_at_10.append(user_rec in recs[:10])
        
    precision_at_5  = np.average(precision_at_5)
    precision_at_10 = np.average(precision_at_10)
    
    return {'precision@5': precision_at_5,
            'precision@10': precision_at_10
           }

evaluate(ncf_recommender, ratings_test)

{'precision@5': 0.005963226768262382, 'precision@10': 0.008282259400364419}