In [465]:
import pandas as pd
import sys
sys.path.append('../src/')
from varka.ml_varka import MovieLenseVarka
from utils.metric import RecallK

import matplotlib.pyplot as plt

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## 1. Готовим данные
В качестве теста отщипляем последние 2 месяца.

In [459]:
# 956703954 + 60 * 60 * 24 * 7 * 4 * 10 = 980895954
varka = MovieLenseVarka('../code/data/ml-1m/movies.dat', '../code/data/ml-1m/users.dat', '../code/data/ml-1m/ratings.dat', 
                        980895954, '../code/data/ml-1m/train.csv', '../code/data/ml-1m/val.csv', 128) # 10 months to train
varka.do_varka()

number of rows in train: 916429
number of rows in val: 77740


In [460]:
train = pd.read_pickle('../code/data/ml-1m/train.csv')
val = pd.read_pickle('../code/data/ml-1m/val.csv')

In [461]:
train.head()

Unnamed: 0,UserID,MovieID,Rating,history,candidate,timestamp
0,3022,3578,4,"[[198, Strange Days, [Action, Crime, Sci-Fi], ...","[3578, Gladiator, [Action, Drama], M, 25, 17, ...",970507113
1,3112,1099,3,"[[2422, Karate Kid III, The, [Action, Adventur...","[1099, Christmas Carol, A, [Drama], M, 18, 12,...",969476859
2,1680,2597,2,"[[2048, Great Mouse Detective, The, [Animation...","[2597, Lost & Found, [Comedy, Romance], M, 25,...",974836897
3,3705,3089,4,"[[3015, Coma, [Thriller], M, 45, 7, 30076, 197...","[3089, Bicycle Thief, The (Ladri di biciclette...",966281485
4,1845,852,3,"[[1198, Raiders of the Lost Ark, [Action, Adve...","[852, Tin Cup, [Comedy, Romance], M, 25, 2, 10...",974708054


In [462]:
train_light = train[['MovieID', 'UserID', 'Rating', 'Genre']]
val_light = val[['MovieID', 'UserID', 'Rating', 'Genre']]
train_light.head()

KeyError: "['Genre'] not in index"

In [None]:
from surprise import Dataset, Reader

In [None]:
reader = Reader(rating_scale=(1, 5))
train_surprise = Dataset.load_from_df(train_light[['UserID', 'MovieID', 'Rating']], reader).build_full_trainset()
val_surprise = Dataset.load_from_df(val_light[['UserID', 'MovieID', 'Rating']], reader)
testset_surprise = train_surprise.build_anti_testset()
print(len(testset_surprise))

21282657


## 2. Обучение и тестирование бейзлайнов
В качестве бейзлайнов возьмем:
- Top-K самых популярных товаров с обучения
- KNNBasic из surprise library
- SVD из surprise library

Будем считать следующие user-wise метрики:
- User-wise recall@50.
- User-wise diversity@50 (среднее количество различных жанров среди первых 50 кандидатов).
- User-wise median count@50: для каждого пользователя считаем медианную популярность (по числу покупок на обучении), а дальше усредняем по всем пользователям. <- для замера влияния на long-tail.

### Вспомогательный код для подсчета метрик

In [363]:
positive_pairs = val_light.groupby('UserID').apply(lambda x: [it for it in x['MovieID']]).reset_index()
positive_pairs['positive'] = positive_pairs[0]
positive_pairs = positive_pairs[['UserID', 'positive']]
users = positive_pairs.UserID.values

Unnamed: 0,UserID,positive
0,19,"[1655, 2990, 1193, 1262, 2918, 2600, 2004, 265..."
1,20,"[1371, 3527, 1240, 1527, 3753, 1468, 2858, 169..."
2,22,"[1034, 65, 1097, 3527, 2161, 2376, 2193, 678, ..."
3,23,"[2973, 1388, 3740, 915, 1222, 2411, 3396, 2410..."
4,24,"[1959, 2657, 425, 2757, 1635]"


In [451]:
metric_recall50 = RecallK(50)
metric_diversity50 = DiversityK(50)
metric_long_tail50 = LongTailK(50)

### Top-K

In [365]:
top50_popular = train_light.groupby('MovieID')['UserID'].count().sort_values().values[-50:]

In [366]:
preds = []
for user in users:
    preds.append(top50_popular)
df_preds = pd.DataFrame({'UserID': users, 'preds': preds})
df_preds.head()

Unnamed: 0,UserID,preds
0,19,"[1571, 1578, 1588, 1600, 1610, 1611, 1624, 163..."
1,20,"[1571, 1578, 1588, 1600, 1610, 1611, 1624, 163..."
2,22,"[1571, 1578, 1588, 1600, 1610, 1611, 1624, 163..."
3,23,"[1571, 1578, 1588, 1600, 1610, 1611, 1624, 163..."
4,24,"[1571, 1578, 1588, 1600, 1610, 1611, 1624, 163..."


In [367]:
df_preds = df_preds.set_index('UserID').join(positive_pairs.set_index('UserID'), how='left', on='UserID').reset_index()
df_preds.head()

Unnamed: 0,UserID,preds,positive
0,19,"[1571, 1578, 1588, 1600, 1610, 1611, 1624, 163...","[1655, 2990, 1193, 1262, 2918, 2600, 2004, 265..."
1,20,"[1571, 1578, 1588, 1600, 1610, 1611, 1624, 163...","[1371, 3527, 1240, 1527, 3753, 1468, 2858, 169..."
2,22,"[1571, 1578, 1588, 1600, 1610, 1611, 1624, 163...","[1034, 65, 1097, 3527, 2161, 2376, 2193, 678, ..."
3,23,"[1571, 1578, 1588, 1600, 1610, 1611, 1624, 163...","[2973, 1388, 3740, 915, 1222, 2411, 3396, 2410..."
4,24,"[1571, 1578, 1588, 1600, 1610, 1611, 1624, 163...","[1959, 2657, 425, 2757, 1635]"


In [368]:
print('User-wise recall@50 for top-k most popular:', round(metric_recall50(df_preds) * 100, 2), '%')

Recall@50 for top-k most popular: 3.1 %


### KNNBasic

In [None]:
def get_top_50(events):
    tmp = []
    for movie, rating in zip(events['MovieID'], events['Rating']):
        tmp.append((movie, rating))
    tmp.sort(key=lambda x: x[1])
    return [it[0] for it in tmp[-50:]]

def build_preds_datafrmae(predictions):
    tmp_u = []
    tmp_i = []
    tmp_r = []
    for pred in predictions:
        tmp_u.append(pred[0])
        tmp_i.append(pred[1])
        tmp_r.append(pred[3])
    tmp = pd.DataFrame({'UserID': tmp_u, 'MovieID': tmp_i, 'Rating': tmp_r})
    tmp = tmp.groupby('UserID').apply(get_top_50).reset_index()
    tmp['preds'] = tmp[0]
    tmp = tmp[['UserID', 'preds']]
    df_preds = positive_pairs.copy()
    df_preds = df_preds.set_index('UserID').join(tmp.set_index('UserID'), how='left', on='UserID').reset_index()
    return df_preds

In [453]:
from surprise import KNNBasic

In [455]:
knn = KNNBasic(k=10)
knn.fit(train_surprise)
predictions_knnbasic = knn.test(testset_surprise)

Computing the msd similarity matrix...
Done computing similarity matrix.


KeyboardInterrupt: 

### SVD

In [369]:
from surprise import SVD

In [371]:
svd = SVD()
svd.fit(train_surprise)
predictions_svd = svd.test(testset_surprise)


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x2f2e29ad0>

In [452]:
print('Recall@50 for SVD:', round(metric_recall50(df_preds) * 100, 2), '%')

Recall@50 for SVD: 12.12 %
