In [5]:
import pandas as pd
import sys
sys.path.append('../src/')
from varka.ml_varka import MovieLenseVarka
from utils.metric import RecallK, DiversityK, LongTailK

import matplotlib.pyplot as plt

%load_ext autoreload
%autoreload 2

ModuleNotFoundError: No module named 'varka.ml_varka'

## 1. Готовим данные
В качестве теста отщипляем последние 2 месяца.

In [467]:
# 956703954 + 60 * 60 * 24 * 7 * 4 * 10 = 980895954
varka = MovieLenseVarka('../code/data/ml-1m/movies.dat', '../code/data/ml-1m/users.dat', '../code/data/ml-1m/ratings.dat', 
                        980895954, '../code/data/ml-1m/train.csv', '../code/data/ml-1m/val.csv', 128) # 10 months to train
varka.do_varka()

number of rows in train: 916429
number of rows in val: 77740


In [3]:
train = pd.read_pickle('../code/data/ml-1m/train.csv')
val = pd.read_pickle('../code/data/ml-1m/val.csv')

In [4]:
train.head()

Unnamed: 0,UserID,MovieID,Rating,history,candidate,timestamp,Genre
0,3403,6,4,"[[413, Airheads, [Comedy], M, 35, 5, 48342, 19...","[6, Heat, [Action, Crime, Thriller], M, 35, 5,...",967429703,"[Action, Crime, Thriller]"
1,4630,1883,3,"[[593, Silence of the Lambs, The, [Drama, Thri...","[1883, Bulworth, [Comedy], F, 25, 4, 94610, 19...",964040034,[Comedy]
2,2882,27,3,"[[2683, Austin Powers: The Spy Who Shagged Me,...","[27, Now and Then, [Drama], M, 18, 20, 78759, ...",972243969,[Drama]
3,3513,593,5,"[[908, North by Northwest, [Drama, Thriller], ...","[593, Silence of the Lambs, The, [Drama, Thril...",966976389,"[Drama, Thriller]"
4,2010,2759,2,"[[2021, Dune, [Fantasy, Sci-Fi], M, 18, 4, 815...","[2759, Dick, [Comedy], M, 18, 4, 81520, 1999, ...",974680399,[Comedy]


In [4]:
train_light = train[['MovieID', 'UserID', 'Rating', 'Genre']]
val_light = val[['MovieID', 'UserID', 'Rating', 'Genre']]
train_light.head()

Unnamed: 0,MovieID,UserID,Rating,Genre
0,6,3403,4,"[Action, Crime, Thriller]"
1,1883,4630,3,[Comedy]
2,27,2882,3,[Drama]
3,593,3513,5,"[Drama, Thriller]"
4,2759,2010,2,[Comedy]


In [5]:
from surprise import Dataset, Reader

In [6]:
reader = Reader(rating_scale=(1, 5))
train_surprise = Dataset.load_from_df(train_light[['UserID', 'MovieID', 'Rating']], reader).build_full_trainset()
val_surprise = Dataset.load_from_df(val_light[['UserID', 'MovieID', 'Rating']], reader)
testset_surprise = train_surprise.build_anti_testset()
print(len(testset_surprise))

21282657


## 2. Обучение и тестирование бейзлайнов
В качестве бейзлайнов возьмем:
- Top-K самых популярных товаров с обучения
- KNNBasic из surprise library
- SVD из surprise library

Будем считать следующие user-wise метрики:
- User-wise recall@50.
- User-wise diversity@50 (среднее количество различных жанров среди первых 50 кандидатов).
- User-wise median count@50: для каждого пользователя считаем медианную популярность (по числу покупок на обучении), а дальше усредняем по всем пользователям. <- для замера влияния на long-tail.

### Вспомогательный код для подсчета метрик

In [7]:
positive_pairs = val_light.groupby('UserID').apply(lambda x: [it for it in x['MovieID']]).reset_index()
positive_pairs['positive'] = positive_pairs[0]
positive_pairs = positive_pairs[['UserID', 'positive']]
users = positive_pairs.UserID.values

In [53]:
metric_recall50 = RecallK(50)
metric_diversity50 = DiversityK(50)
metric_long_tail50 = LongTailK(50)

In [9]:
movie_to_cnt = dict()
movie_to_genre = dict()
for movieID, genre in zip(train_light['MovieID'].values, train_light['Genre'].values):
    if movieID not in movie_to_cnt:
        movie_to_cnt[movieID] = 1
    else:
        movie_to_cnt[movieID] += 1

    if movieID not in movie_to_genre:
        movie_to_genre[movieID] = genre

### Top-K

In [10]:
top50_popular_only_ids = list(train_light.groupby('MovieID')['UserID'].count().sort_values().keys()[-50:])
top50_popular = []
for movieID in top50_popular_only_ids:
    top50_popular.append((movieID, movie_to_cnt[movieID], movie_to_genre[movieID]))

In [11]:
preds = []
for user in users:
    preds.append(top50_popular)
df_preds = pd.DataFrame({'UserID': users, 'preds': preds})
df_preds.head()

Unnamed: 0,UserID,preds
0,19,"[(1610, 1571, [Action, Thriller]), (1213, 1578..."
1,20,"[(1610, 1571, [Action, Thriller]), (1213, 1578..."
2,22,"[(1610, 1571, [Action, Thriller]), (1213, 1578..."
3,23,"[(1610, 1571, [Action, Thriller]), (1213, 1578..."
4,24,"[(1610, 1571, [Action, Thriller]), (1213, 1578..."


In [12]:
df_preds = df_preds.set_index('UserID').join(positive_pairs.set_index('UserID'), how='left', on='UserID').reset_index()
df_preds.head()

Unnamed: 0,UserID,preds,positive
0,19,"[(1610, 1571, [Action, Thriller]), (1213, 1578...","[1527, 223, 3409, 480, 2918, 405, 2653, 2004, ..."
1,20,"[(1610, 1571, [Action, Thriller]), (1213, 1578...","[1694, 2641, 3717, 1468, 1371, 1375, 1527, 352..."
2,22,"[(1610, 1571, [Action, Thriller]), (1213, 1578...","[1198, 2193, 1097, 3176, 2161, 2005, 1127, 542..."
3,23,"[(1610, 1571, [Action, Thriller]), (1213, 1578...","[2259, 2411, 1258, 2712, 2148, 3052, 2641, 264..."
4,24,"[(1610, 1571, [Action, Thriller]), (1213, 1578...","[2657, 1635, 425, 1959, 2757]"


In [43]:
print('User-wise recall@50 for top-k most popular:', round(metric_recall50(df_preds) * 100, 2), '%')
print('User-wise diversity@50 for top-k most popular:', metric_diversity50(df_preds))
print('User-wise long_tail@50 for top-k most popular:', metric_long_tail50(df_preds))

User-wise recall@50 for top-k most popular: 13.25 %
User-wise diversity@50 for top-k most popular: 16.0
User-wise long_tail@50 for top-k most popular: 2042.0


### KNNBasic

In [52]:
def get_top_50(events):
    tmp = []
    for movie, rating in zip(events['MovieID'], events['Rating']):
        tmp.append((movie, rating))
    tmp.sort(key=lambda x: x[1])
    return [it[0] for it in tmp[-50:]]

def build_preds_dataframe(predictions):
    tmp_u = []
    tmp_i = []
    tmp_r = []
    for pred in predictions:
        tmp_u.append(pred[0])
        tmp_i.append((pred[1], movie_to_cnt[pred[1]], movie_to_genre[pred[1]]))
        tmp_r.append(pred[3])
    tmp = pd.DataFrame({'UserID': tmp_u, 'MovieID': tmp_i, 'Rating': tmp_r})
    tmp = tmp.groupby('UserID').apply(get_top_50).reset_index()
    tmp['preds'] = tmp[0]
    tmp = tmp[['UserID', 'preds']]
    df_preds = positive_pairs.copy()
    df_preds = df_preds.set_index('UserID').join(tmp.set_index('UserID'), how='left', on='UserID').reset_index()
    df_preds = df_preds.fillna(-1)
    return df_preds

In [44]:
from surprise import KNNBasic

In [45]:
knn = KNNBasic(k=10)
knn.fit(train_surprise)
predictions_knnbasic = knn.test(testset_surprise) # !!!! 64 min, warning !!!!

Computing the msd similarity matrix...
Done computing similarity matrix.


In [54]:
df_preds = build_preds_dataframe(predictions_knnbasic)

In [55]:
df_preds

Unnamed: 0,UserID,positive,preds
0,19,"[1527, 223, 3409, 480, 2918, 405, 2653, 2004, ...","[(1234, 974, [Comedy, Crime]), (557, 2, [Drama..."
1,20,"[1694, 2641, 3717, 1468, 1371, 1375, 1527, 352...","[(1242, 1048, [Action, Drama, War]), (3468, 40..."
2,22,"[1198, 2193, 1097, 3176, 2161, 2005, 1127, 542...","[(1947, 681, [Musical, Romance]), (3951, 28, [..."
3,23,"[2259, 2411, 1258, 2712, 2148, 3052, 2641, 264...","[(2908, 785, [Drama]), (1664, 4, [Drama]), (29..."
4,24,"[2657, 1635, 425, 1959, 2757]","[(1208, 1094, [Drama, War]), (1148, 832, [Anim..."
...,...,...,...
1049,6001,"[1358, 2672, 454, 1687, 1653, 3947, 1658, 965,...","[(3675, 246, [Musical]), (1131, 192, [Drama]),..."
1050,6002,"[2013, 3451, 144, 1161, 1219, 2946, 2520, 3147...","[(1963, 232, [Comedy]), (3135, 182, [Drama]), ..."
1051,6016,"[2062, 339, 1835, 252, 3894, 920, 1339, 2941, ...","[(1069, 101, [Film-Noir, Thriller]), (3090, 16..."
1052,6028,[3000],"[(2324, 1084, [Comedy, Drama]), (898, 548, [Co..."


In [56]:
print('User-wise recall@50 for top-k most popular:', round(metric_recall50(df_preds) * 100, 2), '%')
print('User-wise diversity@50 for top-k most popular:', metric_diversity50(df_preds))
print('User-wise long_tail@50 for top-k most popular:', metric_long_tail50(df_preds))

User-wise recall@50 for top-k most popular: 5.72 %
User-wise diversity@50 for top-k most popular: 15.12618595825427
User-wise long_tail@50 for top-k most popular: 194.62452290076337


### SVD

In [57]:
from surprise import SVD

In [58]:
svd = SVD()
svd.fit(train_surprise)
predictions_svd = svd.test(testset_surprise)


In [60]:
df_preds = build_preds_dataframe(predictions_svd)

In [61]:
print('User-wise recall@50 for svd:', round(metric_recall50(df_preds) * 100, 2), '%')
print('User-wise diversity@50 for svd:', metric_diversity50(df_preds))
print('User-wise long_tail@50 for svd:', metric_long_tail50(df_preds))

User-wise recall@50 for svd: 12.58 %
User-wise diversity@50 for svd: 14.770398481973434
User-wise long_tail@50 for svd: 479.5706106870229
