In [1]:
# pip install rectools

In [2]:
# pip install -U Jinja2

In [3]:
from copy import deepcopy
import pandas as pd
import numpy as np

import requests
from tqdm.auto import tqdm
import time

from rectools import Columns
from rectools.models import RandomModel, PopularModel
from rectools.dataset import Interactions, Dataset
from rectools.model_selection import TimeRangeSplitter
from rectools.metrics import (
    Precision,
    Recall,
    MRR,
    MAP,
    Serendipity,
    MeanInvUserFreq,
    calc_metrics,
)

In [4]:
K_RECOS = 10

# Load data

In [5]:
data_path = 'data/kion_train'

In [6]:
interactions = pd.read_csv(f'{data_path}/interactions.csv', parse_dates=["last_watch_dt"])

interactions.rename(
    columns={
        'last_watch_dt': Columns.Datetime,
        'total_dur': Columns.Weight
    }, 
    inplace=True) 

users = pd.read_csv(f'{data_path}/users.csv')
items = pd.read_csv(f'{data_path}/items.csv')

# Functions

In [7]:
def train_models(interactions, models, metrics, k, cv):
    """
    Calculate metrics based on cross-validation
    
    Parameters
    -----------
    interactions: pd.DataFrame with User-Item interactions
    models: dict with initialized models
    metrics: dict with initialized metrics
    k: number of recommendations to generate
    cv: initialized Splitter for cross validation
    """
    results = []
    fold_iterator = cv.split(Interactions(interactions), collect_fold_stats=True)

    for train_ids, test_ids, fold_info in tqdm((fold_iterator), total=cv.n_splits):
        print(f"\n==================== Fold {fold_info['i_split']}")
        print(fold_info)

        # 1. Create Dataset
        df_train = interactions.iloc[train_ids]
        dataset = Dataset.construct(df_train)

        df_test = interactions.iloc[test_ids][Columns.UserItem]
        test_users = np.unique(df_test[Columns.User])

        catalog = df_train[Columns.Item].unique()

        for model_name, model in models.items():
            # 2-3. Fit model and log the training time
            start_time = time.time()
            model.fit(dataset)
            end_time = time.time()
            recos = model.recommend(
                users=test_users,
                dataset=dataset,
                k=k,
                filter_viewed=True,
            )
            # 4. Calculate and save metrics
            metric_values = calc_metrics(
                metrics,
                reco=recos,
                interactions=df_test,
                prev_interactions=df_train,
                catalog=catalog,
            )
            models[model_name] = deepcopy(model)
            res = {"fold": fold_info["i_split"], "model": model_name, "time": end_time - start_time}
            res.update(metric_values)
            results.append(res)
            
    return pd.DataFrame(results)

In [8]:
def visualize(model, dataset, user_ids, item_data, k):
    """
    Visual analysis of recommendations
    
    Parameters
    -----------
    model: fitted model
    metrics: rectools Dataset
    user_ids: list of selected user_ids to view
    item_data: information about items that is important to reflect for visual analysis (e.g. name)
    k: number of recommendations to generate
    """
    cols_ext = [Columns.User, Columns.Item, 'is_watched']
    cols = cols_ext[:-1]
    
    # 1. Get recos
    recos = model.recommend(
                users=user_ids,
                dataset=dataset,
                k=k,
                filter_viewed=True,
    )
    recos['is_watched'] = False
    
    # 2. Get watched items
    watched = dataset.interactions.df
    watched = watched.loc[watched[Columns.User].isin(user_ids), cols]
    watched['is_watched'] = True
    
    # 3. Merge items from previous steps and add item's features
    un = pd.concat([recos[cols_ext], watched[cols_ext]], axis=0)
    
    cnt = dataset.interactions.df[cols].groupby([Columns.Item]).count()
    cnt.reset_index(inplace=True)
    cnt.columns = [Columns.Item, 'views_count']
    
    res = un.merge(item_data, how='left', on=Columns.Item,) \
            .merge(cnt, on=Columns.Item,) \
            .sort_values([Columns.User, 'is_watched', 'views_count'], ascending=False)
    return res

# Test our fuctions

## Cross validation

In [9]:
n_splits = 3

cv = TimeRangeSplitter(
    test_size="7D",
    n_splits=n_splits,
    filter_already_seen=True,
    filter_cold_items=True,
    filter_cold_users=True,
)

# Take few simple models to compare
models = {
    "random": RandomModel(random_state=32),
    "popular": PopularModel(),
}

# We will calculate several classic (precision@k and recall@k) and "beyond accuracy" metrics
metrics = {
    "prec@1": Precision(k=1),
    "prec@5": Precision(k=5),
    "prec@10": Precision(k=10),
    "recall@1": Recall(k=1),
    "recall@5": Recall(k=5),
    "recall@10": Recall(k=10),
    "MRR@1": MRR(k=1),
    "MRR@5": MRR(k=5),
    "MRR@10": MRR(k=10),
    "MAP@1": MAP(k=1),
    "MAP@5": MAP(k=5),
    "MAP@10": MAP(k=10),
    "Serendipity@1": Serendipity(k=1),
    "Serendipity@5": Serendipity(k=5),
    "Serendipity@10": Serendipity(k=10),
    "MeanInvUserFreq@1": MeanInvUserFreq(k=1),
    "MeanInvUserFreq@5": MeanInvUserFreq(k=5),
    "MeanInvUserFreq@10": MeanInvUserFreq(k=10),
}

In [10]:
res = train_models(interactions, models, metrics, K_RECOS, cv)

  0%|          | 0/3 [00:00<?, ?it/s]


{'i_split': 0, 'start': Timestamp('2021-08-02 00:00:00', freq='7D'), 'end': Timestamp('2021-08-09 00:00:00', freq='7D'), 'train': 4266013, 'train_users': 797423, 'train_items': 15237, 'test': 263681, 'test_users': 98184, 'test_items': 6602}

{'i_split': 1, 'start': Timestamp('2021-08-09 00:00:00', freq='7D'), 'end': Timestamp('2021-08-16 00:00:00', freq='7D'), 'train': 4649162, 'train_users': 850489, 'train_items': 15415, 'test': 279422, 'test_users': 103511, 'test_items': 6698}

{'i_split': 2, 'start': Timestamp('2021-08-16 00:00:00', freq='7D'), 'end': Timestamp('2021-08-23 00:00:00', freq='7D'), 'train': 5051815, 'train_users': 906071, 'train_items': 15577, 'test': 298878, 'test_users': 110076, 'test_items': 6679}


In [11]:
res

Unnamed: 0,fold,model,time,prec@1,recall@1,prec@5,recall@5,prec@10,recall@10,MRR@1,...,MRR@10,MAP@1,MAP@5,MAP@10,MeanInvUserFreq@1,MeanInvUserFreq@5,MeanInvUserFreq@10,Serendipity@1,Serendipity@5,Serendipity@10
0,0,random,3.8e-05,0.000244,8.7e-05,0.000228,0.000445,0.000214,0.000771,0.000244,...,0.000665,8.7e-05,0.0002,0.000243,15.599409,15.593514,15.593936,8e-06,7e-06,7e-06
1,0,popular,1.263931,0.084026,0.047435,0.054225,0.143095,0.035483,0.181957,0.084026,...,0.146332,0.047435,0.08319,0.089605,2.401723,3.079837,3.711584,2e-06,3e-06,2e-06
2,1,random,0.000579,0.000184,5.1e-05,0.000172,0.000278,0.000177,0.00062,0.000184,...,0.000543,5.1e-05,0.000134,0.000179,15.602863,15.6128,15.611655,4e-06,7e-06,7e-06
3,1,popular,1.512366,0.074466,0.04191,0.051848,0.136661,0.033572,0.172428,0.074466,...,0.13542,0.04191,0.076906,0.082607,2.373249,3.065811,3.712928,2e-06,3e-06,2e-06
4,2,random,5.6e-05,0.000236,7.7e-05,0.000207,0.000373,0.000188,0.000688,0.000236,...,0.000603,7.7e-05,0.000172,0.000212,15.64014,15.632653,15.633438,7e-06,8e-06,7e-06
5,2,popular,1.320657,0.070806,0.038816,0.051134,0.132482,0.032655,0.166089,0.070806,...,0.134058,0.038816,0.074789,0.080114,2.356194,3.055289,3.715659,2e-06,2e-06,2e-06


In [12]:
# Aggregate metrics by folds and compare models
pivot_results = res.drop(columns="fold").groupby(["model"], sort=False).agg(["mean", "std"])
mean_metric_subset = [(metric, agg) for metric, agg in pivot_results.columns if agg == 'mean']
(
    pivot_results.style
    .highlight_min(subset=mean_metric_subset, color='lightcoral', axis=0)
    .highlight_max(subset=mean_metric_subset, color='lightgreen', axis=0)
)

Unnamed: 0_level_0,time,time,prec@1,prec@1,recall@1,recall@1,prec@5,prec@5,recall@5,recall@5,prec@10,prec@10,recall@10,recall@10,MRR@1,MRR@1,MRR@5,MRR@5,MRR@10,MRR@10,MAP@1,MAP@1,MAP@5,MAP@5,MAP@10,MAP@10,MeanInvUserFreq@1,MeanInvUserFreq@1,MeanInvUserFreq@5,MeanInvUserFreq@5,MeanInvUserFreq@10,MeanInvUserFreq@10,Serendipity@1,Serendipity@1,Serendipity@5,Serendipity@5,Serendipity@10,Serendipity@10
Unnamed: 0_level_1,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std
model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2,Unnamed: 25_level_2,Unnamed: 26_level_2,Unnamed: 27_level_2,Unnamed: 28_level_2,Unnamed: 29_level_2,Unnamed: 30_level_2,Unnamed: 31_level_2,Unnamed: 32_level_2,Unnamed: 33_level_2,Unnamed: 34_level_2,Unnamed: 35_level_2,Unnamed: 36_level_2,Unnamed: 37_level_2,Unnamed: 38_level_2
random,0.000225,0.000307,0.000221,3.3e-05,7.2e-05,1.9e-05,0.000202,2.8e-05,0.000365,8.4e-05,0.000193,1.9e-05,0.000693,7.6e-05,0.000221,3.3e-05,0.000485,5.5e-05,0.000604,6.1e-05,7.2e-05,1.9e-05,0.000169,3.3e-05,0.000211,3.2e-05,15.614137,0.022585,15.612989,0.01957,15.613009,0.019786,6e-06,2e-06,7e-06,1e-06,7e-06,0.0
popular,1.365651,0.130186,0.076432,0.006826,0.04272,0.004366,0.052402,0.001618,0.137413,0.005346,0.033903,0.001443,0.173492,0.007987,0.076432,0.006826,0.131669,0.006167,0.138603,0.006728,0.04272,0.004366,0.078295,0.00437,0.084109,0.004921,2.377055,0.023002,3.066979,0.012316,3.71339,0.002076,2e-06,0.0,3e-06,0.0,2e-06,0.0


## Visualization

### Initialize params for tests

In [16]:
item_features_frames = []
user_ids = [666262, 672861, 955527]
cols = ['item_id', 'title', 'genres']

for feature in cols[1:]:
    feature_frame = items.reindex(columns=[Columns.Item, feature])
    feature_frame.columns = ["id", "value"]
    feature_frame["feature"] = feature
    item_features_frames.append(feature_frame)

item_features = pd.concat(item_features_frames)
item_features = item_features[item_features["id"].isin(interactions[Columns.Item])]

dataset = Dataset.construct(interactions, item_features_df=item_features, cat_item_features=cols[1:])

model_rand = models['random'].fit(dataset)
model_pop = models['popular'].fit(dataset)

### Visualize random model results

In [17]:
vis_res_rand = visualize(model_rand, dataset, user_ids, items[cols], k=K_RECOS)
vis_res_rand

Unnamed: 0,user_id,item_id,is_watched,title,genres,views_count
31,955527,21,True,Признание 5,для взрослых,193123
19,955527,496,False,Воскресший Эртугрул,"боевики, драмы, приключения",3108
20,955527,4205,False,Дело гастронома №1 (Операция Беркут),"драмы, русские",283
26,955527,3407,False,Черный капитан,"боевики, русские, военные",31
21,955527,10822,False,Она защищает Родину,"драмы, советские, военные",13
22,955527,10914,False,Великолепная,"зарубежные, комедии, мелодрамы",2
23,955527,3999,False,Джиперс криперс,"ужасы, триллеры",2
24,955527,14961,False,Битва за Землю,"боевики, ужасы, фантастика, триллеры",2
25,955527,13734,False,Сексуальный массаж и Фантазии,для взрослых,1
27,955527,14614,False,Настя,"мелодрамы, комедии",1


### Visualize popular model results

In [18]:
vis_res_pop = visualize(model_pop, dataset, user_ids, items[cols], k=K_RECOS)
vis_res_pop

Unnamed: 0,user_id,item_id,is_watched,title,genres,views_count
33,955527,21,True,Признание 5,для взрослых,193123
26,955527,142,False,Маша,"драмы, триллеры",55043
14,955527,4151,False,Секреты семейной жизни,комедии,325
20,955527,2657,False,Подслушано,"драмы, триллеры",285
29,955527,6809,False,Дуров,документальное,96
23,955527,4880,False,Афера,комедии,39
17,955527,3734,False,Прабабушка легкого поведения,комедии,23
2,955527,10440,False,Хрустальный,"триллеры, детективы",7
8,955527,9728,False,Гнев человеческий,"боевики, триллеры",7
11,955527,13865,False,Девятаев,"драмы, военные, приключения",3
