In [19]:
from typing import Dict, List

from copy import deepcopy
import pandas as pd
import numpy as np
from rectools import Columns
from rectools.dataset import Interactions, Dataset
from rectools.models import PopularModel, RandomModel
from rectools.model_selection import Splitter, TimeRangeSplitter
from rectools.models.base import ModelBase
from rectools.metrics.base import MetricAtK
from rectools.metrics import calc_metrics, Precision, Recall, Serendipity, MeanInvUserFreq, MAP, NDCG
import time
import os
from pprint import pprint

In [18]:

interactions_url = os.path.join(os.path.dirname(os.getcwd()), 'data/interactions.csv')
items_url = os.path.join(os.path.dirname(os.getcwd()), 'data/items.csv')


interactions_df = pd.read_csv(interactions_url)
interactions_df.rename(
        columns={
                'last_watch_dt': Columns.Datetime,
                'total_dur': Columns.Weight
        }, 
        inplace=True)


items_df = pd.read_csv(items_url)

interactions = Interactions(interactions_df)

In [14]:
def cross_val_score(
        models: Dict[str, ModelBase],
        metrics: Dict[str, MetricAtK],
        cv: Splitter,
        rec_num: int = 10
):
    """
    Calculate cross validation metrics
    """
    iterator = cv.split(interactions)
    results = []
    for train_ids, test_ids, fold_info in iterator:
        print(f"\n==================== Fold {fold_info['i_split']}")
        df_train = interactions.df.iloc[train_ids]
        dataset = Dataset.construct(df_train)
        df_test = interactions.df.iloc[test_ids][Columns.UserItem]
        test_users = np.unique(df_test[Columns.User])
        catalog = df_train[Columns.Item].unique()
        for model_name, model in models.items():
                curr_model = deepcopy(model)
                start_time = time.time()
                curr_model.fit(dataset)
                end_time = time.time()
                recos = curr_model.recommend(
                        users=test_users,
                        dataset=dataset,
                        k=rec_num,
                        filter_viewed=True,
                        )
                metric_values = calc_metrics(
                        metrics,
                        reco=recos,
                        interactions=df_test,
                        prev_interactions=df_train,
                        catalog=catalog,
                        )
                res = {"fold": fold_info["i_split"], "model": model_name, 'learning_time': end_time - start_time}
                res.update(metric_values)
                results.append(res)
    
    # Create DataFrame
    result_df = pd.DataFrame(results)
    metric_names = list(metrics.keys()) + ['learning_time']
    grouped_df = result_df.groupby('model')[metric_names].mean().reset_index()
    return grouped_df


In [15]:
n_splits = 3

models = {
    "random": RandomModel(random_state=42),
    "popular": PopularModel()
}

metrics = {
    "precision": Precision(k=10),
    "recall": Recall(k=10),
    "novelty": MeanInvUserFreq(k=10),
    "serendipity": Serendipity(k=10),
    "MAP@1": MAP(k=1),
    "MAP@5": MAP(k=5),
    "MAP@10": MAP(k=10),
    'NDCG@10': NDCG(k=10)
}

cv = TimeRangeSplitter(
    test_size="7D",
    n_splits=n_splits,
    filter_already_seen=True,
    filter_cold_items=True,
    filter_cold_users=True,
)

result = cross_val_score(models, metrics, cv)
result






Unnamed: 0,model,precision,recall,novelty,serendipity,MAP@1,MAP@5,MAP@10,NDCG@10,learning_time
0,popular,0.033903,0.173492,3.71339,2e-06,0.04272,0.078295,0.084109,0.043084,4.346222
1,random,0.000176,0.000687,15.610409,6e-06,6.9e-05,0.000155,0.0002,0.000175,5e-05


In [21]:
def visual_analize(
        fitted_model: ModelBase,
        interactions_dataset: pd.DataFrame,
        user_ids: List[int],
        item_data: pd.DataFrame
):
    dataset =  Dataset.construct(interactions_dataset)
    recommendations = fitted_model.recommend(
        users=user_ids,
        dataset=dataset,
        k=10,
        filter_viewed=True,
    )
    history_df = interactions_dataset.merge(item_data, on='item_id').sort_values(by='datetime')
    recommendations_with_items = recommendations.merge(item_data, on='item_id')
    result_dict = {}
    for user in user_ids:
        result_dict[user] = {
            'history': history_df[history_df['user_id'] == user][['datetime', 'title', 'genres']].to_dict('records'),
            'recommendations': recommendations_with_items[recommendations_with_items['user_id'] == user][['title', 'genres', 'rank']].to_dict('records')
        }
    return result_dict
        

In [22]:
# Using example

model = PopularModel()
dataset = Dataset.construct(interactions_df)
model.fit(dataset)

result = visual_analize(
    model,
    interactions_df,
    [666262, 672861, 955527],
    items_df
)

pprint(result)

{666262: {'history': [{'datetime': Timestamp('2021-05-12 00:00:00'),
                       'genres': 'боевики, историческое, приключения',
                       'title': 'Последний викинг'},
                      {'datetime': Timestamp('2021-05-12 00:00:00'),
                       'genres': 'боевики, триллеры, приключения',
                       'title': 'Робин Гуд: Начало'},
                      {'datetime': Timestamp('2021-05-14 00:00:00'),
                       'genres': 'боевики, драмы, историческое, военные',
                       'title': 'Томирис'}],
          'recommendations': [{'genres': 'триллеры, детективы',
                               'rank': 1,
                               'title': 'Хрустальный'},
                              {'genres': 'драмы, мелодрамы',
                               'rank': 2,
                               'title': 'Клиника счастья'},
                              {'genres': 'боевики, триллеры',
                               'rank': 3,
