In [1]:
import warnings
warnings.simplefilter('ignore')

In [2]:
import requests
from copy import deepcopy
import pandas as pd
from pandas.api.types import CategoricalDtype
import numpy as np
from tqdm.auto import tqdm

from rectools import Columns
from rectools.dataset import Dataset, Interactions
from rectools.models.popular import PopularModel, Popularity
from rectools.models.popular_in_category import PopularInCategoryModel, RatioStrategy, MixingStrategy
from rectools.model_selection import TimeRangeSplitter
from rectools.metrics import MAP, Recall, NDCG, calc_metrics

from scipy.stats import mode
import scipy.sparse as sps
from scipy.sparse import csr_matrix

from sklearn.linear_model import ElasticNet

from itertools import product

import sys
import time

In [3]:
# setup MLflow tracking

import mlflow

mlflow.set_tracking_uri("http://127.0.0.1:5001")
mlflow.set_experiment("popular_models")

<Experiment: artifact_location='file:///D:/PycharmProjects/itmo_mts_recsys/recsys_service/mlflow_runs/1', creation_time=1681596047094, experiment_id='1', last_update_time=1681596047094, lifecycle_stage='active', name='popular_models', tags={}>

In [4]:
pd.set_option('max_colwidth', 400)

In [53]:
n_folds = 3
unit = "D"
n_units = 7
K_RECOS = 10
metrics = {
    "Recall@10": Recall(k=10),
    "MAP@10": MAP(k=10),
    "NDCG@10": NDCG(k=10),
}

## Get data

In [54]:
interactions = pd.read_csv('../data/kion_train/interactions.csv')
users = pd.read_csv('../data/kion_train/users.csv')
items = pd.read_csv('../data/kion_train/items.csv')

In [55]:
# rename columns, convert timestamp
interactions.rename(
    columns={
        'track_id': Columns.Item,
        'last_watch_dt': Columns.Datetime,
        'total_dur': Columns.Weight
    }, 
    inplace=True) 

interactions['datetime'] = pd.to_datetime(interactions['datetime'])

In [56]:
def headtail(df, n=5):
    return pd.concat([df.head(n), df.tail(n)])

## Validation setup

In [57]:
interactions[Columns.Datetime].min(), interactions[Columns.Datetime].max()

In [58]:
daily_inters = interactions.groupby(Columns.Datetime)[Columns.User].agg('count')
daily_inters.name = 'interactions'
headtail(daily_inters)

In [59]:
#last_date = interactions[Columns.Datetime].max().normalize()
last_date = pd.to_datetime('15-08-2021', format='%d-%m-%Y')


start_date = last_date - pd.Timedelta(n_folds * n_units - 1, unit=unit)
periods = n_folds + 1
freq = f"{n_units}{unit}"
print(f"""
    start_date: {start_date}
    last_date: {last_date}
    periods: {periods}
    freq: {freq}
""")

date_range = pd.date_range(start=start_date, periods=periods, freq=freq, tz=last_date.tz)
date_range

In [60]:
cv = TimeRangeSplitter(
    date_range=date_range,
    filter_already_seen=True,
cv

## Prepare data

### Binarize years in release_year feature

In [61]:
_, bins = pd.qcut(items["release_year"], 10, retbins=True)
labels = bins[:-1]

year_feature = pd.DataFrame(
    {
        Columns.Item: items[Columns.Item],
        "value": pd.cut(items["release_year"], bins=bins, labels=bins[:-1]),
        "feature": "release_year",
    }
)
year_feature.head()

### Split genres into list

In [62]:
items["genre"] = items["genres"].str.split(",")
items[["genre", "genres"]].head(3)

In [63]:
genre_feature = items[[Columns.Item, "genre"]].explode("genre")
genre_feature.columns = [Columns.Item, "value"]
genre_feature["feature"] = "genre"
genre_feature.head()

In [64]:
item_feat = pd.concat([genre_feature, year_feature])
item_feat = item_feat[item_feat[Columns.Item].isin(interactions[Columns.Item])]
item_feat.shape

### Construct dataset object to fit models

In [65]:
dataset = Dataset.construct(
    interactions_df=interactions,
    user_features_df=None,
    item_features_df=item_feat,
    cat_item_features=['genre', 'release_year']
)

## Tuning hyperparams utils

In [66]:
def tune_hyperparams(cv, params, model_type, inters_obj, verbose=False, category_feature=None):
    params_grid = [item for item in product(*params.values())]
    fold_iterator = cv.split(inters_obj)
    results = []

    for i_fold, (train_ids, test_ids, _) in enumerate(fold_iterator):     
        df_train = inters_obj.df.iloc[train_ids]
        item_feat_train = item_feat[item_feat[Columns.Item].isin(df_train[Columns.Item])]
      
    dataset = Dataset.construct(
        interactions_df=df_train,
        user_features_df=None,
        item_features_df=item_feat_train,
        cat_item_features=['genre', 'release_year']
    )

    df_test = inters_obj.df.iloc[test_ids][Columns.UserItem]
    test_users = np.unique(df_test[Columns.User])

    for params_setup in params_grid:
        with mlflow.start_run():
            start_time = time.time()
            
            if category_feature:
                model = model_type(category_feature=category_feature)
            else:
                model = model_type()
            for i, param_name in enumerate(params.keys()): 
                setattr(model, param_name, params_setup[i])
                
                # Log current models params
                mlflow.log_param(param_name, params_setup[i])

            model.fit(dataset)
            elapsed = time.time() - start_time
            
            recos = model.recommend(
                  users=test_users,
                  dataset=dataset,
                  k=K_RECOS,
                  filter_viewed=True,
            )
            metric_values = calc_metrics(
                  metrics,
                  reco=recos,
                  interactions=df_test,
                  prev_interactions=df_train
            )
            res = {"fold": i_fold, "model": f'{model_type.__name__} {str(params_setup)}'}
            res.update(metric_values)
            if verbose:
                print(res)
            results.append(res)
            
            # log metrics NDCG, MAP@10, Recall@10
            metrics_to_log = {k.replace('@', '_at_'): v for k, v in metric_values.items()}
            mlflow.log_metrics(metrics_to_log)
            
            # log model size in bytes and fit time
            mlflow.log_metric('model_size_bytes', sys.getsizeof(model))
            mlflow.log_metric('fit_time', elapsed)
           
    return results

## Popular model

### Simple run

In [67]:
%%time

popular_model = PopularModel()
popular_model.fit(dataset);

In [68]:
popular_model.recommend(
    dataset.user_id_map.external_ids[:1], 
    dataset=dataset, 
    k=10, 
    filter_viewed=False,  # True - throw away some items for each user
).merge(
    items[['item_id', 'title']], 
    on='item_id',
    how='left',
)

### Tune hyperparams

In [69]:
params_grid = {
    'period': [
        pd.Timedelta(7, unit='d'),
        pd.Timedelta(1, unit='w'),
        pd.Timedelta(14, unit='d'),
        pd.Timedelta(28, unit='d'),
     ],
     'popularity': [Popularity.N_USERS, Popularity.N_INTERACTIONS, Popularity.MEAN_WEIGHT, Popularity.SUM_WEIGHT]
}

In [70]:
cv = TimeRangeSplitter(
    date_range=date_range,
    filter_already_seen=True,
)

In [71]:
inters = Interactions(interactions)

In [80]:
%%time

val_results = tune_hyperparams(cv, params=params_grid, model_type=PopularModel, inters_obj=inters)

### Get metrics from MLflow and the best model 

In [13]:
best_run = mlflow.search_runs(experiment_names=["popular_models"], order_by=["metrics.MAP_at_10 DESC"], max_results=1)
best_run

Unnamed: 0,run_id,experiment_id,status,artifact_uri,start_time,end_time,metrics.MAP_at_10,metrics.NDCG_at_10,metrics.model_size_bytes,metrics.fit_time,metrics.Recall_at_10,params.period,params.popularity,tags.mlflow.runName,tags.mlflow.source.type,tags.mlflow.user,tags.mlflow.source.name
0,d0d0e4b774424d43b45a14f59bd95387,1,FINISHED,file:///D:/PycharmProjects/itmo_mts_recsys/recsys_service/mlflow_runs/1/d0d0e4b774424d43b45a14f59bd95387/artifacts,2023-04-15 22:38:00.591000+00:00,2023-04-15 22:38:12.629000+00:00,0.090447,0.047749,48.0,0.296674,0.205985,7 days 00:00:00,Popularity.N_INTERACTIONS,wistful-carp-209,LOCAL,maste,D:\PycharmProjects\itmo_mts_recsys\venv\lib\site-packages\ipykernel_launcher.py
