In [1]:
import pandas as pd
import numpy as np
import scipy as sp
import requests
from tqdm.auto import tqdm
from scipy.stats import mode 
from pprint import pprint
from implicit.nearest_neighbours import CosineRecommender
import warnings
warnings.filterwarnings("ignore")

from rectools import Columns

pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 200)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
interactions = pd.read_csv('../data_original/interactions.csv')
users = pd.read_csv('../data_original/users.csv')
items = pd.read_csv('../data_original/items.csv')

In [4]:
# rename columns, convert timestamp
interactions.rename(columns={'last_watch_dt': Columns.Datetime,
                            'total_dur': Columns.Weight}, 
                    inplace=True) 

interactions['datetime'] = pd.to_datetime(interactions['datetime'])

In [5]:
# train test split 
# test = last 1 week 
from rectools.model_selection import TimeRangeSplit

n_folds = 1
unit = "W"
n_units = 1
periods = n_folds + 1
freq = f"{n_units}{unit}"

last_date = interactions[Columns.Datetime].max().normalize()
start_date = last_date - pd.Timedelta(n_folds * n_units + 1, unit=unit)  
print(f"Start date and last date of the test fold: {start_date, last_date}")
    
date_range = pd.date_range(start=start_date, periods=periods, freq=freq, tz=last_date.tz)
print(f"Test fold borders: {date_range.values.astype('datetime64[D]')}")

# generator of folds
cv = TimeRangeSplit(
    date_range=date_range,
    filter_already_seen=True,
    filter_cold_items=True,
    filter_cold_users=True,
)
print(f"Real number of folds: {cv.get_n_splits(interactions)}")

Start date and last date of the test fold: (Timestamp('2021-08-08 00:00:00'), Timestamp('2021-08-22 00:00:00'))
Test fold borders: ['2021-08-08' '2021-08-15']
Real number of folds: 1


In [6]:

(train_ids, test_ids, fold_info) = cv.split(interactions, collect_fold_stats=True).__next__()

## Prepare train matrix 

In [18]:
train = interactions.loc[train_ids]
test = interactions.loc[test_ids]

In [19]:
users_inv_mapping = dict(enumerate(train['user_id'].unique()))
users_mapping = {v: k for k, v in users_inv_mapping.items()}

In [20]:
items_inv_mapping = dict(enumerate(train['item_id'].unique()))
items_mapping = {v: k for k, v in items_inv_mapping.items()}

In [21]:
def get_coo_matrix(df, 
                   user_col='user_id', 
                   item_col='item_id', 
                   weight_col=None, 
                   users_mapping=None, 
                   items_mapping=None):
    if weight_col:
        weights = df[weight_col].astype(np.float32)
    else:
        weights = np.ones(len(df), dtype=np.float32)

    interaction_matrix = sp.sparse.coo_matrix((
        weights, 
        (
            df[user_col].map(users_mapping.get), 
            df[item_col].map(items_mapping.get)
        )
    ))
    return interaction_matrix

In [22]:
interaction_matrix = get_coo_matrix(train, weight_col='weight',
                                    users_mapping=users_mapping, 
                                    items_mapping=items_mapping)

## Selection of models and parameters

In [33]:
from rectools.dataset import Dataset

dataset = Dataset.construct(
    interactions_df=train,
    user_features_df=None,
    item_features_df=None
)

In [35]:
from implicit.nearest_neighbours import CosineRecommender, BM25Recommender
from rectools.models.implicit_knn import ImplicitItemKNNWrapperModel

In [45]:
from rectools.metrics import Precision, Recall, MAP, calc_metrics

metrics = {
    "prec@10": Precision(k=10),
    "recall@10": Recall(k=10),
    "map@10": MAP(k=10)
}

catalog = train['item_id'].unique()

In [73]:
res = dict()


for n, recommender in (('CosineRecommender', CosineRecommender), ('BM25Recommender', BM25Recommender)):
    for k in [10, 20, 30, 50]:
        model = ImplicitItemKNNWrapperModel(model=recommender(K=k))
        model.fit(dataset);
        
        rec = model.recommend(
            test['user_id'].unique(), 
            dataset=dataset, 
            k=10, 
            filter_viewed=False
        )
        
        metric_values_itemknn = calc_metrics(
            metrics,
            reco=rec,
            interactions=test,
            prev_interactions=train,
            catalog=catalog
        )
        
        res[f'{n}_{k}'] = metric_values_itemknn


In [74]:
pd.DataFrame(res)

Unnamed: 0,CosineRecommender_10,CosineRecommender_20,CosineRecommender_30,CosineRecommender_50,BM25Recommender_10,BM25Recommender_20,BM25Recommender_30,BM25Recommender_50
prec@10,0.016868,0.017216,0.017312,0.017445,0.032703,0.032569,0.032522,0.032494
recall@10,0.093154,0.094826,0.095209,0.095812,0.169153,0.168536,0.16834,0.168239
map@10,0.023014,0.023095,0.023146,0.023214,0.048648,0.048371,0.048277,0.0482


In [75]:
res_filter_viewed = dict()


for n, recommender in (('CosineRecommender', CosineRecommender), ('BM25Recommender', BM25Recommender)):
    for k in [10, 20, 30, 50]:
        model = ImplicitItemKNNWrapperModel(model=recommender(K=k))
        model.fit(dataset);
        
        rec = model.recommend(
            test['user_id'].unique(), 
            dataset=dataset, 
            k=10, 
            filter_viewed=True
        )
        
        metric_values_itemknn = calc_metrics(
            metrics,
            reco=rec,
            interactions=test,
            prev_interactions=train,
            catalog=catalog
        )
        
        res_filter_viewed[f'{n}_{k}'] = metric_values_itemknn

In [76]:
pd.DataFrame(res_filter_viewed)

Unnamed: 0,CosineRecommender_10,CosineRecommender_20,CosineRecommender_30,CosineRecommender_50,BM25Recommender_10,BM25Recommender_20,BM25Recommender_30,BM25Recommender_50
prec@10,0.022915,0.025375,0.025878,0.026411,0.039367,0.038933,0.038977,0.038951
recall@10,0.119112,0.131688,0.133881,0.135827,0.19889,0.198348,0.198759,0.198882
map@10,0.058292,0.060869,0.061515,0.062209,0.095622,0.095578,0.095633,0.095661


## top model + popular 

In [78]:
model = ImplicitItemKNNWrapperModel(model=BM25Recommender(K=10))
model.fit(dataset);

rec = model.recommend(
            test['user_id'].unique(), 
            dataset=dataset, 
            k=10, 
            filter_viewed=True
        )

In [79]:
rec.head()

Unnamed: 0,user_id,item_id,score,rank
0,1016458,15297,231095800000.0,1
1,1016458,9728,138320400000.0,2
2,1016458,4151,111853100000.0,3
3,1016458,3734,96772320000.0,4
4,1016458,2657,94959740000.0,5


In [80]:
from rectools.models.popular import PopularModel 

pop = PopularModel()
pop.fit(dataset);

pop_rec = pop.recommend(
    dataset.user_id_map.external_ids[:1], 
    dataset=dataset, 
    k=10, 
    filter_viewed=True
)

In [81]:
com_recs = pd.concat((rec, pop_rec)).drop_duplicates()
com_recs.head()

Unnamed: 0,user_id,item_id,score,rank
0,1016458,15297,231095800000.0,1
1,1016458,9728,138320400000.0,2
2,1016458,4151,111853100000.0,3
3,1016458,3734,96772320000.0,4
4,1016458,2657,94959740000.0,5


In [82]:
calc_metrics(
        metrics,
        reco=com_recs,
        interactions=test,
        prev_interactions=train,
        catalog=catalog
    )

{'prec@10': 0.039367345537981824,
 'recall@10': 0.19889028267943354,
 'map@10': 0.09562182706507105}

 For cold users

In [69]:
# popular for last mounth
import datetime as DT

interactions_month = interactions[interactions['datetime'] >
                                  (interactions['datetime'].max() - DT.timedelta(days=30))]

popular_recs_month = list(interactions_month.item_id.value_counts()[:10])
popular_recs_month

[59226, 54373, 54115, 27752, 23812, 23253, 20967, 18909, 18878, 13305]