In [156]:
from copy import deepcopy
import dill
import pandas as pd
import numpy as np

import requests
from tqdm.auto import tqdm
from typing import Dict
import time

from userknn import UserKnn

from rectools import Columns
from rectools.models import RandomModel, PopularModel
from implicit.nearest_neighbours import BM25Recommender, CosineRecommender, TFIDFRecommender, ItemItemRecommender
from rectools.dataset import Interactions, Dataset
from rectools.model_selection import TimeRangeSplitter
from rectools.metrics import (
    Precision,
    Recall,
    MRR,
    MAP,
    Serendipity,
    MeanInvUserFreq,
    calc_metrics,
)


In [157]:
K_RECOS = 10
DATA_PATH = 'data/kion_train'

# Data

In [159]:
interactions = pd.read_csv(f'{data_path}/interactions.csv', parse_dates=["last_watch_dt"])

interactions.rename(
    columns={
        'last_watch_dt': Columns.Datetime,
        'total_dur': Columns.Weight
    }, 
    inplace=True) 

users = pd.read_csv(f'{data_path}/users.csv')
items = pd.read_csv(f'{data_path}/items.csv')

# Utils

In [160]:
def train_models(interactions, models, metrics, k, cv):
    """
    Calculate metrics based on cross-validation
    
    Parameters
    -----------
    interactions: pd.DataFrame with User-Item interactions
    models: dict with initialized models
    metrics: dict with initialized metrics
    k: number of recommendations to generate
    cv: initialized Splitter for cross validation
    """
    results = []
    fold_iterator = cv.split(Interactions(interactions), collect_fold_stats=True)

    for train_ids, test_ids, fold_info in tqdm((fold_iterator), total=cv.n_splits):
        print(f"\n==================== Fold {fold_info['i_split']}")
        print(fold_info)

        # 1. Create Dataset
        df_train = interactions.iloc[train_ids]
        dataset = Dataset.construct(df_train)

        df_test = interactions.iloc[test_ids][Columns.UserItem]
        test_users = np.unique(df_test[Columns.User])

        catalog = df_train[Columns.Item].unique()

        for model_name, model in models.items():
            # 2-3. Fit model and log the training time
            start_time = time.time()
            model.fit(df_train)
            end_time = time.time()
            recos = model.predict(df_test, k)
            # 4. Calculate and save metrics
            metric_values = calc_metrics(
                metrics,
                reco=recos,
                interactions=df_test,
                prev_interactions=df_train,
                catalog=catalog,
            )
            models[model_name] = deepcopy(model)
            res = {"fold": fold_info["i_split"], "model": model_name, "time": end_time - start_time}
            res.update(metric_values)
            results.append(res)
            
    return pd.DataFrame(results), models

In [161]:
def visualize(model, dataset, user_ids, item_data, k):
    """
    Visual analysis of recommendations
    
    Parameters
    -----------
    model: fitted model
    metrics: rectools Dataset
    user_ids: list of selected user_ids to view
    item_data: information about items that is important to reflect for visual analysis (e.g. name)
    k: number of recommendations to generate
    """
    cols_ext = [Columns.User, Columns.Item, 'is_watched']
    cols = cols_ext[:-1]
    
    # 1. Get recos
    recos = model.recommend(
                users=user_ids,
                dataset=dataset,
                k=k,
                filter_viewed=True,
    )
    recos['is_watched'] = False
    
    # 2. Get watched items
    watched = dataset.interactions.df
    watched = watched.loc[watched[Columns.User].isin(user_ids), cols]
    watched['is_watched'] = True
    
    # 3. Merge items from previous steps and add item's features
    un = pd.concat([recos[cols_ext], watched[cols_ext]], axis=0)
    
    cnt = dataset.interactions.df[cols].groupby([Columns.Item]).count()
    cnt.reset_index(inplace=True)
    cnt.columns = [Columns.Item, 'views_count']
    
    res = un.merge(item_data, how='left', on=Columns.Item,) \
            .merge(cnt, on=Columns.Item,) \
            .sort_values([Columns.User, 'is_watched', 'views_count'], ascending=False)
    return res

# Implicit recommenders

In [164]:
dataset = Dataset.construct(
    interactions_df=interactions,
    user_features_df=None,
    item_features_df=None
)

n_splits = 3

cv = TimeRangeSplitter(
    test_size="7D",
    n_splits=n_splits,
    filter_already_seen=True,
    filter_cold_items=True,
    filter_cold_users=True,
)

# We will calculate several classic metrics
metrics = {
    'map@10': MAP(k=10),
    'novelty': MeanInvUserFreq(k=10),
}

In [66]:
models_1 = {
    'userknn_cosine_10': UserKnn(CosineRecommender(num_threads=4), N_users=10), 
    'userknn_cosine_50': UserKnn(CosineRecommender(num_threads=4), N_users=50),
    'userknn_tfidf_10': UserKnn(TFIDFRecommender(num_threads=4), N_users=10),
    'userknn_tfidf_50': UserKnn(TFIDFRecommender(num_threads=4), N_users=50),
}

res_1, models_1 = train_models(interactions, models_1, metrics, K_RECOS, cv)

  0%|          | 0/3 [00:00<?, ?it/s]


{'i_split': 0, 'start': Timestamp('2021-08-02 00:00:00', freq='7D'), 'end': Timestamp('2021-08-09 00:00:00', freq='7D'), 'train': 4266013, 'train_users': 797423, 'train_items': 15237, 'test': 263681, 'test_users': 98184, 'test_items': 6602}




  0%|          | 0/797423 [00:00<?, ?it/s]



  0%|          | 0/797423 [00:00<?, ?it/s]



  0%|          | 0/797423 [00:00<?, ?it/s]



  0%|          | 0/797423 [00:00<?, ?it/s]


{'i_split': 1, 'start': Timestamp('2021-08-09 00:00:00', freq='7D'), 'end': Timestamp('2021-08-16 00:00:00', freq='7D'), 'train': 4649162, 'train_users': 850489, 'train_items': 15415, 'test': 279422, 'test_users': 103511, 'test_items': 6698}




  0%|          | 0/850489 [00:00<?, ?it/s]



  0%|          | 0/850489 [00:00<?, ?it/s]



  0%|          | 0/850489 [00:00<?, ?it/s]



  0%|          | 0/850489 [00:00<?, ?it/s]


{'i_split': 2, 'start': Timestamp('2021-08-16 00:00:00', freq='7D'), 'end': Timestamp('2021-08-23 00:00:00', freq='7D'), 'train': 5051815, 'train_users': 906071, 'train_items': 15577, 'test': 298878, 'test_users': 110076, 'test_items': 6679}




  0%|          | 0/906071 [00:00<?, ?it/s]



  0%|          | 0/906071 [00:00<?, ?it/s]



  0%|          | 0/906071 [00:00<?, ?it/s]



  0%|          | 0/906071 [00:00<?, ?it/s]

In [67]:
# Aggregate metrics by folds and compare models
pivot_results = res_1.drop(columns="fold").groupby(["model"], sort=False).agg(["mean", "std"])
mean_metric_subset = [(metric, agg) for metric, agg in pivot_results.columns if agg == 'mean']
(
    pivot_results.style
    .highlight_min(subset=mean_metric_subset, color='lightcoral', axis=0)
    .highlight_max(subset=mean_metric_subset, color='lightgreen', axis=0)
)

Unnamed: 0_level_0,time,time,map@10,map@10,novelty,novelty
Unnamed: 0_level_1,mean,std,mean,std,mean,std
model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
userknn_cosine_10,1157.869836,168.133591,0.003929,0.000294,6.975452,0.06028
userknn_cosine_50,1156.467637,167.49465,0.003905,0.000299,7.540823,0.069372
userknn_tfidf_10,1154.286115,174.755831,0.005659,0.000313,7.43363,0.062186
userknn_tfidf_50,1155.07059,184.893021,0.006334,0.000414,7.638598,0.068806


In [71]:
models_2 = {
    'userknn_bm25_25': UserKnn(BM25Recommender(num_threads=2), N_users=25),
    'userknn_bm25_50': UserKnn(BM25Recommender(num_threads=2), N_users=50)
}

res_2, models_2 = train_models(interactions, models_2, metrics, K_RECOS, cv)

  0%|          | 0/3 [00:00<?, ?it/s]


{'i_split': 0, 'start': Timestamp('2021-08-02 00:00:00', freq='7D'), 'end': Timestamp('2021-08-09 00:00:00', freq='7D'), 'train': 4266013, 'train_users': 797423, 'train_items': 15237, 'test': 263681, 'test_users': 98184, 'test_items': 6602}




  0%|          | 0/797423 [00:00<?, ?it/s]



  0%|          | 0/797423 [00:00<?, ?it/s]


{'i_split': 1, 'start': Timestamp('2021-08-09 00:00:00', freq='7D'), 'end': Timestamp('2021-08-16 00:00:00', freq='7D'), 'train': 4649162, 'train_users': 850489, 'train_items': 15415, 'test': 279422, 'test_users': 103511, 'test_items': 6698}




  0%|          | 0/850489 [00:00<?, ?it/s]



  0%|          | 0/850489 [00:00<?, ?it/s]


{'i_split': 2, 'start': Timestamp('2021-08-16 00:00:00', freq='7D'), 'end': Timestamp('2021-08-23 00:00:00', freq='7D'), 'train': 5051815, 'train_users': 906071, 'train_items': 15577, 'test': 298878, 'test_users': 110076, 'test_items': 6679}




  0%|          | 0/906071 [00:00<?, ?it/s]



  0%|          | 0/906071 [00:00<?, ?it/s]

In [73]:
# Aggregate metrics by folds and compare models
pivot_results = res_2.drop(columns="fold").groupby(["model"], sort=False).agg(["mean", "std"])
mean_metric_subset = [(metric, agg) for metric, agg in pivot_results.columns if agg == 'mean']
(
    pivot_results.style
    .highlight_min(subset=mean_metric_subset, color='lightcoral', axis=0)
    .highlight_max(subset=mean_metric_subset, color='lightgreen', axis=0)
)

Unnamed: 0_level_0,time,time,map@10,map@10,novelty,novelty
Unnamed: 0_level_1,mean,std,mean,std,mean,std
model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
userknn_bm25_25,1176.985729,200.088945,0.002699,7.7e-05,9.283169,0.082654
userknn_bm25_50,1166.071831,184.654958,0.002699,7.7e-05,9.283169,0.082654


# Modified UserKnn from seminar

In [166]:
class UserKnnModified(UserKnn):

    treshold_model = None
    
    def _generate_recs_mapper(self, model: ItemItemRecommender, user_mapping: Dict[int, int], 
                              user_inv_mapping: Dict[int, int], N: int):
        def _recs_mapper(user):
            users, sim = list(), list()
            try:
                user_id = self.users_mapping[user]
                users, sim = model.similar_items(user_id, N=N)
            except KeyError:
                pass
            # add threshold user to every similar items sequence
            users, sim = np.append(users, [-1]), np.append(sim, [0.0001]) 
            return [self.users_inv_mapping[user] for user in users], sim
        
        return _recs_mapper
    
    def get_mappings(self, train):
        super().get_mappings(train)
        # add threshold user_id
        self.users_inv_mapping[-1] = -1
        self.users_mapping = {v: k for k, v in self.users_inv_mapping.items()}
    
    def fit(self, train: pd.DataFrame):
        super().fit(train)
        dataset = Dataset.construct(
            interactions_df=train,
            user_features_df=None,
            item_features_df=None
        ) 
        # init and fit treshold model for cold users and 'not enough count' neighbours
        self.treshold_model = PopularModel() 
        self.treshold_model.fit(dataset)
        tr_recs = self.treshold_model.recommend([train.at[0, 'user_id']], dataset, 100, False)
        df_tr_recs = pd.DataFrame([[-1 , tr_recs['item_id'].values]])
        df_tr_recs.columns = ['sim_user_id', 'item_id']
        self.watched = pd.concat([self.watched, df_tr_recs], axis=0)
    
    def predict(self, test: pd.DataFrame, N_recs: int = 10):
        
        if not self.is_fitted:
            raise ValueError("Please call fit before predict")
        
        mapper = self._generate_recs_mapper(
            model=self.user_knn, 
            user_mapping=self.users_mapping,
            user_inv_mapping=self.users_inv_mapping,
            N=self.N_users
        )

        recs = pd.DataFrame({'user_id': test['user_id'].unique()})
        recs['sim_user_id'], recs['sim'] = zip(*recs['user_id'].map(mapper))
        recs = recs.set_index('user_id').apply(pd.Series.explode).reset_index()
        
        recs = recs[~(recs['user_id'] == recs['sim_user_id'])]\
            .merge(self.watched, on=['sim_user_id'], how='left')\
            .explode('item_id')\
            .sort_values(['user_id', 'sim'], ascending=False)\
            .drop_duplicates(['user_id', 'item_id'], keep='first')\
            .merge(self.item_idf, left_on='item_id', right_on='index', how='left')
        recs['score'] = recs['sim'] * recs['idf']
        recs = recs.sort_values(['user_id', 'score'], ascending=False)
        recs['rank'] = recs.groupby('user_id').cumcount() + 1 
        return recs[recs['rank'] <= N_recs][['user_id', 'item_id', 'score', 'rank']]
    

In [13]:
m = UserKnnModified(TFIDFRecommender())
train = interactions[:100000]
test = interactions[100000: 101000]
m.fit(train)
r = m.predict(test)



  0%|          | 0/83986 [00:00<?, ?it/s]

In [13]:
m = UserKnnModified(TFIDFRecommender())
train = interactions[:100000]
test = interactions[100000: 101000]
m.fit(train)
r = m.predict(test)



  0%|          | 0/83986 [00:00<?, ?it/s]

In [14]:
r.user_id.value_counts()

1097513    10
362770     10
378295     10
377590     10
377561     10
           ..
715477     10
715376     10
714291     10
713970     10
2616       10
Name: user_id, Length: 997, dtype: int64

In [14]:
r.user_id.value_counts()

1097513    10
362770     10
378295     10
377590     10
377561     10
           ..
715477     10
715376     10
714291     10
713970     10
2616       10
Name: user_id, Length: 997, dtype: int64

In [10]:
models_3 = {
    'userknnmodified_tfidf_50': UserKnnModified(TFIDFRecommender(num_threads=2), N_users=50),
}

res_3, models_3 = train_models(interactions, models_3, metrics, K_RECOS, cv)

  0%|          | 0/3 [00:00<?, ?it/s]


{'i_split': 0, 'start': Timestamp('2021-08-02 00:00:00', freq='7D'), 'end': Timestamp('2021-08-09 00:00:00', freq='7D'), 'train': 4266013, 'train_users': 797423, 'train_items': 15237, 'test': 263681, 'test_users': 98184, 'test_items': 6602}




  0%|          | 0/797423 [00:00<?, ?it/s]


{'i_split': 1, 'start': Timestamp('2021-08-09 00:00:00', freq='7D'), 'end': Timestamp('2021-08-16 00:00:00', freq='7D'), 'train': 4649162, 'train_users': 850489, 'train_items': 15415, 'test': 279422, 'test_users': 103511, 'test_items': 6698}




  0%|          | 0/850489 [00:00<?, ?it/s]


{'i_split': 2, 'start': Timestamp('2021-08-16 00:00:00', freq='7D'), 'end': Timestamp('2021-08-23 00:00:00', freq='7D'), 'train': 5051815, 'train_users': 906071, 'train_items': 15577, 'test': 298878, 'test_users': 110076, 'test_items': 6679}




  0%|          | 0/906071 [00:00<?, ?it/s]

In [11]:
# Aggregate metrics by folds and compare models
pivot_results = res_3.drop(columns="fold").groupby(["model"], sort=False).agg(["mean", "std"])
mean_metric_subset = [(metric, agg) for metric, agg in pivot_results.columns if agg == 'mean']
(
    pivot_results.style
    .highlight_min(subset=mean_metric_subset, color='lightcoral', axis=0)
    .highlight_max(subset=mean_metric_subset, color='lightgreen', axis=0)
)

Unnamed: 0_level_0,time,time,map@10,map@10,novelty,novelty
Unnamed: 0_level_1,mean,std,mean,std,mean,std
model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
userknnmodified_tfidf_50,3172.608017,1250.403026,0.007496,0.000812,8.052439,0.032318


# Prepare offline predictions

In [167]:
userknn = UserKnnModified(TFIDFRecommender(num_threads=2), N_users=50)
userknn.fit(interactions)



  0%|          | 0/962179 [00:00<?, ?it/s]

In [172]:
userknn_reco = userknn.predict(users[['user_id']], 10)

In [177]:
preds = (userknn_reco.groupby(['user_id'])
        .agg({'item_id': lambda x: x.tolist()})
        .reset_index()
        .set_index('user_id')
        .to_dict()['item_id']
)

In [178]:
with open('userknn_model.dill', 'wb') as f:
    dill.dump(userknn, f)

with open('userknn_reco.dill', 'wb') as f:
    dill.dump(preds, f)

# Prepare files for online model

In [14]:
import scipy as sp

def get_mapping(train_df, col):
  inv_mapping = dict(enumerate(train_df[col].unique()))
  mapping = {v: k for k, v in inv_mapping.items()}
  return inv_mapping, mapping


def get_coo_matrix(df,
                   user_col='user_id',
                   item_col='item_id',
                   weight_col=None,
                   users_mapping=None,
                   items_mapping=None):
    if weight_col:
        weights = df[weight_col].astype(np.float32)
    else:
        weights = np.ones(len(df), dtype=np.float32)

    interaction_matrix = sp.sparse.coo_matrix((
        weights,
        (
            df[user_col].map(users_mapping.get),
            df[item_col].map(items_mapping.get)
        )
    ))
    return interaction_matrix

In [15]:
users_inv_mapping, users_mapping = get_mapping(interactions, 'user_id')
items_inv_mapping, items_mapping = get_mapping(interactions, 'item_id')


interaction_matrix = get_coo_matrix(
    interactions,
    weight_col='weight',
    users_mapping=users_mapping,
    items_mapping=items_mapping,
)

In [18]:
with open('users_mapping.dill', 'wb') as f:
    dill.dump(users_mapping, f)

with open('users_inv_mapping.dill', 'wb') as f:
    dill.dump(users_inv_mapping, f)

In [154]:
model = TFIDFRecommender()
model.fit(interaction_matrix)



  0%|          | 0/15706 [00:00<?, ?it/s]

In [155]:
with open('tfidf_model.dill', 'wb') as f:
    dill.dump(model, f)