In [1]:
from collections import Counter
from pathlib import Path
import pickle

import dill
import numpy as np
import pandas as pd
import scipy as sp
import tqdm

from implicit.nearest_neighbours import  BM25Recommender, CosineRecommender, TFIDFRecommender
from rectools import Columns
from rectools.dataset import Dataset, Interactions
from rectools.metrics import Precision, Recall, MeanInvUserFreq, Serendipity, calc_metrics
from rectools.models.popular import PopularModel
from rectools.model_selection import TimeRangeSplitter

# 🎬 Get KION dataset 

In [2]:
!mkdir ../data

mkdir: cannot create directory ‘../data’: File exists


In [3]:
!wget https://storage.yandexcloud.net/itmo-recsys-public-data/kion_train.zip -O ../data/data_original.zip

--2022-12-06 15:04:03--  https://storage.yandexcloud.net/itmo-recsys-public-data/kion_train.zip
Resolving storage.yandexcloud.net (storage.yandexcloud.net)... 213.180.193.243, 2a02:6b8::1d9
Connecting to storage.yandexcloud.net (storage.yandexcloud.net)|213.180.193.243|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 78795385 (75M) [application/zip]
Saving to: ‘../data/data_original.zip’


2022-12-06 15:04:11 (11,0 MB/s) - ‘../data/data_original.zip’ saved [78795385/78795385]



In [4]:
!unzip ../data/data_original.zip -d ../data

Archive:  ../data/data_original.zip
replace ../data/kion_train/interactions.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: ^C


# EDA

In [2]:
interactions = pd.read_csv('../data/kion_train/interactions.csv')
users = pd.read_csv('../data/kion_train/users.csv')
items = pd.read_csv('../data/kion_train/items.csv')

In [3]:
# rename columns, convert timestamp
interactions.rename(columns={'last_watch_dt': Columns.Datetime,
                            'total_dur': Columns.Weight}, 
                    inplace=True) 

interactions['datetime'] = pd.to_datetime(interactions['datetime'])

# Simple popular by number of interactions

In [4]:
# constract dataset for popular
dataset = Dataset.construct(
    interactions_df=interactions,
    user_features_df=None,
    item_features_df=None
)

In [5]:
popular_model = PopularModel()
popular_model.fit(dataset);

In [6]:
# join titles from items 
popular_recommendations = popular_model.recommend(
    dataset.user_id_map.external_ids[:1], 
    dataset=dataset, 
    k=20, 
    filter_viewed=False
).merge(items[['item_id', 'title']], 
       on='item_id',
       how='left')

## Let's look at popular items

In [7]:
popular_recommendations.head(10)

Unnamed: 0,user_id,item_id,score,rank,title
0,176549,10440,202457.0,1,Хрустальный
1,176549,15297,193123.0,2,Клиника счастья
2,176549,9728,132865.0,3,Гнев человеческий
3,176549,13865,122119.0,4,Девятаев
4,176549,4151,91167.0,5,Секреты семейной жизни
5,176549,3734,74803.0,6,Прабабушка легкого поведения
6,176549,2657,68581.0,7,Подслушано
7,176549,4880,55043.0,8,Афера
8,176549,142,45367.0,9,Маша
9,176549,6809,40372.0,10,Дуров


# UserKNN Model

In [8]:
# train test split 
# test = last 1 week 

n_folds = 1
unit = "W"
n_units = 1
periods = n_folds + 1
freq = f"{n_units}{unit}"

last_date = interactions[Columns.Datetime].max().normalize()
start_date = last_date - pd.Timedelta(n_folds * n_units + 1, unit=unit)  
print(f"Start date and last date of the test fold: {start_date, last_date}")
    
date_range = pd.date_range(start=start_date, periods=periods, freq=freq, tz=last_date.tz)
print(f"Test fold borders: {date_range.values.astype('datetime64[D]')}")

# generator of folds
cv = TimeRangeSplitter(
    date_range=date_range,
    filter_already_seen=True,
    filter_cold_items=True,
    filter_cold_users=True,
)
print(f"Real number of folds: {cv.get_n_splits(Interactions(interactions))}")

Start date and last date of the test fold: (Timestamp('2021-08-08 00:00:00'), Timestamp('2021-08-22 00:00:00'))
Test fold borders: ['2021-08-08' '2021-08-15']
Real number of folds: 1


In [9]:
# we have just 1 test fold - no need to iterate over fold
(train_ids, test_ids, fold_info) = cv.split(Interactions(interactions), collect_fold_stats=True).__next__()

In [10]:
# Prepare train matrix 
train = interactions.loc[train_ids]
test = interactions.loc[test_ids]

In [11]:
users_inv_mapping = dict(enumerate(train['user_id'].unique()))
users_mapping = {v: k for k, v in users_inv_mapping.items()}

items_inv_mapping = dict(enumerate(train['item_id'].unique()))
items_mapping = {v: k for k, v in items_inv_mapping.items()}

print(f"users_mapping amount: {len(users_mapping)}")
print(f"items_mapping amount: {len(items_mapping)}")

users_mapping amount: 842129
items_mapping amount: 15404


In [12]:
# Get sparse matrix 
def get_coo_matrix(df, 
                   user_col='user_id', 
                   item_col='item_id', 
                   weight_col=None, 
                   users_mapping=None, 
                   items_mapping=None):
    if weight_col:
        weights = df[weight_col].astype(np.float32)
    else:
        weights = np.ones(len(df), dtype=np.float32)

    interaction_matrix = sp.sparse.coo_matrix((
        weights, 
        (
            df[user_col].map(users_mapping.get), 
            df[item_col].map(items_mapping.get)
        )
    ))
    return interaction_matrix

In [13]:
interaction_matrix = get_coo_matrix(train, weight_col='weight',
                                    users_mapping=users_mapping, 
                                    items_mapping=items_mapping)

## We will try CosineRecommender, TFIDFRecommender with different K

## Задание: наличие тюнинга гиперпараметров (например, векторного расстояния или типов kNN моделей (implicit/rectools/...)) (4 балла)

In [14]:
models = {
    'Cosine_10': CosineRecommender(K=10),
    'Cosine_20': CosineRecommender(K=20),
    'Cosine_50': CosineRecommender(K=50),
    'Cosine_100': CosineRecommender(K=100),
    'TFIDF_10' : TFIDFRecommender(K=10),
    'TFIDF_20' : TFIDFRecommender(K=20),
    'TFIDF_50' : TFIDFRecommender(K=50),
    'TFIDF_100' : TFIDFRecommender(K=100),
}

metrics = {
    "prec@10": Precision(k=10),
    "recall@10": Recall(k=10),
    "novelty": MeanInvUserFreq(k=10),
    "serendipity": Serendipity(k=10),
}

catalog = train['item_id'].unique()

## Train and save models

In [None]:
models_dir = Path('../models')
models_dir.mkdir(exist_ok=True)

for model_name, model in tqdm.tqdm(models.items()):
    model.fit(interaction_matrix)
    # save model
    with open(models_dir / f'{model_name}.dill', 'wb') as f:
        dill.dump(model, f)

## Load saved models

In [15]:
models_dir = Path('../models')

for model_name in tqdm.tqdm(models.keys()):
    with open(models_dir / f'{model_name}.dill', 'rb') as f:
        models[model_name] = dill.load(f)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:01<00:00,  6.63it/s]


### Utility function to get user neighbours

In [16]:
def generate_implicit_recs_mapper(model, N, users_mapping, users_inv_mapping):
    def _recs_mapper(user):
        user_id = users_mapping[user]
        recs = model.similar_items(user_id, N=N)
        return [users_inv_mapping[user] for user, _ in recs], [sim for _, sim in recs]
    return _recs_mapper

### Utility function to add items IDF to recs

In [17]:
def add_items_idf(df: pd.DataFrame, train:pd.DataFrame = train):
    cnt = Counter(train['item_id'].values)
    idf = pd.DataFrame.from_dict(cnt, orient='index', columns=['doc_freq']).reset_index()
    n = train.shape[0]
    idf['idf'] = idf['doc_freq'].apply(lambda x: np.log((1 + n) / (1 + x) + 1))
    return df.merge(
        idf[['index', 'idf']], 
        left_on='item_id',
        right_on='index',
        how='left'
    ).drop(['index'], axis=1)

## Calculate metrcis for all models.

### We tried 3 items assessment techniques:
- vanilla IDF
- user-normalized IDF with multiplication
- user-normalized IDF with addition

## Задание: другие варианты ранжированивания айтемов похожих пользователей (2 балла)

In [18]:
results = []

# Calculate metrics with IDF ranking

for model_name, model in tqdm.tqdm(models.items()):
    # if model_name.startswith('BM25'):
    #     continue
    mapper = generate_implicit_recs_mapper(
        model, 
        N=model.K,
        users_mapping=users_mapping,
        users_inv_mapping=users_inv_mapping
    )
    recs = pd.DataFrame({
        'user_id': test['user_id'].unique()
    })
    recs['similar_user_id'], recs['similarity'] = zip(*recs['user_id'].map(mapper))

    # explode lists to get vertical representation
    recs = recs.set_index('user_id').apply(pd.Series.explode).reset_index()
    # delete recommendations of itself  
    recs = recs[~(recs['user_id']==recs['similar_user_id'])]

    # Join watched items of neighbour users to get item recommendations
    watched_items = train.groupby('user_id').agg({'item_id': list})
    watched_items.head()
    recs = recs.merge(watched_items, left_on=['similar_user_id'], right_on=['user_id'], how='left')
    recs = recs.explode('item_id')

    # Sort and keep only pairs user_id - item_id with highest users similarity
    recs = recs.sort_values(['user_id', 'similarity'], ascending=False)
    recs = recs.drop_duplicates(['user_id', 'item_id'], keep='first')

    # Add IDF for items and calculate final ranks
    recs_with_idf = add_items_idf(recs, train)
    
    # Normalize IDF for each user
    max_idfs = recs_with_idf.groupby('user_id')['idf'].max()
    recs_with_idf = recs_with_idf.merge(
        max_idfs,
        left_on='user_id',
        right_index=True,
        how='left'
    ).rename(columns={'idf_x': 'idf', 'idf_y': 'idf_max'})
    recs_with_idf['norm_idf'] = recs_with_idf['idf'] / recs_with_idf['idf_max']

    # Calculate ranks based on vanilla IDF
    recs_with_idf['rank_idf'] = recs_with_idf['similarity'] * recs_with_idf['idf']
    recs_with_idf = recs_with_idf.sort_values(['user_id', 'rank_idf'], ascending=False)
    recs_with_idf['rank'] = recs_with_idf.groupby('user_id').cumcount() + 1
    model_result = {'name': model_name}
    metric_values = calc_metrics(
                metrics,
                reco=recs_with_idf[recs_with_idf['rank'] < 11], # We need only 10 recs for each user
                interactions=test,
                prev_interactions=train,
                catalog=catalog
            )
    model_result.update(metric_values)
    results.append(model_result)
    
    # Calculate ranks based on normalized IDF (multiply)
    recs_with_idf['rank_idf'] = recs_with_idf['similarity'] * recs_with_idf['norm_idf']
    recs_with_idf = recs_with_idf.sort_values(['user_id', 'rank_idf'], ascending=False)
    recs_with_idf['rank'] = recs_with_idf.groupby('user_id').cumcount() + 1 
    model_result = {'name': f'{model_name}_norm_idf_mult'}
    metric_values = calc_metrics(
                metrics,
                reco=recs_with_idf[recs_with_idf['rank'] < 11], # We need only 10 recs for each user
                interactions=test,
                prev_interactions=train,
                catalog=catalog
            )
    model_result.update(metric_values)
    results.append(model_result)
    
    # Calculate ranks based on normalized IDF (Additions)
    recs_with_idf['rank_idf'] = recs_with_idf['similarity'] + recs_with_idf['norm_idf']
    recs_with_idf = recs_with_idf.sort_values(['user_id', 'rank_idf'], ascending=False)
    recs_with_idf['rank'] = recs_with_idf.groupby('user_id').cumcount() + 1 
    model_result = {'name': f'{model_name}_norm_idf_add'}
    metric_values = calc_metrics(
                metrics,
                reco=recs_with_idf[recs_with_idf['rank'] < 11], # We need only 10 recs for each user
                interactions=test,
                prev_interactions=train,
                catalog=catalog
            )
    model_result.update(metric_values)
    results.append(model_result)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 8/8 [11:00<00:00, 82.55s/it]


In [19]:
pd.DataFrame.from_dict(results)

Unnamed: 0,name,prec@10,recall@10,novelty,serendipity
0,Cosine_10,0.003556,0.016221,8.202711,6e-05
1,Cosine_10_norm_idf_mult,0.003556,0.016221,8.202711,6e-05
2,Cosine_10_norm_idf_add,0.003539,0.016146,8.207746,5.9e-05
3,Cosine_20,0.002736,0.012419,9.021152,6.3e-05
4,Cosine_20_norm_idf_mult,0.002736,0.012419,9.021152,6.3e-05
5,Cosine_20_norm_idf_add,0.002721,0.012377,9.031966,6.3e-05
6,Cosine_50,0.001832,0.008025,10.066978,6.2e-05
7,Cosine_50_norm_idf_mult,0.001832,0.008025,10.066978,6.2e-05
8,Cosine_50_norm_idf_add,0.001808,0.007937,10.088801,6.1e-05
9,Cosine_100,0.001344,0.005731,10.904754,5.6e-05


# Best model - TFIDF_10

In [135]:
model = models['TFIDF_10']

mapper = generate_implicit_recs_mapper(
        model, 
        N=model.K,
        users_mapping=users_mapping,
        users_inv_mapping=users_inv_mapping
    )

all_data = pd.concat([train, test])

recs = pd.DataFrame({
    'user_id': all_data['user_id'].unique()
})

recs['similar_user_id'], recs['similarity'] = zip(*recs['user_id'].map(mapper))

# explode lists to get vertical representation
recs = recs.set_index('user_id').apply(pd.Series.explode).reset_index()
# delete recommendations of itself  
recs = recs[~(recs['user_id']==recs['similar_user_id'])]

# Join watched items of neighbour users to get item recommendations
watched_items = all_data.groupby('user_id').agg({'item_id': list})
recs = recs.merge(watched_items, left_on=['similar_user_id'], right_on=['user_id'], how='left')
recs = recs.explode('item_id')

# Sort and keep only pairs user_id - item_id with highest users similarity
recs = recs.sort_values(['user_id', 'similarity'], ascending=False)
recs = recs.drop_duplicates(['user_id', 'item_id'], keep='first')

# Add IDF for items and calculate final ranks
recs_with_idf = add_items_idf(recs, all_data)
recs_with_idf['rank_idf'] = recs_with_idf['similarity'] * recs_with_idf['idf']
recs_with_idf = recs_with_idf.sort_values(['user_id', 'rank_idf'], ascending=False)

# Calculate final ranks
recs_with_idf['rank'] = recs_with_idf.groupby('user_id').cumcount() + 1 

# We need only 10 recs for each user
recs_with_idf = recs_with_idf[recs_with_idf['rank'] < 11]
    
max_idfs = recs_with_idf.groupby('user_id')['idf'].max()
recs_with_idf = recs_with_idf.merge(
    max_idfs,
    left_on='user_id',
    right_index=True,
    how='left'
).rename(columns={'idf_x': 'idf', 'idf_y': 'idf_max'})
recs_with_idf['norm_idf'] = recs_with_idf['idf'] / recs_with_idf['idf_max']
recs_with_idf

Unnamed: 0,user_id,similar_user_id,similarity,item_id,idf,rank_idf,rank,idf_max,norm_idf
4,1097557,816790,0.968389,16509,6.371188,6.169792,1,6.371188,1.000000
1,1097557,991816,0.968476,3182,6.171464,5.976913,2,6.371188,0.968652
2,1097557,816790,0.968389,6192,6.084136,5.891813,3,6.371188,0.954945
3,1097557,816790,0.968389,4151,4.086500,3.957324,4,6.371188,0.641403
5,1097557,850075,0.923018,9728,3.796473,3.504212,5,6.371188,0.595881
...,...,...,...,...,...,...,...,...,...
7561571,0,4112,0.719012,849,5.884698,4.231168,6,9.832989,0.598465
7561573,0,1024203,0.710033,12192,5.101967,3.622563,7,9.832989,0.518862
7561568,0,341571,0.746481,3734,4.298216,3.208536,8,9.832989,0.437122
7561567,0,341571,0.746481,9728,3.796473,2.833994,9,9.832989,0.386096


# Save offline recs

In [None]:
groups_for_offline = recs_with_idf.groupby('user_id')['item_id'].agg(list)

In [142]:
tfidf_offline_recs = groups_for_offline.to_dict()

In [90]:
tfidf_offline_recs = {}
for user in tqdm.tqdm(recs_with_idf['user_id'].unique()):
    tfidf_offline_recs[user] = recs_with_idf[recs_with_idf['user_id']==user]['item_id'].values

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 842053/842053 [23:46<00:00, 590.37it/s]


## Задание в тесте вас ждут холодные пользователи. Сделайте рекомендации для них (обратите внимание на rectools.models.popular) (2 балла)

- в сервисе для холодных пользователей мы возвращаем 10 популярных

In [91]:
popular_items = list(popular_recommendations.head(20)['item_id'].values)
popular_items

[10440,
 15297,
 9728,
 13865,
 4151,
 3734,
 2657,
 4880,
 142,
 6809,
 12192,
 9996,
 8636,
 4740,
 7571,
 11237,
 1844,
 12995,
 4457,
 14431]

## ЗАДАНИЕ: сделать кол-во рекомендаций не меньше N (2 балла)

In [24]:
# Let's add popular items in case user has few recommendations

# for user in tfidf_offline_recs:
#     tfidf_offline_recs[user] = list(tfidf_offline_recs[user])
#     user_recs = tfidf_offline_recs[user]
#     i = 0
#     while len(user_recs) < 10:
#         if popular_items[i] not in user_recs:
#             user_recs.append(popular_items[i])
#         i += 1

In [152]:
with open(models_dir / 'tfidf_idf_10_offline.pkl', 'wb') as f:
    pickle.dump(tfidf_offline_recs, f, protocol=pickle.HIGHEST_PROTOCOL)

## Save artifacts for online recommender

In [24]:
with open(models_dir / 'popular.pkl', 'wb') as f:
    pickle.dump(popular_items, f, protocol=pickle.HIGHEST_PROTOCOL)

In [103]:
# Save train_test
pd.concat([train, test]).to_csv(models_dir / 'train.csv', index=False)

# Debug

In [93]:
%load_ext autoreload 
%autoreload 2

# Добавляем путь к основной папке проекта, чтобы иметь возможность делать импорт из src
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [154]:
from service.userknn import OnlineUserKnnRecommender, OfflineRecommender

In [145]:
online_model = OnlineUserKnnRecommender(
    'TFIDF_10.dill',
    Path('../models'),
    'train.csv'
)

In [155]:
offline_model = OfflineRecommender(
    'tfidf_idf_10_offline.pkl',
    Path('../models'),
    'popular.pkl'
)

In [161]:
different_recs = []

for user in tqdm.tqdm(pd.concat([train, test])['user_id'].unique()[:10000]):
    online = online_model.recommend(user)
    offline = offline_model.recommend(user)
    if set(online) != set(offline):
        different_recs.append(user)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10000/10000 [01:18<00:00, 127.34it/s]


In [162]:
len(different_recs)

0