In [1]:
import pandas as pd
import numpy as np
import scipy as sp
import requests
from tqdm.auto import tqdm
from scipy.stats import mode 
import dill
from pprint import pprint
from implicit.nearest_neighbours import CosineRecommender, TFIDFRecommender
import warnings
warnings.filterwarnings("ignore")

import sys
sys.path.insert(1, '/Users/nadya/MTS/ITMORecoService_32/service' )
from userknn import UserKnn 


from rectools import Columns
from rectools.dataset import Dataset
from rectools.dataset import Interactions

pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 200)

# 🎬 Get KION dataset 

<a href="https://ods.ai/competitions/competition-recsys-21/data"> Dataset description [ru] </a>


In [2]:
# # download dataset by chunks
# url = "https://storage.yandexcloud.net/itmo-recsys-public-data/kion_train.zip"

# req = requests.get(url, stream=True)

# with open('kion_train.zip', "wb") as fd:
#     total_size_in_bytes = int(req.headers.get('Content-Length', 0))
#     progress_bar = tqdm(desc='kion dataset download', total=total_size_in_bytes, unit='iB', unit_scale=True)
#     for chunk in req.iter_content(chunk_size=2 ** 20):
#         progress_bar.update(len(chunk))
#         fd.write(chunk)

In [3]:
# !unzip kion_train.zip

# EDA

In [4]:
interactions = pd.read_csv('kion_train/interactions.csv')
users = pd.read_csv('kion_train/users.csv')
items = pd.read_csv('kion_train/items.csv')

In [5]:
# rename columns, convert timestamp
interactions.rename(columns={'last_watch_dt': Columns.Datetime,
                            'total_dur': Columns.Weight}, 
                    inplace=True) 

interactions['datetime'] = pd.to_datetime(interactions['datetime'])

# Simple popular by number of interactions

In [6]:
# constract dataset for popular
dataset = Dataset.construct(
    interactions_df=interactions,
    user_features_df=None,
    item_features_df=None
)

In [7]:
from rectools.models.popular import PopularModel 

pop = PopularModel()
pop.fit(dataset);

In [8]:
# join titles from items 
popm = pop.recommend(
    dataset.user_id_map.external_ids[:1], 
    dataset=dataset, 
    k=10, 
    filter_viewed=False  # True - throw away some items for each user
).merge(items[['item_id', 'title']], 
       on='item_id',
       how='left')

In [9]:
popm.head(10)

Unnamed: 0,user_id,item_id,score,rank,title
0,176549,10440,202457.0,1,Хрустальный
1,176549,15297,193123.0,2,Клиника счастья
2,176549,9728,132865.0,3,Гнев человеческий
3,176549,13865,122119.0,4,Девятаев
4,176549,4151,91167.0,5,Секреты семейной жизни
5,176549,3734,74803.0,6,Прабабушка легкого поведения
6,176549,2657,68581.0,7,Подслушано
7,176549,4880,55043.0,8,Афера
8,176549,142,45367.0,9,Маша
9,176549,6809,40372.0,10,Дуров


# UserKNN Model

In [10]:
# train test split 
# test = last 1 week 
from rectools.model_selection import TimeRangeSplitter

n_folds = 1
unit = "W"
n_units = 1
periods = n_folds + 1
freq = f"{n_units}{unit}"

last_date = interactions[Columns.Datetime].max().normalize()
start_date = last_date - pd.Timedelta(n_folds * n_units + 1, unit=unit)  
print(f"Start date and last date of the test fold: {start_date, last_date}")
    
date_range = pd.date_range(start=start_date, periods=periods, freq=freq, tz=last_date.tz)
print(f"Test fold borders: {date_range.values.astype('datetime64[D]')}")

# generator of folds
cv = TimeRangeSplitter(
    date_range=date_range,
    filter_already_seen=True,
    filter_cold_items=True,
    filter_cold_users=True,
)
print(f"Real number of folds: {cv.get_n_splits(Interactions(interactions))}")

Start date and last date of the test fold: (Timestamp('2021-08-08 00:00:00'), Timestamp('2021-08-22 00:00:00'))
Test fold borders: ['2021-08-08' '2021-08-15']
Real number of folds: 1


In [11]:
# we have just 1 test fold - no need to iterate over fold
(train_ids, test_ids, fold_info) = cv.split(Interactions(interactions), collect_fold_stats=True).__next__()

In [12]:
# Prepare train matrix 
train = interactions.loc[train_ids]
test = interactions.loc[test_ids]

In [13]:
users_inv_mapping = dict(enumerate(train['user_id'].unique()))
users_mapping = {v: k for k, v in users_inv_mapping.items()}

In [14]:
items_inv_mapping = dict(enumerate(train['item_id'].unique()))
items_mapping = {v: k for k, v in items_inv_mapping.items()}

In [15]:
print(f"users_mapping amount: {len(users_mapping)}")
print(f"items_mapping amount: {len(items_mapping)}")

users_mapping amount: 842129
items_mapping amount: 15404


In [16]:
# Get sparse matrix 
def get_coo_matrix(df, 
                   user_col='user_id', 
                   item_col='item_id', 
                   weight_col=None, 
                   users_mapping=None, 
                   items_mapping=None):
    if weight_col:
        weights = df[weight_col].astype(np.float32)
    else:
        weights = np.ones(len(df), dtype=np.float32)

    interaction_matrix = sp.sparse.coo_matrix((
        weights, 
        (
            df[user_col].map(users_mapping.get), 
            df[item_col].map(items_mapping.get)
        )
    ))
    return interaction_matrix

In [17]:
interaction_matrix = get_coo_matrix(train, weight_col='weight',
                                    users_mapping=users_mapping, 
                                    items_mapping=items_mapping)

In [18]:
interaction_matrix;

In [16]:
# fit simple UserKNN model
userknnTFIDFK100 = TFIDFRecommender(K=100)
userknnTFIDFK100.fit(interaction_matrix)

  0%|          | 0/842129 [00:00<?, ?it/s]

In [17]:
# save model
with open('userknnTFIDFK100.dill', 'wb') as f:
    dill.dump(userknnTFIDFK100, f)

In [19]:
with open('userknnTFIDFK100.dill', 'rb') as f:
    userknnTFIDFK100 = dill.load(f)

In [20]:
userknnTFIDFK100.similar_items(1)

[(1, 1.0000000000000002),
 (273835, 0.2674896661891822),
 (773911, 0.2632101885721496),
 (64222, 0.25914292101068126),
 (162556, 0.2537780088030859),
 (46469, 0.25300032508778697),
 (171713, 0.2510257070734939),
 (422766, 0.25097207864459437),
 (282657, 0.24713968487569477),
 (575, 0.24674976366579474)]

In [21]:
# function to recommend to get user neighbours
def generate_implicit_recs_mapper(model, N, users_mapping, users_inv_mapping):
    def _recs_mapper(user):
        user_id = users_mapping[user]
        recs = model.similar_items(user_id, N=N)
        return [users_inv_mapping[user] for user, _ in recs], [sim for _, sim in recs]
    return _recs_mapper

In [22]:
mapper = generate_implicit_recs_mapper(
    userknnTFIDFK100, 
    N=10,
    users_mapping=users_mapping,
    users_inv_mapping=users_inv_mapping
)

In [23]:
recs = pd.DataFrame({
    'user_id': test['user_id'].unique()
})

recs['similar_user_id'], recs['similarity'] = zip(*recs['user_id'].map(mapper))
recs.head()

Unnamed: 0,user_id,similar_user_id,similarity
0,1016458,"[1016458, 542443, 983874, 263122, 1087135, 299198, 416417, 107556, 980255, 55914]","[0.9999999999999997, 0.30795056944195065, 0.2892556288628258, 0.2865786318030271, 0.2840258603538891, 0.28275509313396374, 0.2814949055823506, 0.2806520562356141, 0.27569737451648724, 0.2703263699..."
1,68478,"[68478, 620526, 695810, 20161, 412019, 879494, 589849, 988152, 691867, 1059229]","[0.9999999999999998, 0.6879105673985805, 0.6879105673985805, 0.6879105673985805, 0.6879105673985805, 0.6875684365910972, 0.682527643497476, 0.6810886673793897, 0.6649011119333184, 0.6500976114158797]"
2,580093,"[580093, 805265, 980409, 939663, 708439, 736490, 943049, 86046, 522603, 521456]","[1.0, 0.7937798109102163, 0.788744014339269, 0.7666797469287155, 0.7631606741401563, 0.750189495128039, 0.7501195711404515, 0.7474499190666032, 0.7473880080188159, 0.7471532852053147]"
3,1072552,"[1072552, 162216, 613583, 896269, 449057, 246357, 992547, 49193, 503934, 639316]","[1.0000000000000002, 0.45699036174245683, 0.45472129140483386, 0.4278037012555298, 0.42654520594852924, 0.42378450328959727, 0.41383525556374756, 0.411008423568197, 0.4078683617582729, 0.407690880..."
4,910002,"[910002, 301452, 454746, 537812, 691567, 801495, 578937, 858108, 541585, 457179]","[0.9999999999999997, 0.5504875268884805, 0.5503635062149785, 0.5497829629361086, 0.5497829629361086, 0.5497829629361086, 0.5497829629361086, 0.5497829629361086, 0.5497829629361086, 0.5497829629361..."


In [24]:
# explode lists to get vertical representation
recs = recs.set_index('user_id').apply(pd.Series.explode).reset_index()

In [25]:
recs.head(10+5)

Unnamed: 0,user_id,similar_user_id,similarity
0,1016458,1016458,1.0
1,1016458,542443,0.307951
2,1016458,983874,0.289256
3,1016458,263122,0.286579
4,1016458,1087135,0.284026
5,1016458,299198,0.282755
6,1016458,416417,0.281495
7,1016458,107556,0.280652
8,1016458,980255,0.275697
9,1016458,55914,0.270326


In [26]:
# delete recommendations of itself  
recs = recs[~(recs['user_id']==recs['similar_user_id'])]

In [27]:
recs.shape

(930323, 3)

In [28]:
recs.head()

Unnamed: 0,user_id,similar_user_id,similarity
1,1016458,542443,0.307951
2,1016458,983874,0.289256
3,1016458,263122,0.286579
4,1016458,1087135,0.284026
5,1016458,299198,0.282755


In [29]:
# Join watched items of neighbour users to get item recommendations
watched = train.groupby('user_id').agg({'item_id': list})
watched.head()

Unnamed: 0_level_0,item_id
user_id,Unnamed: 1_level_1
0,"[7102, 14359, 15297, 6006, 9728, 12192]"
2,"[7571, 3541, 15266, 13867, 12841, 10770, 4475, 9506, 8936, 11018, 11577, 561, 7106, 6774, 16029, 8482, 6825, 3594, 16166, 5819, 2954, 383, 11689, 12449, 2025, 6155, 3628, 334, 4024, 7210, 11539, 1..."
3,"[9728, 16406, 10440, 3475, 4151, 1418, 2220, 3734, 13789, 8581, 13849, 9550, 10464, 4880, 4436, 8801, 11790, 2657, 8252]"
4,"[4700, 6317]"
5,"[14397, 6445, 11437, 5651, 6167, 12466, 632, 8450, 2685, 7825, 3145, 7043, 4179, 15890, 5115, 4719, 10848]"


In [30]:
# join watched items
recs = recs.merge(watched, left_on=['similar_user_id'], right_on=['user_id'], how='left')
recs = recs.explode('item_id')

In [31]:
recs.head(30+5)

Unnamed: 0,user_id,similar_user_id,similarity,item_id
0,1016458,542443,0.307951,15411
0,1016458,542443,0.307951,12173
0,1016458,542443,0.307951,734
0,1016458,542443,0.307951,9728
1,1016458,983874,0.289256,9103
1,1016458,983874,0.289256,9761
2,1016458,263122,0.286579,9761
2,1016458,263122,0.286579,15096
2,1016458,263122,0.286579,4533
3,1016458,1087135,0.284026,3532


In [32]:
# drop duplicates pairs user_id-item_id 
# keep with the largest similiarity
recs = recs.sort_values(['user_id', 'similarity'], ascending=False)
recs.head()

Unnamed: 0,user_id,similar_user_id,similarity,item_id
872651,1097544,731318,0.889545,6309
872652,1097544,477980,0.889545,6309
872653,1097544,775180,0.889545,6309
872654,1097544,641410,0.889545,6309
872655,1097544,743461,0.887554,9728


In [33]:
recs = recs.drop_duplicates(['user_id', 'item_id'], keep='first')

In [34]:
recs.shape

(1299711, 4)

## IDF and normalization


In [35]:
# Make rank from similarity
from collections import Counter
cnt = Counter(train['item_id'].values)

In [36]:
# convert counter to dataframe
idf = pd.DataFrame.from_dict(cnt, orient='index', columns=['doc_freq']).reset_index()
idf.head()

Unnamed: 0,index,doc_freq
0,9506,3407
1,1659,891
2,7107,16279
3,7638,1067
4,6686,1529


In [37]:
# num of documents = num of recommendation list = dataframe shape
n = train.shape[0]
idf['idf'] = idf['doc_freq'].apply(lambda x: np.log((1 + n) / (1 + x) + 1))

In [38]:
idf.head()

Unnamed: 0,index,doc_freq,idf
0,9506,3407,7.205753
1,1659,891,8.54562
2,7107,16279,5.644741
3,7638,1067,8.365581
4,6686,1529,8.006202


In [39]:
# join idf 
recs = recs\
    .merge(
        idf[['index', 'idf']], 
        left_on='item_id',
        right_on='index',
        how='left')\
    .drop(['index'], axis=1)

recs.head()

Unnamed: 0,user_id,similar_user_id,similarity,item_id,idf
0,1097544,731318,0.889545,6309,11.134213
1,1097544,743461,0.887554,9728,3.792339
2,1097544,743461,0.887554,14317,6.059406
3,1097544,756287,0.811298,5569,7.681125
4,1097544,756287,0.811298,9335,8.570588


In [40]:
# idf normalization
recs['norm_rank'] = ((recs.idf-recs.idf.min())/(recs.idf.max()-recs.idf.min()))

In [41]:
recs.head()

Unnamed: 0,user_id,similar_user_id,similarity,item_id,idf,norm_rank
0,1097544,731318,0.889545,6309,11.134213,0.690168
1,1097544,743461,0.887554,9728,3.792339,0.042374
2,1097544,743461,0.887554,14317,6.059406,0.242403
3,1097544,756287,0.811298,5569,7.681125,0.385492
4,1097544,756287,0.811298,9335,8.570588,0.463972


In [42]:
recs['rank_idf'] = recs['similarity'] * recs['norm_rank']

In [43]:
recs.head()

Unnamed: 0,user_id,similar_user_id,similarity,item_id,idf,norm_rank,rank_idf
0,1097544,731318,0.889545,6309,11.134213,0.690168,0.613935
1,1097544,743461,0.887554,9728,3.792339,0.042374,0.037609
2,1097544,743461,0.887554,14317,6.059406,0.242403,0.215146
3,1097544,756287,0.811298,5569,7.681125,0.385492,0.312749
4,1097544,756287,0.811298,9335,8.570588,0.463972,0.376419


In [44]:
# make order by rank_idf 
recs = recs.sort_values(['user_id', 'rank_idf'], ascending=False)

In [45]:
# make rank
recs['rank'] = recs.groupby('user_id').cumcount() + 1 

In [46]:
recs.head(15)

Unnamed: 0,user_id,similar_user_id,similarity,item_id,idf,norm_rank,rank_idf,rank
17,1097544,1092610,0.710835,860,13.259452,0.877683,0.623888,1
0,1097544,731318,0.889545,6309,11.134213,0.690168,0.613935,2
12,1097544,756287,0.811298,140,9.212122,0.520576,0.422342,3
5,1097544,756287,0.811298,6467,8.974267,0.49959,0.405316,4
23,1097544,771092,0.639132,11317,10.486889,0.633053,0.404604,5
4,1097544,756287,0.811298,9335,8.570588,0.463972,0.376419,6
24,1097544,771092,0.639132,12635,9.883621,0.579825,0.370584,7
9,1097544,756287,0.811298,9702,8.392142,0.448227,0.363646,8
6,1097544,756287,0.811298,10593,8.275242,0.437913,0.355278,9
14,1097544,756287,0.811298,6220,8.124683,0.424629,0.3445,10


## 6. Metrics

In [47]:
from rectools.metrics import Precision, Recall, MeanInvUserFreq, Serendipity, calc_metrics

# calculate several classic (precision@k and recall@k) and "beyond accuracy" metrics
metrics = {
    "prec@10": Precision(k=10),
    "recall@10": Recall(k=10),
    "novelty": MeanInvUserFreq(k=10),
    "serendipity": Serendipity(k=10),
}

catalog = train['item_id'].unique()
    
metric_values = calc_metrics(
            metrics,
            reco=recs,
            interactions=test,
            prev_interactions=train,
            catalog=catalog
        )

In [48]:
metric_values

{'prec@10': 0.004748830687467519,
 'recall@10': 0.02260051509971515,
 'novelty': 7.579854207661611,
 'serendipity': 5.776636582983954e-05}

## Solving issue N<10

In [64]:
# number of recommendations per user < N
recs['rank'].value_counts().head(10)

1     101982
2      78029
3      70973
4      65946
5      61655
6      57489
7      53556
8      49902
9      46436
10     43050
Name: rank, dtype: int64

In [65]:
# deleting rows with a rank grater than 10
best_recs = recs[recs['rank'] < 11]
best_recs

Unnamed: 0,user_id,similar_user_id,similarity,item_id,idf,norm_rank,rank_idf,rank
17,1097544,1092610,0.710835,860,13.259452,0.877683,0.623888,1
0,1097544,731318,0.889545,6309,11.134213,0.690168,0.613935,2
12,1097544,756287,0.811298,140,9.212122,0.520576,0.422342,3
5,1097544,756287,0.811298,6467,8.974267,0.499590,0.405316,4
23,1097544,771092,0.639132,11317,10.486889,0.633053,0.404604,5
...,...,...,...,...,...,...,...,...
1299704,3,1014931,0.568152,3734,4.286431,0.085969,0.048843,6
1299709,3,206553,0.534367,2657,4.307105,0.087793,0.046914,7
1299702,3,252285,0.576888,4151,4.068069,0.066702,0.03848,8
1299700,3,252285,0.576888,9728,3.792339,0.042374,0.024445,9


In [66]:
best_recs['user_id'].value_counts()

1097544    10
515225     10
515496     10
515473     10
515452     10
           ..
389271      1
389317      1
389370      1
389378      1
387893      1
Name: user_id, Length: 101982, dtype: int64

In [67]:
user_id_list = best_recs['user_id'].tolist()

In [68]:
item_id_list = best_recs['item_id'].tolist()

In [73]:
# creating a list of dictionaries {user_id:[item_id_1, ...]}
user_recos_list = []
x = 0
for k, j in zip(user_id_list, item_id_list):
    if k == x:
        d[k].append(j) 
    else:
        d = {k:[j]}
        user_recos_list.append(d)
        x = k
        

In [74]:
user_recos_list

[{1097544: [860, 6309, 140, 6467, 11317, 9335, 12635, 9702, 10593, 6220]},
 {1097534: [15634, 2858, 1132, 5658, 14431, 9996, 6809, 4880, 2657, 4151]},
 {1097513: [9710, 8879, 2452, 7512, 7280, 10053, 11806, 8221, 1348, 12147]},
 {1097512: [3734, 10440]},
 {1097508: [16471, 13419, 7592, 1193, 2316, 4773, 3838, 15815, 1304, 8207]},
 {1097479: [15051, 10770, 14741, 4740, 3734, 9728, 13865, 15297]},
 {1097463: [9988]},
 {1097459: [10776, 10847, 10408, 13713, 15472, 4457, 11237]},
 {1097444: [11547, 2149, 5824, 5097, 9995, 2016, 1711, 8083, 13515, 15644]},
 {1097434: [15051, 3296, 7806, 6649, 14684, 15096, 10942, 12463, 101, 14470]},
 {1097432: [10811]},
 {1097429: [12779, 10986, 9000, 8681, 14683, 710, 7993, 11797, 8650, 9682]},
 {1097413: [2729, 13615, 9447, 7870, 8051, 13361, 11035, 11074, 6106, 4582]},
 {1097403: [12138, 8199, 6626, 11863, 12173, 3734, 9728, 13865, 15297, 10440]},
 {1097382: [13218]},
 {1097375: [9728]},
 {1097360: [6968, 12501, 11310, 14431, 9996, 4880, 9728]},
 {10973

In [71]:
pop_item_id_list = popm['item_id'].tolist()
pop_item_id_list

[10440, 15297, 9728, 13865, 4151, 3734, 2657, 4880, 142, 6809]

In [75]:
final_recs_list = user_recos_list

In [76]:
# creating a list of dictionaries {user_id:[item_id_1, item_id_2, ..., item_id_10]}
for users in final_recs_list:
    for value in users:
        if(len(users[value])) < 10:
            x = 10 - len(users[value])
            users[value].extend(pop_item_id_list[:x])
            print(users[value])
        else:
            print(users[value])

[860, 6309, 140, 6467, 11317, 9335, 12635, 9702, 10593, 6220]
[15634, 2858, 1132, 5658, 14431, 9996, 6809, 4880, 2657, 4151]
[9710, 8879, 2452, 7512, 7280, 10053, 11806, 8221, 1348, 12147]
[3734, 10440, 10440, 15297, 9728, 13865, 4151, 3734, 2657, 4880]
[16471, 13419, 7592, 1193, 2316, 4773, 3838, 15815, 1304, 8207]
[15051, 10770, 14741, 4740, 3734, 9728, 13865, 15297, 10440, 15297]
[9988, 10440, 15297, 9728, 13865, 4151, 3734, 2657, 4880, 142]
[10776, 10847, 10408, 13713, 15472, 4457, 11237, 10440, 15297, 9728]
[11547, 2149, 5824, 5097, 9995, 2016, 1711, 8083, 13515, 15644]
[15051, 3296, 7806, 6649, 14684, 15096, 10942, 12463, 101, 14470]
[10811, 10440, 15297, 9728, 13865, 4151, 3734, 2657, 4880, 142]
[12779, 10986, 9000, 8681, 14683, 710, 7993, 11797, 8650, 9682]
[2729, 13615, 9447, 7870, 8051, 13361, 11035, 11074, 6106, 4582]
[12138, 8199, 6626, 11863, 12173, 3734, 9728, 13865, 15297, 10440]
[13218, 10440, 15297, 9728, 13865, 4151, 3734, 2657, 4880, 142]
[9728, 10440, 15297, 9728, 1

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [77]:
final_recs_list

[{1097544: [860, 6309, 140, 6467, 11317, 9335, 12635, 9702, 10593, 6220]},
 {1097534: [15634, 2858, 1132, 5658, 14431, 9996, 6809, 4880, 2657, 4151]},
 {1097513: [9710, 8879, 2452, 7512, 7280, 10053, 11806, 8221, 1348, 12147]},
 {1097512: [3734, 10440, 10440, 15297, 9728, 13865, 4151, 3734, 2657, 4880]},
 {1097508: [16471, 13419, 7592, 1193, 2316, 4773, 3838, 15815, 1304, 8207]},
 {1097479: [15051,
   10770,
   14741,
   4740,
   3734,
   9728,
   13865,
   15297,
   10440,
   15297]},
 {1097463: [9988, 10440, 15297, 9728, 13865, 4151, 3734, 2657, 4880, 142]},
 {1097459: [10776,
   10847,
   10408,
   13713,
   15472,
   4457,
   11237,
   10440,
   15297,
   9728]},
 {1097444: [11547, 2149, 5824, 5097, 9995, 2016, 1711, 8083, 13515, 15644]},
 {1097434: [15051, 3296, 7806, 6649, 14684, 15096, 10942, 12463, 101, 14470]},
 {1097432: [10811, 10440, 15297, 9728, 13865, 4151, 3734, 2657, 4880, 142]},
 {1097429: [12779, 10986, 9000, 8681, 14683, 710, 7993, 11797, 8650, 9682]},
 {1097413: [27