# Metrics, validation strategies and baselines

В данном jupyter notebook рассматриваются примеры того, какие схемы валидации и метрики используются в рекомендательных системах.
Также построим простые модели (бейзлайны) на данных МТС Библиотеки. 

* [Preprocessing](#preprocessing)
* [General remarks](#general-remarks)
* [Metrics](#metrics)
    * [Regression](#regression)
    * [Classification](#classification)
    * [Ranking](#ranking)
* [Validation strategies](#validation)
* [Baselines](#baselines)

In [2]:
import os
import numpy as np 
import pandas as pd 
from itertools import islice, cycle
from more_itertools import pairwise



<a id="preprocessing"></a>
# Preprocessing

Загрузим наши данные, теперь уже с фичами, и применим знания из [pandas-scipy-for-recsys](https://www.kaggle.com/sharthz23/pandas-scipy-for-recsys)

In [3]:
df = pd.read_pickle('./interactions_preprocessed.pickle')
df_users = pd.read_pickle('./users_preprocessed.pickle')
df_items = pd.read_pickle('./items_preprocessed.pickle')

In [4]:
test_dates = df['start_date'].unique()[-7:]



In [5]:
test_dates = list(pairwise(test_dates))
test_dates

[(numpy.datetime64('2019-12-25T00:00:00.000000000'),
  numpy.datetime64('2019-12-26T00:00:00.000000000')),
 (numpy.datetime64('2019-12-26T00:00:00.000000000'),
  numpy.datetime64('2019-12-27T00:00:00.000000000')),
 (numpy.datetime64('2019-12-27T00:00:00.000000000'),
  numpy.datetime64('2019-12-28T00:00:00.000000000')),
 (numpy.datetime64('2019-12-28T00:00:00.000000000'),
  numpy.datetime64('2019-12-29T00:00:00.000000000')),
 (numpy.datetime64('2019-12-29T00:00:00.000000000'),
  numpy.datetime64('2019-12-30T00:00:00.000000000')),
 (numpy.datetime64('2019-12-30T00:00:00.000000000'),
  numpy.datetime64('2019-12-31T00:00:00.000000000'))]

In [6]:
split_dates = test_dates[0]

In [7]:
train = df[df['start_date'] < split_dates[0]]

In [8]:
test = df[(df['start_date'] >= split_dates[0]) & (df['start_date'] < split_dates[1])]

In [9]:
test = test[(test['rating'] >= 4) | (test['rating'].isnull())]

In [10]:
split_dates, train.shape, test.shape


((numpy.datetime64('2019-12-25T00:00:00.000000000'),
  numpy.datetime64('2019-12-26T00:00:00.000000000')),
 (1517994, 5),
 (2114, 5))

<a id="baselines"></a>
# Baselines

Будем строить бейзлайн по популярному в зависимости от возраста пользователя

In [11]:
df_users

Unnamed: 0,user_id,age,sex
0,1,45_54,
1,2,18_24,0.0
2,3,65_inf,0.0
3,4,18_24,0.0
4,5,35_44,0.0
...,...,...,...
142883,159606,25_34,0.0
142884,159607,25_34,
142885,159609,18_24,0.0
142886,159610,35_44,0.0


In [12]:
train.head(2)

Unnamed: 0,user_id,item_id,progress,rating,start_date
0,126706,14433,80,,2018-01-01
1,127290,140952,58,,2018-01-01


In [13]:
test.head(2)

Unnamed: 0,user_id,item_id,progress,rating,start_date
1517914,38753,135245,0,,2019-12-25
1517915,101642,319500,67,5.0,2019-12-25


In [35]:
class PopularRecommender():
    def __init__(self, max_K=100, days=30, item_column='item_id', dt_column='date'):
        self.max_K = max_K
        self.days = days
        self.item_column = item_column
        self.dt_column = dt_column
        self.recommendations = []
        self.min_date = min_date

    def fit(self, df, ):
        min_date = df[self.dt_column].max().normalize() - pd.DateOffset(days=self.days)
        self.min_date = min_date
        self.recommendations = df.loc[df[self.dt_column] > min_date, self.item_column].value_counts().head(self.max_K).index.values

    def recommend(self, users=None, N=10):
        recs = self.recommendations[:N]
        if users is None:
            return recs
        else:
            return list(islice(cycle([recs]), len(users)))

In [36]:
pop_model = PopularRecommender(days=77, dt_column='start_date')

In [37]:
vars(pop_model)

{'max_K': 100,
 'days': 77,
 'item_column': 'item_id',
 'dt_column': 'start_date',
 'recommendations': [],
 'min_date': Timestamp('2019-10-08 00:00:00')}

In [21]:
train['start_date'].nunique()

723

In [26]:
min_date = train['start_date'].max().normalize() - pd.DateOffset(days=77)

In [27]:
min_date

Timestamp('2019-10-08 00:00:00')

In [43]:
dt_column = 'start_date'
item_column = 'item_id'
df.loc[df[dt_column] > min_date, item_column]

1355528    158123
1355529    160933
1355530     63496
1355531    198094
1355532    299722
            ...  
1532914    285394
1532915     73789
1532916     77993
1532917    230195
1532924    233762
Name: item_id, Length: 177391, dtype: int64

In [25]:
train.head(2)

Unnamed: 0,user_id,item_id,progress,rating,start_date
0,126706,14433,80,,2018-01-01
1,127290,140952,58,,2018-01-01


In [38]:
pop_model.fit(train)

In [40]:
vars(pop_model)

{'max_K': 100,
 'days': 77,
 'item_column': 'item_id',
 'dt_column': 'start_date',
 'recommendations': array([109201, 230067, 235407,  35265, 155266, 237760, 291806, 151190,
        270415, 285394, 218025, 115190, 147734, 282647, 208935, 271846,
         74650, 281005, 168037, 135032, 168900, 219099, 288531,  75579,
        226196,  96052,   9197, 169853, 312940, 103077,  99616,  63978,
          6136, 311394,  86588,  43622, 295007, 110811, 104010, 176898,
        210979,  48510, 107522,  55913,  67643, 181955,  30772, 141961,
        233762, 126581, 247959,  93249, 238200,  39878, 276498, 238856,
        303990,  86123,   5001,   8875,  31454, 241026,  28889, 209326,
        265984,  26963, 231923, 211217, 243711, 121687,  90674, 108759,
        232317, 163618, 251856,  11424, 233644,  58803, 240571,  49352,
        231257,  12301, 297013, 145016,  65912,  99357, 181857, 226128,
         81827,  35226, 121544, 206185, 312792,  90225,  34363, 223806,
        222102, 250772,  25173, 22

In [17]:
pop_model.recommendations

array([109201, 230067, 235407,  35265, 155266, 237760, 291806, 151190,
       270415, 285394, 218025, 115190, 147734, 282647, 208935, 271846,
        74650, 281005, 168037, 135032, 168900, 219099, 288531,  75579,
       226196,  96052,   9197, 169853, 312940, 103077,  99616,  63978,
         6136, 311394,  86588,  43622, 295007, 110811, 104010, 176898,
       210979,  48510, 107522,  55913,  67643, 181955,  30772, 141961,
       233762, 126581, 247959,  93249, 238200,  39878, 276498, 238856,
       303990,  86123,   5001,   8875,  31454, 241026,  28889, 209326,
       265984,  26963, 231923, 211217, 243711, 121687,  90674, 108759,
       232317, 163618, 251856,  11424, 233644,  58803, 240571,  49352,
       231257,  12301, 297013, 145016,  65912,  99357, 181857, 226128,
        81827,  35226, 121544, 206185, 312792,  90225,  34363, 223806,
       222102, 250772,  25173, 229749], dtype=int64)

In [18]:
pop_model.days

77

In [21]:
item_titles = dict(zip(df_items.id,df_items.title))

In [24]:
item_titles[128115]

'Ворон-челобитчик'

In [25]:
print(item_titles.get(128115))

Ворон-челобитчик


In [55]:
top10_recs = pop_model.recommend()
top10_recs

array([109201, 230067, 235407,  35265, 155266, 237760, 291806, 151190,
       270415, 285394], dtype=int64)

In [57]:
map(item_titles.get, top10_recs)

<map at 0x1dde3a08fa0>

In [56]:
list(map(item_titles.get, top10_recs))

['Яблоки из сада Шлицбутера',
 'Кавказский пленник',
 'Пикник на обочине',
 'Записки юного врача',
 'О любви',
 'Русские народные сказки',
 'Женская война',
 'История государства Российского. Том 2. От Великого князя Святополка до Великого князя Мстислава Изяславовича',
 'Черный человек',
 'Хитрость']

In [30]:
# create datframe with user_id fo recs
recs = pd.DataFrame({'user_id': test['user_id'].unique()})


In [32]:
recs.head(2)

Unnamed: 0,user_id
0,38753
1,101642


In [33]:

top_N = 10
recs['item_id'] = pop_model.recommend(recs['user_id'], N=top_N)
recs.head()

Unnamed: 0,user_id,item_id
0,38753,"[109201, 230067, 235407, 35265, 155266, 237760..."
1,101642,"[109201, 230067, 235407, 35265, 155266, 237760..."
2,13548,"[109201, 230067, 235407, 35265, 155266, 237760..."
3,130425,"[109201, 230067, 235407, 35265, 155266, 237760..."
4,93986,"[109201, 230067, 235407, 35265, 155266, 237760..."


In [34]:
recs = recs.explode('item_id')
recs.head(top_N + 2)

Unnamed: 0,user_id,item_id
0,38753,109201
0,38753,230067
0,38753,235407
0,38753,35265
0,38753,155266
0,38753,237760
0,38753,291806
0,38753,151190
0,38753,270415
0,38753,285394


In [35]:
recs['rank'] = recs.groupby('user_id').cumcount() + 1
recs.head(top_N + 2)

Unnamed: 0,user_id,item_id,rank
0,38753,109201,1
0,38753,230067,2
0,38753,235407,3
0,38753,35265,4
0,38753,155266,5
0,38753,237760,6
0,38753,291806,7
0,38753,151190,8
0,38753,270415,9
0,38753,285394,10


In [37]:
recs.rename(columns={'rank':'predict_rank'} , inplace = True)

In [41]:
recs.head(2)

Unnamed: 0,user_id,item_id,predict_rank
0,38753,109201,1
0,38753,230067,2


In [40]:
test.head(2)

Unnamed: 0,user_id,item_id,progress,rating,start_date
1517914,38753,135245,0,,2019-12-25
1517915,101642,319500,67,5.0,2019-12-25


In [43]:

test_recs = test.merge(recs, how = 'left' , on = ['user_id','item_id'])

In [45]:
test_recs.head(2)

Unnamed: 0,user_id,item_id,progress,rating,start_date,predict_rank
0,38753,135245,0,,2019-12-25,
1,101642,319500,67,5.0,2019-12-25,


In [47]:
test_recs['users_item_count'] = test_recs.groupby(level='user_id', sort=False)['predict_rank'].transform(np.size)

ValueError: level name user_id is not the name of the index

In [51]:
uic = test_recs.groupby('user_id').agg({'item_id':'count'}).reset_index().rename(columns = {'item_id':'users_item_count'})

In [53]:
test_recs = test_recs.merge(uic , how = 'left' , on = 'user_id')

In [54]:
test_recs['pred_reciprocal_rank'] = 1 / test_recs['predict_rank']
test_recs['pred_reciprocal_rank'] = test_recs['pred_reciprocal_rank'].fillna(0)

In [55]:
test_recs.head(2)

Unnamed: 0,user_id,item_id,progress,rating,start_date,predict_rank,users_item_count,pred_reciprocal_rank
0,38753,135245,0,,2019-12-25,,1,0.0
1,101642,319500,67,5.0,2019-12-25,,1,0.0


In [57]:
test_recs = test_recs.sort_values(by=['user_id', 'predict_rank'])



In [58]:
test_recs['cumulative_rank'] = test_recs.groupby('user_id').cumcount() + 1

In [59]:
test_recs

Unnamed: 0,user_id,item_id,progress,rating,start_date,predict_rank,users_item_count,pred_reciprocal_rank,cumulative_rank
1440,21,97894,100,,2019-12-25,,1,0.0,1
798,27,179635,99,,2019-12-25,,1,0.0,1
199,58,315050,65,4.0,2019-12-25,,1,0.0,1
1787,288,24595,11,,2019-12-25,,1,0.0,1
1596,430,218187,26,,2019-12-25,,1,0.0,1
...,...,...,...,...,...,...,...,...,...
994,159294,110617,18,,2019-12-25,,1,0.0,1
426,159303,197759,64,,2019-12-25,,1,0.0,1
5,159466,124115,84,,2019-12-25,,1,0.0,1
1121,159472,245992,78,,2019-12-25,,1,0.0,1


In [61]:
test_recs['cumulative_rank'] = test_recs['cumulative_rank'] / test_recs['predict_rank']

In [63]:
test_recs.head(2)

Unnamed: 0,user_id,item_id,progress,rating,start_date,predict_rank,users_item_count,pred_reciprocal_rank,cumulative_rank
1440,21,97894,100,,2019-12-25,,1,0.0,
798,27,179635,99,,2019-12-25,,1,0.0,


In [64]:
test_recs[test_recs['predict_rank'].notnull()]

Unnamed: 0,user_id,item_id,progress,rating,start_date,predict_rank,users_item_count,pred_reciprocal_rank,cumulative_rank
1513,474,235407,100,5.0,2019-12-25,3.0,1,0.333333,0.333333
671,1672,230067,12,,2019-12-25,2.0,1,0.500000,0.500000
1595,7313,291806,0,,2019-12-25,7.0,5,0.142857,0.142857
1536,10260,35265,0,,2019-12-25,4.0,1,0.250000,0.250000
1489,11207,235407,0,,2019-12-25,3.0,1,0.333333,0.333333
...,...,...,...,...,...,...,...,...,...
496,139770,285394,96,,2019-12-25,10.0,2,0.100000,0.200000
403,144116,155266,97,,2019-12-25,5.0,1,0.200000,0.200000
761,144818,151190,3,,2019-12-25,8.0,3,0.125000,0.125000
1283,146602,235407,11,,2019-12-25,3.0,1,0.333333,0.333333


In [68]:
print(f'Метрик по test ({str(split_dates[0])[:10]}, {str(split_dates[1])[:10]})')

Метрик по test (2019-12-25, 2019-12-26)


In [69]:
users_count = test_recs['user_id'].nunique()

In [70]:
users_count

1751

In [72]:
for k in range(1, top_N + 1):
    hit_k = f'hit@{k}'
    test_recs[hit_k] = test_recs['predict_rank'] <= k
    print(f'Precision@{k} = {(test_recs[hit_k] / k).sum() / users_count:.4f}')
    print(f"Recall@{k} = {(test_recs[hit_k] / test_recs['users_item_count']).sum() / users_count:.4f}")

Precision@1 = 0.0000
Recall@1 = 0.0000
Precision@2 = 0.0026
Recall@2 = 0.0040
Precision@3 = 0.0048
Recall@3 = 0.0128
Precision@4 = 0.0050
Recall@4 = 0.0173
Precision@5 = 0.0045
Recall@5 = 0.0192
Precision@6 = 0.0039
Recall@6 = 0.0204
Precision@7 = 0.0037
Recall@7 = 0.0222
Precision@8 = 0.0035
Recall@8 = 0.0241
Precision@9 = 0.0034
Recall@9 = 0.0253
Precision@10 = 0.0035
Recall@10 = 0.0290


In [73]:
mapN = (test_recs["cumulative_rank"] / test_recs["users_item_count"]).sum() / users_count
print(f"MAP@{top_N} = {mapN}")

MAP@10 = 0.007738812893010494


In [80]:
mrr = test_recs.groupby('user_id').agg({'pred_reciprocal_rank':'max'}).reset_index()['pred_reciprocal_rank'].mean()
print(f"MRR = {mrr}")

MRR = 0.008750101982540589
