# Metrics, validation strategies and baselines

В данном jupyter notebook рассматриваются примеры того, какие схемы валидации и метрики используются в рекомендательных системах.
Также построим простые модели (бейзлайны) на данных МТС Библиотеки. 

* [Preprocessing](#preprocessing)
* [General remarks](#general-remarks)
* [Metrics](#metrics)
    * [Regression](#regression)
    * [Classification](#classification)
    * [Ranking](#ranking)
* [Validation strategies](#validation)
* [Baselines](#baselines)

In [3]:
import os
import numpy as np 
import pandas as pd 
from itertools import islice, cycle
from more_itertools import pairwise



In [4]:
import math

def is_nan(value):
    return math.isnan(float(value))

def isNaN(string):
    return string != string

<a id="preprocessing"></a>
# Preprocessing

Загрузим наши данные, теперь уже с фичами, и применим знания из [pandas-scipy-for-recsys](https://www.kaggle.com/sharthz23/pandas-scipy-for-recsys)

In [5]:
df = pd.read_pickle('./interactions_preprocessed.pickle')
df_users = pd.read_pickle('./users_preprocessed.pickle')
df_items = pd.read_pickle('./items_preprocessed.pickle')

In [6]:
test_dates = df['start_date'].unique()[-7:]



In [7]:
test_dates = list(pairwise(test_dates))
test_dates

[(numpy.datetime64('2019-12-25T00:00:00.000000000'),
  numpy.datetime64('2019-12-26T00:00:00.000000000')),
 (numpy.datetime64('2019-12-26T00:00:00.000000000'),
  numpy.datetime64('2019-12-27T00:00:00.000000000')),
 (numpy.datetime64('2019-12-27T00:00:00.000000000'),
  numpy.datetime64('2019-12-28T00:00:00.000000000')),
 (numpy.datetime64('2019-12-28T00:00:00.000000000'),
  numpy.datetime64('2019-12-29T00:00:00.000000000')),
 (numpy.datetime64('2019-12-29T00:00:00.000000000'),
  numpy.datetime64('2019-12-30T00:00:00.000000000')),
 (numpy.datetime64('2019-12-30T00:00:00.000000000'),
  numpy.datetime64('2019-12-31T00:00:00.000000000'))]

In [8]:
split_dates = test_dates[0]

In [9]:
train = df[df['start_date'] < split_dates[0]]

In [10]:
test = df[(df['start_date'] >= split_dates[0]) & (df['start_date'] < split_dates[1])]

In [11]:
test = test[(test['rating'] >= 4) | (test['rating'].isnull())]

In [12]:
split_dates, train.shape, test.shape


((numpy.datetime64('2019-12-25T00:00:00.000000000'),
  numpy.datetime64('2019-12-26T00:00:00.000000000')),
 (1517994, 5),
 (2114, 5))

<a id="baselines"></a>
# Baselines

Будем строить бейзлайн по популярному в зависимости от возраста пользователя

In [13]:
df_users

Unnamed: 0,user_id,age,sex
0,1,45_54,
1,2,18_24,0.0
2,3,65_inf,0.0
3,4,18_24,0.0
4,5,35_44,0.0
...,...,...,...
142883,159606,25_34,0.0
142884,159607,25_34,
142885,159609,18_24,0.0
142886,159610,35_44,0.0


In [14]:
train.head(2)

Unnamed: 0,user_id,item_id,progress,rating,start_date
0,126706,14433,80,,2018-01-01
1,127290,140952,58,,2018-01-01


In [15]:
test.head(2)

Unnamed: 0,user_id,item_id,progress,rating,start_date
1517914,38753,135245,0,,2019-12-25
1517915,101642,319500,67,5.0,2019-12-25


In [16]:
class PopularRecommender():
    def __init__(self, max_K=10, days=30, item_column='item_id', dt_column='date' , df_u = df_users):
        self.max_K = max_K
        self.days = days
        self.item_column = item_column
        self.dt_column = dt_column
        self.N = 10
        self.recommendations = []
        self.min_date = None
        self.df_u = df_u
        self.df_res = None


    def fit(self, df, ):
        min_date = df[self.dt_column].max().normalize() - pd.DateOffset(days=self.days)
        self.min_date = min_date
        self.standart_recommendations = df.loc[df[self.dt_column] > min_date, self.item_column].value_counts().head(self.max_K).index.values

    def super_fit(self, df, ):
        min_date = df[self.dt_column].max().normalize() - pd.DateOffset(days=self.days)
        self.min_date = min_date
        self.df_res = df.loc[df[self.dt_column] > min_date]\
            .merge(self.df_u , how = 'left' , on ='user_id')\
            .groupby(['age','sex','item_id']).agg({'user_id' : 'count'}).reset_index()\
            .rename(columns = {'user_id':'cnt'})\
            .sort_values(['age','sex','cnt'] , ascending=False)\
            .groupby(['age','sex']).head(self.max_K)
        self.standart_recommendations = df.loc[df[self.dt_column] > min_date, self.item_column].value_counts().head(self.max_K).index.values


    def recommend(self, users=None,):
        recs = self.standart_recommendations[:self.N]
        if users is None:
            return recs
        else:
            return list(islice(cycle([recs]), len(users)))

    def super_recommend(self, user,):
        x = self.df_u.loc[self.df_u['user_id'] == user]
        if x.empty:
            return self.standart_recommendations
        elif(is_nan(x['sex'].iloc[0])):
            return self.standart_recommendations
        elif(isNaN(x['age'].iloc[0])):
            return self.standart_recommendations
        else:
            #m = self.df_res.loc[(self.df_res['age'] == x['age'].iloc[0])]
            #m = self.df_res.loc[(self.df_res['sex'] == x['sex'].iloc[0])]
            m = self.df_res.loc[(self.df_res['age'] == x['age'].iloc[0]) & (self.df_res['sex'] == x['sex'].iloc[0])]
            n = m['item_id'].values
            return n



In [17]:
pop_model = PopularRecommender(days=77, dt_column='start_date')

In [18]:
pop_model.super_fit(train)

  .groupby(['age','sex','item_id']).agg({'user_id' : 'count'}).reset_index()\


In [19]:
item_titles = dict(zip(df_items.id,df_items.title))

In [20]:
print(item_titles.get(271690))

None


In [21]:
top100_recs = pop_model.super_recommend(30)

In [22]:
top100_recs

array([109201, 218025, 211217, 169853, 230067,  74650, 238856, 141961,
       155266,  35265], dtype=int64)

In [23]:
list(map(item_titles.get, top100_recs))

['Яблоки из сада Шлицбутера',
 'Текст',
 'НИ СЫ. Восточная мудрость, которая гласит: будь уверен в своих силах и не позволяй сомнениям мешать тебе двигаться вперед',
 'Анна Каренина',
 'Кавказский пленник',
 'То, что делает меня',
 'Большая книга «ленивой мамы»',
 'Капитанская дочка',
 'О любви',
 'Записки юного врача']

In [45]:
# create datframe with user_id fo recs
recs = pd.DataFrame({'user_id': test['user_id'].unique()})



In [46]:
recs.head(2)

Unnamed: 0,user_id
0,38753
1,101642


In [47]:
def super_rec( u_id, pop_model = pop_model ):
    g = pop_model.super_recommend(u_id)
    return g

In [48]:
super_rec(34)

array([230067, 109201, 226196,  35265, 235407, 151190, 297013, 311394,
       115190, 237760], dtype=int64)

In [49]:
recs['item_id'] = recs['user_id'].apply(super_rec)

In [50]:
recs

Unnamed: 0,user_id,item_id
0,38753,"[109201, 235407, 237760, 9197, 35265, 168900, ..."
1,101642,"[109201, 218025, 211217, 169853, 230067, 74650..."
2,13548,"[109201, 218025, 211217, 169853, 230067, 74650..."
3,130425,"[109201, 235407, 237760, 9197, 35265, 168900, ..."
4,93986,"[109201, 35265, 285394, 235407, 9197, 155266, ..."
...,...,...
1746,129222,"[109201, 218025, 99616, 237760, 230067, 281005..."
1747,18067,"[109201, 218025, 99616, 237760, 230067, 281005..."
1748,76378,"[230067, 109201, 237760, 235407, 281005, 27041..."
1749,135722,"[109201, 218025, 211217, 169853, 230067, 74650..."


In [51]:
recs = recs.explode('item_id')
recs.head(top_N + 2)

Unnamed: 0,user_id,item_id
0,38753,109201
0,38753,235407
0,38753,237760
0,38753,9197
0,38753,35265
0,38753,168900
0,38753,230067
0,38753,99616
0,38753,168037
0,38753,43622


In [52]:
recs['rank'] = recs.groupby('user_id').cumcount() + 1
recs.head(top_N + 2)

Unnamed: 0,user_id,item_id,rank
0,38753,109201,1
0,38753,235407,2
0,38753,237760,3
0,38753,9197,4
0,38753,35265,5
0,38753,168900,6
0,38753,230067,7
0,38753,99616,8
0,38753,168037,9
0,38753,43622,10


In [53]:
recs.rename(columns={'rank':'predict_rank'} , inplace = True)

In [54]:
recs.head(2)

Unnamed: 0,user_id,item_id,predict_rank
0,38753,109201,1
0,38753,235407,2


In [55]:
test.head(2)

Unnamed: 0,user_id,item_id,progress,rating,start_date
1517914,38753,135245,0,,2019-12-25
1517915,101642,319500,67,5.0,2019-12-25


In [56]:

test_recs = test.merge(recs, how = 'left' , on = ['user_id','item_id'])

In [57]:
test_recs.head(2)

Unnamed: 0,user_id,item_id,progress,rating,start_date,predict_rank
0,38753,135245,0,,2019-12-25,
1,101642,319500,67,5.0,2019-12-25,


In [59]:
test_recs = test.merge(recs, how = 'left' , on = ['user_id','item_id'])

In [60]:
uic = test_recs.groupby('user_id').agg({'item_id':'count'}).reset_index().rename(columns = {'item_id':'users_item_count'})

In [61]:
test_recs = test_recs.merge(uic , how = 'left' , on = 'user_id')

In [62]:
test_recs['pred_reciprocal_rank'] = 1 / test_recs['predict_rank']
test_recs['pred_reciprocal_rank'] = test_recs['pred_reciprocal_rank'].fillna(0)

In [63]:
test_recs.head(2)

Unnamed: 0,user_id,item_id,progress,rating,start_date,predict_rank,users_item_count,pred_reciprocal_rank
0,38753,135245,0,,2019-12-25,,1,0.0
1,101642,319500,67,5.0,2019-12-25,,1,0.0


In [64]:
test_recs = test_recs.sort_values(by=['user_id', 'predict_rank'])



In [65]:
test_recs['cumulative_rank'] = test_recs.groupby('user_id').cumcount() + 1

In [66]:
test_recs

Unnamed: 0,user_id,item_id,progress,rating,start_date,predict_rank,users_item_count,pred_reciprocal_rank,cumulative_rank
1440,21,97894,100,,2019-12-25,,1,0.0,1
798,27,179635,99,,2019-12-25,,1,0.0,1
199,58,315050,65,4.0,2019-12-25,,1,0.0,1
1787,288,24595,11,,2019-12-25,,1,0.0,1
1596,430,218187,26,,2019-12-25,,1,0.0,1
...,...,...,...,...,...,...,...,...,...
994,159294,110617,18,,2019-12-25,,1,0.0,1
426,159303,197759,64,,2019-12-25,,1,0.0,1
5,159466,124115,84,,2019-12-25,,1,0.0,1
1121,159472,245992,78,,2019-12-25,,1,0.0,1


In [67]:
test_recs['cumulative_rank'] = test_recs['cumulative_rank'] / test_recs['predict_rank']

In [68]:
test_recs.head(2)

Unnamed: 0,user_id,item_id,progress,rating,start_date,predict_rank,users_item_count,pred_reciprocal_rank,cumulative_rank
1440,21,97894,100,,2019-12-25,,1,0.0,
798,27,179635,99,,2019-12-25,,1,0.0,


In [69]:
test_recs[test_recs['predict_rank'].notnull()]

Unnamed: 0,user_id,item_id,progress,rating,start_date,predict_rank,users_item_count,pred_reciprocal_rank,cumulative_rank
1513,474,235407,100,5.0,2019-12-25,2.0,1,0.5,0.5
671,1672,230067,12,,2019-12-25,2.0,1,0.5,0.5
1536,10260,35265,0,,2019-12-25,3.0,1,0.333333,0.333333
1489,11207,235407,0,,2019-12-25,3.0,1,0.333333,0.333333
672,12589,230067,5,,2019-12-25,2.0,1,0.5,0.5
1554,15277,35265,99,,2019-12-25,2.0,2,0.5,0.5
654,19679,230067,1,,2019-12-25,5.0,1,0.2,0.2
524,25849,285394,92,,2019-12-25,5.0,1,0.2,0.2
940,26416,311394,100,,2019-12-25,8.0,1,0.125,0.125
727,27884,208935,38,,2019-12-25,8.0,1,0.125,0.125


In [70]:
print(f'Метрик по test ({str(split_dates[0])[:10]}, {str(split_dates[1])[:10]})')

Метрик по test (2019-12-25, 2019-12-26)


In [71]:
users_count = test_recs['user_id'].nunique()

In [72]:
users_count

1751

In [73]:
for k in range(1, top_N + 1):
    hit_k = f'hit@{k}'
    test_recs[hit_k] = test_recs['predict_rank'] <= k
    print(f'Precision@{k} = {(test_recs[hit_k] / k).sum() / users_count:.4f}')
    print(f"Recall@{k} = {(test_recs[hit_k] / test_recs['users_item_count']).sum() / users_count:.4f}")

Precision@1 = 0.0006
Recall@1 = 0.0006
Precision@2 = 0.0043
Recall@2 = 0.0069
Precision@3 = 0.0044
Recall@3 = 0.0112
Precision@4 = 0.0040
Recall@4 = 0.0138
Precision@5 = 0.0045
Recall@5 = 0.0187
Precision@6 = 0.0038
Recall@6 = 0.0192
Precision@7 = 0.0037
Recall@7 = 0.0218
Precision@8 = 0.0034
Recall@8 = 0.0231
Precision@9 = 0.0032
Recall@9 = 0.0245
Precision@10 = 0.0031
Recall@10 = 0.0262


In [74]:
mapN = (test_recs["cumulative_rank"] / test_recs["users_item_count"]).sum() / users_count
print(f"MAP@{top_N} = {mapN}")

MAP@10 = 0.007808387648479025


In [75]:
mrr = test_recs.groupby('user_id').agg({'pred_reciprocal_rank':'max'}).reset_index()['pred_reciprocal_rank'].mean()
print(f"MRR = {mrr}")

MRR = 0.008963358806305693
