# Metrics, validation strategies and baselines

В данном jupyter notebook рассматриваются примеры того, какие схемы валидации и метрики используются в рекомендательных системах.
Также построим простые модели (бейзлайны) на данных МТС Библиотеки. 

* [Preprocessing](#preprocessing)
* [General remarks](#general-remarks)
* [Metrics](#metrics)
    * [Regression](#regression)
    * [Classification](#classification)
    * [Ranking](#ranking)
* [Validation strategies](#validation)
* [Baselines](#baselines)

In [1]:
import os
import numpy as np 
import pandas as pd 
from itertools import islice, cycle
from more_itertools import pairwise



In [471]:
import math

def is_nan(value):
    return math.isnan(float(value))

def isNaN(string):
    return string != string

<a id="preprocessing"></a>
# Preprocessing

Загрузим наши данные, теперь уже с фичами, и применим знания из [pandas-scipy-for-recsys](https://www.kaggle.com/sharthz23/pandas-scipy-for-recsys)

In [2]:
df = pd.read_pickle('./interactions_preprocessed.pickle')
df_users = pd.read_pickle('./users_preprocessed.pickle')
df_items = pd.read_pickle('./items_preprocessed.pickle')

In [3]:
test_dates = df['start_date'].unique()[-7:]



In [4]:
test_dates = list(pairwise(test_dates))
test_dates

[(numpy.datetime64('2019-12-25T00:00:00.000000000'),
  numpy.datetime64('2019-12-26T00:00:00.000000000')),
 (numpy.datetime64('2019-12-26T00:00:00.000000000'),
  numpy.datetime64('2019-12-27T00:00:00.000000000')),
 (numpy.datetime64('2019-12-27T00:00:00.000000000'),
  numpy.datetime64('2019-12-28T00:00:00.000000000')),
 (numpy.datetime64('2019-12-28T00:00:00.000000000'),
  numpy.datetime64('2019-12-29T00:00:00.000000000')),
 (numpy.datetime64('2019-12-29T00:00:00.000000000'),
  numpy.datetime64('2019-12-30T00:00:00.000000000')),
 (numpy.datetime64('2019-12-30T00:00:00.000000000'),
  numpy.datetime64('2019-12-31T00:00:00.000000000'))]

In [5]:
split_dates = test_dates[0]

In [6]:
train = df[df['start_date'] < split_dates[0]]

In [7]:
test = df[(df['start_date'] >= split_dates[0]) & (df['start_date'] < split_dates[1])]

In [8]:
test = test[(test['rating'] >= 4) | (test['rating'].isnull())]

In [9]:
split_dates, train.shape, test.shape


((numpy.datetime64('2019-12-25T00:00:00.000000000'),
  numpy.datetime64('2019-12-26T00:00:00.000000000')),
 (1517994, 5),
 (2114, 5))

<a id="baselines"></a>
# Baselines

Будем строить бейзлайн по популярному в зависимости от возраста пользователя

In [10]:
df_users

Unnamed: 0,user_id,age,sex
0,1,45_54,
1,2,18_24,0.0
2,3,65_inf,0.0
3,4,18_24,0.0
4,5,35_44,0.0
...,...,...,...
142883,159606,25_34,0.0
142884,159607,25_34,
142885,159609,18_24,0.0
142886,159610,35_44,0.0


In [11]:
train.head(2)

Unnamed: 0,user_id,item_id,progress,rating,start_date
0,126706,14433,80,,2018-01-01
1,127290,140952,58,,2018-01-01


In [12]:
test.head(2)

Unnamed: 0,user_id,item_id,progress,rating,start_date
1517914,38753,135245,0,,2019-12-25
1517915,101642,319500,67,5.0,2019-12-25


In [519]:
class PopularRecommender():
    def __init__(self, max_K=10, days=30, item_column='item_id', dt_column='date' , df_u = df_users):
        self.max_K = max_K
        self.days = days
        self.item_column = item_column
        self.dt_column = dt_column
        self.N = 10
        self.recommendations = []
        self.min_date = None
        self.df_u = df_u
        self.df_res = None


    def fit(self, df, ):
        min_date = df[self.dt_column].max().normalize() - pd.DateOffset(days=self.days)
        self.min_date = min_date
        self.standart_recommendations = df.loc[df[self.dt_column] > min_date, self.item_column].value_counts().head(self.max_K).index.values

    def super_fit(self, df, ):
        min_date = df[self.dt_column].max().normalize() - pd.DateOffset(days=self.days)
        self.min_date = min_date
        self.df_res = df.loc[df[self.dt_column] > min_date]\
            .merge(self.df_u , how = 'left' , on ='user_id')\
            .groupby(['age','sex','item_id']).agg({'user_id' : 'count'}).reset_index()\
            .rename(columns = {'user_id':'cnt'})\
            .sort_values(['age','sex','cnt'] , ascending=False)\
            .groupby(['age','sex']).head(self.max_K)
        self.standart_recommendations = df.loc[df[self.dt_column] > min_date, self.item_column].value_counts().head(self.max_K).index.values


    def recommend(self, users=None,):
        recs = self.standart_recommendations[:self.N]
        if users is None:
            return recs
        else:
            return list(islice(cycle([recs]), len(users)))

    def super_recommend(self, user,):
        x = self.df_u.loc[self.df_u['user_id'] == user]
        if x.empty:
            return self.standart_recommendations
        elif(is_nan(x['sex'].iloc[0])):
            return self.standart_recommendations
        elif(isNaN(x['age'].iloc[0])):
            return self.standart_recommendations
        else:
            #m = self.df_res.loc[(self.df_res['age'] == x['age'].iloc[0])]
            #m = self.df_res.loc[(self.df_res['sex'] == x['sex'].iloc[0])]
            m = self.df_res.loc[(self.df_res['age'] == x['age'].iloc[0]) & (self.df_res['sex'] == x['sex'].iloc[0])]
            print(m)
            n = m['item_id'].values
            return n



In [520]:
pop_model = PopularRecommender(days=77, dt_column='start_date')

In [521]:
pop_model.super_fit(train)

  .groupby(['age','sex','item_id']).agg({'user_id' : 'count'}).reset_index()\


In [522]:
k = pop_model.super_recommend(44)

          age  sex  item_id  cnt
284725  35_44  1.0   235407   22
283870  35_44  1.0   230067   18
265185  35_44  1.0   109201   15
283256  35_44  1.0   226196   15
253646  35_44  1.0    35265   13
271690  35_44  1.0   151190   13
261647  35_44  1.0    86588   11
282183  35_44  1.0   219099   11
266206  35_44  1.0   115190   10
272308  35_44  1.0   155266   10


In [523]:
k

array([235407, 230067, 109201, 226196,  35265, 151190,  86588, 219099,
       115190, 155266], dtype=int64)

In [506]:
pop_model.df_res

Unnamed: 0,age,sex,item_id,cnt
563061,65_inf,1.0,109201,71
582601,65_inf,1.0,235407,23
559523,65_inf,1.0,86588,21
569020,65_inf,1.0,147734,17
551522,65_inf,1.0,35265,15
...,...,...,...,...
11552,18_24,0.0,74650,35
36997,18_24,0.0,238856,34
22036,18_24,0.0,141961,32
24078,18_24,0.0,155266,32


In [513]:
df_items[df_items['id'] == 230067]

Unnamed: 0,id,title,genres,authors,year
417,230067,Кавказский пленник,"Стихи и поэзия,Литература 19 века,Русская клас...",Александр Пушкин,1822


In [515]:
item_titles = dict(zip(df_items.id,df_items.title))

In [516]:
item_titles[230067]

'Кавказский пленник'

In [494]:
print(item_titles.get(271690))

None


In [524]:
top100_recs = pop_model.super_recommend(30)

         age  sex  item_id  cnt
16955  18_24  0.0   109201   63
33790  18_24  0.0   218025   43
32729  18_24  0.0   211217   40
26325  18_24  0.0   169853   37
35640  18_24  0.0   230067   36
11552  18_24  0.0    74650   35
36997  18_24  0.0   238856   34
22036  18_24  0.0   141961   32
24078  18_24  0.0   155266   32
5416   18_24  0.0    35265   31


In [525]:
top100_recs

array([109201, 218025, 211217, 169853, 230067,  74650, 238856, 141961,
       155266,  35265], dtype=int64)

In [495]:
top10_recs = pop_model.recommend()
top10_recs

array([109201, 230067, 235407,  35265, 155266, 237760, 291806, 151190,
       270415, 285394], dtype=int64)

In [497]:
list(map(item_titles.get, top10_recs))

['Яблоки из сада Шлицбутера',
 'Кавказский пленник',
 'Пикник на обочине',
 'Записки юного врача',
 'О любви',
 'Русские народные сказки',
 'Женская война',
 'История государства Российского. Том 2. От Великого князя Святополка до Великого князя Мстислава Изяславовича',
 'Черный человек',
 'Хитрость']

In [526]:
list(map(item_titles.get, top100_recs))

['Яблоки из сада Шлицбутера',
 'Текст',
 'НИ СЫ. Восточная мудрость, которая гласит: будь уверен в своих силах и не позволяй сомнениям мешать тебе двигаться вперед',
 'Анна Каренина',
 'Кавказский пленник',
 'То, что делает меня',
 'Большая книга «ленивой мамы»',
 'Капитанская дочка',
 'О любви',
 'Записки юного врача']

In [None]:
# create datframe with user_id fo recs
recs = pd.DataFrame({'user_id': test['user_id'].unique()})


In [None]:
recs.head(2)

In [None]:

top_N = 10
recs['item_id'] = pop_model.recommend(recs['user_id'], N=top_N)
recs.head()

In [None]:
recs = recs.explode('item_id')
recs.head(top_N + 2)

In [None]:
recs['rank'] = recs.groupby('user_id').cumcount() + 1
recs.head(top_N + 2)

In [None]:
recs.rename(columns={'rank':'predict_rank'} , inplace = True)

In [None]:
recs.head(2)

In [None]:
test.head(2)

In [None]:

test_recs = test.merge(recs, how = 'left' , on = ['user_id','item_id'])

In [None]:
test_recs.head(2)

In [None]:
test_recs['users_item_count'] = test_recs.groupby(level='user_id', sort=False)['predict_rank'].transform(np.size)

In [None]:
uic = test_recs.groupby('user_id').agg({'item_id':'count'}).reset_index().rename(columns = {'item_id':'users_item_count'})

In [None]:
test_recs = test_recs.merge(uic , how = 'left' , on = 'user_id')

In [None]:
test_recs['pred_reciprocal_rank'] = 1 / test_recs['predict_rank']
test_recs['pred_reciprocal_rank'] = test_recs['pred_reciprocal_rank'].fillna(0)

In [None]:
test_recs.head(2)

In [None]:
test_recs = test_recs.sort_values(by=['user_id', 'predict_rank'])



In [None]:
test_recs['cumulative_rank'] = test_recs.groupby('user_id').cumcount() + 1

In [None]:
test_recs

In [None]:
test_recs['cumulative_rank'] = test_recs['cumulative_rank'] / test_recs['predict_rank']

In [None]:
test_recs.head(2)

In [None]:
test_recs[test_recs['predict_rank'].notnull()]

In [None]:
print(f'Метрик по test ({str(split_dates[0])[:10]}, {str(split_dates[1])[:10]})')

In [None]:
users_count = test_recs['user_id'].nunique()

In [None]:
users_count

In [None]:
for k in range(1, top_N + 1):
    hit_k = f'hit@{k}'
    test_recs[hit_k] = test_recs['predict_rank'] <= k
    print(f'Precision@{k} = {(test_recs[hit_k] / k).sum() / users_count:.4f}')
    print(f"Recall@{k} = {(test_recs[hit_k] / test_recs['users_item_count']).sum() / users_count:.4f}")

In [None]:
mapN = (test_recs["cumulative_rank"] / test_recs["users_item_count"]).sum() / users_count
print(f"MAP@{top_N} = {mapN}")

In [None]:
mrr = test_recs.groupby('user_id').agg({'pred_reciprocal_rank':'max'}).reset_index()['pred_reciprocal_rank'].mean()
print(f"MRR = {mrr}")