#Вебинар 6. Двухуровневые модели рекомендаций

In [1]:
!pip install implicit==0.4.4

Collecting implicit==0.4.4
  Downloading implicit-0.4.4.tar.gz (1.1 MB)
[?25l[K     |▎                               | 10 kB 19.0 MB/s eta 0:00:01[K     |▋                               | 20 kB 10.8 MB/s eta 0:00:01[K     |▉                               | 30 kB 8.5 MB/s eta 0:00:01[K     |█▏                              | 40 kB 8.2 MB/s eta 0:00:01[K     |█▌                              | 51 kB 4.3 MB/s eta 0:00:01[K     |█▊                              | 61 kB 5.1 MB/s eta 0:00:01[K     |██                              | 71 kB 5.5 MB/s eta 0:00:01[K     |██▍                             | 81 kB 5.7 MB/s eta 0:00:01[K     |██▋                             | 92 kB 6.4 MB/s eta 0:00:01[K     |███                             | 102 kB 5.1 MB/s eta 0:00:01[K     |███▎                            | 112 kB 5.1 MB/s eta 0:00:01[K     |███▌                            | 122 kB 5.1 MB/s eta 0:00:01[K     |███▉                            | 133 kB 5.1 MB/s eta 0:00:01[K  

__Import libs__

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix

# Матричная факторизация
from implicit import als

# Модель второго уровня
from lightgbm import LGBMClassifier

# Написанные нами функции
from metrics import precision_at_k, recall_at_k
from utils import prefilter_items
from recommenders import MainRecommender

__Read data__

In [3]:
data = pd.read_csv('retail_train.csv')
item_features = pd.read_csv('product.csv')
user_features = pd.read_csv('hh_demographic.csv')

In [5]:
data.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


In [6]:
data.week_no.value_counts()

20    29872
25    28722
24    27862
21    27011
16    26859
17    26325
22    25794
18    25532
23    25158
19    24234
15    23590
14    21884
13    18634
26    18172
12    17512
11    16706
10    13543
9     10585
8     10428
7      8980
6      8896
5      7168
4      5379
3      4803
2      3675
1      1881
Name: week_no, dtype: int64

In [7]:
data.week_no.max()

26

__Process features dataset__

In [8]:
ITEM_COL = 'item_id'
USER_COL = 'user_id'

In [9]:
# column processing
item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': ITEM_COL}, inplace=True)
user_features.rename(columns={'household_key': USER_COL }, inplace=True)

__Split dataset for train, eval, test__

In [35]:
# Важна схема обучения и валидации!
# -- давние покупки -- | -- 6 недель -- | -- 3 недель -- 
# подобрать размер 2-ого датасета (6 недель) --> learning curve (зависимость метрики recall@k от размера датасета)


VAL_MATCHER_WEEKS = 6
VAL_RANKER_WEEKS = 3

In [36]:
# берем данные для тренировки matching модели
data_train_matcher = data[data['week_no'] < data['week_no'].max() - (VAL_MATCHER_WEEKS + VAL_RANKER_WEEKS)]

# берем данные для валидации matching модели
data_val_matcher = data[(data['week_no'] >= data['week_no'].max() - (VAL_MATCHER_WEEKS + VAL_RANKER_WEEKS)) &
                      (data['week_no'] < data['week_no'].max() - (VAL_RANKER_WEEKS))]


# берем данные для тренировки ranking модели
data_train_ranker = data_val_matcher.copy()  # Для наглядности. Далее мы добавим изменения, и они будут отличаться

# берем данные для теста ranking, matching модели
data_val_ranker = data[data['week_no'] >= data['week_no'].max() - VAL_RANKER_WEEKS]

In [37]:
def print_stats_data(df_data, name_df):
    print(name_df)
    print(f"Shape: {df_data.shape} Users: {df_data[USER_COL].nunique()} Items: {df_data[ITEM_COL].nunique()}")

In [38]:
print_stats_data(data_train_matcher,'train_matcher')
print_stats_data(data_val_matcher,'val_matcher')
print_stats_data(data_train_ranker,'train_ranker')
print_stats_data(data_val_ranker,'val_ranker')

train_matcher
Shape: (200523, 12) Users: 2269 Items: 31333
val_matcher
Shape: (158768, 12) Users: 2138 Items: 26684
train_ranker
Shape: (158768, 12) Users: 2138 Items: 26684
val_ranker
Shape: (99914, 12) Users: 1950 Items: 22084


In [39]:
data_train_matcher.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


__Prefilter items__

In [40]:
n_items_before = data_train_matcher['item_id'].nunique()

data_train_matcher = prefilter_items(data_train_matcher, item_features=item_features, take_n_popular=5000)

n_items_after = data_train_matcher['item_id'].nunique()
print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

Decreased # items from 31333 to 5001


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['price'] = data['sales_value'] / (np.maximum(data['quantity'], 1))


__Make cold-start to warm-start__

In [41]:
# ищем общих пользователей
common_users = data_train_matcher.user_id.values

data_val_matcher = data_val_matcher[data_val_matcher.user_id.isin(common_users)]
data_train_ranker = data_train_ranker[data_train_ranker.user_id.isin(common_users)]
data_val_ranker = data_val_ranker[data_val_ranker.user_id.isin(common_users)]

print_stats_data(data_train_matcher,'train_matcher')
print_stats_data(data_val_matcher,'val_matcher')
print_stats_data(data_train_ranker,'train_ranker')
print_stats_data(data_val_ranker,'val_ranker')

train_matcher
Shape: (83129, 13) Users: 2201 Items: 5001
val_matcher
Shape: (141323, 12) Users: 1868 Items: 25163
train_ranker
Shape: (141323, 12) Users: 1868 Items: 25163
val_ranker
Shape: (89429, 12) Users: 1737 Items: 20920


__Init/train recommender__

In [42]:
recommender = MainRecommender(data_train_matcher)



  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/5001 [00:00<?, ?it/s]

__Eval recall of matching__

In [43]:
ACTUAL_COL = 'actual'

In [44]:
result_eval_matcher = data_val_matcher.groupby(USER_COL)[ITEM_COL].unique().reset_index()
result_eval_matcher.columns=[USER_COL, ACTUAL_COL]
result_eval_matcher.head(2)

Unnamed: 0,user_id,actual
0,1,"[849066, 866227, 960732, 965541, 995242, 10731..."
1,2,"[846833, 978318, 899824, 901606, 1075368, 1133..."


In [45]:
# N = Neighbors
N_PREDICT = 50 

In [46]:
%%time
# для понятности расписано все в строчку, без функций, ваша задача уметь оборачивать все это в функции
result_eval_matcher['own_rec'] = result_eval_matcher[USER_COL].apply(lambda x: recommender.get_own_recommendations(x, N=N_PREDICT))
result_eval_matcher['sim_item_rec'] = result_eval_matcher[USER_COL].apply(lambda x: recommender.get_similar_items_recommendation(x, N=50))
result_eval_matcher['als_rec'] = result_eval_matcher[USER_COL].apply(lambda x: recommender.get_als_recommendations(x, N=50))

CPU times: user 15.2 s, sys: 10.1 s, total: 25.4 s
Wall time: 14 s


In [47]:
result_eval_matcher.head(2)

Unnamed: 0,user_id,actual,own_rec,sim_item_rec,als_rec
0,1,"[849066, 866227, 960732, 965541, 995242, 10731...","[7167962, 855601, 1055831, 5565840, 1131115, 9...","[1103096, 6391517, 869728, 993592, 950575, 111...","[957839, 985160, 854852, 980263, 882308, 10864..."
1,2,"[846833, 978318, 899824, 901606, 1075368, 1133...","[9365106, 5567388, 5586255, 930118, 868542, 69...","[9296844, 1124029, 830941, 1040839, 1009321, 1...","[829685, 9368501, 824272, 974723, 1133826, 829..."


**Задание 1.**

A) Попробуйте различные варианты генерации кандидатов. Какие из них дают наибольший recall@k ?
- Пока пробуем отобрать 50 кандидатов (k=50)
- Качество измеряем на data_val_matcher: следующие 6 недель после трейна

Дают ли own recommendtions + top-popular лучший recall?  

In [48]:
def evalRecall(df_result, target_col_name, recommend_model):
    result_col_name = 'result'
    df_result[result_col_name] = df_result[target_col_name].apply(lambda x: recommend_model(x, N=25))
    return df_result.apply(lambda row: recall_at_k(row[result_col_name], row[ACTUAL_COL], k=N_PREDICT), axis=1).mean()


def calc_recall(df_data, top_k):
    for col_name in df_data.columns[2:]:
        yield col_name, df_data.apply(lambda row: recall_at_k(row[col_name], row[ACTUAL_COL], k=top_k), axis=1).mean()

def calc_precision(df_data, top_k):
    for col_name in df_data.columns[2:]:
        yield col_name, df_data.apply(lambda row: precision_at_k(row[col_name], row[ACTUAL_COL], k=top_k), axis=1).mean()

__Recall@50 of matching__

In [49]:
TOPK_RECALL = 50

In [50]:
sorted(calc_recall(result_eval_matcher, TOPK_RECALL), key=lambda x: x[1],reverse=True)

[('own_rec', 0.06756802374107197),
 ('sim_item_rec', 0.03551020497643118),
 ('als_rec', 0.0029339604750504332)]

__Recall@25 of matching__

In [51]:
TOPK_RECALL = 25

In [52]:
sorted(calc_recall(result_eval_matcher, TOPK_RECALL), key=lambda x: x[1],reverse=True)

[('own_rec', 0.044326567173376026),
 ('sim_item_rec', 0.017717529299490876),
 ('als_rec', 0.0011459414172025077)]

__Recall@75 of matching__

In [53]:
TOPK_RECALL = 75

In [54]:
sorted(calc_recall(result_eval_matcher, TOPK_RECALL), key=lambda x: x[1],reverse=True)

[('own_rec', 0.06756802374107197),
 ('sim_item_rec', 0.03551020497643118),
 ('als_rec', 0.0029339604750504332)]

__Вывод:__

При уменьшении 'k' показатели recall уменьшаются, при увеличении 'k' recall не изменяется.