# Вебинар 6. Двухуровневые модели рекомендаций

## Заглушка, выложу окончательный вариант сегодня к вечеру.

Код для src, utils, metrics вы можете скачать из [этого](https://github.com/geangohn/recsys-tutorial) github репозитория

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix

# Матричная факторизация
from implicit import als

# Модель второго уровня
from lightgbm import LGBMClassifier

# Написанные нами функции
from src.metrics import precision_at_k, recall_at_k
from src.utils import prefilter_items
from src.recommenders import MainRecommender

In [2]:
data = pd.read_csv('data/retail_train.csv')
item_features = pd.read_csv('data/product.csv')
user_features = pd.read_csv('data/hh_demographic.csv')

# column processing
item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': 'item_id'}, inplace=True)
user_features.rename(columns={'household_key': 'user_id'}, inplace=True)


# Важна схема обучения и валидации!
# -- давние покупки -- | -- 6 недель -- | -- 3 недель -- 
# подобрать размер 2-ого датасета (6 недель) --> learning curve (зависимость метрики recall@k от размера датасета)
val_lvl_1_size_weeks = 6
val_lvl_2_size_weeks = 3

data_train_lvl_1 = data[data['week_no'] < data['week_no'].max() - (val_lvl_1_size_weeks + val_lvl_2_size_weeks)]
data_val_lvl_1 = data[(data['week_no'] >= data['week_no'].max() - (val_lvl_1_size_weeks + val_lvl_2_size_weeks)) &
                      (data['week_no'] < data['week_no'].max() - (val_lvl_2_size_weeks))]

data_train_lvl_2 = data_val_lvl_1.copy()  # Для наглядности. Далее мы добавим изменения, и они будут отличаться
data_val_lvl_2 = data[data['week_no'] >= data['week_no'].max() - val_lvl_2_size_weeks]

data_train_lvl_1.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


In [3]:
n_items_before = data_train_lvl_1['item_id'].nunique()

data_train_lvl_1 = prefilter_items(data_train_lvl_1, item_features=item_features, take_n_popular=5000)

n_items_after = data_train_lvl_1['item_id'].nunique()
print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

Decreased # items from 83685 to 5001


In [4]:
recommender = MainRecommender(data_train_lvl_1)



HBox(children=(FloatProgress(value=0.0, max=15.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5001.0), HTML(value='')))




In [5]:
recommender.get_als_recommendations(2375, N=5)

[899624, 871756, 844179, 1044078, 1106523]

In [6]:
recommender.get_own_recommendations(2375, N=5)

[948640, 918046, 847962, 907099, 873980]

In [7]:
recommender.get_similar_items_recommendation(2375, N=5)

[1046545, 1044078, 1044078, 934399, 15778319]

In [8]:
recommender.get_similar_users_recommendation(2375, N=5)

[1012801, 820612, 974265, 10457044, 1102003]

### Задание 1

A) Попробуйте различные варианты генерации кандидатов. Какие из них дают наибольший recall@k ?
- Пока пробуем отобрать 50 кандидатов (k=50)
- Качество измеряем на data_val_lvl_1: следующие 6 недель после трейна

Дают ли own recommendtions + top-popular лучший recall?  

B)* Как зависит recall@k от k? Постройте для одной схемы генерации кандидатов эту зависимость для k = {20, 50, 100, 200, 500}  
C)* Исходя из прошлого вопроса, как вы думаете, какое значение k является наиболее разумным?


In [9]:
result_lvl_1 = data_val_lvl_1.groupby('user_id')['item_id'].unique().reset_index().copy()
result_lvl_1.columns=['user_id', 'actual']
result_lvl_1.head(2)

Unnamed: 0,user_id,actual
0,1,"[853529, 865456, 867607, 872137, 874905, 87524..."
1,2,"[15830248, 838136, 839656, 861272, 866211, 870..."


In [10]:
train_users = data_train_lvl_1['user_id'].unique()
result_lvl_1 = result_lvl_1[result_lvl_1['user_id'].isin(train_users)]
result_lvl_1['als'] = result_lvl_1['user_id'].apply(lambda u: recommender.get_als_recommendations(u, 500))
result_lvl_1.head(2)

Unnamed: 0,user_id,actual,als
0,1,"[853529, 865456, 867607, 872137, 874905, 87524...","[1037332, 962615, 883616, 856942, 916990, 1046..."
1,2,"[15830248, 838136, 839656, 861272, 866211, 870...","[866211, 5569230, 1127831, 916122, 1029743, 11..."


In [11]:
result_lvl_1['own'] = result_lvl_1['user_id'].apply(lambda u: recommender.get_own_recommendations(u, 500))
result_lvl_1.head(2)

Unnamed: 0,user_id,actual,als,own
0,1,"[853529, 865456, 867607, 872137, 874905, 87524...","[1037332, 962615, 883616, 856942, 916990, 1046...","[856942, 9297615, 5577022, 877391, 9655212, 88..."
1,2,"[15830248, 838136, 839656, 861272, 866211, 870...","[866211, 5569230, 1127831, 916122, 1029743, 11...","[911974, 1076580, 1103898, 5567582, 1056620, 9..."


In [12]:
result_lvl_1['similar_items'] = result_lvl_1['user_id'].apply(lambda u: recommender.get_similar_items_recommendation(u, 500))
result_lvl_1.head(2)

Unnamed: 0,user_id,actual,als,own,similar_items
0,1,"[853529, 865456, 867607, 872137, 874905, 87524...","[1037332, 962615, 883616, 856942, 916990, 1046...","[856942, 9297615, 5577022, 877391, 9655212, 88...","[1132789, 1007512, 9297615, 5577022, 920200, 9..."
1,2,"[15830248, 838136, 839656, 861272, 866211, 870...","[866211, 5569230, 1127831, 916122, 1029743, 11...","[911974, 1076580, 1103898, 5567582, 1056620, 9...","[8090509, 5569845, 1044078, 985999, 880888, 81..."


In [13]:
#result_lvl_1['similar_users'] = result_lvl_1['user_id'].apply(lambda u: recommender.get_similar_users_recommendation(u, 5))
#result_lvl_1.head(2)
# не работает

In [14]:
for model_name in ['als', 'own', 'similar_items']:
    for k in [20, 50, 100, 200, 500]:
        mean_recall_at_k = result_lvl_1.apply(lambda row: recall_at_k(row[model_name], row['actual'], k), axis=1).mean()
        print(f'Model {model_name}, k = {k}:   Mean recall@k = {mean_recall_at_k}')


Model als, k = 20:   Mean recall@k = 0.02976588995428441
Model als, k = 50:   Mean recall@k = 0.048853669036638715
Model als, k = 100:   Mean recall@k = 0.06854377248122953
Model als, k = 200:   Mean recall@k = 0.09884143005110785
Model als, k = 500:   Mean recall@k = 0.14770428873385313
Model own, k = 20:   Mean recall@k = 0.03928427679372909
Model own, k = 50:   Mean recall@k = 0.06525657038145175
Model own, k = 100:   Mean recall@k = 0.09604492955885034
Model own, k = 200:   Mean recall@k = 0.13537278412833242
Model own, k = 500:   Mean recall@k = 0.18205324555508678
Model similar_items, k = 20:   Mean recall@k = 0.01772720182971755
Model similar_items, k = 50:   Mean recall@k = 0.033834068115221
Model similar_items, k = 100:   Mean recall@k = 0.05375860395028277
Model similar_items, k = 200:   Mean recall@k = 0.08562398404144855
Model similar_items, k = 500:   Mean recall@k = 0.1357875146124941


### Задание 2.

Обучите модель 2-ого уровня, при этом:
    - Добавьте минимум по 2 фичи для юзера, товара и пары юзер-товар
    - Измерьте отдельно precision@5 модели 1-ого уровня и двухуровневой модели на data_val_lvl_2
    - Вырос ли precision@5 при использовании двухуровневой модели?

In [115]:
users_lvl_2 = pd.DataFrame(data_train_lvl_2['user_id'].unique(), columns = ['user_id']) 
users_lvl_2 = users_lvl_2[users_lvl_2['user_id'].isin(train_users)]

users_lvl_2['candidates'] = users_lvl_2['user_id'].apply(lambda x: recommender.get_own_recommendations(x, N=50))

s = users_lvl_2.apply(lambda x: pd.Series(x['candidates']), axis=1).stack().reset_index(level=1, drop=True)
s.name = 'item_id'

users_lvl_2 = users_lvl_2.drop('candidates', axis=1).join(s)
users_lvl_2.head(2)

Unnamed: 0,user_id,item_id
0,2070,1105426
0,2070,1097350


In [213]:
targets_lvl_2 = data_train_lvl_2[['user_id', 'item_id']].copy()
targets_lvl_2['target'] = 1  

targets_lvl_2 = users_lvl_2.merge(targets_lvl_2, on=['user_id', 'item_id'], how='left')

targets_lvl_2['target'].fillna(0, inplace= True)
display(targets_lvl_2.head(2))
targets_lvl_2['target'].mean()

Unnamed: 0,user_id,item_id,target
0,2070,1105426,0.0
1,2070,1097350,0.0


0.10460593102333061

In [214]:
item_feature_columns = ['brand', 'department', 'sub_commodity_desc']
user_feature_columns = ['hh_comp_desc', 'income_desc', 'age_desc']
user_item_feature_columns = ['store_id', 'sales_value']

display(targets_lvl_2.shape)
targets_lvl_2 = targets_lvl_2.merge(item_features, on='item_id', how='left')
display(targets_lvl_2.shape)
targets_lvl_2 = targets_lvl_2.merge(user_features, on='user_id', how='left')
display(targets_lvl_2.shape)
targets_lvl_2.head(2)

(111313, 3)

(111313, 9)

(111313, 16)

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc
0,2070,1105426,0.0,69,DELI,Private,SANDWICHES,SANDWICHES - (COLD),,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown
1,2070,1097350,0.0,2468,GROCERY,National,DOMESTIC WINE,VALUE GLASS WINE,4 LTR,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown


In [215]:
baskets_by_user = data_train_lvl_2.groupby('user_id')['basket_id'].nunique().reset_index().rename(columns={'basket_id': 'baskets_by_user'})
sales_value_by_user = data_train_lvl_2.groupby('user_id')['sales_value'].sum().reset_index().rename(columns={'sales_value': 'sales_value_by_user'})

targets_lvl_2 = targets_lvl_2.merge(baskets_by_user, on='user_id').merge(sales_value_by_user, on='user_id')
targets_lvl_2['user_mean_check'] = targets_lvl_2['sales_value_by_user'] / targets_lvl_2['baskets_by_user'] 

train_with_features = data_train_lvl_2.merge(item_features, on='item_id')

quantity_by_user_commodity_desc = train_with_features.groupby(['user_id', 'commodity_desc'])['quantity'].sum().reset_index().rename(columns={'quantity': 'quantity_by_user_commodity_desc'})
targets_lvl_2 = targets_lvl_2.merge(total_quantity_by_user_commodity_desc, on=['user_id', 'commodity_desc']).fillna({'quantity_by_user_commodity_desc':0})

weeks_by_user = data_train_lvl_2.groupby('user_id')['week_no'].nunique().reset_index().rename(columns={'week_no': 'weeks_by_user'})
targets_lvl_2 = targets_lvl_2.merge(weeks_by_user, on='user_id')

targets_lvl_2['baskets_per_week_by_user'] = targets_lvl_2['baskets_by_user'] / targets_lvl_2['weeks_by_user'] 

weeks_by_item = data_train_lvl_2.groupby('item_id')['week_no'].nunique().reset_index().rename(columns={'week_no': 'weeks_by_item'})
quanity_by_item = data_train_lvl_2.groupby('item_id')['quantity'].sum().reset_index().rename(columns={'quantity': 'quantity_by_item'})
targets_lvl_2 = targets_lvl_2.merge(weeks_by_item, on='item_id').merge(quanity_by_item, on='item_id')

targets_lvl_2['quanity_per_week_by_item'] = targets_lvl_2['quantity_by_item'] / targets_lvl_2['weeks_by_item'] 

targets_lvl_2

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,...,kid_category_desc,baskets_by_user,sales_value_by_user,user_mean_check,total_quantity_by_user_commodity_desc,weeks_by_user,baskets_per_week_by_user,weeks_by_item,quantity_by_item,quanity_per_week_by_item
0,2070,928263,0.0,69,DRUG GM,Private,DIAPERS & DISPOSABLES,BABY DIAPERS,13 CT,45-54,...,None/Unknown,43,617.29,14.355581,1,6,7.166667,5,11,2.200000
1,1314,928263,0.0,69,DRUG GM,Private,DIAPERS & DISPOSABLES,BABY DIAPERS,13 CT,,...,,26,906.32,34.858462,2,6,4.333333,5,11,2.200000
2,40,928263,1.0,69,DRUG GM,Private,DIAPERS & DISPOSABLES,BABY DIAPERS,13 CT,45-54,...,None/Unknown,30,1113.34,37.111333,13,6,5.000000,5,11,2.200000
3,40,928263,1.0,69,DRUG GM,Private,DIAPERS & DISPOSABLES,BABY DIAPERS,13 CT,45-54,...,None/Unknown,30,1113.34,37.111333,13,6,5.000000,5,11,2.200000
4,40,928263,1.0,69,DRUG GM,Private,DIAPERS & DISPOSABLES,BABY DIAPERS,13 CT,45-54,...,None/Unknown,30,1113.34,37.111333,13,6,5.000000,5,11,2.200000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45097,693,984957,0.0,867,DELI,National,SALADS/DIPS,SAL:OLIVES/PICKLS-PRPCK,32 OZ,45-54,...,None/Unknown,2,116.23,58.115000,1,1,2.000000,5,7,1.400000
45098,2458,1134210,0.0,69,GROCERY,Private,SPICES & EXTRACTS,PURE EXTRACTS,1 OZ,,...,,3,123.78,41.260000,1,2,1.500000,6,19,3.166667
45099,1315,1077373,0.0,16,GROCERY,Private,PNT BTR/JELLY/JAMS,HONEY,N 12 OZ,,...,,1,119.21,119.210000,2,1,1.000000,6,39,6.500000
45100,1999,1008547,0.0,673,PRODUCE,National,CITRUS,ORANGES NAVELS ALL,4 LB,,...,,1,19.10,19.100000,1,1,1.000000,6,63,10.500000


In [216]:
cat_features = [f for f, t in zip(targets_lvl_2.dtypes.index, targets_lvl_2.dtypes) if t == 'object']
cat_features

['department',
 'brand',
 'commodity_desc',
 'sub_commodity_desc',
 'curr_size_of_product',
 'age_desc',
 'marital_status_code',
 'income_desc',
 'homeowner_desc',
 'hh_comp_desc',
 'household_size_desc',
 'kid_category_desc']

In [234]:
X_train = targets_lvl_2.drop('target', axis=1)
y_train = targets_lvl_2['target']
for c in cat_features:
    X_train[c] = X_train[c].astype('category')

In [235]:
lgb = LGBMClassifier(objective='binary', max_depth=7, num_leaves=256)
lgb.fit(X_train, y_train, categorical_feature=cat_features)

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=7,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=100, n_jobs=-1, num_leaves=256, objective='binary',
               random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [236]:
train_preds = lgb.predict(X_train)
print(pd.Series(train_preds).value_counts())
print(y_train.value_counts())

0.0    37438
1.0     7664
dtype: int64
0.0    33458
1.0    11644
Name: target, dtype: int64
