# Вебинар 6. Двухуровневые модели рекомендаций


Код для src, utils, metrics вы можете скачать из [этого](https://github.com/geangohn/recsys-tutorial) github репозитория

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix

# Матричная факторизация
from implicit import als

# Модель второго уровня
from lightgbm import LGBMClassifier

# Написанные нами функции
from src.metrics import precision_at_k, recall_at_k
from src.utils import prefilter_items
from src.recommenders import MainRecommender

In [2]:
data = pd.read_csv('data/retail_train.csv')
item_features = pd.read_csv('data/product.csv')
user_features = pd.read_csv('data/hh_demographic.csv')

# column processing
item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': 'item_id'}, inplace=True)
user_features.rename(columns={'household_key': 'user_id'}, inplace=True)


# Важна схема обучения и валидации!
# -- давние покупки -- | -- 6 недель -- | -- 3 недель -- 
# подобрать размер 2-ого датасета (6 недель) --> learning curve (зависимость метрики recall@k от размера датасета)
val_lvl_1_size_weeks = 6
val_lvl_2_size_weeks = 3

data_train_lvl_1 = data[data['week_no'] < data['week_no'].max() - (val_lvl_1_size_weeks + val_lvl_2_size_weeks)]
data_val_lvl_1 = data[(data['week_no'] >= data['week_no'].max() - (val_lvl_1_size_weeks + val_lvl_2_size_weeks)) &
                      (data['week_no'] < data['week_no'].max() - (val_lvl_2_size_weeks))]

data_train_lvl_2 = data_val_lvl_1.copy()  # Для наглядности. Далее мы добавим изменения, и они будут отличаться
data_val_lvl_2 = data[data['week_no'] >= data['week_no'].max() - val_lvl_2_size_weeks]

data_train_lvl_1.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


In [3]:
n_items_before = data_train_lvl_1['item_id'].nunique()

data_train_lvl_1 = prefilter_items(data_train_lvl_1, item_features=item_features, take_n_popular=5000)

n_items_after = data_train_lvl_1['item_id'].nunique()
print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

Decreased # items from 83685 to 5001


In [4]:
recommender = MainRecommender(data_train_lvl_1)



HBox(children=(FloatProgress(value=0.0, max=15.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5001.0), HTML(value='')))




In [5]:
recommender.get_als_recommendations(2375, N=5)

[871756, 899624, 925862, 1044078, 1106523]

In [6]:
recommender.get_own_recommendations(2375, N=5)

[948640, 918046, 847962, 907099, 873980]

In [7]:
recommender.get_similar_items_recommendation(2375, N=5)

[1046545, 1044078, 1042907, 842125, 15778319]

In [8]:
recommender.get_similar_users_recommendation(2375, N=5)

[1012801, 1026945, 820612, 974265, 1102003]

### Задание 1

A) Попробуйте различные варианты генерации кандидатов. Какие из них дают наибольший recall@k ?
- Пока пробуем отобрать 50 кандидатов (k=50)
- Качество измеряем на data_val_lvl_1: следующие 6 недель после трейна

Дают ли own recommendtions + top-popular лучший recall?  

B)* Как зависит recall@k от k? Постройте для одной схемы генерации кандидатов эту зависимость для k = {20, 50, 100, 200, 500}  
C)* Исходя из прошлого вопроса, как вы думаете, какое значение k является наиболее разумным?


In [9]:
result_lvl_1 = data_val_lvl_1.groupby('user_id')['item_id'].unique().reset_index().copy()
result_lvl_1.columns=['user_id', 'actual']
result_lvl_1.head(2)

Unnamed: 0,user_id,actual
0,1,"[853529, 865456, 867607, 872137, 874905, 87524..."
1,2,"[15830248, 838136, 839656, 861272, 866211, 870..."


In [10]:
train_users = data_train_lvl_1['user_id'].unique()
result_lvl_1 = result_lvl_1[result_lvl_1['user_id'].isin(train_users)]
result_lvl_1['als'] = result_lvl_1['user_id'].apply(lambda u: recommender.get_als_recommendations(u, 500))
result_lvl_1.head(2)

Unnamed: 0,user_id,actual,als
0,1,"[853529, 865456, 867607, 872137, 874905, 87524...","[1037332, 1082212, 1055425, 885290, 912704, 85..."
1,2,"[15830248, 838136, 839656, 861272, 866211, 870...","[5569230, 934399, 1029743, 882308, 1127831, 99..."


In [11]:
result_lvl_1['own'] = result_lvl_1['user_id'].apply(lambda u: recommender.get_own_recommendations(u, 500))
result_lvl_1.head(2)

Unnamed: 0,user_id,actual,als,own
0,1,"[853529, 865456, 867607, 872137, 874905, 87524...","[1037332, 1082212, 1055425, 885290, 912704, 85...","[856942, 9297615, 5577022, 877391, 9655212, 88..."
1,2,"[15830248, 838136, 839656, 861272, 866211, 870...","[5569230, 934399, 1029743, 882308, 1127831, 99...","[911974, 1076580, 1103898, 5567582, 1056620, 9..."


In [12]:
result_lvl_1['similar_items'] = result_lvl_1['user_id'].apply(lambda u: recommender.get_similar_items_recommendation(u, 500))
result_lvl_1.head(2)

Unnamed: 0,user_id,actual,als,own,similar_items
0,1,"[853529, 865456, 867607, 872137, 874905, 87524...","[1037332, 1082212, 1055425, 885290, 912704, 85...","[856942, 9297615, 5577022, 877391, 9655212, 88...","[842762, 1007512, 990656, 5577022, 9803545, 98..."
1,2,"[15830248, 838136, 839656, 861272, 866211, 870...","[5569230, 934399, 1029743, 882308, 1127831, 99...","[911974, 1076580, 1103898, 5567582, 1056620, 9...","[1137346, 5569845, 1044078, 985999, 880888, 81..."


In [13]:
#result_lvl_1['similar_users'] = result_lvl_1['user_id'].apply(lambda u: recommender.get_similar_users_recommendation(u, 5))
#result_lvl_1.head(2)
# не работает

In [14]:
for model_name in ['als', 'own', 'similar_items']:
    for k in [20, 50, 100, 200, 500]:
        mean_recall_at_k = result_lvl_1.apply(lambda row: recall_at_k(row[model_name], row['actual'], k), axis=1).mean()
        print(f'Model {model_name}, k = {k}:   Mean recall@k = {mean_recall_at_k}')


Model als, k = 20:   Mean recall@k = 0.029936029167757377
Model als, k = 50:   Mean recall@k = 0.049233143762609094
Model als, k = 100:   Mean recall@k = 0.06950404919177722
Model als, k = 200:   Mean recall@k = 0.09827608717654128
Model als, k = 500:   Mean recall@k = 0.14748560935103644
Model own, k = 20:   Mean recall@k = 0.03928427679372909
Model own, k = 50:   Mean recall@k = 0.06525657038145175
Model own, k = 100:   Mean recall@k = 0.09604492955885034
Model own, k = 200:   Mean recall@k = 0.13537278412833242
Model own, k = 500:   Mean recall@k = 0.18205324555508678
Model similar_items, k = 20:   Mean recall@k = 0.018189114141727265
Model similar_items, k = 50:   Mean recall@k = 0.034190612171024135
Model similar_items, k = 100:   Mean recall@k = 0.05445933414783967
Model similar_items, k = 200:   Mean recall@k = 0.08649117439314544
Model similar_items, k = 500:   Mean recall@k = 0.137187102283578


### Задание 2.

Обучите модель 2-ого уровня, при этом:
    - Добавьте минимум по 2 фичи для юзера, товара и пары юзер-товар
    - Измерьте отдельно precision@5 модели 1-ого уровня и двухуровневой модели на data_val_lvl_2
    - Вырос ли precision@5 при использовании двухуровневой модели?

In [15]:
users_lvl_2 = pd.DataFrame(data_train_lvl_2['user_id'].unique(), columns = ['user_id']) 
users_lvl_2 = users_lvl_2[users_lvl_2['user_id'].isin(train_users)]

users_lvl_2['candidates'] = users_lvl_2['user_id'].apply(lambda x: recommender.get_own_recommendations(x, N=50))

s = users_lvl_2.apply(lambda x: pd.Series(x['candidates']), axis=1).stack().reset_index(level=1, drop=True)
s.name = 'item_id'

users_lvl_2 = users_lvl_2.drop('candidates', axis=1).join(s)
users_lvl_2.head(2)

Unnamed: 0,user_id,item_id
0,2070,1105426
0,2070,1097350


In [16]:
targets_lvl_2 = data_train_lvl_2[['user_id', 'item_id']].copy()
targets_lvl_2['target'] = 1  

targets_lvl_2 = users_lvl_2.merge(targets_lvl_2, on=['user_id', 'item_id'], how='left')

targets_lvl_2['target'].fillna(0, inplace= True)
display(targets_lvl_2)
targets_lvl_2['target'].mean()

Unnamed: 0,user_id,item_id,target
0,2070,1105426,0.0
1,2070,1097350,0.0
2,2070,879194,0.0
3,2070,948640,0.0
4,2070,928263,0.0
...,...,...,...
111308,1745,903454,0.0
111309,1745,9419888,0.0
111310,1745,1076769,0.0
111311,1745,1092588,0.0


0.10460593102333061

In [17]:
def add_features(ds, data_ds=None):
    if data_ds is None:
        data_ds = ds
        
    ds = ds.merge(item_features, on='item_id', how='left')
    ds = ds.merge(user_features, on='user_id', how='left')

    baskets_by_user = data_ds.groupby('user_id')['basket_id'].nunique().reset_index().rename(columns={'basket_id': 'baskets_by_user'})
    sales_value_by_user = data_ds.groupby('user_id')['sales_value'].sum().reset_index().rename(columns={'sales_value': 'sales_value_by_user'})

    ds = ds.merge(baskets_by_user, on='user_id', how='left').merge(sales_value_by_user, on='user_id', how='left')
    ds['user_mean_check'] = ds['sales_value_by_user'] / ds['baskets_by_user'] 
    # print(ds.shape)

    train_with_features = data_ds.merge(item_features, on='item_id', how='left')

    quantity_by_user_commodity_desc = train_with_features.groupby(['user_id', 'commodity_desc'])['quantity'].sum().reset_index().rename(columns={'quantity': 'quantity_by_user_commodity_desc'})
    ds = ds.merge(quantity_by_user_commodity_desc, on=['user_id', 'commodity_desc'], how='left').fillna({'quantity_by_user_commodity_desc':0})
    # print(ds.shape)

    weeks_by_user = data_ds.groupby('user_id')['week_no'].nunique().reset_index().rename(columns={'week_no': 'weeks_by_user'})
    ds = ds.merge(weeks_by_user, on='user_id', how='left')

    ds['baskets_per_week_by_user'] = ds['baskets_by_user'] / ds['weeks_by_user'] 
    # print(ds.shape)

    weeks_by_item = data_ds.groupby('item_id')['week_no'].nunique().reset_index().rename(columns={'week_no': 'weeks_by_item'})
    quanity_by_item = data_ds.groupby('item_id')['quantity'].sum().reset_index().rename(columns={'quantity': 'quantity_by_item'})
    ds = ds.merge(weeks_by_item, on='item_id', how='left').merge(quanity_by_item, on='item_id', how='left')

    ds['quanity_per_week_by_item'] = ds['quantity_by_item'] / ds['weeks_by_item'] 
    #print(ds.shape)

    return ds

targets_lvl_2 = add_features(targets_lvl_2, data_train_lvl_2)

targets_lvl_2

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,...,kid_category_desc,baskets_by_user,sales_value_by_user,user_mean_check,quantity_by_user_commodity_desc,weeks_by_user,baskets_per_week_by_user,weeks_by_item,quantity_by_item,quanity_per_week_by_item
0,2070,1105426,0.0,69,DELI,Private,SANDWICHES,SANDWICHES - (COLD),,45-54,...,None/Unknown,43,617.29,14.355581,0.0,6,7.166667,3.0,5.0,1.666667
1,2070,1097350,0.0,2468,GROCERY,National,DOMESTIC WINE,VALUE GLASS WINE,4 LTR,45-54,...,None/Unknown,43,617.29,14.355581,0.0,6,7.166667,1.0,1.0,1.000000
2,2070,879194,0.0,69,DRUG GM,Private,DIAPERS & DISPOSABLES,BABY DIAPERS,14 CT,45-54,...,None/Unknown,43,617.29,14.355581,1.0,6,7.166667,,,
3,2070,948640,0.0,1213,DRUG GM,National,ORAL HYGIENE PRODUCTS,WHITENING SYSTEMS,3 OZ,45-54,...,None/Unknown,43,617.29,14.355581,0.0,6,7.166667,,,
4,2070,928263,0.0,69,DRUG GM,Private,DIAPERS & DISPOSABLES,BABY DIAPERS,13 CT,45-54,...,None/Unknown,43,617.29,14.355581,1.0,6,7.166667,5.0,11.0,2.200000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
111308,1745,903454,0.0,1216,MEAT-PCKGD,National,FROZEN MEAT,OTHER - FULLY COOKED,32 OZ,45-54,...,None/Unknown,1,13.97,13.970000,0.0,1,1.000000,,,
111309,1745,9419888,0.0,759,GROCERY,National,YOGURT,YOGURT MULTI-PACKS,48 OZ,45-54,...,None/Unknown,1,13.97,13.970000,0.0,1,1.000000,5.0,9.0,1.800000
111310,1745,1076769,0.0,3859,DELI,National,DELI MEATS,MEAT: LUNCHMEAT BULK,,45-54,...,None/Unknown,1,13.97,13.970000,0.0,1,1.000000,5.0,13.0,2.600000
111311,1745,1092588,0.0,709,GROCERY,National,FLUID MILK PRODUCTS,MISCELLANEOUS MILK,32 OZ,45-54,...,None/Unknown,1,13.97,13.970000,0.0,1,1.000000,6.0,27.0,4.500000


In [18]:
cat_features = [f for f, t in zip(targets_lvl_2.dtypes.index, targets_lvl_2.dtypes) if t == 'object']
cat_features

['department',
 'brand',
 'commodity_desc',
 'sub_commodity_desc',
 'curr_size_of_product',
 'age_desc',
 'marital_status_code',
 'income_desc',
 'homeowner_desc',
 'hh_comp_desc',
 'household_size_desc',
 'kid_category_desc']

In [19]:
X_train = targets_lvl_2.drop('target', axis=1)
y_train = targets_lvl_2['target']
for c in cat_features:
    X_train[c] = X_train[c].astype('category')

In [20]:
lgb = LGBMClassifier(objective='binary', max_depth=7, num_leaves=256)
lgb.fit(X_train, y_train, categorical_feature=cat_features)



LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=7,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=100, n_jobs=-1, num_leaves=256, objective='binary',
               random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [21]:
result_lvl_2 = data_val_lvl_2.groupby('user_id')['item_id'].unique().reset_index().copy()
result_lvl_2.columns=['user_id', 'actual']
result_lvl_2.shape

(2042, 2)

In [26]:
# предполагаем, что информация для выполнения рекомендации ограничена X_train

pred_ds = X_train[['user_id', 'item_id']].copy()
pred_ds['proba'] = lgb.predict_proba(X_train).T[0].T
pred_ds = pred_ds.groupby(['user_id', 'item_id'])['proba'].mean().reset_index() # пары user_id-item_id не уникальны в X_train
pred_s = pred_ds.groupby('user_id').apply(lambda x: x.sort_values('proba', ascending=False)['item_id'].tolist())#.reset_index().rename(columns={0:'lvl_2'})

top = pred_ds.groupby('item_id')['proba'].mean().reset_index().sort_values('proba', ascending=False)['item_id'].tolist()

def get_lvl_2_recommendations(user_id, N=5):
    r = pred_s[user_id]
    return r[:N]

def get_recommendations(fn, user_id, N=5):
    r = []
    try:
        r = fn(user_id, N)
    except:
        print(f'Нет рекомендации для user_id={user_id} из {fn}, возвращаем {N} из наилучших для lvl_2')
    if len(r) < N:
        print(f'Недостаточно рекомендации для user_id={user_id} из {fn}, добавляем {N - len(r)} из наилучших для lvl_2')
        r += top[:N - len(r)] 
    return r[:N]

In [27]:
get_lvl_2_recommendations(1)

[7152319, 825123, 9296837, 979674, 936685]

In [28]:
result_lvl_2['lvl_2'] = result_lvl_2['user_id'].apply(lambda u: get_recommendations(get_lvl_2_recommendations, u, 5))
result_lvl_2['als'] = result_lvl_2['user_id'].apply(lambda u: get_recommendations(recommender.get_als_recommendations, u, 5))

result_lvl_2.head(2)

Нет рекомендации для user_id=3 из <function get_lvl_2_recommendations at 0x0000027907C67828>, возвращаем 5 из наилучших для lvl_2
Недостаточно рекомендации для user_id=3 из <function get_lvl_2_recommendations at 0x0000027907C67828>, добавляем 5 из наилучших для lvl_2
Нет рекомендации для user_id=48 из <function get_lvl_2_recommendations at 0x0000027907C67828>, возвращаем 5 из наилучших для lvl_2
Недостаточно рекомендации для user_id=48 из <function get_lvl_2_recommendations at 0x0000027907C67828>, добавляем 5 из наилучших для lvl_2
Нет рекомендации для user_id=60 из <function get_lvl_2_recommendations at 0x0000027907C67828>, возвращаем 5 из наилучших для lvl_2
Недостаточно рекомендации для user_id=60 из <function get_lvl_2_recommendations at 0x0000027907C67828>, добавляем 5 из наилучших для lvl_2
Нет рекомендации для user_id=73 из <function get_lvl_2_recommendations at 0x0000027907C67828>, возвращаем 5 из наилучших для lvl_2
Недостаточно рекомендации для user_id=73 из <function get_lvl

Нет рекомендации для user_id=1984 из <bound method MainRecommender.get_als_recommendations of <src.recommenders.MainRecommender object at 0x000002790CD2BC08>>, возвращаем 5 из наилучших для lvl_2
Недостаточно рекомендации для user_id=1984 из <bound method MainRecommender.get_als_recommendations of <src.recommenders.MainRecommender object at 0x000002790CD2BC08>>, добавляем 5 из наилучших для lvl_2
Нет рекомендации для user_id=2259 из <bound method MainRecommender.get_als_recommendations of <src.recommenders.MainRecommender object at 0x000002790CD2BC08>>, возвращаем 5 из наилучших для lvl_2
Недостаточно рекомендации для user_id=2259 из <bound method MainRecommender.get_als_recommendations of <src.recommenders.MainRecommender object at 0x000002790CD2BC08>>, добавляем 5 из наилучших для lvl_2


Unnamed: 0,user_id,actual,lvl_2,als
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[7152319, 825123, 9296837, 979674, 936685]","[1037332, 1082212, 1055425, 885290, 912704]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[819594, 1066869, 1080354, 13040302, 1043956]","[1044078, 1106523, 844179, 12301100, 1029743]"


In [29]:
for model_name in ['lvl_2', 'als']:
    mean_precision_at_k = result_lvl_2.apply(lambda row: precision_at_k(row[model_name], row['actual']), axis=1).mean()
    print(f'Model: {model_name}, Mean precision@k = {mean_precision_at_k}')

Model: lvl_2, Mean precision@k = 0.014397649363369212
Model: als, Mean precision@k = 0.09755142017629724


Результат lvl_2 пока плохой