# Двухуровневые модели рекомендаций

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
%load_ext autoreload
%autoreload

# Для работы с матрицами
from scipy.sparse import csr_matrix

# Матричная факторизация
from implicit import als

# Модель второго уровня
from lightgbm import LGBMClassifier

import os, sys
module_path = os.path.abspath(os.path.join(os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)

# Написанные нами функции
from metrics import precision_at_k, recall_at_k
from utils import prefilter_items
from recommenders import MainRecommender

In [2]:
data = pd.read_csv('retail_train.csv')
item_features = pd.read_csv('product.csv')
user_features = pd.read_csv('hh_demographic.csv')

# column processing
item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': 'item_id'}, inplace=True)
user_features.rename(columns={'household_key': 'user_id'}, inplace=True)


# Важна схема обучения и валидации!
# -- давние покупки -- | -- 6 недель -- | -- 3 недель -- 
# подобрать размер 2-ого датасета (6 недель) --> learning curve (зависимость метрики recall@k от размера датасета)
val_lvl_1_size_weeks = 6
val_lvl_2_size_weeks = 3

data_train_lvl_1 = data[data['week_no'] < data['week_no'].max() - (val_lvl_1_size_weeks + val_lvl_2_size_weeks)]
data_val_lvl_1 = data[(data['week_no'] >= data['week_no'].max() - (val_lvl_1_size_weeks + val_lvl_2_size_weeks)) &
                      (data['week_no'] < data['week_no'].max() - (val_lvl_2_size_weeks))]

data_train_lvl_2 = data_val_lvl_1.copy()  # Для наглядности. Далее мы добавим изменения, и они будут отличаться
data_val_lvl_2 = data[data['week_no'] >= data['week_no'].max() - val_lvl_2_size_weeks]


data_train_lvl_1.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


In [3]:
data_train_lvl_2 = data_val_lvl_1.copy()  # Для наглядности. Далее мы добавим изменения, и они будут отличаться
data_val_lvl_2 = data[data['week_no'] >= data['week_no'].max() - val_lvl_2_size_weeks]

In [4]:
n_items_before = data_train_lvl_1['item_id'].nunique()

data_train_lvl_1 = prefilter_items(data_train_lvl_1, item_features=item_features, take_n_popular=5000)

n_items_after = data_train_lvl_1['item_id'].nunique()
print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['price'] = data['sales_value'] / (np.maximum(data['quantity'], 1))


Decreased # items from 83685 to 5001


In [5]:
recommender = MainRecommender(data_train_lvl_1)



  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/5001 [00:00<?, ?it/s]

In [6]:
recommender.get_als_recommendations(2375, N=200)

[899624,
 1106523,
 871756,
 1044078,
 5569230,
 883932,
 844179,
 1116376,
 925862,
 8090521,
 9835223,
 1046545,
 1025535,
 1092937,
 832678,
 854852,
 1003616,
 8090537,
 841220,
 999714,
 870547,
 1051323,
 999779,
 9836106,
 944534,
 1081177,
 832442,
 1026984,
 947858,
 865528,
 5569845,
 12731517,
 1022428,
 823990,
 850925,
 830304,
 1000753,
 12262978,
 1119454,
 863447,
 1134678,
 12301839,
 1035207,
 965267,
 850102,
 1133312,
 1038663,
 1059902,
 1004390,
 999270,
 865456,
 963727,
 1090931,
 1097458,
 6534480,
 869322,
 950935,
 937292,
 12731544,
 940090,
 1012587,
 910032,
 828106,
 12731432,
 998556,
 5585510,
 1029743,
 823704,
 973181,
 9832469,
 10198378,
 895166,
 861279,
 847066,
 1096573,
 965766,
 1079067,
 1131344,
 9835451,
 1107420,
 837304,
 945998,
 12731685,
 948650,
 896862,
 888543,
 1081262,
 895930,
 839818,
 1017369,
 835098,
 8065410,
 1004906,
 1080867,
 826666,
 1104349,
 1009770,
 12731543,
 933067,
 1021324,
 1102207,
 1138467,
 1068719,
 12302069

In [7]:
recommender.get_own_recommendations(2375, N=200)

[948640,
 918046,
 847962,
 907099,
 873980,
 884694,
 10285454,
 1107760,
 7169090,
 979674,
 10308345,
 1069531,
 974766,
 1015474,
 950935,
 847066,
 1102207,
 1020770,
 9521787,
 974265,
 940996,
 8019845,
 5567194,
 12811490,
 1003616,
 973181,
 890719,
 982955,
 9677152,
 998519,
 1072685,
 1131382,
 1021715,
 12263119,
 960791,
 7441873,
 986021,
 956666,
 1038692,
 9677748,
 9297223,
 927030,
 12757653,
 1046919,
 6391532,
 989069,
 1068451,
 951954,
 835300,
 937343,
 1047249,
 13876348,
 1061732,
 981601,
 1121028,
 1087547,
 828393,
 996269,
 951951,
 1036093,
 1023815,
 5570408,
 827667,
 1082454,
 1006878,
 5570048,
 841309,
 1078652,
 1115553,
 1056492,
 1138467,
 1004945,
 947858,
 1092885,
 1121694,
 938138,
 8019916,
 827919,
 984315,
 10341855,
 883932,
 8291322,
 1096794,
 1028938,
 1087618,
 8020166,
 1082185,
 866871,
 930666,
 825994,
 910151,
 823990,
 848029,
 896613,
 12301839,
 1117219,
 1135258,
 869868,
 1046545,
 899624,
 6442594,
 1137775,
 825343,
 104290

In [8]:
recommender.get_similar_items_recommendation(880150, N=200)

[1029743,
 1106523,
 5569230,
 916122,
 844179,
 1044078,
 1126899,
 1070820,
 1127831,
 866211,
 8090521,
 878996,
 8090537,
 5569471,
 1004906,
 854852,
 899624,
 986912,
 933835,
 1075368,
 1081177,
 6034857,
 5585510,
 965267,
 834117,
 940947,
 983584,
 12810393,
 913210,
 874972,
 5569845,
 5568378,
 999858,
 908318,
 985999,
 901062,
 1040807,
 1018740,
 951412,
 1101010,
 1105488,
 1037840,
 880150,
 1043751,
 857503,
 1122358,
 1132771,
 823704,
 854405,
 909714,
 993638,
 930118,
 839419,
 971922,
 910032,
 863447,
 835098,
 865456,
 976199,
 12301109,
 1070702,
 5569374,
 8090532,
 872137,
 883003,
 1023720,
 1012587,
 1000753,
 893018,
 1024306,
 897954,
 1042438,
 907631,
 5568729,
 1068719,
 1137775,
 1087102,
 944534,
 1020581,
 999270,
 832678,
 838186,
 1138443,
 999104,
 1112238,
 1050851,
 902172,
 965766,
 12301100,
 952163,
 957951,
 8090509,
 852856,
 885863,
 1056509,
 845307,
 1062002,
 944836,
 819978,
 957736,
 999779,
 1027168,
 1051323,
 8065410,
 1026118,
 

In [9]:
recommender.get_similar_users_recommendation(2375, N=200)

[950935,
 894360,
 1097398,
 929410,
 861494,
 884694,
 1081533,
 977559,
 835618,
 918046,
 1069256,
 1133654,
 935578,
 1116253,
 1065259,
 10198378,
 1008288,
 928749,
 1074040,
 1065259,
 1134296,
 967994,
 1040416,
 1088771,
 7146026,
 918638,
 1055403,
 1080867,
 1064002,
 10198378,
 1096573,
 1057168,
 1138596,
 869322,
 873324,
 861494,
 8090542,
 1056212,
 1088771,
 1138596,
 938165,
 917033,
 8019902,
 10204149,
 875392,
 1029549,
 935578,
 947412,
 13007846,
 891632,
 928749,
 1104349,
 10198378,
 882489,
 1057749,
 1096573,
 821741,
 1027216,
 953539,
 1036093,
 865233,
 879922,
 12262832,
 917406,
 1056212,
 839208,
 1075765,
 898121,
 847962,
 5574108,
 918638,
 911311,
 1086740,
 935460,
 841365,
 842423,
 820011,
 993838,
 875089,
 874563,
 894439,
 1052294,
 993838,
 1055403,
 820486,
 1094744,
 835988,
 5981267,
 1070129,
 992650,
 7082640,
 1069039,
 902377,
 1069256,
 1086740,
 1102207,
 1134296,
 835351,
 879922,
 1065017,
 1136341,
 985605,
 1083043,
 1105426,
 82

### Задание 1

A) Попробуйте различные варианты генерации кандидатов. Какие из них дают наибольший recall@k ?
- Пока пробуем отобрать 200 кандидатов (k=200)
- Качество измеряем на data_val_lvl_1: следующие 6 недель после трейна

Дают ли own recommendtions + top-popular лучший recall?  

B)* Как зависит recall@k от k? Постройте для одной схемы генерации кандидатов эту зависимость для k = {20, 50, 100, 200, 500}  
C)* Исходя из прошлого вопроса, как вы думаете, какое значение k является наиболее разумным?


In [10]:
result_lvl_1 = data_val_lvl_1.groupby('user_id')['item_id'].unique().reset_index()
result_lvl_1.columns=['user_id', 'actual']
result_lvl_1.count()

user_id    2154
actual     2154
dtype: int64

In [11]:
recommend_list= []
mean_recall = []
for user, bought_list in zip(result_lvl_1['user_id'],result_lvl_1['actual']):
    recommend_list = recommender.get_similar_items_recommendation(user, N=200)
    # print(f'Recall {user} равен {recall_at_k(recommend_list, bought_list, k=200)}')
    mean_recall.append(recall_at_k(recommend_list, bought_list, k=200))

print(f'Средний Recall равен {np.array(mean_recall).mean() * 100}')

Средний Recall равен 8.565419315494673


In [12]:
recommend_list= []
mean_precision_at_k = []
for user, bought_list in zip(result_lvl_1['user_id'],result_lvl_1['actual']):
    recommend_list = recommender.get_similar_items_recommendation(user, N=200)
    mean_precision_at_k.append(precision_at_k(recommend_list, bought_list, k=200))
    
print(f'Средний precision_at_k равен {np.array(mean_precision_at_k).mean() * 100}')

Средний precision_at_k равен 2.451949860724234


### Задание 2.

Обучите модель 2-ого уровня, при этом:
    - Добавьте минимум по 2 фичи для юзера, товара и пары юзер-товар
    - Измерьте отдельно precision@5 модели 1-ого уровня и двухуровневой модели на data_val_lvl_2
    - Вырос ли precision@5 при использовании двухуровневой модели?

In [13]:
n_items_before = data_train_lvl_2['item_id'].nunique()

data_train_lvl_2 = prefilter_items(data_train_lvl_2, item_features=item_features, take_n_popular=5000)

n_items_after = data_train_lvl_2['item_id'].nunique()
print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

Decreased # items from 27649 to 5001


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['price'] = data['sales_value'] / (np.maximum(data['quantity'], 1))


In [14]:
data_train_lvl_2 = data_train_lvl_2.astype({'user_id': 'int32','basket_id': 'int64','day': 'int32','item_id': 'int32','quantity': 'int32','user_id': 'int32',})

In [15]:
# Создаем фичу число покупок юзером за месяц
def create_pur_per_month(data):
    values = {}
    df = data.drop_duplicates(['basket_id'])
    for user in data['user_id']:
        values[user] = df.loc[df['user_id'] == user]['basket_id'].count() / 4
        
    for user in values.keys():
        data.loc[data_train_lvl_2['user_id'] == user, 'pur_per_month'] = values[user]
    return data

data_train_lvl_2 = create_pur_per_month(data_train_lvl_2)

In [16]:
# Создаем фичу число покупок товара за месяц
def create_sel_per_month(data):
    values = {}
    df = data
    for item in data['item_id']:
        values[item] = df.loc[df['item_id'] == item]['basket_id'].count() * df.loc[df['item_id'] == item]['quantity'].sum() / 4
        
    for item in values.keys():
        data.loc[data_train_lvl_2['item_id'] == item, 'sell_per_month'] = values[item]
    return data

data_train_lvl_2 = create_sel_per_month(data_train_lvl_2)

In [17]:
data_train_lvl_2

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc,price,pur_per_month,sell_per_month
2107471,2021,40618753059,594,896862,2,5.00,443,-2.98,101,86,0.0,0.0,2.50,1.00,600.00
2107473,2021,40618753059,594,1019142,2,5.00,443,-1.98,101,86,0.0,0.0,2.50,1.00,30.00
2107476,2021,40618753059,594,9835223,1,9.27,443,-3.63,101,86,0.0,0.0,9.27,1.00,304.00
2108010,1753,40618809138,594,999999,1,29.99,345,0.00,8,86,0.0,0.0,29.99,5.00,91870339.00
2108685,2120,40629515323,594,999999,1,2.09,136,0.00,830,86,0.0,0.0,2.09,2.50,91870339.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2282311,2088,41297771158,635,13877226,1,15.65,304,-5.25,1258,91,0.0,0.0,15.65,3.25,8.75
2282315,1541,41297771180,635,999999,1,5.19,304,0.00,1301,91,0.0,0.0,5.19,2.00,91870339.00
2282316,1168,41297772063,635,836262,1,12.40,304,0.00,1526,91,0.0,0.0,12.40,1.25,4.00
2282323,462,41297773713,635,10180324,1,3.00,304,-0.29,2040,91,0.0,0.0,3.00,1.50,49.00


Значения в 91870339 для не популярных товаров

In [18]:
def create_xsandys(data_train_lvl_2):
    users_lvl_2 = pd.DataFrame(data_train_lvl_2['user_id'].unique())
    users_lvl_2.columns = ['user_id']
    
    targets_lvl_2 = data_train_lvl_2[['user_id', 'item_id']].copy()
    targets_lvl_2['target'] = 1

    # Пока только warm start
    train_users = data_train_lvl_1['user_id'].unique()
    users_lvl_2 = users_lvl_2[users_lvl_2['user_id'].isin(train_users)]

    users_lvl_2['candidates'] = users_lvl_2['user_id'].apply(lambda x: recommender.get_own_recommendations(x, N=200))
    s = users_lvl_2.apply(lambda x: pd.Series(x['candidates']), axis=1).stack().reset_index(level=1, drop=True)
    s.name = 'item_id'

    users_lvl_2 = users_lvl_2.drop('candidates', axis=1).join(s)
    users_lvl_2['drop'] = 1  # фиктивная пересенная
    
    targets_lvl_2 = targets_lvl_2.merge(item_features, on='item_id', how='left')
    targets_lvl_2 = targets_lvl_2.merge(user_features, on='user_id', how='left')
    
    X_train = targets_lvl_2.drop('target', axis=1)
    y_train = targets_lvl_2[['target']]
    
    cat_feats = X_train.columns[2:].tolist()
    X_train[cat_feats] = X_train[cat_feats].astype('category')
    
    return X_train, y_train, cat_feats
    

In [19]:
X_train, y_train, cat_feats = create_xsandys(data_train_lvl_2)

In [20]:
%%time

lgb = LGBMClassifier(objective='binary', max_depth=7, categorical_column=cat_feats)
lgb.fit(X_train, y_train)

train_preds = lgb.predict(X_train)

  return f(**kwargs)


Wall time: 454 ms


In [21]:
X_val, y_val, cat_feats = create_xsandys(data_val_lvl_2)

In [22]:
print(f'Средний precision_at_k равен {precision_at_k(train_preds,y_val, k=200) / 100}')

Средний precision_at_k равен 5.9157


При использовании 2ух-уровневой модели precision_at_k вырос в 2 раза