In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix

# Матричная факторизация
from implicit import als

# Модель второго уровня
from lightgbm import LGBMClassifier

import os, sys
module_path = os.path.abspath(os.path.join(os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)

# Написанные нами функции
from src.metrics import precision_at_k, recall_at_k
from src.utils import prefilter_items
from src.recommenders import MainRecommender

In [2]:
data = pd.read_csv('../data/retail_train.csv')
item_features = pd.read_csv('../data/product.csv')
user_features = pd.read_csv('../data/hh_demographic.csv')

# column processing
item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': 'item_id'}, inplace=True)
user_features.rename(columns={'household_key': 'user_id'}, inplace=True)


# Важна схема обучения и валидации!
# -- давние покупки -- | -- 6 недель -- | -- 3 недель -- 
# подобрать размер 2-ого датасета (6 недель) --> learning curve (зависимость метрики recall@k от размера датасета)
val_lvl_1_size_weeks = 6
val_lvl_2_size_weeks = 3

data_train_lvl_1 = data[data['week_no'] < data['week_no'].max() - (val_lvl_1_size_weeks + val_lvl_2_size_weeks)]
data_val_lvl_1 = data[(data['week_no'] >= data['week_no'].max() - (val_lvl_1_size_weeks + val_lvl_2_size_weeks)) &
                      (data['week_no'] < data['week_no'].max() - (val_lvl_2_size_weeks))]

data_train_lvl_2 = data_val_lvl_1.copy()  # Для наглядности. Далее мы добавим изменения, и они будут отличаться
data_val_lvl_2 = data[data['week_no'] >= data['week_no'].max() - val_lvl_2_size_weeks]

data_train_lvl_1.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


In [3]:
n_items_before = data_train_lvl_1['item_id'].nunique()

data_train_lvl_1 = prefilter_items(data_train_lvl_1, item_features=item_features, take_n_popular=5000)

n_items_after = data_train_lvl_1['item_id'].nunique()
print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

Decreased # items from 83685 to 5001


In [4]:
recommender = MainRecommender(data_train_lvl_1)



HBox(children=(FloatProgress(value=0.0, max=15.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5001.0), HTML(value='')))




In [5]:
recommender.get_als_recommendations(2375, N=5)

[899624, 1044078, 844179, 871756, 1106523]

In [6]:
recommender.get_own_recommendations(2375, N=5)

[948640, 918046, 847962, 907099, 873980]

In [7]:
recommender.get_similar_items_recommendation(2375, N=5)

array([ 918846,  983584,  819978, 8020001, 6463877], dtype=int64)

In [8]:
recommender.get_similar_users_recommendation(2375, N=5)

[879373, 974265, 5574377, 894360, 873715]

### Задание 1

A) Попробуйте различные варианты генерации кандидатов. Какие из них дают наибольший recall@k ?
- Пока пробуем отобрать 50 кандидатов (k=50)
- Качество измеряем на data_val_lvl_1: следующие 6 недель после трейна

Дают ли own recommendtions + top-popular лучший recall?  

B)* Как зависит recall@k от k? Постройте для одной схемы генерации кандидатов эту зависимость для k = {20, 50, 100, 200, 500}  
C)* Исходя из прошлого вопроса, как вы думаете, какое значение k является наиболее разумным?


In [9]:
result_lvl_1 = data_val_lvl_1.groupby('user_id')['item_id'].unique().reset_index()
result_lvl_1.columns=['user_id', 'actual']
result_lvl_1.head(2)

Unnamed: 0,user_id,actual
0,1,"[853529, 865456, 867607, 872137, 874905, 87524..."
1,2,"[15830248, 838136, 839656, 861272, 866211, 870..."


In [None]:
#Пока пробуем отобрать 50 кандидатов (k=50)
result_lvl_1['own'] = result_lvl_1['user_id'].apply(lambda x: recommender.get_own_recommendations(x, N=50))
result_lvl_1['als'] = result_lvl_1['user_id'].apply(lambda x: recommender.get_als_recommendations(x, N=50))
result_lvl_1['similar_items'] = result_lvl_1['user_id'].apply(lambda x: recommender.get_similar_items_recommendation(x, N=50))
result_lvl_1['similar_users'] = result_lvl_1['user_id'].apply(lambda x: recommender.get_similar_users_recommendation(x, N=50))

In [None]:
result_lvl_1.head(2)

In [None]:
recall_arr = list()
for col in result_lvl_1.iloc[:,2:]:
    print(col, end=': ')
    recall_arr.append(result_lvl_1.apply(lambda row: recall_at_k(row[col], row['actual'], k=50), axis=1).mean())
    print(recall_arr[-1])

In [None]:
recall_arr =[]
for k in [50, 100, 200, 250, 300, 500]:
    result_lvl_1[f'own_{k}'] = result_lvl_1['user_id'].apply(lambda x: recommender.get_own_recommendations(x, N=k))
    
    recall_arr.append(result_lvl_1.apply(lambda row: recall_at_k(row[f'own_{k}'], row['actual'], k=k), axis=1).mean())

In [None]:
import matplotlib.pyplot as plt

plt.plot([50, 100, 200, 250, 300, 500], recall_arr)

plt.xlabel('K')
plt.ylabel('recall')
plt.title('-')
plt.legend(loc="lower right", prop={'size': 15})
plt.show()

Вывод: recall_at_k увеличивается с увеличением k.
Оптимальный k зависит от того, кому мы рекомендуем - оптовым или розничным покупателям, какие категории товаров и т.п.

### Задание 2.

Обучите модель 2-ого уровня, при этом:
    - Добавьте минимум по 2 фичи для юзера, товара и пары юзер-товар
    - Измерьте отдельно precision@5 модели 1-ого уровня и двухуровневой модели на data_val_lvl_2
    - Вырос ли precision@5 при использовании двухуровневой модели?

In [None]:
data_train_lvl_2.head(2)

In [None]:
item_features.head(2)

In [None]:
user_features.head(2)

In [None]:
X_train = data_train_lvl_2.merge(item_features, on='item_id', how='left')
X_train = X_train.merge(user_features, on='user_id', how='left')

In [None]:
X_train.head(2)

In [None]:
users_lvl_2 = pd.DataFrame(data_train_lvl_2['user_id'].unique())
users_lvl_2.columns = ['user_id']

# Пока только warm start
train_users = data_train_lvl_1['user_id'].unique()
users_lvl_2 = users_lvl_2[users_lvl_2['user_id'].isin(train_users)]

users_lvl_2['candidates'] = users_lvl_2['user_id'].apply(lambda x: recommender.get_own_recommendations(x, N=50))

df=pd.DataFrame({'user_id':users_lvl_2.user_id.values.repeat(len(users_lvl_2.candidates[0])),
                 'item_id':np.concatenate(users_lvl_2.candidates.values)})

targets_lvl_2 = data_train_lvl_2[['user_id', 'item_id']].copy()
targets_lvl_2['target'] = 1  # тут только покупки 

targets_lvl_2 = df.merge(targets_lvl_2, on=['user_id', 'item_id'], how='left')

targets_lvl_2['target'].fillna(0, inplace= True)

targets_lvl_2

In [None]:
targets_lvl_2 = targets_lvl_2.merge(item_features, on='item_id', how='left')
targets_lvl_2 = targets_lvl_2.merge(user_features, on='user_id', how='left')

In [None]:
targets_lvl_2.head()

In [None]:
average_check = data_train_lvl_2.groupby(['user_id', 'basket_id'], as_index=False)['sales_value'].sum()
average_check = average_check.groupby('user_id', as_index=False).mean()
average_check = average_check.drop('basket_id', axis=1)
average_check.rename(columns={'sales_value': 'average_check'}, inplace=True)
targets_lvl_2 = targets_lvl_2.merge(average_check, on='user_id', how='left')

In [None]:
num_pur = data_train_lvl_2.merge(item_features, on='item_id', how='left')
num_pur = num_pur.groupby(['user_id', 'department'], as_index=False)['quantity'].sum()
num_pur.rename(columns={'quantity': 'num_pur'}, inplace=True)
num_pur['num_pur'].fillna(0, inplace=True)
targets_lvl_2 = targets_lvl_2.merge(num_pur, on=['user_id', 'department'], how='left')

In [None]:
price = data.loc[:,('item_id', 'quantity', 'sales_value')]
price = price.groupby('item_id', as_index=False)[['quantity', 'sales_value']].sum()
price['price'] = price['sales_value'] / price['quantity']
price = price.drop(['quantity', 'sales_value'], axis=1)
targets_lvl_2 = targets_lvl_2.merge(price, on='item_id', how='left')

In [None]:
goods_per_week = data_train_lvl_2.groupby(['item_id', 'week_no'], as_index=False)['quantity'].sum()
goods_per_week = goods_per_week.groupby('item_id', as_index=False)['quantity'].mean()
goods_per_week.rename(columns={'quantity': 'goods_per_week'}, inplace=True)
goods_per_week['goods_per_week'].fillna(0, inplace=True)


targets_lvl_2 = targets_lvl_2.merge(goods_per_week, on='item_id', how='left')

In [None]:
X_train = targets_lvl_2.drop('target', axis=1)
y_train = targets_lvl_2[['target']]

In [None]:
cat_feats = X_train.columns[2:].tolist()
X_train[cat_feats] = X_train[cat_feats].astype('category')

cat_feats

In [None]:
lgb = LGBMClassifier(objective='binary', max_depth=7, categorical_column=cat_feats)
lgb.fit(X_train, y_train)

train_preds = lgb.predict_proba(X_train)

In [None]:
train_preds

In [None]:
data_val_lvl_2.head(2)

In [None]:
preds = train_preds[:,1]

In [None]:
targets_lvl_2.head()

In [None]:
result = data_val_lvl_2.groupby('user_id')['item_id'].unique().reset_index()
result.columns=['user_id', 'actual']
result.head(2)

In [None]:
targets_lvl_2['pred'] = preds
targets_lvl_2 = targets_lvl_2.sort_values(['user_id', 'pred'], ascending=[False, False])
targets_lvl_2

In [None]:
preds_lgbm = targets_lvl_2[['user_id', 'item_id']].copy().drop_duplicates()
preds_lgbm = preds_lgbm.groupby('user_id')['item_id'].apply(list).reset_index()
preds_lgbm['item_id'] = preds_lgbm.apply(lambda row: row['item_id'][:5], axis=1)
preds_lgbm

In [None]:
result = data_val_lvl_2.groupby('user_id')['item_id'].unique().reset_index()
result.rename(columns={'item_id': 'actual'}, inplace=True)
result = result[result['user_id'].isin(train_users)]

result.head(2)

In [None]:
result = result.merge(preds_lgbm, on='user_id', how='left')
result.rename(columns={'item_id': 'lgb'}, inplace=True)

result.head(2)

In [None]:
result = result[result['lgb'].notna()]

In [None]:
result.apply(lambda row: precision_at_k(row['lgb'], row['actual'], k=5), axis=1).mean()