# Course project


## **Основное**
- Целевая метрика precision@5
- Бейзлайн решения - [MainRecommender](https://github.com/geangohn/recsys-tutorial/blob/master/src/recommenders.py)
- Сдаем ссылку на github с решением. В решении должны быть отчетливо видна метрика на новом тестовом сете из файла retail_test1.csv, то есть вам нужно для всех юзеров из этого файла выдать выши рекомендации, и посчитать на actual покупках precision@5.

## Результат курсовой работы:
precision@5 = 0.2407

# Import libs

In [5]:
!pip install implicit
!pip install catboost

from IPython.display import clear_output

clear_output()

In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix

# Матричная факторизация
from implicit import als

# Модель второго уровня
from lightgbm import LGBMClassifier

import os, sys
# module_path = os.path.abspath(os.path.join(os.pardir))
# if module_path not in sys.path:
#     sys.path.append(module_path)

# Написанные нами функции
from metrics import precision_at_k, recall_at_k
from utils import prefilter_items
from recommenders import MainRecommender

In [7]:
from google.colab import drive
drive.mount('/content/gdrive/')

Mounted at /content/gdrive/


In [200]:
!nvidia-smi

Wed Dec  1 18:30:31 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 495.44       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   56C    P0    59W / 149W |    601MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

## Read data

In [201]:
# PATH_DATA = "../data"
PATH_DATA = "/content/gdrive/MyDrive/GeekBrains/recommendations/data"

In [202]:
data = pd.read_csv(os.path.join(PATH_DATA,'retail_train.csv'))
item_features = pd.read_csv(os.path.join(PATH_DATA,'product.csv'))
user_features = pd.read_csv(os.path.join(PATH_DATA,'hh_demographic.csv'))

# Set global const

In [203]:
ITEM_COL = 'item_id'
USER_COL = 'user_id'
ACTUAL_COL = 'actual'

# N = Neighbors
N_PREDICT = 200

# Process features dataset

In [204]:
# column processing
item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': ITEM_COL}, inplace=True)
user_features.rename(columns={'household_key': USER_COL }, inplace=True)

# Split dataset for train, eval, test

In [205]:
# Разделим датасет только на 2 части. Вместо data_val_ranker будем сразу оценивать качество на "закрытой" части датасета
# -- давние покупки -- | -- 6 недель -- | -- retail_test1.csv -- 

VAL_MATCHER_WEEKS = 6

In [206]:
# берем данные для тренировки matching модели
data_train_matcher = data[data['week_no'] < data['week_no'].max() - VAL_MATCHER_WEEKS]

# берем данные для валидации matching модели
data_val_matcher = data[(data['week_no'] >= data['week_no'].max() - VAL_MATCHER_WEEKS)]

# берем данные для тренировки ranking модели
data_train_ranker = data_val_matcher.copy()  # Для наглядности. Далее мы добавим изменения, и они будут отличаться

In [207]:
def print_stats_data(df_data, name_df):
    print(name_df)
    print(f"Shape: {df_data.shape} Users: {df_data[USER_COL].nunique()} Items: {df_data[ITEM_COL].nunique()}")

In [208]:
print_stats_data(data_train_matcher,'train_matcher')
print_stats_data(data_val_matcher,'val_matcher')
print_stats_data(data_train_ranker,'train_ranker')

train_matcher
Shape: (2193515, 12) Users: 2499 Items: 85334
val_matcher
Shape: (203289, 12) Users: 2197 Items: 30040
train_ranker
Shape: (203289, 12) Users: 2197 Items: 30040


# Prefilter items

In [209]:
# оставил в фильтрации только топ пользователей.
# качество в разы повышается
n_items_before = data_train_matcher['item_id'].nunique()

data_train_matcher = prefilter_items(data_train_matcher, item_features=item_features, take_n_popular=5000)

n_items_after = data_train_matcher['item_id'].nunique()
print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

Decreased # items from 85334 to 5001


# Make cold-start to warm-start

In [210]:
# ищем общих пользователей
common_users = data_train_matcher.user_id.values

data_val_matcher = data_val_matcher[data_val_matcher.user_id.isin(common_users)]
data_train_ranker = data_train_ranker[data_train_ranker.user_id.isin(common_users)]

print_stats_data(data_train_matcher,'train_matcher')
print_stats_data(data_val_matcher,'val_matcher')
print_stats_data(data_train_ranker,'train_ranker')

train_matcher
Shape: (2193515, 12) Users: 2499 Items: 5001
val_matcher
Shape: (203289, 12) Users: 2197 Items: 30040
train_ranker
Shape: (203289, 12) Users: 2197 Items: 30040


# Init/train recommender

In [211]:
recommender = MainRecommender(data_train_matcher)

  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/5001 [00:00<?, ?it/s]

# Eval recall of matching

В 6 домашнем задании уже было произведено сравнение различных рекомендаций.
Поэтому будем давать рекомендации на основании собственных покупок,
то есть recommender.get_own_recommendations

In [212]:
result_eval_matcher = data_val_matcher.groupby(USER_COL)[ITEM_COL].unique().reset_index()
result_eval_matcher.columns=[USER_COL, ACTUAL_COL]
result_eval_matcher.head(2)

Unnamed: 0,user_id,actual
0,1,"[829323, 835108, 836423, 851515, 875240, 87737..."
1,2,"[895388, 8357614, 12301772, 821083, 828106, 83..."


In [213]:
def calc_recall(df_data, col_name, top_k, print_res=False):
    recall = df_data.apply(lambda row: recall_at_k(row[col_name],
                                                 row[ACTUAL_COL], k=top_k), axis=1).mean()
    if print_res:
        print(f"Recall@{top_k}: {recall}")
        
    return recall
        
def calc_precision(df_data, col_name, top_k, print_res=False):
    precision = df_data.apply(lambda row: precision_at_k(row[col_name],
                                                    row[ACTUAL_COL], k=top_k), axis=1).mean()
    
    if print_res:
        print(f"Precision@{top_k}: {precision}")
        
    return precision

def base_predict(df, target_col_name, result_col_name, recommend_model, TOPK_RECALL, TOPK_PRECISION):
    df[result_col_name] = df[target_col_name]\
                                    .apply(lambda x: recommend_model(x, N=TOPK_RECALL))
    
    recall = calc_recall(df, result_col_name, TOPK_RECALL)
    precision = calc_recall(df, result_col_name, TOPK_PRECISION)  
    
    return df, recall, precision

In [214]:
%%time

TOPK_RECALL = N_PREDICT
TOPK_PRECISION = 5
recommend_model = recommender.get_own_recommendations
result_col_name = "own_rec"

result_eval_matcher, recall, precision = base_predict(result_eval_matcher,
                                                      USER_COL,
                                                      result_col_name,
                                                      recommend_model,
                                                      TOPK_RECALL,
                                                      TOPK_PRECISION)

CPU times: user 29.3 s, sys: 161 ms, total: 29.5 s
Wall time: 29.2 s


In [215]:
print(f"Recall@50 of matching: {recall}")
print(f"Precision@5 of matching: {precision}")

Recall@50 of matching: 0.22539991640788526
Precision@5 of matching: 0.025087974200688264


# Ranking part

### Обучаем модель 2-ого уровня на выбранных кандидатах

- Обучаем на data_train_ranking
- Обучаем *только* на выбранных кандидатах
- Генерируем топ-50 кадидиатов через get_own_recommendations
- (!) Если юзер купил < 50 товаров, то get_own_recommendations дополнит рекоммендации топ-популярными

## Подготовка данных для трейна

In [216]:
# взяли пользователей из трейна для ранжирования
df_match_candidates = pd.DataFrame(data_train_ranker[USER_COL].unique())
df_match_candidates.columns = [USER_COL]

# собираем кандитатов с первого этапа (matcher)
df_match_candidates['candidates'] = df_match_candidates[USER_COL]\
                    .apply(lambda x: recommender.get_own_recommendations(x, N=N_PREDICT))

df_match_candidates.head(2)

Unnamed: 0,user_id,candidates
0,84,"[1001646, 829722, 899580, 883916, 926903, 9065..."
1,1753,"[13842224, 1094371, 1089066, 862981, 901543, 1..."


In [217]:
# разворачиваем товары
df_items = df_match_candidates.apply(lambda x: pd.Series(x['candidates']), axis=1).stack().reset_index(level=1, drop=True)
df_items.name = 'item_id'

df_match_candidates = df_match_candidates.drop('candidates', axis=1).join(df_items)

df_match_candidates.head(4)

Unnamed: 0,user_id,item_id
0,84,1001646
0,84,829722
0,84,899580
0,84,883916


### Check warm start

In [218]:
print_stats_data(df_match_candidates, 'match_candidates')

match_candidates
Shape: (439400, 2) Users: 2197 Items: 4904


### Создаем трейн сет для ранжирования с учетом кандидатов с этапа 1 

In [219]:
df_ranker_train = data_train_ranker[[USER_COL, ITEM_COL]].copy()
df_ranker_train['target'] = 1  # тут только покупки 

df_ranker_train = df_match_candidates.merge(df_ranker_train, on=[USER_COL, ITEM_COL], how='left')

# чистим дубликаты
df_ranker_train = df_ranker_train.drop_duplicates(subset=[USER_COL, ITEM_COL])

df_ranker_train['target'].fillna(0, inplace= True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  downcast=downcast,


In [220]:
df_ranker_train.target.value_counts()

0.0    395136
1.0     33530
Name: target, dtype: int64

In [221]:
df_ranker_train.head(2)

Unnamed: 0,user_id,item_id,target
0,84,1001646,0.0
1,84,829722,0.0


## Подготавливаем фичи для обучения модели

### Описательные фичи

In [222]:
item_features.head(2)

Unnamed: 0,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product
0,25671,2,GROCERY,National,FRZN ICE,ICE - CRUSHED/CUBED,22 LB
1,26081,2,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,


In [223]:
user_features.head(2)

Unnamed: 0,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,user_id
0,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown,1
1,45-54,A,50-74K,Homeowner,2 Adults No Kids,2,None/Unknown,7


In [224]:
df_ranker_train = df_ranker_train.merge(item_features, on='item_id', how='left')
df_ranker_train = df_ranker_train.merge(user_features, on='user_id', how='left')

df_ranker_train.head(2)

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc
0,84,1001646,0.0,1225,GROCERY,National,CAT FOOD,CANNED CAT FOOD (9 LIVES/FRISK,5.5 OZ,,,,,,,
1,84,829722,0.0,70,GROCERY,National,LAUNDRY DETERGENTS,LIQUID LAUNDRY DETERGENTS,32 LOAD,,,,,,,


### Поведенческие фичи

##### Чтобы считать поведенческие фичи, нужно учесть все данные что были до data_val_ranker

In [225]:
data.head()

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0
2,2375,26984851472,1,1036325,1,0.99,364,-0.3,1631,1,0.0,0.0
3,2375,26984851472,1,1082185,1,1.21,364,0.0,1631,1,0.0,0.0
4,2375,26984851472,1,8160430,1,1.5,364,-0.39,1631,1,0.0,0.0


In [226]:
df_ranker_train = df_ranker_train.merge(data.groupby(by=ITEM_COL).agg('sales_value').sum().rename('total_item_sales_value'), how='left',on=ITEM_COL)

df_ranker_train = df_ranker_train.merge(data.groupby(by=ITEM_COL).agg('quantity').sum().rename('total_quantity_value'), how='left',on=ITEM_COL)

df_ranker_train = df_ranker_train.merge(data.groupby(by=ITEM_COL).agg(USER_COL).count().rename('item_freq'), how='left',on=ITEM_COL)

df_ranker_train = df_ranker_train.merge(data.groupby(by=USER_COL).agg(USER_COL).count().rename('user_freq'), how='left',on=USER_COL)

df_ranker_train = df_ranker_train.merge(data.groupby(by=USER_COL).agg('sales_value').sum().rename('total_user_sales_value'), how='left',on=USER_COL)

df_ranker_train = df_ranker_train.merge(data.groupby(by=ITEM_COL).agg('quantity').sum().rename('item_quantity_per_week')/data.week_no.nunique(), how='left',on=ITEM_COL)

df_ranker_train = df_ranker_train.merge(data.groupby(by=USER_COL).agg('quantity').sum().rename('user_quantity_per_week')/data.week_no.nunique(), how='left',on=USER_COL)


df_ranker_train = df_ranker_train.merge(data.groupby(by=ITEM_COL).agg('quantity').sum().rename('item_quantity_per_basket')/data.basket_id.nunique(), how='left',on=ITEM_COL)

df_ranker_train = df_ranker_train.merge(data.groupby(by=USER_COL).agg('quantity').sum().rename('user_quantity_per_baskter')/data.basket_id.nunique(), how='left',on=USER_COL)


df_ranker_train = df_ranker_train.merge(data.groupby(by=ITEM_COL).agg(USER_COL).count().rename('item_freq_per_basket')/data.basket_id.nunique(), how='left',on=ITEM_COL)

df_ranker_train = df_ranker_train.merge(data.groupby(by=USER_COL).agg(USER_COL).count().rename('user_freq_per_basket')/data.basket_id.nunique(), how='left',on=USER_COL)

### Добавим фичи, написанные в домашнем задании к 6 уроку

In [227]:
big_data = data.copy()

big_data = big_data.merge(item_features[[ITEM_COL,'department']], on='item_id', how='left')

big_data.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc,department
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0,PRODUCE
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0,PRODUCE


#### Генерация фичей для user_id

In [228]:
# Средний чек
add_user_features = pd.DataFrame(big_data.groupby('user_id')['sales_value'].median()).reset_index()
add_user_features.rename(columns={'sales_value': 'avg_sales_value'}, inplace=True)

In [229]:
# Средняя сумма покупки 1 товара в каждой категории
departments = big_data['department'].unique().tolist()
departments.remove(' ')
for department in departments:
    add_user_features[f'avg_buy_{department}'] = 0

user_department_price = pd.DataFrame(big_data.groupby(['user_id', 'department'])['sales_value'].median()).reset_index()
user_department_price = user_department_price[user_department_price['department'] != ' ']

for user_id, department, avg_buy in user_department_price.values:
    add_user_features.loc[add_user_features['user_id'] == user_id, 
                         f"avg_buy_{department}"] = avg_buy

add_user_features.head()

Unnamed: 0,user_id,avg_sales_value,avg_buy_PRODUCE,avg_buy_GROCERY,avg_buy_DRUG GM,avg_buy_MEAT,avg_buy_MEAT-PCKGD,avg_buy_DELI,avg_buy_SEAFOOD-PCKGD,avg_buy_PASTRY,avg_buy_NUTRITION,avg_buy_VIDEO RENTAL,avg_buy_MISC SALES TRAN,avg_buy_FLORAL,avg_buy_SEAFOOD,avg_buy_SALAD BAR,avg_buy_AUTOMOTIVE,avg_buy_SPIRITS,avg_buy_COSMETICS,avg_buy_MISC. TRANS.,avg_buy_GARDEN CENTER,avg_buy_CHEF SHOPPE,avg_buy_TRAVEL & LEISUR,avg_buy_COUP/STR & MFG,avg_buy_KIOSK-GAS,avg_buy_FROZEN GROCERY,avg_buy_RESTAURANT,avg_buy_HOUSEWARES,avg_buy_PORK,avg_buy_POSTAL CENTER,avg_buy_GM MERCH EXP,avg_buy_CNTRL/STORE SUP,avg_buy_PROD-WHS SALES,avg_buy_DAIRY DELI,avg_buy_HBC,avg_buy_CHARITABLE CONT,avg_buy_RX,avg_buy_TOYS,avg_buy_PHOTO,avg_buy_DELI/SNACK BAR,avg_buy_GRO BAKERY,avg_buy_PHARMACY SUPPLY,avg_buy_ELECT &PLUMBING,avg_buy_MEAT-WHSE,avg_buy_VIDEO
0,1,2.29,1.535,2.18,2.29,4.11,2.99,3.385,0.0,2.99,2.69,0.0,10.0,7.99,0.0,4.86,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.39,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0
1,2,2.39,1.76,2.0,2.99,5.31,2.99,3.35,8.99,2.245,2.49,0.0,0.0,21.99,0.0,0.0,0.0,0.0,4.59,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0
2,3,2.0,1.29,2.0,1.49,6.93,3.29,6.17,4.99,3.29,0.0,0.0,0.0,0.0,0.0,1.36,0.0,0.0,1.25,0.0,0.0,0.0,0.0,0.0,20.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0
3,4,2.59,1.69,2.5,3.995,4.16,3.995,7.175,4.69,0.0,3.29,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0
4,5,2.58,1.095,2.49,2.99,7.12,4.495,3.5,0.0,2.38,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.74,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0


In [230]:
df_ranker_train = df_ranker_train.merge(add_user_features, on=USER_COL, how='left')

#### Генерация фичей для item_id

In [231]:
# Среднее кол-во покупок в неделю
# (отличается от того среднего, который был в основной группе доп фичей)
item_week_quantity = pd.DataFrame(big_data.groupby(['item_id', 'week_no'])['quantity'].sum()).reset_index()
add_item_features = pd.DataFrame(item_week_quantity.groupby('item_id')['quantity'].mean()).reset_index()
add_item_features.rename(columns={'quantity': 'my_quantity_per_week'}, inplace=True)


# Цена товара
big_data['price'] = big_data['sales_value'] / big_data['quantity']
# У товара может меняться цена во времени,
# поэтому усредним полученные значения цены по каждому товару
item_prices = pd.DataFrame(big_data.groupby('item_id')['price'].mean()).reset_index()
add_item_features = add_item_features.merge(item_prices, on='item_id', how='left')

add_item_features.head()

Unnamed: 0,item_id,my_quantity_per_week,price
0,25671,2.0,3.49
1,26081,1.0,0.99
2,26093,1.0,1.59
3,26190,1.0,1.54
4,26355,2.0,0.99


In [232]:
df_ranker_train = df_ranker_train.merge(add_item_features, on=ITEM_COL, how='left')

#### Генерация фичей user_id - item_id

In [233]:
# (Средняя сумма покупки 1 товара в каждой категории (берем категорию item_id)) - (Цена item_id)
avg_price_in_department = pd.DataFrame(big_data.groupby('department')['sales_value'].mean()).reset_index()

avg_price_in_department = dict(zip(avg_price_in_department.department.tolist(),
                                   avg_price_in_department.sales_value.tolist()))

df_ranker_train['hard_feature_1'] = df_ranker_train[['department', 'price']]\
        .apply(lambda row: avg_price_in_department[row[0]] - row[1], axis=1)

In [234]:
# (Кол-во покупок юзером конкретной категории в неделю) -
# - (Среднее кол-во покупок всеми юзерами конкретной категории в неделю)

# Сначала посчитаем сколько покупок сделал каждый юзер в каждой категории
user_dep_num_purchases = pd.DataFrame(df_ranker_train.groupby(['user_id', 'department'])['item_id'].count()).reset_index()
user_dep_num_purchases.rename(columns={'item_id': 'number_purchases'}, inplace=True)

# Теперь усредним количество покупок по каждой категории
avg_purchases_per_dep = dict(user_dep_num_purchases.groupby('department')['number_purchases'].mean())

user_dep_num_purchases['hard_feature_2'] = user_dep_num_purchases.apply(lambda row: row[2] - avg_purchases_per_dep[row[1]],
                                                                                            axis=1)

# Удалим колонку перед мержем
user_dep_num_purchases.drop('number_purchases', axis=1, inplace=True)

user_dep_num_purchases.head(2)

Unnamed: 0,user_id,department,hard_feature_2
0,1,DELI,4.294236
1,1,DRUG GM,-5.558891


In [235]:
df_ranker_train = df_ranker_train.merge(user_dep_num_purchases,
                                       on=['user_id', 'department'], how='left')

In [236]:
df_ranker_train.head()

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,total_item_sales_value,total_quantity_value,item_freq,user_freq,total_user_sales_value,item_quantity_per_week,user_quantity_per_week,item_quantity_per_basket,user_quantity_per_baskter,item_freq_per_basket,user_freq_per_basket,avg_sales_value,avg_buy_PRODUCE,avg_buy_GROCERY,avg_buy_DRUG GM,avg_buy_MEAT,avg_buy_MEAT-PCKGD,avg_buy_DELI,avg_buy_SEAFOOD-PCKGD,avg_buy_PASTRY,avg_buy_NUTRITION,avg_buy_VIDEO RENTAL,avg_buy_MISC SALES TRAN,avg_buy_FLORAL,avg_buy_SEAFOOD,avg_buy_SALAD BAR,avg_buy_AUTOMOTIVE,avg_buy_SPIRITS,avg_buy_COSMETICS,avg_buy_MISC. TRANS.,avg_buy_GARDEN CENTER,avg_buy_CHEF SHOPPE,avg_buy_TRAVEL & LEISUR,avg_buy_COUP/STR & MFG,avg_buy_KIOSK-GAS,avg_buy_FROZEN GROCERY,avg_buy_RESTAURANT,avg_buy_HOUSEWARES,avg_buy_PORK,avg_buy_POSTAL CENTER,avg_buy_GM MERCH EXP,avg_buy_CNTRL/STORE SUP,avg_buy_PROD-WHS SALES,avg_buy_DAIRY DELI,avg_buy_HBC,avg_buy_CHARITABLE CONT,avg_buy_RX,avg_buy_TOYS,avg_buy_PHOTO,avg_buy_DELI/SNACK BAR,avg_buy_GRO BAKERY,avg_buy_PHARMACY SUPPLY,avg_buy_ELECT &PLUMBING,avg_buy_MEAT-WHSE,avg_buy_VIDEO,my_quantity_per_week,price,hard_feature_1,hard_feature_2
0,84,1001646,0.0,1225,GROCERY,National,CAT FOOD,CANNED CAT FOOD (9 LIVES/FRISK,5.5 OZ,,,,,,,,84.59,257,131,439,1153.65,2.705263,91.294737,0.001,0.033747,0.00051,0.001708,1.99,1.16,1.99,2.19,6.46,3.29,0.0,0.0,5.745,0.0,0.0,0.0,2.99,0.0,0.0,0.0,0.0,5.49,0.99,0.0,0.0,0.0,0.0,20.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,3.893939,0.332521,2.14878,0.212107
1,84,829722,0.0,70,GROCERY,National,LAUNDRY DETERGENTS,LIQUID LAUNDRY DETERGENTS,32 LOAD,,,,,,,,656.76,165,147,439,1153.65,1.736842,91.294737,0.000642,0.033747,0.000572,0.001708,1.99,1.16,1.99,2.19,6.46,3.29,0.0,0.0,5.745,0.0,0.0,0.0,2.99,0.0,0.0,0.0,0.0,5.49,0.99,0.0,0.0,0.0,0.0,20.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,2.291667,3.994762,-1.513461,0.212107
2,84,899580,0.0,1225,GROCERY,National,CAT FOOD,CANNED CAT FOOD (9 LIVES/FRISK,5.5 OZ,,,,,,,,137.79,417,206,439,1153.65,4.389474,91.294737,0.001623,0.033747,0.000802,0.001708,1.99,1.16,1.99,2.19,6.46,3.29,0.0,0.0,5.745,0.0,0.0,0.0,2.99,0.0,0.0,0.0,0.0,5.49,0.99,0.0,0.0,0.0,0.0,20.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,5.085366,0.331026,2.150275,0.212107
3,84,883916,0.0,1225,GROCERY,National,CAT FOOD,CANNED CAT FOOD (9 LIVES/FRISK,5.5 OZ,,,,,,,,83.62,249,119,439,1153.65,2.621053,91.294737,0.000969,0.033747,0.000463,0.001708,1.99,1.16,1.99,2.19,6.46,3.29,0.0,0.0,5.745,0.0,0.0,0.0,2.99,0.0,0.0,0.0,0.0,5.49,0.99,0.0,0.0,0.0,0.0,20.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,3.772727,0.335933,2.145368,0.212107
4,84,926903,0.0,418,GROCERY,National,BAKED SWEET GOODS,SNACK CAKE - MULTI PACK,11 OZ,,,,,,,,161.52,128,104,439,1153.65,1.347368,91.294737,0.000498,0.033747,0.000405,0.001708,1.99,1.16,1.99,2.19,6.46,3.29,0.0,0.0,5.745,0.0,0.0,0.0,2.99,0.0,0.0,0.0,0.0,5.49,0.99,0.0,0.0,0.0,0.0,20.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,2.37037,1.274314,1.206988,0.212107


In [237]:
X_train = df_ranker_train.drop('target', axis=1)
y_train = df_ranker_train[['target']]

In [238]:
cat_feats = [column for column in X_train.columns
                    if X_train[column].dtypes == 'object']
# X_train[cat_feats] = X_train[cat_feats].astype('category')
X_train[cat_feats] = X_train[cat_feats].astype('str')

cat_feats

['department',
 'brand',
 'commodity_desc',
 'sub_commodity_desc',
 'curr_size_of_product',
 'age_desc',
 'marital_status_code',
 'income_desc',
 'homeowner_desc',
 'hh_comp_desc',
 'household_size_desc',
 'kid_category_desc']

## Обучение модели ранжирования

In [96]:
import catboost as catb

In [251]:
%%time

params = {
    'cat_features': cat_feats,
    'silent': True,
    'random_state': 15,
    'iterations': 1500,
    'max_depth': 10,
    'l2_leaf_reg': 2,
    "task_type": "GPU"
}


model = catb.CatBoostClassifier(**params)

# Обучение на train_test для получения метрик
model.fit(X_train, y_train) 

CPU times: user 7min 46s, sys: 28.4 s, total: 8min 14s
Wall time: 7min 18s


In [252]:
train_preds = model.predict_proba(X_train)
df_ranker_predict = df_ranker_train.copy()
df_ranker_predict['proba_item_purchase'] = train_preds[:,1]

# Оценка на тесте для выполнения курсового проекта

In [253]:
df_test = pd.read_csv(os.path.join(PATH_DATA,'retail_test1.csv'))

result_test = df_test.groupby(USER_COL)[ITEM_COL].unique().reset_index()
result_test.columns=[USER_COL, ACTUAL_COL]
result_test.head(2)

Unnamed: 0,user_id,actual
0,1,"[880007, 883616, 931136, 938004, 940947, 94726..."
1,2,"[820165, 820291, 826784, 826835, 829009, 85784..."


В тестовом датасете есть уникальный пользователь, который не был в данных для этого.
Добавил в recommenders._get_recommendations try except,
поэтому для такого пользователя предсказание не делается,
а просто предлагаются самые популярные товары

In [254]:
%%time
result_test['own_rec'] = result_test[USER_COL].apply(lambda x: recommender.get_own_recommendations(x, N=N_PREDICT))

calc_precision(result_test, 'own_rec', TOPK_PRECISION, print_res=True)

Precision@5: 0.18885941644562135
CPU times: user 29 s, sys: 178 ms, total: 29.2 s
Wall time: 29 s


In [255]:
def rerank(user_id):
    return df_ranker_predict[df_ranker_predict[USER_COL]==user_id]\
                .sort_values('proba_item_purchase', ascending=False).head(5).item_id.tolist()

In [256]:
result_test['reranked_own_rec'] = result_test[USER_COL].apply(lambda user_id: rerank(user_id))

calc_precision(result_test, 'reranked_own_rec', TOPK_PRECISION, print_res=True)

Precision@5: 0.24077348066298113


  return flags.sum() / len(recommended_list)


0.24077348066298113