# Course project


# Import libs

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix

# Матричная факторизация
from implicit import als

# Модель второго уровня
from lightgbm import LGBMClassifier

import os, sys
module_path = os.path.abspath(os.path.join(os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)

# Написанные нами функции
from metrics import precision_at_k, recall_at_k
from utils import prefilter_items
from recommenders import MainRecommender
import itertools

## Read data

In [2]:
data = pd.read_csv('transaction_data.csv')
item_features = pd.read_csv('product.csv')
user_features = pd.read_csv('hh_demographic.csv')

# Set global const

In [3]:
ITEM_COL = 'item_id'
USER_COL = 'user_id'
ACTUAL_COL = 'actual'

# N = Neighbors
N_PREDICT = 50 

# Process features dataset

In [4]:
# column processing
item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': ITEM_COL}, inplace=True)
user_features.rename(columns={'household_key': USER_COL }, inplace=True)

# Split dataset for train, eval, test

In [5]:
# Важна схема обучения и валидации!
# -- давние покупки -- | -- 6 недель -- | -- 3 недель -- 
# подобрать размер 2-ого датасета (6 недель) --> learning curve (зависимость метрики recall@k от размера датасета)


VAL_MATCHER_WEEKS = 6
VAL_RANKER_WEEKS = 3

In [6]:
# берем данные для тренировки matching модели
data_train_matcher = data[data['week_no'] < data['week_no'].max() - (VAL_MATCHER_WEEKS + VAL_RANKER_WEEKS)]

# берем данные для валидации matching модели
data_val_matcher = data[(data['week_no'] >= data['week_no'].max() - (VAL_MATCHER_WEEKS + VAL_RANKER_WEEKS)) &
                      (data['week_no'] < data['week_no'].max() - (VAL_RANKER_WEEKS))]

# берем данные для тренировки ranking модели
data_train_ranker = data_val_matcher.copy()  # Для наглядности. Далее мы добавим изменения, и они будут отличаться

# берем данные для теста ranking, matching модели
data_val_ranker = data[data['week_no'] >= data['week_no'].max() - VAL_RANKER_WEEKS]

In [7]:
# сделаем объединенный сет данных для первого уровня (матчинга)
df_for_featurization = pd.concat([data_train_matcher, data_val_matcher])

In [8]:
def print_stats_data(df_data, name_df):
    print(name_df)
    print(f"Shape: {df_data.shape} Users: {df_data[USER_COL].nunique()} Items: {df_data[ITEM_COL].nunique()}")

In [9]:
print_stats_data(data_train_matcher,'train_matcher')
print_stats_data(data_val_matcher,'val_matcher')
print_stats_data(data_train_ranker,'train_ranker')
print_stats_data(data_val_ranker,'val_ranker')

train_matcher
Shape: (2108779, 12) Users: 2498 Items: 83685
val_matcher
Shape: (169711, 12) Users: 2154 Items: 27649
train_ranker
Shape: (169711, 12) Users: 2154 Items: 27649
val_ranker
Shape: (118314, 12) Users: 2042 Items: 24329


In [10]:
# выше видим разброс по пользователям и товарам и дальше мы перейдем к warm-start (только известные пользователи)

In [11]:
data_val_matcher.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
2104867,2070,40618492260,594,1019940,1,1.0,311,-0.29,40,86,0.0,0.0
2107468,2021,40618753059,594,840361,1,0.99,443,0.0,101,86,0.0,0.0


# Prefilter items

In [12]:
n_items_before = data_train_matcher['item_id'].nunique()

data_train_matcher = prefilter_items(data_train_matcher, item_features=item_features, take_n_popular=5000)

n_items_after = data_train_matcher['item_id'].nunique()
print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['price'] = data['sales_value'] / (np.maximum(data['quantity'], 1))


Decreased # items from 83685 to 5001


# Make cold-start to warm-start

In [13]:
# ищем общих пользователей
common_users = list(set(data_train_matcher.user_id.values)&(set(data_val_matcher.user_id.values))&set(data_val_ranker.user_id.values))

# оставляем общих пользователей
data_train_matcher = data_train_matcher[data_train_matcher.user_id.isin(common_users)]
data_val_matcher = data_val_matcher[data_val_matcher.user_id.isin(common_users)]
data_train_ranker = data_train_ranker[data_train_ranker.user_id.isin(common_users)]
data_val_ranker = data_val_ranker[data_val_ranker.user_id.isin(common_users)]

print_stats_data(data_train_matcher,'train_matcher')
print_stats_data(data_val_matcher,'val_matcher')
print_stats_data(data_train_ranker,'train_ranker')
print_stats_data(data_val_ranker,'val_ranker')

train_matcher
Shape: (784420, 13) Users: 1915 Items: 4999
val_matcher
Shape: (163261, 12) Users: 1915 Items: 27118
train_ranker
Shape: (163261, 12) Users: 1915 Items: 27118
val_ranker
Shape: (115989, 12) Users: 1915 Items: 24042


# Init/train recommender

In [14]:
recommender = MainRecommender(data_train_matcher)



  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/4999 [00:00<?, ?it/s]

# Eval recall of matching

In [15]:
result_eval_matcher = data_val_matcher.groupby(USER_COL)[ITEM_COL].unique().reset_index()
result_eval_matcher.columns=[USER_COL, ACTUAL_COL]
result_eval_matcher.head(2)

Unnamed: 0,user_id,actual
0,1,"[853529, 865456, 867607, 872137, 874905, 87524..."
1,6,"[1024306, 1102949, 6548453, 835394, 940804, 96..."


In [16]:
def calc_recall(df_data, top_k):
    for col_name in df_data.columns[2:]:
        yield col_name, df_data.apply(lambda row: recall_at_k(row[col_name], row[ACTUAL_COL], k=top_k), axis=1).mean()

In [17]:
def calc_precision(df_data, top_k):
    for col_name in df_data.columns[2:]:
        yield col_name, df_data.apply(lambda row: precision_at_k(row[col_name], row[ACTUAL_COL], k=top_k), axis=1).mean()

In [18]:
# # простая функция добавления кандидатов в результирующую таблицу + расчет точности
def evalRecall(df_result, target_col_name, result_col_name, recommend_model, N_predictions):
#    result_col_name = 'result'
    df_result[result_col_name] = df_result[target_col_name].apply(lambda x: recommend_model(x, N=N_predictions))
    return df_result.apply(lambda row: recall_at_k(row[result_col_name], row[ACTUAL_COL], k=N_predictions), axis=1).mean()

In [19]:
dict_models = {'own_rec': recommender.get_own_recommendations,
              'sim_item_rec': recommender.get_similar_items_recommendation,
              'als_rec': recommender.get_als_recommendations,
              'sim_user_rec': recommender.get_similar_users_recommendation}

In [20]:
# N = Neighbors
N_PREDICT = 50

In [21]:
# for model in dict_models:
#     print(f'{model} : {evalRecall(result_eval_matcher, USER_COL, model, dict_models[model], N_PREDICT)}')

In [22]:
# N_PREDICTIONS = [20, 50, 100, 200, 500]

# for k in N_PREDICTIONS:
#     print(f"own_rec_{k}: {evalRecall(result_eval_matcher, USER_COL, f'own_rec_{k}', recommender.get_own_recommendations, k)}")

In [23]:
# sorted(calc_recall(result_eval_matcher, 500), key=lambda x: x[1],reverse=True)

In [24]:
# sorted(calc_precision(result_eval_matcher, 500), key=lambda x: x[1],reverse=True)

Думаю, что наиболее разумным - найти не более 200 кандидатов. Да, у 500 была самая высокая точность, но это слишком усложняет модель в дальнейшем и не факт, что это поднимет финальные метрики после 2-ого этапа.

По итогу 200 кандидатов даже уменьшили метрики, оставила 50

# Ranking part

### Обучаем модель 2-ого уровня на выбранных кандидатах

- Обучаем на data_train_ranking
- Обучаем *только* на выбранных кандидатах
- Я *для примера* сгенерирую топ-50 кадидиатов через get_own_recommendations
- (!) Если юзер купил < 50 товаров, то get_own_recommendations дополнит рекоммендации топ-популярными

In [25]:
# -- давние покупки -- | -- 6 недель -- | -- 3 недель -- 

## Подготовка данных для трейна

In [26]:
# взяли пользователей из трейна для ранжирования
df_match_candidates = pd.DataFrame(data_train_ranker[USER_COL].unique())
df_match_candidates.columns = [USER_COL]

In [27]:
# собираем кандитатов с первого этапа (matcher)
df_match_candidates['candidates'] = df_match_candidates[USER_COL].apply(lambda x: recommender.get_own_recommendations(x, N=N_PREDICT))

In [28]:
df_match_candidates.head(2)

Unnamed: 0,user_id,candidates
0,2070,"[1105426, 1097350, 879194, 948640, 928263, 944..."
1,2021,"[950935, 1119454, 835578, 863762, 1097398, 101..."


In [29]:
# разворачиваем товары
df_items = df_match_candidates.apply(lambda x: pd.Series(x['candidates']), axis=1).stack().reset_index(level=1, drop=True)
df_items.name = 'item_id'

In [30]:
df_match_candidates = df_match_candidates.drop('candidates', axis=1).join(df_items)

In [31]:
df_match_candidates.head(4)

Unnamed: 0,user_id,item_id
0,2070,1105426
0,2070,1097350
0,2070,879194
0,2070,948640


### Check warm start

In [32]:
print_stats_data(df_match_candidates, 'match_candidates')

match_candidates
Shape: (95750, 2) Users: 1915 Items: 4437


### Создаем трейн сет для ранжирования с учетом кандидатов с этапа 1 

In [33]:
df_ranker_train = data_train_ranker[[USER_COL, ITEM_COL]].copy()
df_ranker_train['target'] = 1  # тут только покупки 

df_ranker_train = df_match_candidates.merge(df_ranker_train, on=[USER_COL, ITEM_COL], how='left')

df_ranker_train['target'].fillna(0, inplace= True)
df_ranker_train['target'].mean()

0.11119830179378062

In [34]:
df_ranker_train.target.value_counts()

0.0    88346
1.0    11053
Name: target, dtype: int64

In [35]:
df_ranker_train.head(2)

Unnamed: 0,user_id,item_id,target
0,2070,1105426,0.0
1,2070,1097350,0.0


(!) На каждого юзера 50 item_id-кандидатов

In [44]:
df_ranker_train['target'].mean()

0.11119830179378062

## Подготавливаем фичи для обучения модели

### Описательные фичи

### Поведенческие фичи

##### Чтобы считать поведенческие фичи, нужно учесть все данные что были до data_val_ranker

## !!! Пока выполните нотбук без этих строк, потом вернитесь и запустите их, обучите ранкер и посмотрите на метрики с ранжированием

In [37]:
df_ranker_train = df_ranker_train.merge(item_features, on='item_id', how='left')
df_ranker_train = df_ranker_train.merge(user_features, on='user_id', how='left')
df_ranker_train.head(2)

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc
0,2070,1105426,0.0,69,DELI,Private,SANDWICHES,SANDWICHES - (COLD),,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown
1,2070,1097350,0.0,2468,GROCERY,National,DOMESTIC WINE,VALUE GLASS WINE,4 LTR,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown


In [39]:
# Всего продано по выручке и товару
df_ranker_train = df_ranker_train.merge(df_for_featurization.groupby(by=ITEM_COL).agg('sales_value').sum().rename('total_item_sales_value'), how='left',on=ITEM_COL)
# Всего продано по количеству и товару
df_ranker_train = df_ranker_train.merge(df_for_featurization.groupby(by=ITEM_COL).agg('quantity').sum().rename('total_quantity_value'), how='left',on=ITEM_COL)
# Частота покупок товара пользователем
df_ranker_train = df_ranker_train.merge(df_for_featurization.groupby(by=ITEM_COL).agg(USER_COL).count().rename('item_freq'), how='left',on=ITEM_COL)
# Частота покупок пользователем
df_ranker_train = df_ranker_train.merge(df_for_featurization.groupby(by=USER_COL).agg(USER_COL).count().rename('user_freq'), how='left',on=USER_COL)
# сколько всего потратил пользователь
df_ranker_train = df_ranker_train.merge(df_for_featurization.groupby(by=USER_COL).agg('sales_value').sum().rename('total_user_sales_value'), how='left',on=USER_COL)
# количество покупок товара в неделю
df_ranker_train = df_ranker_train.merge(df_for_featurization.groupby(by=ITEM_COL).agg('quantity').sum().rename('item_quantity_per_week')/df_for_featurization.week_no.nunique(), how='left',on=ITEM_COL)
# количество покупок юзером в неделю
df_ranker_train = df_ranker_train.merge(df_for_featurization.groupby(by=USER_COL).agg('quantity').sum().rename('user_quantity_per_week')/df_for_featurization.week_no.nunique(), how='left',on=USER_COL)
# среднее количество товара в корзине
df_ranker_train = df_ranker_train.merge(df_for_featurization.groupby(by=ITEM_COL).agg('quantity').sum().rename('item_quantity_per_basket')/df_for_featurization.basket_id.nunique(), how='left',on=ITEM_COL)
# вопрос. Группировка по юзеру, считаем количество товаров и делим на ВСЕ корзины, без группировки по юзеру?
df_ranker_train = df_ranker_train.merge(df_for_featurization.groupby(by=USER_COL).agg('quantity').sum().rename('user_quantity_per_baskter')/df_for_featurization.basket_id.nunique(), how='left',on=USER_COL)
# частота товара в корзине (как часто он появляется в корзине)
df_ranker_train = df_ranker_train.merge(df_for_featurization.groupby(by=ITEM_COL).agg(USER_COL).count().rename('item_freq_per_basket')/df_for_featurization.basket_id.nunique(), how='left',on=ITEM_COL)
# частота корзин у пользователя
df_ranker_train = df_ranker_train.merge(df_for_featurization.groupby(by=USER_COL).agg(USER_COL).count().rename('user_freq_per_basket')/df_for_featurization.basket_id.nunique(), how='left',on=USER_COL)


Создаем датафрейм для фичеризации

In [41]:
week_num = df_for_featurization['week_no'].nunique()

In [42]:
df_for_featurization = df_for_featurization.merge(item_features, on='item_id', how='left')
df_for_featurization = df_for_featurization.merge(user_features, on='user_id', how='left')
df_for_featurization.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,...,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,...,POTATOES,POTATOES RUSSET (BULK&BAG),5 LB,,,,,,,
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,...,ONIONS,ONIONS SWEET (BULK&BAG),40 LB,,,,,,,


In [43]:
item_features = item_features.iloc[0:0]
user_features = user_features.iloc[0:0]

# item_features

In [47]:
# Фичи item_id: Цена / Средняя цена товара в категории
# узнаем среднюю цену товара (вдруг были скидки)
df_total_price_item = df_for_featurization.groupby(['item_id', 'commodity_desc'])[['sales_value', 'quantity']].sum().reset_index()
df_total_price_item['aver_price_item'] = df_total_price_item['sales_value']/df_total_price_item['quantity']
df_total_price_item = df_total_price_item.fillna(0)
df_total_price_item.head(2)

Unnamed: 0,item_id,commodity_desc,sales_value,quantity,aver_price_item
0,25671,FRZN ICE,20.94,6,3.49
1,26081,NO COMMODITY DESCRIPTION,0.99,1,0.99


In [48]:
# узнаем среднюю цену товара в категории
df_total_price_category = df_for_featurization.groupby('commodity_desc')[['sales_value', 'quantity']].sum().reset_index()
df_total_price_category['aver_price_category'] = df_total_price_category['sales_value']/df_total_price_category['quantity']
df_total_price_category = df_total_price_category.fillna(0)
df_total_price_category.head(2)

Unnamed: 0,commodity_desc,sales_value,quantity,aver_price_category
0,,0.0,0,0.0
1,(CORP USE ONLY),253.73,70,3.624714


In [49]:
# соединим 2 датасета
df_total_price_item = df_total_price_item.merge(df_total_price_category[['commodity_desc', 'aver_price_category']], on='commodity_desc', how='left')
df_total_price_item.head(2)

Unnamed: 0,item_id,commodity_desc,sales_value,quantity,aver_price_item,aver_price_category
0,25671,FRZN ICE,20.94,6,3.49,2.281239
1,26081,NO COMMODITY DESCRIPTION,0.99,1,0.99,3.04497


In [50]:
# считаем новую фичу и добавляем её в финальный датасет для добавления
df_total_price_item['price_per_category'] = df_total_price_item['aver_price_item']/df_total_price_item['aver_price_category']
df_total_price_item = df_total_price_item[['item_id', 'aver_price_item', 'aver_price_category', 'price_per_category']]
df_total_price_item.head(2)

Unnamed: 0,item_id,aver_price_item,aver_price_category,price_per_category
0,25671,3.49,2.281239,1.52987
1,26081,0.99,3.04497,0.325126


In [51]:
df_for_item_features = df_for_featurization.groupby(['item_id', 'commodity_desc'])[['sales_value', 'quantity']].sum().reset_index().merge(df_for_featurization.groupby('commodity_desc')[['sales_value', 'quantity']].sum().reset_index(), on='commodity_desc', how='left')
df_for_item_features['item_sales_value_per_category'] = df_for_item_features['sales_value_x']/df_for_item_features['sales_value_y']
df_for_item_features['item_quantity_per_category'] = df_for_item_features['quantity_x']/df_for_item_features['quantity_y']
df_for_item_features = df_for_item_features[['item_id', 'item_sales_value_per_category', 'item_quantity_per_category']]
df_for_item_features.head(2)

Unnamed: 0,item_id,item_sales_value_per_category,item_quantity_per_category
0,25671,0.003345,0.002187
1,26081,0.00011,0.000338


In [52]:
# df_for_item_features по item_id \\сколько по выручке было продано товара относительно категории, сколько по количеству было продано товара относительно категории
df_ranker_train = df_ranker_train.merge(df_for_item_features, on='item_id', how='left')
df_for_item_features = df_for_item_features.iloc[0:0]
df_total_price_item = df_total_price_item.iloc[0:0]
df_total_price_category = df_total_price_category.iloc[0:0]

In [54]:
# Фичи user_id: - Средний чек 
add_features_user = df_for_featurization.groupby('user_id')['sales_value'].sum()/df_for_featurization.groupby('user_id')['basket_id'].nunique()
add_features_user = add_features_user.reset_index()
add_features_user.rename(columns={0: 'aver_invoice'}, inplace=True)
add_features_user.head(2)

Unnamed: 0,user_id,aver_invoice
0,1,50.130533
1,2,41.442045


In [56]:
# add_features_user по user_id \\средний чек пользователя
df_ranker_train = df_ranker_train.merge(add_features_user, on='user_id', how='left')
add_features_user = add_features_user.iloc[0:0]

In [56]:
# Фичи пары user_id - item_id : Средняя сумма покупки 1 товара покупателя в каждой категории (по item_id)

In [58]:
# Фичи пары user_id - item_id : Средняя сумма покупки 1 товара покупателя в каждой категории (по item_id)
# находим общие траты пользователя в категории за весь период/# находим количество товаров, купленных пользователем, в категории за весь период
df_user_aver_per_category = df_for_featurization.groupby(['user_id', 'commodity_desc'])['sales_value'].sum().reset_index().merge(df_for_featurization.groupby(['user_id', 'commodity_desc'])['quantity'].sum().reset_index(), on=['user_id', 'commodity_desc'], how='left')
df_user_aver_per_category['user_aver_sum_per_category'] = df_user_aver_per_category['sales_value']/df_user_aver_per_category['quantity']
df_user_aver_per_category = df_user_aver_per_category.fillna(0)
df_user_aver_per_category.rename(columns = {'sales_value':'user_sales_value_per_category', 'quantity':'user_quantity_per_category'}, inplace = True)
df_user_aver_per_category.head(2)

Unnamed: 0,user_id,commodity_desc,user_sales_value_per_category,user_quantity_per_category,user_aver_sum_per_category
0,1,,0.0,0,0.0
1,1,AIR CARE,57.66,22,2.620909


In [59]:
# df_user_aver_per_category по user_id \\ сумма покупки пользователя в каждой категории, количество в каждой категории, среднее (сумма\количество)
df_ranker_train = df_ranker_train.merge(df_user_aver_per_category, on=['user_id', 'commodity_desc'], how='left')
df_user_aver_per_category = df_user_aver_per_category.iloc[0:0]
df_ranker_train.shape

(99399, 33)

In [60]:
#Фичи пары user_id - item_id: Кол-во покупок юзером конкретной категории в неделю

df_user_count_per_category = df_for_featurization.groupby(['user_id', 'commodity_desc', 'week_no'])['quantity'].sum().reset_index()
df_user_count_per_category = df_user_count_per_category.groupby(['user_id', 'commodity_desc'])['quantity'].apply(list).reset_index()
df_user_count_per_category['user_count_per_category'] = df_user_count_per_category['quantity'].apply(lambda x: sum(x)/week_num)
df_user_count_per_category = df_user_count_per_category[['user_id', 'commodity_desc', 'user_count_per_category']]
df_user_count_per_category.head(2)

Unnamed: 0,user_id,commodity_desc,user_count_per_category
0,1,,0.0
1,1,AIR CARE,0.241758


In [61]:
# df_user_count_per_category по user_id \\ количество покупок пользователем конкретной категории в неделю
df_ranker_train = df_ranker_train.merge(df_user_count_per_category, on=['user_id', 'commodity_desc'], how='left')
df_user_count_per_category = df_user_count_per_category.iloc[0:0]
df_ranker_train.shape

(99399, 34)

# age

In [62]:
# пользователь - товар (сколько людей такого же возраста купили данный товар)
df_item_per_age = df_for_featurization.groupby(['item_id', 'age_desc'])['quantity'].sum().reset_index()
df_item_per_age.rename(columns = {'quantity':'quantity_item_per_age'}, inplace = True)
df_item_per_age.head(2) # по item и age_desc - добавлено

Unnamed: 0,item_id,age_desc,quantity_item_per_age
0,25671,35-44,5
1,25671,45-54,1


In [63]:
# df_item_per_age по item_id, age_desc \\ сколько людей такого же возраста купили данный товар
df_ranker_train = df_ranker_train.merge(df_item_per_age, on=['item_id', 'age_desc'], how='left')
df_item_per_age = df_item_per_age.iloc[0:0]
df_ranker_train.shape

(99399, 35)

In [64]:
# пользователь - товар (сколько товаров и сколько потратили пользователи одной возрастной группы относительно всего объема)
df_per_age = df_for_featurization.groupby(['age_desc'])[['quantity', 'sales_value']].sum().reset_index()
df_per_age['quantity_age_per_total'] = df_per_age['quantity']/df_for_featurization['quantity'].sum()
df_per_age['sales_value_age_desc_per_total'] = df_per_age['sales_value']/df_for_featurization['sales_value'].sum()
df_per_age = df_per_age.drop(['quantity', 'sales_value'], axis=1)
df_per_age.head(2) # по age_desc - добавлено

Unnamed: 0,age_desc,quantity_age_per_total,sales_value_age_desc_per_total
0,19-24,0.019962,0.027124
1,25-34,0.119066,0.096003


In [65]:
# df_per_age по age_desc \\ количество товаров купленных возрастной группой относительно всего объема, также по выручке
df_ranker_train = df_ranker_train.merge(df_per_age, on='age_desc', how='left')
df_per_age = df_per_age.iloc[0:0]
df_ranker_train.shape

(99399, 37)

# marital status

In [66]:
# пользователь - товар (сколько товаров и сколько потратили пользователи marital_status_code относительно всего объема)
df_per_marital_status_code = df_for_featurization.groupby(['marital_status_code'])[['quantity', 'sales_value']].sum().reset_index()
df_per_marital_status_code['quantity_marital_status_code_total'] = df_per_marital_status_code['quantity']/df_for_featurization['quantity'].sum()
df_per_marital_status_code['sales_value_marital_status_code_total'] = df_per_marital_status_code['sales_value']/df_for_featurization['sales_value'].sum()
df_per_marital_status_code = df_per_marital_status_code.drop(['quantity', 'sales_value'], axis=1)
df_per_marital_status_code.head(2) #

Unnamed: 0,marital_status_code,quantity_marital_status_code_total,sales_value_marital_status_code_total
0,A,0.314709,0.259012
1,B,0.070961,0.074058


In [67]:
# df_per_marital_status_code по marital_status_code \\ количество товаров купленных marital_status_code относительно всего объема, также по выручке
df_ranker_train = df_ranker_train.merge(df_per_marital_status_code, on='marital_status_code', how='left')
df_per_marital_status_code = df_per_marital_status_code.iloc[0:0]
df_ranker_train.shape

(99399, 39)

In [68]:
# пользователи - товар (сколько людей такого же marital_status_code купили данный товар)
df_item_marital_status_code = df_for_featurization.groupby(['item_id', 'marital_status_code'])['quantity'].sum().reset_index()
df_item_marital_status_code.rename(columns = {'quantity':'quantity_item_per_marital_status_code'}, inplace = True)
df_item_marital_status_code.head(2) # по item и marital_status_code

Unnamed: 0,item_id,marital_status_code,quantity_item_per_marital_status_code
0,25671,A,4
1,25671,U,2


In [69]:
# df_item_marital_status_code по item_id marital_status_code\\ количество товаров купленных marital_status_code
df_ranker_train = df_ranker_train.merge(df_item_marital_status_code, on=['item_id', 'marital_status_code'], how='left')
df_item_marital_status_code = df_item_marital_status_code.iloc[0:0]
df_ranker_train.shape

(99399, 40)

# homeowner_desc

In [70]:
# пользователь - товар (сколько людей такого же homeowner_desc купили данный товар)
df_item_per_homeowner_desc = df_for_featurization.groupby(['item_id', 'homeowner_desc'])['quantity'].sum().reset_index()
df_item_per_homeowner_desc.rename(columns = {'quantity':'quantity_item_per_homeowner_desc'}, inplace = True)
df_item_per_homeowner_desc.head(2) # по item и homeowner_desc - добавлено

Unnamed: 0,item_id,homeowner_desc,quantity_item_per_homeowner_desc
0,25671,Homeowner,5
1,25671,Unknown,1


In [71]:
# df_item_per_homeowner_desc по item_id homeowner_desc\\ количество товаров купленных homeowner_desc 
df_ranker_train = df_ranker_train.merge(df_item_per_homeowner_desc, on=['item_id', 'homeowner_desc'], how='left')
df_item_per_homeowner_desc = df_item_per_homeowner_desc.iloc[0:0]
df_ranker_train.shape

(99399, 41)

In [72]:
# пользователь - товар (сколько товаров и сколько потратили пользователи homeowner_desc относительно всего объема)
df_homeowner_desc = df_for_featurization.groupby(['homeowner_desc'])[['quantity', 'sales_value']].sum().reset_index()
df_homeowner_desc['quantity_homeowner_desc_total'] = df_homeowner_desc['quantity']/df_for_featurization['quantity'].sum()
df_homeowner_desc['sales_value_homeowner_desc_total'] = df_homeowner_desc['sales_value']/df_for_featurization['sales_value'].sum()
df_homeowner_desc = df_homeowner_desc.drop(['quantity', 'sales_value'], axis=1)
df_homeowner_desc.head(2) #

Unnamed: 0,homeowner_desc,quantity_homeowner_desc_total,sales_value_homeowner_desc_total
0,Homeowner,0.465942,0.377072
1,Probable Owner,0.003876,0.006719


In [73]:
# df_homeowner_desc по homeowner_desc \\ количество товаров купленных homeowner_desc относительно всего объема, также по выручке
df_ranker_train = df_ranker_train.merge(df_homeowner_desc, on='homeowner_desc', how='left')
df_homeowner_desc = df_homeowner_desc.iloc[0:0]
df_ranker_train.shape

(99399, 43)

# household_size_desc

In [74]:
# пользователь - товар (сколько людей такого же household_size_desc купили данный товар)
df_item_per_household_size_desc = df_for_featurization.groupby(['item_id', 'household_size_desc'])['quantity'].sum().reset_index()
df_item_per_household_size_desc.rename(columns = {'quantity':'quantity_item_per_household_size_desc'}, inplace = True)
df_item_per_household_size_desc.head(5) # по item и household_size_desc - добавлено

Unnamed: 0,item_id,household_size_desc,quantity_item_per_household_size_desc
0,25671,1,1
1,25671,2,1
2,25671,4,4
3,26601,1,1
4,26636,1,1


In [75]:
# df_item_per_household_size_desc по item_id household_size_desc\\ количество товаров купленных household_size_desc 
df_ranker_train = df_ranker_train.merge(df_item_per_household_size_desc, on=['item_id', 'household_size_desc'], how='left')
df_item_per_household_size_desc = df_item_per_household_size_desc.iloc[0:0]

In [76]:
# пользователь - товар (сколько товаров и сколько потратили пользователи household_size_desc относительно всего объема)
df_household_size_desc = df_for_featurization.groupby('household_size_desc')[['quantity', 'sales_value']].sum().reset_index()
df_household_size_desc['quantity_household_size_desc_total'] = df_household_size_desc['quantity']/df_for_featurization['quantity'].sum()
df_household_size_desc['sales_value_household_size_desc_total'] = df_household_size_desc['sales_value']/df_for_featurization['sales_value'].sum()
df_household_size_desc = df_household_size_desc.drop(['quantity', 'sales_value'], axis=1)
df_household_size_desc.head(2) #

Unnamed: 0,household_size_desc,quantity_household_size_desc_total,sales_value_household_size_desc_total
0,1,0.174283,0.159194
1,2,0.244989,0.219726


In [77]:
# df_household_size_desc по household_size_desc \\ количество товаров купленных household_size_desc относительно всего объема, также по выручке
df_ranker_train = df_ranker_train.merge(df_household_size_desc, on='household_size_desc', how='left')
df_household_size_desc = df_household_size_desc.iloc[0:0]
df_ranker_train.shape

(99399, 46)

# kid_category_desc

In [78]:
# разберемся с kid_category_desc
# пользователь - товар (сколько людей такого же kid_category_desc купили данный товар)
df_item_per_kid_category_desc = df_for_featurization.groupby(['item_id', 'kid_category_desc'])['quantity'].sum().reset_index()
df_item_per_kid_category_desc.rename(columns = {'quantity':'quantity_item_per_kid_category_desc'}, inplace = True)
df_item_per_kid_category_desc.head(2) # по item и kid_category_desc - добавлено

Unnamed: 0,item_id,kid_category_desc,quantity_item_per_kid_category_desc
0,25671,2,4
1,25671,None/Unknown,2


In [79]:
# df_item_per_kid_category_desc по item_id kid_category_desc\\ количество товаров купленных kid_category_desc 
df_ranker_train = df_ranker_train.merge(df_item_per_kid_category_desc, on=['item_id', 'kid_category_desc'], how='left')
df_item_per_kid_category_desc = df_item_per_kid_category_desc.iloc[0:0]

In [80]:
# пользователь - товар (сколько товаров и сколько потратили пользователи kid_category_desc относительно всего объема)
df_kid_category_desc = df_for_featurization.groupby('kid_category_desc')[['quantity', 'sales_value']].sum().reset_index()
df_kid_category_desc['quantity_kid_category_desc_total'] = df_kid_category_desc['quantity']/df_for_featurization['quantity'].sum()
df_kid_category_desc['sales_kid_category_desc_desc_total'] = df_kid_category_desc['sales_value']/df_for_featurization['sales_value'].sum()
df_kid_category_desc = df_kid_category_desc.drop(['quantity', 'sales_value'], axis=1)
df_kid_category_desc.head(2) #

Unnamed: 0,kid_category_desc,quantity_kid_category_desc_total,sales_kid_category_desc_desc_total
0,1,0.079687,0.083135
1,2,0.06654,0.050062


In [81]:
# df_kid_category_desc по kid_category_desc \\ количество товаров купленных kid_category_desc относительно всего объема, также по выручке
df_ranker_train = df_ranker_train.merge(df_kid_category_desc, on='kid_category_desc', how='left')
df_kid_category_desc = df_kid_category_desc.iloc[0:0]
df_ranker_train.shape

(99399, 49)

# brand

In [82]:
df_quantity_per_brand = df_for_featurization.groupby(['user_id', 'brand'])['quantity'].sum().reset_index()
df_quantity_per_brand = df_quantity_per_brand.merge(df_quantity_per_brand.groupby('user_id')['quantity'].sum().reset_index(), on = 'user_id', how='left') 
df_quantity_per_brand['user_brand_perc'] = df_quantity_per_brand['quantity_x'] / df_quantity_per_brand['quantity_y']
df_quantity_per_brand = df_quantity_per_brand.drop(['quantity_x', 'quantity_y'], axis=1)
df_quantity_per_brand.head(2)

Unnamed: 0,user_id,brand,user_brand_perc
0,1,National,0.890919
1,1,Private,0.109081


In [83]:
# df_quantity_per_brand по user_id brand \\ какой бренд больше предпоситает конкретный пользователь
df_ranker_train = df_ranker_train.merge(df_quantity_per_brand, on=['user_id', 'brand'], how='left')
df_quantity_per_brand = df_quantity_per_brand.iloc[0:0]
df_ranker_train.shape

(99399, 50)

# Категория от возраста

In [84]:
# пройдемся по категориям
# в какой тратит больше всего денег в среднем на единицу товара
# какой возраст в какой категории чаще всего покупает
df_age_per_category = df_for_featurization.groupby(['age_desc', 'commodity_desc'])[['quantity', 'sales_value']].sum().reset_index()
df_age_per_category.rename(columns = {'quantity':'quantity_commodity_desc_per_age_desc', 'sales_value':'sales_value_commodity_desc_per_age_desc'}, inplace = True)
df_age_per_category['sales_value_commodity_desc_per_age_desc_aver'] = df_age_per_category['sales_value_commodity_desc_per_age_desc']/df_age_per_category['quantity_commodity_desc_per_age_desc']
df_age_per_category = df_age_per_category.fillna(0)
df_age_per_category.head(2) # добавить по age_desc и commodity_desc

Unnamed: 0,age_desc,commodity_desc,quantity_commodity_desc_per_age_desc,sales_value_commodity_desc_per_age_desc,sales_value_commodity_desc_per_age_desc_aver
0,19-24,,0,0.0,0.0
1,19-24,ADULT INCONTINENCE,32,157.48,4.92125


In [85]:
# df_age_per_category по age_desc и commodity_desc \\ какой возраст покупает в какой категории (выручка, колчество, среднее на единицу товара)
df_ranker_train = df_ranker_train.merge(df_age_per_category, on=['age_desc', 'commodity_desc'], how='left')
df_age_per_category = df_age_per_category.iloc[0:0]
df_ranker_train.shape

(99399, 53)

# Фичи пары user_id - item_id : 
Средняя сумма покупки 1 товара покупателя в каждой подкатегориии

In [86]:
# находим общие траты пользователя в категории за весь период
df_user_sum_per_sub_category = df_for_featurization.groupby(['user_id', 'sub_commodity_desc'])[['sales_value', 'quantity']].sum().reset_index()
df_user_sum_per_sub_category['user_sum_per_sub_category'] = df_user_sum_per_sub_category['sales_value']/df_user_sum_per_sub_category['quantity']
df_user_sum_per_sub_category = df_user_sum_per_sub_category.fillna(0)
df_user_sum_per_sub_category.rename(columns = {'sales_value':'user_sales_value_sub_commodity_desc', 'quantity':'user_quantity_sub_commodity_desc'}, inplace = True)
df_user_sum_per_sub_category.head(2) # добавить по user_id sub_commodity_desc добавлено

Unnamed: 0,user_id,sub_commodity_desc,user_sales_value_sub_commodity_desc,user_quantity_sub_commodity_desc,user_sum_per_sub_category
0,1,,0.0,0,0.0
1,1,ADULT ANALGESICS,9.98,2,4.99


In [87]:
# df_user_sum_per_sub_category по user_id и sub_commodity_desc \\ сколько пользователь покупает в подкатегории, сколько выручка, сколько среднее за товар
df_ranker_train = df_ranker_train.merge(df_user_sum_per_sub_category, on=['user_id', 'sub_commodity_desc'], how='left')
df_user_sum_per_sub_category = df_user_sum_per_sub_category.iloc[0:0]

средняя цена товара в категории, которую выбирает пользователь относительно уровня цены данного товара в категории в целом

In [89]:
# средняя цена товара в категории, которую выбирает пользователь относительно уровня цены данного товара в категории в целом
df_user_aver_price_per_category = df_for_featurization.groupby(['user_id', 'commodity_desc'])['sales_value'].mean()/df_for_featurization.groupby(['user_id', 'commodity_desc'])['quantity'].mean()
df_user_aver_price_per_category = df_user_aver_price_per_category.reset_index()
df_user_aver_price_per_category = df_user_aver_price_per_category.fillna(0)
df_user_aver_price_per_category.head(2) # сколько в среднем пользователь тратит в категории на 1 единицу

Unnamed: 0,user_id,commodity_desc,0
0,1,,0.0
1,1,AIR CARE,2.620909


In [90]:
# средняя цена товара в категории
df_item_aver_price_per_category = df_for_featurization.groupby(['item_id'])['sales_value'].mean()/df_for_featurization.groupby(['item_id'])['quantity'].mean()
df_item_aver_price_per_category = df_item_aver_price_per_category.reset_index().fillna(0)
df_item_aver_price_per_category.head(2) # средняя цена товара 

Unnamed: 0,item_id,0
0,25671,3.49
1,26081,0.99


In [91]:
df_merge_user_item_cat = df_for_featurization.groupby(['user_id', 'item_id','commodity_desc'])['sales_value'].sum().reset_index()
df_merge_user_item_cat = df_merge_user_item_cat.merge(df_user_aver_price_per_category, on=['user_id', 'commodity_desc'], how='left')
df_merge_user_item_cat = df_merge_user_item_cat.merge(df_item_aver_price_per_category, on='item_id', how='left')
df_merge_user_item_cat['user_choise_per_category'] = df_merge_user_item_cat['0_x']/df_merge_user_item_cat['0_y']
df_merge_user_item_cat = df_merge_user_item_cat[['user_id', 'item_id', 'user_choise_per_category']]
df_merge_user_item_cat.head(2) # по user_id и item_id

Unnamed: 0,user_id,item_id,user_choise_per_category
0,1,819312,1.108914
1,1,820165,1.739603


In [92]:
# df_merge_user_item_cat по user_id и item_id \\ цена товара относительно средней суммы, которую тратит пользователь в категории
df_ranker_train = df_ranker_train.merge(df_merge_user_item_cat, on=['user_id', 'item_id'], how='left')
df_merge_user_item_cat = df_merge_user_item_cat.iloc[0:0]
df_item_aver_price_per_category = df_item_aver_price_per_category.iloc[0:0]
df_user_aver_price_per_category = df_user_aver_price_per_category.iloc[0:0]
df_ranker_train.shape

(99399, 57)

# income_desc

In [93]:
#Добавим фичи с income_desc
# пользователь - товар (сколько людей такого же income_desc купили данный товар)
df_item_per_income_desc_desc = df_for_featurization.groupby(['item_id', 'income_desc'])['quantity'].sum().reset_index()
df_item_per_income_desc_desc.rename(columns = {'quantity':'quantity_item_per_income_desc_desc'}, inplace = True)
df_item_per_income_desc_desc.head(2) # по item и income_desc 

Unnamed: 0,item_id,income_desc,quantity_item_per_income_desc_desc
0,25671,100-124K,1
1,25671,50-74K,1


In [94]:
# df_item_per_income_desc_desc по item_id income_desc\\ количество товаров купленных income_desc 
df_ranker_train = df_ranker_train.merge(df_item_per_income_desc_desc, on=['item_id', 'income_desc'], how='left')
df_item_per_income_desc_desc = df_item_per_income_desc_desc.iloc[0:0]
df_ranker_train.shape

(99399, 58)

In [95]:
# пользователь - товар (сколько товаров и сколько потратили пользователи kid_category_desc относительно всего объема)
df_income_desc = df_for_featurization.groupby('income_desc')[['quantity', 'sales_value']].sum().reset_index()
df_income_desc['quantity_income_desc_total'] = df_income_desc['quantity']/df_for_featurization['quantity'].sum()
df_income_desc['sales_income_desc_total'] = df_income_desc['sales_value']/df_for_featurization['sales_value'].sum()
df_income_desc = df_income_desc.drop(['quantity', 'sales_value'], axis=1)
df_income_desc.head(2) #

Unnamed: 0,income_desc,quantity_income_desc_total,sales_income_desc_total
0,100-124K,0.041523,0.025087
1,125-149K,0.052149,0.037274


In [96]:
# df_income_desc по income_desc \\ количество товаров купленных income_desc относительно всего объема, также по выручке
df_ranker_train = df_ranker_train.merge(df_income_desc, on='income_desc', how='left')
df_income_desc = df_income_desc.iloc[0:0]
df_ranker_train.shape

(99399, 60)

# income_desc + commodity_desc

In [97]:
# сколько в целом люди с доходом тратят в разных категориях на единицу товара
df_sales_per_income_desc_per_category = df_for_featurization.groupby(['income_desc', 'commodity_desc'])[['sales_value', 'quantity']].sum().reset_index()
df_sales_per_income_desc_per_category['income_desc_per_category_aver_sum'] = df_sales_per_income_desc_per_category['sales_value']/df_sales_per_income_desc_per_category['quantity']
df_sales_per_income_desc_per_category.rename(columns={'sales_value': 'sales_value_income_desc_per_category', 'quantity': 'quantity_income_desc_per_category'}, inplace=True)
df_sales_per_income_desc_per_category.head(2) # добавить по income_desc и commodity_desc

Unnamed: 0,income_desc,commodity_desc,sales_value_income_desc_per_category,quantity_income_desc_per_category,income_desc_per_category_aver_sum
0,100-124K,,0.0,0,
1,100-124K,(CORP USE ONLY),24.14,6,4.023333


In [98]:
# df_sales_per_income_desc_per_category по income_desc и commodity_desc \\ количество товаров, выручка, среднее на единицу по категориям, котоыре покупают с income_desc
df_ranker_train = df_ranker_train.merge(df_sales_per_income_desc_per_category, on=['income_desc', 'commodity_desc'], how='left')
df_sales_per_income_desc_per_category = df_sales_per_income_desc_per_category.iloc[0:0]
df_ranker_train = df_ranker_train.fillna(0)

df_ranker_train.head(2)

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,...,user_sales_value_sub_commodity_desc,user_quantity_sub_commodity_desc,user_sum_per_sub_category,user_choise_per_category,quantity_item_per_income_desc_desc,quantity_income_desc_total,sales_income_desc_total,sales_value_income_desc_per_category,quantity_income_desc_per_category,income_desc_per_category_aver_sum
0,2070,1105426,0.0,69,DELI,Private,SANDWICHES,SANDWICHES - (COLD),,45-54,...,49.86,14.0,3.561429,0.908651,62.0,0.168539,0.135362,1175.56,302.0,3.892583
1,2070,1097350,0.0,2468,GROCERY,National,DOMESTIC WINE,VALUE GLASS WINE,4 LTR,45-54,...,0.0,0.0,0.0,0.0,38.0,0.168539,0.135362,6254.6,696.0,8.986494


In [99]:
df_ranker_train.shape

(99399, 63)

In [100]:
X_train = df_ranker_train.drop('target', axis=1)
y_train = df_ranker_train[['target']]

In [101]:
cat_feats = X_train.columns[2:].tolist()
X_train[cat_feats] = X_train[cat_feats].astype('category')

cat_feats

['manufacturer',
 'department',
 'brand',
 'commodity_desc',
 'sub_commodity_desc',
 'curr_size_of_product',
 'age_desc',
 'marital_status_code',
 'income_desc',
 'homeowner_desc',
 'hh_comp_desc',
 'household_size_desc',
 'kid_category_desc',
 'total_item_sales_value',
 'total_quantity_value',
 'item_freq',
 'user_freq',
 'total_user_sales_value',
 'item_quantity_per_week',
 'user_quantity_per_week',
 'item_quantity_per_basket',
 'user_quantity_per_baskter',
 'item_freq_per_basket',
 'user_freq_per_basket',
 'item_sales_value_per_category',
 'item_quantity_per_category',
 'aver_invoice',
 'user_sales_value_per_category',
 'user_quantity_per_category',
 'user_aver_sum_per_category',
 'user_count_per_category',
 'quantity_item_per_age',
 'quantity_age_per_total',
 'sales_value_age_desc_per_total',
 'quantity_marital_status_code_total',
 'sales_value_marital_status_code_total',
 'quantity_item_per_marital_status_code',
 'quantity_item_per_homeowner_desc',
 'quantity_homeowner_desc_total'

## Обучение модели ранжирования

In [117]:
lgb = LGBMClassifier(objective='binary',
                     max_depth=15,
                     n_estimators=100,
                     learning_rate=0.5,
                     categorical_column=cat_feats,
                     random_state = 42)


lgb.fit(X_train, y_train)

train_preds = lgb.predict_proba(X_train)

  return f(*args, **kwargs)


In [118]:
df_ranker_predict = df_ranker_train.copy()
df_ranker_predict['proba_item_purchase'] = train_preds[:,1]

# Evaluation on test dataset

In [119]:
result_eval_ranker = data_val_ranker.groupby(USER_COL)[ITEM_COL].unique().reset_index()
result_eval_ranker.columns=[USER_COL, ACTUAL_COL]
result_eval_ranker.head(2)

Unnamed: 0,user_id,actual
0,1,"[821867, 834484, 856942, 865456, 889248, 90795..."
1,6,"[920308, 926804, 946489, 1006718, 1017061, 107..."


## Eval matching on test dataset

In [120]:
TOPK_PRECISION = 5

In [121]:
def rerank(user_id):
    return df_ranker_predict[df_ranker_predict[USER_COL]==user_id].sort_values('proba_item_purchase', ascending=False).head(5).item_id.tolist()

In [122]:
%%time
result_eval_ranker['own_rec'] = result_eval_ranker[USER_COL].apply(lambda x: recommender.get_own_recommendations(x, N=N_PREDICT))
result_eval_ranker['reranked_own_rec'] = result_eval_ranker[USER_COL].apply(lambda user_id: rerank(user_id))
print(*sorted(calc_precision(result_eval_ranker, TOPK_PRECISION), key=lambda x: x[1], reverse=True), sep='\n')

('reranked_own_rec', 0.16428198433420363)
('own_rec', 0.1462140992167102)
Wall time: 24.1 s


# Оценка на тесте для выполнения курсового проекта

In [123]:
df_test = pd.read_csv('retail_test1.csv')
df_transactions = pd.read_csv('transaction_data.csv')

In [124]:
df_test.head()

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,1340,41652823310,664,912987,1,8.49,446,0.0,52,96,0.0,0.0
1,588,41652838477,664,1024426,1,6.29,388,0.0,8,96,0.0,0.0
2,2070,41652857291,664,995242,5,9.1,311,-0.6,46,96,0.0,0.0
3,1602,41665647035,664,827939,1,7.99,334,0.0,1741,96,0.0,0.0
4,1602,41665647035,664,927712,1,0.59,334,-0.4,1741,96,0.0,0.0


In [125]:
result_test = df_test.groupby(USER_COL)[ITEM_COL].unique().reset_index()
result_test.columns=[USER_COL, ACTUAL_COL]
result_test.head(2)

Unnamed: 0,user_id,actual
0,1,"[880007, 883616, 931136, 938004, 940947, 94726..."
1,2,"[820165, 820291, 826784, 826835, 829009, 85784..."


In [126]:
# оставляем юзеров, по которым есть история
result_test = result_test[result_test.user_id.isin(common_users)]

result_test['own_rec'] = result_test[USER_COL].apply(lambda x: recommender.get_own_recommendations(x, N=N_PREDICT))
result_test['reranked_own_rec'] = result_test[USER_COL].apply(lambda user_id: rerank(user_id))
print(*sorted(calc_precision(result_eval_ranker, TOPK_PRECISION), key=lambda x: x[1], reverse=True), sep='\n')

('reranked_own_rec', 0.16428198433420363)
('own_rec', 0.1462140992167102)
