### структура модели

Отбираем  *кандидатов* с помощью простой модели (als) --> переранжируем их сложной моделью (lightgbm)
и выберем top-k (5).


In [176]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

# Для работы с матрицами
from scipy.sparse import csr_matrix, coo_matrix

# Матричная факторизация
from implicit import als

# Матричная факторизация
#from implicit.als import AlternatingLeastSquares
#from implicit.nearest_neighbours import bm25_weight, tfidf_weight

# Модель второго уровня
from lightgbm import LGBMClassifier

import os, sys
module_path = os.path.abspath(os.path.join(os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)

# Написанные нами функции
from src_right.utils import prefilter_items
from src_right.metrics import recall_at_k, precision_at_k
from src_right.recommenders import MainRecommender

In [123]:
data = pd.read_csv('data/retail_train.csv')
item_features = pd.read_csv('data/product.csv')
user_features = pd.read_csv('data/hh_demographic.csv')

# column processing
item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': 'item_id'}, inplace=True)
user_features.rename(columns={'household_key': 'user_id'}, inplace=True)

# Предобработка данных

1) Обработаем классовые фичи
    - Для фичь где классы линейно связанны, переведем их в int.
    - Для остальных переведем в dummy переменные
    
Для того чтоб модель могла использовать максимально количество инфы о зависимостях.

In [124]:
user_features.head()

Unnamed: 0,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,user_id
0,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown,1
1,45-54,A,50-74K,Homeowner,2 Adults No Kids,2,None/Unknown,7
2,25-34,U,25-34K,Unknown,2 Adults Kids,3,1,8
3,25-34,U,75-99K,Homeowner,2 Adults Kids,4,2,13
4,45-54,B,50-74K,Homeowner,Single Female,1,None/Unknown,16


Соберем фичи для get_dummies

In [125]:
dummies_features = ['marital_status_code', 'homeowner_desc', 'hh_comp_desc']

In [126]:
user_features = pd.get_dummies(user_features, columns=dummies_features)

In [127]:
user_features.head()

Unnamed: 0,age_desc,income_desc,household_size_desc,kid_category_desc,user_id,marital_status_code_A,marital_status_code_B,marital_status_code_U,homeowner_desc_Homeowner,homeowner_desc_Probable Owner,homeowner_desc_Probable Renter,homeowner_desc_Renter,homeowner_desc_Unknown,hh_comp_desc_1 Adult Kids,hh_comp_desc_2 Adults Kids,hh_comp_desc_2 Adults No Kids,hh_comp_desc_Single Female,hh_comp_desc_Single Male,hh_comp_desc_Unknown
0,65+,35-49K,2,None/Unknown,1,1,0,0,1,0,0,0,0,0,0,1,0,0,0
1,45-54,50-74K,2,None/Unknown,7,1,0,0,1,0,0,0,0,0,0,1,0,0,0
2,25-34,25-34K,3,1,8,0,0,1,0,0,0,0,1,0,1,0,0,0,0
3,25-34,75-99K,4,2,13,0,0,1,1,0,0,0,0,0,1,0,0,0,0
4,45-54,50-74K,1,None/Unknown,16,0,1,0,1,0,0,0,0,0,0,0,1,0,0


Переведем классовые фичи в float для тех, чьи классы линейно связанны.

In [128]:
user_features[['age_desc']].value_counts()

age_desc
45-54       288
35-44       194
25-34       142
65+          72
55-64        59
19-24        46
dtype: int64

In [129]:
user_features.loc[user_features['age_desc'] == '65+', 'age_desc'] = 5
user_features.loc[user_features['age_desc'] == '55-64', 'age_desc'] = 4
user_features.loc[user_features['age_desc'] == '45-54', 'age_desc'] = 3
user_features.loc[user_features['age_desc'] == '35-44', 'age_desc'] = 2
user_features.loc[user_features['age_desc'] == '25-34', 'age_desc'] = 1
user_features.loc[user_features['age_desc'] == '19-24', 'age_desc'] = 0

In [130]:
user_features[['income_desc']].value_counts()

income_desc
50-74K         192
35-49K         172
75-99K          96
25-34K          77
15-24K          74
Under 15K       61
125-149K        38
100-124K        34
150-174K        30
250K+           11
175-199K        11
200-249K         5
dtype: int64

In [131]:
user_features.loc[user_features['income_desc'] == 'Under 15K', 'income_desc'] = 0
user_features.loc[user_features['income_desc'] == '15-24K', 'income_desc'] = 1
user_features.loc[user_features['income_desc'] == '25-34K', 'income_desc'] = 2
user_features.loc[user_features['income_desc'] == '35-49K', 'income_desc'] = 3
user_features.loc[user_features['income_desc'] == '50-74K', 'income_desc'] = 4
user_features.loc[user_features['income_desc'] == '75-99K', 'income_desc'] = 5
user_features.loc[user_features['income_desc'] == '100-124K', 'income_desc'] = 6
user_features.loc[user_features['income_desc'] == '125-149K', 'income_desc'] = 7
user_features.loc[user_features['income_desc'] == '150-174K', 'income_desc'] = 8
user_features.loc[user_features['income_desc'] == '175-199K', 'income_desc'] = 9
user_features.loc[user_features['income_desc'] == '200-249K', 'income_desc'] = 10
user_features.loc[user_features['income_desc'] == '250K+', 'income_desc'] = 11

In [132]:
user_features[['kid_category_desc']].value_counts()

kid_category_desc
None/Unknown         558
1                    114
3+                    69
2                     60
dtype: int64

In [133]:
user_features.loc[user_features['kid_category_desc'] == 'None/Unknown', 'kid_category_desc'] = 0
user_features.loc[user_features['kid_category_desc'] == '1', 'kid_category_desc'] = 1
user_features.loc[user_features['kid_category_desc'] == '2', 'kid_category_desc'] = 2
user_features.loc[user_features['kid_category_desc'] == '3+', 'kid_category_desc'] = 3

Посмотрим на item_features

In [134]:
item_features.head()

Unnamed: 0,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product
0,25671,2,GROCERY,National,FRZN ICE,ICE - CRUSHED/CUBED,22 LB
1,26081,2,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,
2,26093,69,PASTRY,Private,BREAD,BREAD:ITALIAN/FRENCH,
3,26190,69,GROCERY,Private,FRUIT - SHELF STABLE,APPLE SAUCE,50 OZ
4,26355,69,GROCERY,Private,COOKIES/CONES,SPECIALTY COOKIES,14 OZ


In [135]:
item_features = pd.get_dummies(item_features, columns=['brand'])

## Построим новые фичи

**Фичи user_id:**

    - Средний чек
    - Средняя сумма покупки 1 товара в каждой категории
    - Частотность покупок раз/месяц

In [136]:
#Взять средний чек за каждый поход в магазин и потом среднюю по всем походам, чтоб данные
#были как можно ближе к реальности
mean_sales_value = data.groupby(['user_id','day']).agg({'sales_value':'mean'})\
                .reset_index().groupby('user_id').agg({'sales_value':'mean'})

In [137]:
#Средний чек
user_features = user_features.join(mean_sales_value, on='user_id', how='left')

In [138]:
user_features.rename(columns={'sales_value':'mean_sales_value'}, inplace=True)

In [139]:
#для следующих фичей надо будет собрать датасет со всей информацией в один
full_data = data.join(user_features.set_index('user_id'), on='user_id', how='left')
full_data = full_data.join(item_features.set_index('item_id'), on='item_id', how='left')

In [140]:
full_data.head()

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,...,hh_comp_desc_Single Male,hh_comp_desc_Unknown,mean_sales_value,manufacturer,department,commodity_desc,sub_commodity_desc,curr_size_of_product,brand_National,brand_Private
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,...,,,,69,PRODUCE,POTATOES,POTATOES RUSSET (BULK&BAG),5 LB,0,1
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,...,,,,2,PRODUCE,ONIONS,ONIONS SWEET (BULK&BAG),40 LB,1,0
2,2375,26984851472,1,1036325,1,0.99,364,-0.3,1631,1,...,,,,69,PRODUCE,VEGETABLES - ALL OTHERS,CELERY,,0,1
3,2375,26984851472,1,1082185,1,1.21,364,0.0,1631,1,...,,,,2,PRODUCE,TROPICAL FRUIT,BANANAS,40 LB,1,0
4,2375,26984851472,1,8160430,1,1.5,364,-0.39,1631,1,...,,,,69,PRODUCE,ORGANICS FRUIT & VEGETABLES,ORGANIC CARROTS,1 LB,0,1


In [141]:
#Средняя сумма покупки 1 товара в каждой категории
mean_check_per_department = full_data.groupby(['user_id', 'basket_id', 'department']).agg({'sales_value':'sum'})\
    .groupby(['user_id', 'department']).mean()

In [142]:
mean_check_per_department = mean_check_per_department.reset_index().set_index('user_id')

In [143]:
user_features = user_features.join(mean_check_per_department, on='user_id', how='left')\
                .rename(columns={'sales_value':'mean_check_per_department'})

In [144]:
user_features.head()

Unnamed: 0,age_desc,income_desc,household_size_desc,kid_category_desc,user_id,marital_status_code_A,marital_status_code_B,marital_status_code_U,homeowner_desc_Homeowner,homeowner_desc_Probable Owner,...,homeowner_desc_Unknown,hh_comp_desc_1 Adult Kids,hh_comp_desc_2 Adults Kids,hh_comp_desc_2 Adults No Kids,hh_comp_desc_Single Female,hh_comp_desc_Single Male,hh_comp_desc_Unknown,mean_sales_value,department,mean_check_per_department
0,5,3,2,0,1,1,0,0,1,0,...,0,0,0,1,0,0,0,2.741772,,0.0
0,5,3,2,0,1,1,0,0,1,0,...,0,0,0,1,0,0,0,2.741772,DELI,4.846757
0,5,3,2,0,1,1,0,0,1,0,...,0,0,0,1,0,0,0,2.741772,DRUG GM,8.596415
0,5,3,2,0,1,1,0,0,1,0,...,0,0,0,1,0,0,0,2.741772,FLORAL,7.99
0,5,3,2,0,1,1,0,0,1,0,...,0,0,0,1,0,0,0,2.741772,GROCERY,38.251642


In [145]:
#Количество покупок в неделю
mean_itmes_per_week = data.groupby(['user_id', 'week_no']).agg({'basket_id':'count'}).groupby('user_id').mean()
user_features = user_features.join(mean_itmes_per_week, on='user_id', how='left')\
                .rename(columns={'day':'mean_itmes_per_week'})

**Фичи item_id**:

    - Кол-во покупок в неделю
    
    - Среднее ол-во покупок 1 товара в категории в неделю
    - (Кол-во покупок в неделю) / (Среднее ол-во покупок 1 товара в категории в неделю)
    - Цена (Можно посчитать из retil_train.csv)
    - Цена / Средняя цена товара в категории

In [146]:
item_features.shape

(92353, 8)

In [147]:
#Сколько каждого товара в среднем покупается в неделю
mean_itmes_per_week_itms = data.groupby(['item_id', 'week_no']).agg({'basket_id':'count'}).groupby('item_id').mean()
item_features = item_features.join(mean_itmes_per_week_itms, on='item_id', how='left')\
                .rename(columns={'basket_id':'mean_itmes_per_week'})

In [148]:
item_features.shape

(92353, 9)

In [149]:
item_features.mean_itmes_per_week.value_counts()

1.000000     59045
1.500000      1272
1.333333      1258
1.250000      1156
1.200000      1013
             ...  
3.465517         1
12.181818        1
2.366197         1
3.260870         1
3.144578         1
Name: mean_itmes_per_week, Length: 4098, dtype: int64

In [150]:
#Среднее кол-во покупок товара определеной категории в неделю
mean_itms_per_week_per_department = full_data.groupby(['department', 'week_no']).agg({'item_id':'count'}).reset_index()\
        .groupby('department').agg({'item_id':'mean'})\
        .rename(columns={'item_id':'mean_itms_per_week_per_department'})

In [151]:
mean_itms_per_week_per_department.head()

Unnamed: 0_level_0,mean_itms_per_week_per_department
department,Unnamed: 1_level_1
,75.894737
AUTOMOTIVE,1.404762
CHARITABLE CONT,2.0
CHEF SHOPPE,9.865672
CNTRL/STORE SUP,1.75


In [152]:
item_features = item_features.\
    join(mean_itms_per_week_per_department, on='department', how='left')

In [153]:
#Добавим цену
item_features = item_features.\
    join(data.groupby('item_id').agg({'sales_value':'mean'}), on='item_id', how='left')

In [154]:
item_features.sales_value.mean()

4.3484381373079675

In [155]:
#добавим стандартизованую цену(+смещение)
item_features['std_value'] = item_features.sales_value / item_features.sales_value.mean()

In [156]:
item_features.head()

Unnamed: 0,item_id,manufacturer,department,commodity_desc,sub_commodity_desc,curr_size_of_product,brand_National,brand_Private,mean_itmes_per_week,mean_itms_per_week_per_department,sales_value,std_value
0,25671,2,GROCERY,FRZN ICE,ICE - CRUSHED/CUBED,22 LB,1,0,1.0,15989.789474,6.98,1.605174
1,26081,2,MISC. TRANS.,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,,1,0,1.0,22.8,0.99,0.227668
2,26093,69,PASTRY,BREAD,BREAD:ITALIAN/FRENCH,,0,1,1.0,371.326316,1.59,0.365649
3,26190,69,GROCERY,FRUIT - SHELF STABLE,APPLE SAUCE,50 OZ,0,1,1.0,15989.789474,1.54,0.35415
4,26355,69,GROCERY,COOKIES/CONES,SPECIALTY COOKIES,14 OZ,0,1,1.0,15989.789474,1.98,0.455336


In [157]:
item_features.shape

(92353, 12)


**Фичи пары user_id - item_id**

    - (Средняя сумма покупки 1 товара в каждой категории (берем категорию item_id)) - (Цена item_id)
    - (Средний чек покупок юзером в конкретной категории) / (Средний чек покупок в среднем в конкретной

In [158]:
# Обновим full_data с учетом уже новых фичей
full_data = full_data.join(mean_check_per_department.sales_value.rename('mean_check_per_department'), on='user_id', how='left')

In [159]:
# насколько цена item-а дешевле чем средняя по категории (если с минусом, значит дороже)
price_difference_per_itm = full_data.set_index('item_id').mean_check_per_department - full_data.set_index('item_id').sales_value

In [160]:
# Коеффициент - насколько пользователь сильнее склонен к покупке товара данной категории чем другие

### Поделим данные на train и test для модели первого и второго уровня

In [161]:
val_lvl_1_size_weeks = 6
val_lvl_2_size_weeks = 3

data_train_lvl_1 = data[data['week_no'] < data['week_no'].max() - (val_lvl_1_size_weeks + val_lvl_2_size_weeks)]
data_val_lvl_1 = data[(data['week_no'] >= data['week_no'].max() - (val_lvl_1_size_weeks + val_lvl_2_size_weeks)) &
                      (data['week_no'] < data['week_no'].max() - (val_lvl_2_size_weeks))]

data_train_lvl_2 = data_val_lvl_1.copy()
data_val_lvl_2 = data[data['week_no'] >= data['week_no'].max() - val_lvl_2_size_weeks]

data_train_lvl_1.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


In [162]:
n_items_before = data_train_lvl_1['item_id'].nunique()

data_train_lvl_1 = prefilter_items(data_train_lvl_1)

n_items_after = data_train_lvl_1['item_id'].nunique()
print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

Decreased # items from 83685 to 5001


# Обучим модель первого уровня

В results будем складывать промежуточные и финальные предсказания и метрики 

In [163]:
from lightfm import LightFM
from lightfm.evaluation import precision_at_k, recall_at_k

data_train_lvl_1
data_val_lvl_1
data_train_lvl_2
data_val_lvl_2

In [164]:
user_item_matrix = recommender._prepare_matrix(data_train_lvl_1)
# переведем в формат sparse matrix
sparse_user_item = csr_matrix(user_item_matrix).tocsr()

id_to_itemid, id_to_userid, itemid_to_id, userid_to_id = \
    recommender._prepare_dicts(user_item_matrix)

user_features_lfm = user_features.set_index('user_id')
item_features_lfm = item_features.set_index('item_id')

model = LightFM(no_components=10,
                loss='bpr', # 'warp'
                learning_rate=0.05, 
                item_alpha=0.1, user_alpha=0.1, 
                random_state=42)

model.fit((sparse_user_item > 0) * 1,  # user-item matrix из 0 и 1
          sample_weight=coo_matrix(user_item_matrix),
          user_features=user_features_lfm,
          item_features=item_features_lfm,
          epochs=15, 
          num_threads=4) 

In [165]:
result = data_val_lvl_1.groupby('user_id')['item_id'].unique().reset_index()
result.columns=['user_id', 'actual']
result.head(2)

Unnamed: 0,user_id,actual
0,1,"[853529, 865456, 867607, 872137, 874905, 87524..."
1,2,"[15830248, 838136, 839656, 861272, 866211, 870..."


In [166]:
recommender = MainRecommender(data_train_lvl_1, n_factors=10, regularization=0.001, iterations=15, num_threads=4)


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=15.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=5001.0), HTML(value='')))




In [178]:
def get_1lvl_recommendations(user_id, recommender, N=5): 
    als_rec = recommender.get_als_recommendations(user_id, N=N)
    return als_rec[:N]

In [168]:
def prefilter_test(data_test, data_train):
    data_test.loc[~data_test['user_id'].isin(data_train['user_id'])] = 999999
    data_test = data_test.drop(index=data_test.loc[data_val_lvl_1['user_id']==999999].index)
    return data_test

In [169]:
def get_rec_all_users(data, recommender, N=5):
    rec = []
    for x in set(data['user_id']):
        rec.append(get_1lvl_recommendations(user_id=x, recommender=recommender, N=N))
    rec_series = pd.Series(rec)
    return rec_series

In [170]:
data_val_lvl_1 = prefilter_test(data_val_lvl_1, data_train_lvl_1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)


In [179]:
%%time
rec_series = get_rec_all_users(data_val_lvl_1, recommender=recommender, N=5)

CPU times: user 17.4 s, sys: 1.49 s, total: 18.9 s
Wall time: 9.98 s


In [180]:
result['rec_1'] = rec_series

In [181]:
result = result.dropna(axis=0)

In [182]:
# 0,036 sim_usr
# 0,011 sim_item
# 0,024 own
# 0.052 als

In [183]:
result.apply(lambda row: precision_at_k(row['rec_1'], row['actual']), axis=1).mean()

0.0523477452347749

In [185]:
result.head()

Unnamed: 0,user_id,actual,rec_1
0,1,"[853529, 865456, 867607, 872137, 874905, 87524...","[1135983, 856942, 885290, 959455, 5566472]"
1,2,"[15830248, 838136, 839656, 861272, 866211, 870...","[1029743, 866211, 916122, 5569230, 844179]"
2,4,"[883932, 970760, 1035676, 1055863, 1097610, 67...","[5568378, 1116020, 1137346, 8090521, 907631]"
3,6,"[1024306, 1102949, 6548453, 835394, 940804, 96...","[854852, 896613, 871611, 878996, 965267]"
4,7,"[836281, 843306, 845294, 914190, 920456, 93886...","[839419, 912451, 8273950, 1067779, 8273833]"


In [1]:
# get_1lvl_recommendations(, recommender, N=5)

# Обучаем модель 2-ого уровня на выбранных кандидатах

- Обучаем на data_train_lvl_2
- Обучаем *только* на выбранных кандидатах
- Я *для примера* сгенерирую топ-50 кадидиатов через get_own_recommendations
- (!) Если юзер купил < 50 товаров, то get_own_recommendations дополнит рекоммендации топ-популярными

In [None]:
# -- давние покупки -- | -- 6 недель -- | -- 3 недель -- 

In [186]:
%%time
users_lvl_2 = pd.DataFrame(data_train_lvl_2['user_id'].unique())
users_lvl_2.columns = ['user_id']

# Пока только warm start
train_users = data_train_lvl_1['user_id'].unique()
users_lvl_2 = users_lvl_2[users_lvl_2['user_id'].isin(train_users)]

users_lvl_2['candidates'] = users_lvl_2['user_id'].apply(lambda x: get_1lvl_recommendations(x, recommender=recommender, N=50))


CPU times: user 17.8 s, sys: 1.39 s, total: 19.2 s
Wall time: 11.4 s


In [187]:
users_lvl_2.head()

Unnamed: 0,user_id,candidates
0,2070,"[899624, 844179, 1029743, 5569230, 12810391, 8..."
1,2021,"[1044078, 944534, 844179, 871756, 899624, 1106..."
2,1753,"[1106523, 1029743, 5569230, 8090537, 1013928, ..."
3,2120,"[1044078, 1106523, 1005186, 5585510, 844179, 1..."
4,1346,"[8293439, 5569374, 12352172, 6533936, 5577022,..."


In [188]:
users_lvl_2.shape

(2151, 2)

In [189]:
len(users_lvl_2['candidates'][0])

50

[Как эффективно вытянуть нашу обучающую выборку](https://stackoverflow.com/questions/53218931/how-to-unnest-explode-a-column-in-a-pandas-dataframe)

In [190]:
df = pd.DataFrame({'user_id':users_lvl_2.user_id.values.repeat(len(users_lvl_2.candidates[0])),
                 'item_id':np.concatenate(users_lvl_2.candidates.values)})

In [191]:
len(df) == users_lvl_2.shape[0] * len(users_lvl_2['candidates'][0])

True

In [192]:
df[df.duplicated()]

Unnamed: 0,user_id,item_id


In [193]:
targets_lvl_2 = data_train_lvl_2[['user_id', 'item_id']].copy()
targets_lvl_2['target'] = 1  # тут только покупки 

targets_lvl_2 = df.merge(targets_lvl_2, on=['user_id', 'item_id'], how='left')

targets_lvl_2['target'].fillna(0, inplace= True)


In [194]:
targets_lvl_2['target'].mean()

0.0771046847156516

In [195]:
targets_lvl_2.head()

Unnamed: 0,user_id,item_id,target
0,2070,899624,1.0
1,2070,844179,0.0
2,2070,1029743,0.0
3,2070,5569230,0.0
4,2070,12810391,1.0


(!) На каждого юзера 50 item_id-кандидатов

In [196]:
item_features.head(2)

Unnamed: 0,item_id,manufacturer,department,commodity_desc,sub_commodity_desc,curr_size_of_product,brand_National,brand_Private,mean_itmes_per_week,mean_itms_per_week_per_department,sales_value,std_value
0,25671,2,GROCERY,FRZN ICE,ICE - CRUSHED/CUBED,22 LB,1,0,1.0,15989.789474,6.98,1.605174
1,26081,2,MISC. TRANS.,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,,1,0,1.0,22.8,0.99,0.227668


In [197]:
user_features.head(2)

Unnamed: 0,age_desc,income_desc,household_size_desc,kid_category_desc,user_id,marital_status_code_A,marital_status_code_B,marital_status_code_U,homeowner_desc_Homeowner,homeowner_desc_Probable Owner,...,hh_comp_desc_1 Adult Kids,hh_comp_desc_2 Adults Kids,hh_comp_desc_2 Adults No Kids,hh_comp_desc_Single Female,hh_comp_desc_Single Male,hh_comp_desc_Unknown,mean_sales_value,department,mean_check_per_department,basket_id
0,5,3,2,0,1,1,0,0,1,0,...,0,0,1,0,0,0,2.741772,,0.0,24.828125
0,5,3,2,0,1,1,0,0,1,0,...,0,0,1,0,0,0,2.741772,DELI,4.846757,24.828125


In [198]:
targets_lvl_2 = targets_lvl_2.merge(item_features, on='item_id', how='left')
targets_lvl_2 = targets_lvl_2.merge(user_features, on='user_id', how='left')

targets_lvl_2.head(2)

Unnamed: 0,user_id,item_id,target,manufacturer,department_x,commodity_desc,sub_commodity_desc,curr_size_of_product,brand_National,brand_Private,...,hh_comp_desc_1 Adult Kids,hh_comp_desc_2 Adults Kids,hh_comp_desc_2 Adults No Kids,hh_comp_desc_Single Female,hh_comp_desc_Single Male,hh_comp_desc_Unknown,mean_sales_value,department_y,mean_check_per_department,basket_id
0,2070,899624,1.0,69,PRODUCE,POTATOES,POTATOES RUSSET (BULK&BAG),10 LB,0,1,...,0.0,0.0,0.0,0.0,0.0,1.0,2.390334,,0.0,29.45122
1,2070,899624,1.0,69,PRODUCE,POTATOES,POTATOES RUSSET (BULK&BAG),10 LB,0,1,...,0.0,0.0,0.0,0.0,0.0,1.0,2.390334,CHEF SHOPPE,3.24,29.45122


In [199]:
X_train = targets_lvl_2.drop('target', axis=1)
y_train = targets_lvl_2[['target']]

In [200]:
y_train.head()

Unnamed: 0,target
0,1.0
1,1.0
2,1.0
3,1.0
4,1.0


In [201]:
X_train.head()

Unnamed: 0,user_id,item_id,manufacturer,department_x,commodity_desc,sub_commodity_desc,curr_size_of_product,brand_National,brand_Private,mean_itmes_per_week,...,hh_comp_desc_1 Adult Kids,hh_comp_desc_2 Adults Kids,hh_comp_desc_2 Adults No Kids,hh_comp_desc_Single Female,hh_comp_desc_Single Male,hh_comp_desc_Unknown,mean_sales_value,department_y,mean_check_per_department,basket_id
0,2070,899624,69,PRODUCE,POTATOES,POTATOES RUSSET (BULK&BAG),10 LB,0,1,27.452381,...,0.0,0.0,0.0,0.0,0.0,1.0,2.390334,,0.0,29.45122
1,2070,899624,69,PRODUCE,POTATOES,POTATOES RUSSET (BULK&BAG),10 LB,0,1,27.452381,...,0.0,0.0,0.0,0.0,0.0,1.0,2.390334,CHEF SHOPPE,3.24,29.45122
2,2070,899624,69,PRODUCE,POTATOES,POTATOES RUSSET (BULK&BAG),10 LB,0,1,27.452381,...,0.0,0.0,0.0,0.0,0.0,1.0,2.390334,COSMETICS,21.132,29.45122
3,2070,899624,69,PRODUCE,POTATOES,POTATOES RUSSET (BULK&BAG),10 LB,0,1,27.452381,...,0.0,0.0,0.0,0.0,0.0,1.0,2.390334,COUP/STR & MFG,4.99,29.45122
4,2070,899624,69,PRODUCE,POTATOES,POTATOES RUSSET (BULK&BAG),10 LB,0,1,27.452381,...,0.0,0.0,0.0,0.0,0.0,1.0,2.390334,DELI,3.482174,29.45122


In [202]:
cat_feats = X_train.columns[2:].tolist()
X_train[cat_feats] = X_train[cat_feats].astype('category')

cat_feats

['manufacturer',
 'department_x',
 'commodity_desc',
 'sub_commodity_desc',
 'curr_size_of_product',
 'brand_National',
 'brand_Private',
 'mean_itmes_per_week',
 'mean_itms_per_week_per_department',
 'sales_value',
 'std_value',
 'age_desc',
 'income_desc',
 'household_size_desc',
 'kid_category_desc',
 'marital_status_code_A',
 'marital_status_code_B',
 'marital_status_code_U',
 'homeowner_desc_Homeowner',
 'homeowner_desc_Probable Owner',
 'homeowner_desc_Probable Renter',
 'homeowner_desc_Renter',
 'homeowner_desc_Unknown',
 'hh_comp_desc_1 Adult Kids',
 'hh_comp_desc_2 Adults Kids',
 'hh_comp_desc_2 Adults No Kids',
 'hh_comp_desc_Single Female',
 'hh_comp_desc_Single Male',
 'hh_comp_desc_Unknown',
 'mean_sales_value',
 'department_y',
 'mean_check_per_department',
 'basket_id']

In [203]:
y_train.mean()

target    0.117298
dtype: float64

In [204]:
lgb = LGBMClassifier(objective='binary', max_depth=7, categorical_column=cat_feats)
lgb.fit(X_train, y_train)

train_preds = lgb.predict_proba(X_train)

  return f(*args, **kwargs)




Берем топ-k предсказаний, ранжированных по вероятности, для каждого юзера

## Применим LightGBM Ranker

пример ранжирования LightGBM https://medium.com/@tacucumides/learning-to-rank-with-lightgbm-code-example-in-python-843bd7b44574  
код для ранжирования запросов
https://mlexplained.com/2019/05/27/learning-to-rank-explained-with-code/  
туториал ранжирования CatBoost https://github.com/catboost/tutorials/blob/master/ranking/ranking_tutorial.ipynb  



In [205]:
from lightgbm import LGBMRanker

lgb_params = { 
    'objective':'lambdarank',
    'boosting_type': 'gbdt',
    'n_estimators': 500,
    'learning_rate': 0.1,
    'max_depth': 4,
    'categorical_column': cat_feats,
    'random_state': 27,
    'verbose': 1,
    'is_unbalance': True
}

### Делаем группы запросов - сложный момент

In [206]:
X_train['num'] = np.arange(len(X_train))

In [232]:
X_train.shape

(711027, 36)

In [208]:
gr = X_train[['user_id', 'num']].groupby('user_id', sort=False).max()['num'].values

In [209]:
gr

array([   999,   1050,   1914, ..., 710476, 710526, 711026])

In [210]:
grs = np.concatenate((np.array([gr[0]+1]), gr[1:] - gr[:-1]))

In [211]:
grs

array([1000,   51,  864, ...,   50,   50,  500])

In [212]:
X_train.shape

(711027, 36)

In [213]:
y_train.shape

(711027, 1)

In [214]:
np.sum(grs)

711027

In [215]:
lgb = LGBMRanker(**lgb_params, silent=False)

In [216]:
lgb = LGBMRanker(**lgb_params, silent=False)
lgb.fit(X_train, y_train, verbose_eval=-1, group=grs,
        
        eval_set=[(X_train, y_train)], eval_group=[grs],  
        eval_metric=['ndcg'],
        eval_at=[5, 10 ], early_stopping_rounds=50)

In [218]:
train_preds_proba = pd.Series(lgb.predict(X_train)).apply(lambda x: sigmoid(x))

In [219]:
def sigmoid(x):
    return 1/(1 + np.exp(-x))

In [233]:
train_preds_proba

0         0.872815
1         0.839333
2         0.838013
3         0.822713
4         0.803729
            ...   
711022    0.007286
711023    0.007775
711024    0.007427
711025    0.006347
711026    0.006122
Length: 711027, dtype: float64

Соберем dataframe с предсказанными вероятностями покупки для всех N пар пользователь товар

In [220]:
df_preds = X_train[['user_id', 'item_id']].join(other=train_preds_proba.rename('target_proba'), how='left')

In [231]:
df_preds

Unnamed: 0,user_id,item_id,target_proba
0,2070,899624,0.872815
1,2070,899624,0.839333
2,2070,899624,0.838013
3,2070,899624,0.822713
4,2070,899624,0.803729
...,...,...,...
711022,1745,918046,0.007286
711023,1745,918046,0.007775
711024,1745,918046,0.007427
711025,1745,918046,0.006347


In [221]:
def get_lvl2_rec(df_preds, user_id, N=5):
    """Возварщает N самых вероятных к покупке товаров для пользователся с ID = user_id"""
    
    df_user = df_preds.loc[df_preds['user_id']==user_id]
    df_user.sort_values(by='target_proba', ascending=False, axis=0, inplace=True)
    return list(df_user['item_id'])[:N]

In [222]:
get_lvl2_rec(df_preds, 2070)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_user.sort_values(by='target_proba', ascending=False, axis=0, inplace=True)


[916122, 916122, 916122, 916122, 916122]

In [223]:
def get_lvl2_rec_all(df_preds, N=5):
    rec = []
    for x in set(df_preds['user_id']):
        rec.append(get_lvl2_rec(df_preds, x, N=N))
    rec_series = pd.Series(rec)
    return rec_series

In [None]:
result['rec_2_test'] = get_lvl2_rec_all(df_preds, N=5)

In [225]:
result.head()

Unnamed: 0,user_id,actual,rec_1,rec_2_test
0,1,"[853529, 865456, 867607, 872137, 874905, 87524...","[1135983, 856942, 885290, 959455, 5566472]","[856942, 856942, 856942, 856942, 856942]"
1,2,"[15830248, 838136, 839656, 861272, 866211, 870...","[1029743, 866211, 916122, 5569230, 844179]","[1106523, 1029743, 866211, 1081177, 916122]"
2,4,"[883932, 970760, 1035676, 1055863, 1097610, 67...","[5568378, 1116020, 1137346, 8090521, 907631]","[1029743, 891423, 831063, 9881593, 1075368]"
3,6,"[1024306, 1102949, 6548453, 835394, 940804, 96...","[854852, 896613, 871611, 878996, 965267]","[1082185, 1082185, 1024306, 1024306, 1024306]"
4,7,"[836281, 843306, 845294, 914190, 920456, 93886...","[839419, 912451, 8273950, 1067779, 8273833]","[1126899, 1126899, 1126899, 1126899, 1126899]"


In [226]:
#?precision_at_k 0.011 mix_lvl1
#precision_at_k 0.075 mix_lvl1

In [230]:
result.apply(lambda row: precision_at_k(row['rec_2_test'], row['actual'], k=50), axis=1).mean()

0.06815434681543482

# Не нужно

## Предобработка данных из статьи
https://mlexplained.com/2019/05/27/learning-to-rank-explained-with-code/

In [None]:
def save_data(group_data,output_feature,output_group):
    if len(group_data) == 0:
        return

    output_group.write(str(len(group_data))+"\n")
    for data in group_data:
        # only include nonzero features
        feats = [ p for p in data[2:] if float(p.split(':')[1]) != 0.0 ]        
        output_feature.write(data[0] + " " + " ".join(feats) + "\n")

In [None]:
path_ = 'MQ2008/Fold1/train.txt'
fi = open(path_)
output_feature = open('o_f.txt', "w")
output_group = open('o_g.txt', "w")

group_data = []
group = ""
for line in fi:
    if not line:
        break
    if "#" in line:
        line = line[:line.index("#")]
    splits = line.strip().split(" ")
    if splits[1] != group:
        save_data(group_data,output_feature,output_group)
        group_data = []
    group = splits[1]
    group_data.append(splits)

save_data(group_data,output_feature,output_group)