<a href="https://colab.research.google.com/github/naoncorp/geekbrains-ml/blob/main/example.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
%config InlineBackend.figure_format = 'svg'
plt.style.use('ggplot')
plt.rcParams['font.family'] = 'Times New Roman'

import warnings
warnings.filterwarnings('ignore')

from scipy.sparse import csr_matrix
from scipy.stats import mode

from implicit import als

from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier

from metrics import precision, recall
from utils import prefilter_items
from recommenders import MainRecommender

from IPython.display import display, HTML
import os, sys
os.environ["MKL_NUM_THREADS"] = "1" 
os.environ["NUMEXPR_NUM_THREADS"] = "1" 
os.environ["OMP_NUM_THREADS"] = "1" 

import warnings
warnings.simplefilter('ignore')

ModuleNotFoundError: ignored

In [4]:
!pip install catboost

Collecting catboost
  Downloading catboost-0.26.1-cp37-none-manylinux1_x86_64.whl (67.4 MB)
[K     |████████████████████████████████| 67.4 MB 30 kB/s 
Installing collected packages: catboost
Successfully installed catboost-0.26.1


In [None]:
# read data
data = pd.read_csv('../ex2/webinar_2/retail_train.csv')
item_features = pd.read_csv('../ex2/webinar_2/product.csv')
user_features = pd.read_csv('../ex2/webinar_2/hh_demographic.csv')


display(data.head(2), item_features.head(2), user_features.head(2))

# process features dataset
ITEM_COL = 'item_id'
USER_COL = 'user_id'


# column processing
item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': ITEM_COL}, inplace=True)
user_features.rename(columns={'household_key': USER_COL }, inplace=True)

# Важна схема обучения и валидации!
# -- давние покупки -- | -- 6 недель -- | -- 3 недель -- 
# подобрать размер 2-ого датасета (6 недель) --> learning curve (зависимость метрики recall@k от размера датасета)

VAL_MATCHER_WEEKS = 6
VAL_RANKER_WEEKS = 3

# берем данные для тренировки matching модели
data_train_matcher = data[data['week_no'] < data['week_no'].max() - (VAL_MATCHER_WEEKS + VAL_RANKER_WEEKS)]

# берем данные для валидации matching модели
data_val_matcher = data[(data['week_no'] >= data['week_no'].max() - (VAL_MATCHER_WEEKS + VAL_RANKER_WEEKS)) &
                      (data['week_no'] < data['week_no'].max() - (VAL_RANKER_WEEKS))]


# берем данные для тренировки ranking модели
data_train_ranker = data_val_matcher.copy()  # Для наглядности. Далее мы добавим изменения, и они будут отличаться

# берем данные для теста ranking, matching модели
data_val_ranker = data[data['week_no'] >= data['week_no'].max() - VAL_RANKER_WEEKS]

# print stats data
stats_data = []
for df_data in [data_train_matcher, data_val_matcher, data_train_ranker, data_val_ranker]:
    stats_data.append([df_data.shape, df_data[USER_COL].nunique(), df_data[ITEM_COL].nunique()])
    
stats_df = pd.DataFrame(stats_data,columns = ['Shapes', 'Users', 'Items'],index =['train_matcher', 'val_matcher', 'train_ranker', 'val_ranker'])
display(stats_df)

# выше видим разброс по пользователям и товарам
display(data_train_matcher.head(2))

# Prefilter items
n_items_before = data_train_matcher['item_id'].nunique()

data_train_matcher = prefilter_items(
    data_train_matcher, group_col='item_id', popular_col='quantity', top_popular_filter_choose=5000)

n_items_after = data_train_matcher['item_id'].nunique()
print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

# Make cold-start to warm-start
# ищем общих пользователей
common_users = data_train_matcher.user_id.values

data_val_matcher = data_val_matcher[data_val_matcher.user_id.isin(common_users)]
data_train_ranker = data_train_ranker[data_train_ranker.user_id.isin(common_users)]
data_val_ranker = data_val_ranker[data_val_ranker.user_id.isin(common_users)]

# print stats data
stats_data = []
for df_data in [data_train_matcher, data_val_matcher, data_train_ranker, data_val_ranker]:
    stats_data.append([df_data.shape, df_data[USER_COL].nunique(), df_data[ITEM_COL].nunique()])
    
stats_df = pd.DataFrame(stats_data,columns = ['Shapes', 'Users', 'Items'],index =['train_matcher', 'val_matcher', 'train_ranker', 'val_ranker'])
display(stats_df)

recommender = MainRecommender(verbose=False)
recommender.fit(data_train_matcher)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


Unnamed: 0,PRODUCT_ID,MANUFACTURER,DEPARTMENT,BRAND,COMMODITY_DESC,SUB_COMMODITY_DESC,CURR_SIZE_OF_PRODUCT
0,25671,2,GROCERY,National,FRZN ICE,ICE - CRUSHED/CUBED,22 LB
1,26081,2,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,


Unnamed: 0,AGE_DESC,MARITAL_STATUS_CODE,INCOME_DESC,HOMEOWNER_DESC,HH_COMP_DESC,HOUSEHOLD_SIZE_DESC,KID_CATEGORY_DESC,household_key
0,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown,1
1,45-54,A,50-74K,Homeowner,2 Adults No Kids,2,None/Unknown,7


Unnamed: 0,Shapes,Users,Items
train_matcher,"(2108779, 12)",2498,83685
val_matcher,"(169711, 12)",2154,27649
train_ranker,"(169711, 12)",2154,27649
val_ranker,"(118314, 12)",2042,24329


Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


TypeError: prefilter_items() got an unexpected keyword argument 'group_col'

Unnamed: 0,A,B
x,1,2
y,3,4


Unnamed: 0,A,B
x,5,6
y,7,8
