# importing modules

In [1]:
!pip install grader

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import pandas as pd
import numpy as np
from itertools import product
from grader import grader

In [3]:
url = 'https://github.com/hse-aml/competitive-data-science/blob/master/readonly/final_project_data/sales_train.csv.gz?raw=true'
sales = pd.read_csv(url, compression='gzip')
sales.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
0,02.01.2013,0,59,22154,999.0,1.0
1,03.01.2013,0,25,2552,899.0,1.0
2,05.01.2013,0,25,2552,899.0,-1.0
3,06.01.2013,0,25,2554,1709.05,1.0
4,15.01.2013,0,25,2555,1099.0,1.0


In [4]:
sales.describe().round()
mask = (sales.date_block_num ==2)
mask = (sales.date_block_num ==0)
sales.loc[mask][-5:]

## date_block_num is assigned according to months

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
115685,19.01.2013,0,46,32,349.0,1.0
115686,18.01.2013,0,46,32,349.0,1.0
115687,26.01.2013,0,46,35,399.0,2.0
115688,31.01.2013,0,46,621,149.0,1.0
115689,07.01.2013,0,46,32,349.0,1.0


# data processing

In [5]:
grid = []
# date block unique element list [dbuel]
dbuel = sales['date_block_num'].unique()
for block in dbuel:
  cur_shops = sales[sales['date_block_num'] == block]['shop_id'].unique()
  cur_items = sales[sales['date_block_num'] == block]['item_id'].unique()
  grid.append(np.array(list(product(*[cur_shops, cur_items, [block]])),dtype='int32'))

# turn grid to dataframe vertical axis
index_cols = ['shop_id','item_id','date_block_num']
grid = pd.DataFrame(np.vstack(grid), columns = index_cols,dtype=np.int32)

In [6]:
# Get aggregetting values
gb = sales.groupby(index_cols,as_index=False).agg({'item_cnt_day':'sum'})

In [7]:

#join aggregated data to the grid
all_data = pd.merge(grid,gb,how='left',on=index_cols).fillna(0)
#sort the data
all_data.sort_values(['date_block_num','shop_id','item_id'],inplace=True)
all_data.rename(columns={'item_cnt_day':'target'}, inplace=True)

# mean encoding with target

In [8]:
# global mean :
gl_mean_target = all_data.target.mean()

# mean by target
item_id_target_mean = all_data.groupby('item_id').target.mean()

# In our non-regularized case we just *map* the computed means to the `item_id`'s
all_data['item_target_enc'] = all_data['item_id'].map(item_id_target_mean)

# fill nans if contains:
all_data['item_target_enc'].fillna(gl_mean_target, inplace=True)

In [9]:
# Finding correlation :
co_x, co_y = all_data['item_target_enc'].values, all_data['target'].values

np.corrcoef(co_x, co_y)[0][1]

0.4830386988621698

# Mean Encoding Regularization

---

Due to data leakage and straitified we need to regularize

## K_fold method :

In [23]:
%%time

# import package StratifiedKFold
from sklearn.model_selection import KFold


# intialize k_fold
k_fold = KFold( n_splits=5, shuffle=False)

# regularization and assign new value :
for tr_index, val_index in k_fold.split(all_data):
  tr, val = all_data.loc[tr_index], all_data.loc[val_index]
  mean = tr.groupby(['item_id']).target.mean()
  val['cv_Item_mean'] = val['item_id'].map(mean)
  all_data.loc[val_index] = val
all_data['cv_Item_mean'].fillna(gl_mean_target, inplace=True)


KeyError: ignored

In [24]:
# Finding correlation :
co_x, co_y = all_data['cv_Item_mean'].values, all_data['target'].values

np.corrcoef(co_x, co_y)[0][1]

## Leave-one-out (loo) :

In [None]:
# Calculate sum of the target values using all the objects.
target_sum = all_data.groupby('item_id')['target'].transform('sum')

# Then subtract the target of the given object and divide the resulting value by n_objects - 1.
n_objects = all_data.groupby('item_id')['target'].transform('count')

all_data['loo_item_mean'] = (target_sum - all_data['target']) / (n_objects - 1)
all_data['loo_item_mean'].fillna(0.3343, inplace=True)

In [None]:
# Finding correlation :
co_x, co_y = all_data['loo_item_mean'].values, all_data['target'].values

np.corrcoef(co_x, co_y)[0][1]

## Smoothing method :

In [None]:
alpha = 100

item_id_target_mean = all_data.groupby('item_id')['target'].transform('mean')
n_objects = all_data.groupby('item_id')['target'].transform('count')

all_data['smooth_item_mean'] = (item_id_target_mean * n_objects + 0.3343*alpha) / (n_objects + alpha)

all_data['smooth_item_mean'].fillna(0.3343, inplace=True) 



In [None]:
# Finding correlation :
co_x, co_y = all_data['smooth_item_mean'].values, all_data['target'].values

np.corrcoef(co_x, co_y)[0][1]

## Expanding mean :

In [None]:
# cumulative sum variable :
cumsum = all_data.groupby('item_id')['target'].cumsum() - all_data['target']
# cumulative count variable:
cumcnt = all_data.groupby('item_id').cumcount()

##  expanding mean :
all_data['em_item_mean'] = cumsum / cumcnt
all_data['em_item_mean'].fillna(0.3343, inplace=True) 

In [None]:
# Finding correlation :
co_x, co_y = all_data['em_item_mean'].values, all_data['target'].values

np.corrcoef(co_x, co_y)[0][1]