## Load Libraries and Data

In [1]:
import gc
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 100)
import numpy as np
from tqdm import tqdm_notebook
from itertools import product
from sklearn.preprocessing import LabelEncoder

data_folder = "../input/"
items = pd.read_csv(data_folder+'items.csv')
shops = pd.read_csv(data_folder+'shops.csv')
cats = pd.read_csv(data_folder+'item_categories.csv')
train = pd.read_csv(data_folder+'sales_train.csv.gz', compression='gzip', header=0)
# set index to ID to avoid droping it later
test  = pd.read_csv(data_folder+'test.csv.gz', compression='gzip', header=0).set_index('ID')

## Data cleaning and feature generation

In [3]:
# remove outliers
train = train[train.item_price < 59201.00]
train = train[train.item_cnt_day < 625]

# correct price of negative priced item
train.loc[train.item_price < 0, 'item_price'] = 2499.00

# correct shop ids
# Жуковский ул. Чкалова 39м²
train.loc[train.shop_id == 11, 'shop_id'] = 10
train.loc[train.shop_id == 40, 'shop_id'] = 39
# Якутск Орджоникидзе, 56
train.loc[train.shop_id == 0, 'shop_id'] = 57
# Якутск ТЦ "Центральный"
train.loc[train.shop_id == 1, 'shop_id'] = 58
# drop these in the shop details df
shops = shops.drop([0,1,11,40], axis=0).reset_index(drop=True)

def get_icat(x):
    split = x.split('-')
    main = split[0].strip()
    if len(split) > 1:
        sub = split[1].strip()
    else:
        sub = main
    return main, sub

# Extract shop location city and items main and sub category
# correct shop name
shops.loc[shops.shop_name == 'Сергиев Посад ТЦ "7Я"', 'shop_name'] = 'СергиевПосад ТЦ "7Я"'
# extract city code
shops['city'] = shops.shop_name.apply(lambda x: x.split(' ')[0].strip())
shops['city_code'] = LabelEncoder().fit_transform(shops['city'])
shops = shops[['shop_id', 'city_code']]

# extract main and sub category code
main_sub = cats.item_category_name.apply(get_icat)
cats['main'], cats['sub'] = [x[0] for x in main_sub], [x[1] for x in main_sub]
cats['main_code'] = LabelEncoder().fit_transform(cats['main'])
cats['sub_code'] = LabelEncoder().fit_transform(cats['sub'])
cats = cats[['item_category_id', 'main_code', 'sub_code']]
items = items[['item_id', 'item_category_id']]
del main_sub
gc.collect()

# add date_block_num to test
test['date_block_num'] = 34

# give train similar structure as test
index_cols = ['date_block_num', 'shop_id', 'item_id']
grid = []
for i in range(34):
    curr_shops = train.loc[train.date_block_num == i, 'shop_id'].unique()
    curr_items = train.loc[train.date_block_num == i, 'item_id'].unique()
    grid.append(np.array(list(product(*[[i], curr_shops, curr_items])), dtype=np.int16))

grid = pd.DataFrame(np.vstack(grid), columns=index_cols)
grid = grid.sort_values(index_cols)

# append test to grid 
grid = pd.concat([grid,test], ignore_index=True, sort=False, keys=index_cols) 
grid['date_block_num'] = grid['date_block_num'].astype(np.int8)
grid['shop_id'] = grid['shop_id'].astype(np.int8)
grid['item_id'] = grid['item_id'].astype(np.int16)

del test
gc.collect()

# season feature
seasons = pd.DataFrame()
seasons['date_block_num'] = range(35)
month = seasons.date_block_num % 12 + 1
seasons.loc[month.isin([12,1,2,3]), 'season'] = 0    # 'winter'
seasons.loc[month.isin([6,7,8]), 'season'] = 1  #'summer'
seasons.loc[month.isin([4,5]), 'season'] = 2 #'spring'
seasons.loc[month.isin([9,10,11]), 'season'] = 3 #'autumn'

# revenue feature
train['revenue'] = train.item_cnt_day * train.item_price

# prices ending with 9
str_prices   = train.item_price.astype(str)
str_prices   = str_prices.apply(lambda x: x.split('.')[0])
ends_with_9  = str_prices.apply(lambda x: x.endswith('9'))
# take items with prices ending with 9 more than 70% of the time
items['end_with_9'] = items['item_id'].map((ends_with_9.groupby(train.item_id).mean() > 0.70) * 1)
items.end_with_9.fillna(1, inplace=True)

del str_prices, ends_with_9
gc.collect()

# import math
# def cube_root(x):
#     if x > 0:
#         return math.pow(x, float(1)/3)
#     elif x < 0:
#         return -math.pow(abs(x), float(1)/3)
#     else:
#         return 0

# # pre-process the numerical features
# train['item_price'] = np.log(train.item_price)
# train['item_cnt_day'] = train.item_cnt_day.apply(cube_root)
# train['revenue'] = train.revenue.apply(cube_root)
# drop unwanted cols
train = train.drop(['date', 'item_price'], axis=1)

# add the generated features into grid
grid = pd.merge(grid, items, on='item_id', how='left')
grid = pd.merge(grid, seasons, on='date_block_num', how='left')
grid = pd.merge(grid, cats, on='item_category_id', how='left')
grid = pd.merge(grid, shops, on='shop_id', how='left')

# downcast the datatypes
grid['item_category_id'] = grid['item_category_id'].astype(np.int8)
grid['end_with_9'] = grid['end_with_9'].astype(np.int8)
grid['season'] = grid['season'].astype(np.int8)
grid['main_code'] = grid['main_code'].astype(np.int8)
grid['sub_code'] = grid['sub_code'].astype(np.int8)
grid['city_code'] = grid['city_code'].astype(np.int8)

del items, cats, seasons, shops
gc.collect()

126

### Aggregate sales data

In [None]:
# aggregate sale counts
group = train.groupby(index_cols).agg({'item_cnt_day': 'sum'})
group.columns = ['item_cnt_month']
group.reset_index(inplace=True)
grid = pd.merge(grid, group, on=index_cols, how='left').fillna(0)
grid['item_cnt_month'] = grid['item_cnt_month'].clip(0,20) 

# revenue aggregates
group = train.groupby(index_cols).agg({'revenue': 'mean'})
group.columns = ['mean_revenue']
group.reset_index(inplace=True)
grid = pd.merge(grid, group, on=index_cols, how='left').fillna(0)

# item revenue aggreagtes
group = train.groupby(['date_block_num', 'item_id']).agg({'revenue': 'mean'})
group.columns = ['item_mean_revenue']
group.reset_index(inplace=True)
grid = pd.merge(grid, group, on=['date_block_num', 'item_id'], how='left').fillna(0)

# shop revenue aggregates
group = train.groupby(['date_block_num', 'shop_id']).agg({'revenue': 'mean'})
group.columns = ['shop_mean_revenue']
group.reset_index(inplace=True)
grid = pd.merge(grid, group, on=['date_block_num', 'shop_id'], how='left').fillna(0)
del train, group
gc.collect()

### Mean encodings

In [None]:
from sklearn.model_selection import KFold
gb_mean = grid.loc[grid.date_block_num < 34, 'item_cnt_month'].mean()

def mean_encode_regul(cat_feat, all_data):
    ran_ins = np.random.RandomState(123)
    kf = KFold(n_splits=5, shuffle=False, random_state=ran_ins)
    encoded_feature = pd.Series()
    
    for tr_idx, val_idx in kf.split(all_data.loc[all_data.date_block_num < 34, :]):
        tr, val = all_data.iloc[tr_idx], all_data.iloc[val_idx]
        means = tr.groupby(cat_feat).item_cnt_month.mean()
        encoded_feature = encoded_feature.append(pd.Series(val['item_id'].map(means)))
        
    return encoded_feature

In [None]:
# single categorical column encodings
menc_cols = ['item_id','date_block_num', 'main_code', 'shop_id', 'item_category_id', 'end_with_9', 'season', 'sub_code', 'city_code']

for col in tqdm_notebook(menc_cols):
    grid[col+'_enc'] = mean_encode_regul(col, grid).fillna(gb_mean)

In [None]:
# combiation of categorical columns encodings
# item-month encoding
group = grid.groupby(['date_block_num', 'item_id']).agg({'item_cnt_month': ['mean']})
group.columns = ['item_month_enc']
group['item_month_enc'].fillna(gb_mean, inplace=True)
group.reset_index(inplace=True)
grid = pd.merge(grid, group, on=['date_block_num', 'item_id'], how='left')

# shop_month encoding
group = grid.groupby(['date_block_num', 'shop_id']).agg({'item_cnt_month': ['mean']})
group.columns = ['shop_month_enc']
group['shop_month_enc'].fillna(gb_mean, inplace=True)
group.reset_index(inplace=True)
grid = pd.merge(grid, group, on=['date_block_num', 'shop_id'], how='left')


# itc-month encoding
group = grid.groupby(['date_block_num', 'item_category_id']).agg({'item_cnt_month': ['mean']})
group.columns = ['itc_month_enc']
group['itc_month_enc'].fillna(gb_mean, inplace=True)
group.reset_index(inplace=True)
grid = pd.merge(grid, group, on=['date_block_num', 'item_category_id'], how='left')


# dt-item-Itc encoding
group = grid.groupby(['date_block_num', 'item_id','item_category_id']).agg({'item_cnt_month': ['mean']})
group.columns = ['mon_item_itc_enc']
group['mon_item_itc_enc'].fillna(gb_mean, inplace=True)
group.reset_index(inplace=True)
grid = pd.merge(grid, group, on=['date_block_num', 'item_id','item_category_id'], how='left')


# dt-item-end_wit9 enc
group = grid.groupby(['date_block_num', 'item_id','end_with_9']).agg({'item_cnt_month': ['mean']})
group.columns = ['mon_item_end9_enc']
group['mon_item_end9_enc'].fillna(gb_mean, inplace=True)
group.reset_index(inplace=True)
grid = pd.merge(grid, group, on=['date_block_num', 'item_id','end_with_9'], how='left')


# dt-item-main-sub encoding
group = grid.groupby(['date_block_num', 'item_id','main_code', 'sub_code']).agg({'item_cnt_month': ['mean']})
group.columns = ['mon_it_m-s_enc']
group['mon_it_m-s_enc'].fillna(gb_mean, inplace=True)
group.reset_index(inplace=True)
grid = pd.merge(grid, group, on=['date_block_num', 'item_id','main_code', 'sub_code'], how='left')


# dt-item-main encoding
group = grid.groupby(['date_block_num', 'item_id','main_code']).agg({'item_cnt_month': ['mean']})
group.columns = ['mon_it_m_enc']
group['mon_it_m_enc'].fillna(gb_mean, inplace=True)
group.reset_index(inplace=True)
grid = pd.merge(grid, group, on=['date_block_num', 'item_id','main_code'], how='left')


# dt-item-sub encoding
group = grid.groupby(['date_block_num', 'item_id','sub_code']).agg({'item_cnt_month': ['mean']})
group.columns = ['mon_it_s_enc']
group['mon_it_s_enc'].fillna(gb_mean, inplace=True)
group.reset_index(inplace=True)
grid = pd.merge(grid, group, on=['date_block_num', 'item_id','sub_code'], how='left')

      

# dt-season encoding
group = grid.groupby(['date_block_num', 'season']).agg({'item_cnt_month': ['mean']})
group.columns = ['mon_season_enc']
group['mon_season_enc'].fillna(gb_mean, inplace=True)
group.reset_index(inplace=True)
grid = pd.merge(grid, group, on=['date_block_num', 'season'], how='left')


# dt-sh-season encoding
group = grid.groupby(['date_block_num', 'shop_id','season']).agg({'item_cnt_month': ['mean']})
group.columns = ['mon_sh_se_enc']
group['mon_sh_se_enc'].fillna(gb_mean, inplace=True)
group.reset_index(inplace=True)
grid = pd.merge(grid, group, on=['date_block_num', 'shop_id','season'], how='left')


# dt-sh-city encoding
group = grid.groupby(['date_block_num', 'shop_id', 'city_code']).agg({'item_cnt_month': ['mean']})
group.columns = ['mon_sh_cty_enc']
group['mon_sh_cty_enc'].fillna(gb_mean, inplace=True)
group.reset_index(inplace=True)
grid = pd.merge(grid, group, on=['date_block_num', 'shop_id', 'city_code'], how='left')

In [None]:
grid['mean_revenue'] = grid['mean_revenue'].astype(np.float32)
grid['item_mean_revenue'] = grid['item_mean_revenue'].astype(np.float32)

In [None]:
# downcast the mean encodings and the target
for col in tqdm_notebook(grid.select_dtypes(include=float).columns):
    grid[col] = grid[col].astype(np.float16)

### Lags

In [None]:
def lag_feature(df, lags, col):
    tmp = df[['date_block_num','shop_id','item_id',col]]
    for i in lags:
        shifted = tmp.copy()
        shifted.columns = ['date_block_num','shop_id','item_id', col+'_lag_'+str(i)]
        shifted['date_block_num'] += i
        df = pd.merge(df, shifted, on=['date_block_num','shop_id','item_id'], how='left')
    return df

In [None]:
#grid.drop(['item_category_id','end_with_9','season','main_code','sub_code','city_code'], axis=1,inplace=True)

In [None]:
# create lags
col_lags = [('item_id_enc', [2,3]), ('date_block_num_enc', [2,3]),('item_cnt_month', [1,2,3,6]),\
            ('main_code_enc', [2,3])]

for pair in tqdm_notebook(col_lags):
    grid = lag_feature(grid, pair[1], pair[0])

In [None]:
col_lags = ['mean_revenue', 'item_mean_revenue', 'shop_mean_revenue','item_month_enc',\
            'shop_month_enc', 'itc_month_enc', 'mon_item_itc_enc','mon_item_end9_enc',\
            'mon_it_m-s_enc', 'mon_it_m_enc', 'mon_it_s_enc','mon_season_enc', \
            'mon_sh_se_enc', 'mon_sh_cty_enc', 'shop_id_enc', 'item_category_id_enc',
           'end_with_9_enc', 'season_enc', 'sub_code_enc', 'city_code_enc']

for col in tqdm_notebook(col_lags):
    grid = lag_feature(grid, [1,2,3], col)

In [None]:
# drop unwanted cols
cols_to_drop = [col for col in grid.columns if col.endswith('enc')] + ['mean_revenue', 'item_mean_revenue', 'shop_mean_revenue']
grid.drop(cols_to_drop, axis=1, inplace=True)
grid = grid[grid.date_block_num > 5]
grid.fillna(0, inplace=True)

In [None]:
gc.collect()

In [None]:
# save the built dataframe
grid.to_pickle('grid.pkl')