## Load Libraries

In [2]:
import gc
import time
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 100)
import numpy as np
from tqdm import tqdm_notebook
from itertools import product
from sklearn.preprocessing import LabelEncoder

In [3]:
data_folder = "data/"
items = pd.read_csv(data_folder+'items.csv')
shops = pd.read_csv(data_folder+'shops.csv')
cats = pd.read_csv(data_folder+'item_categories.csv')
train = pd.read_csv(data_folder+'sales_train.csv.gz', compression='gzip', header=0)
# set index to ID to avoid droping it later
test  = pd.read_csv(data_folder+'test.csv.gz', compression='gzip', header=0).set_index('ID')

In [4]:
import math
def cube_root(x):
    if x > 0:
        return math.pow(x, float(1)/3)
    elif x < 0:
        return -math.pow(abs(x), float(1)/3)
    else:
        return 0

## Data cleaning and feature generation

In [5]:
# remove outliers
train = train[train.item_price < 59201.00]
train = train[train.item_cnt_day < 625]

In [6]:
# correct price of negative priced item
train.loc[train.item_price < 0, 'item_price'] = 2499.00

In [7]:
# correct shop ids
# Жуковский ул. Чкалова 39м²
train.loc[train.shop_id == 11, 'shop_id'] = 10
train.loc[train.shop_id == 40, 'shop_id'] = 39
# Якутск Орджоникидзе, 56
train.loc[train.shop_id == 0, 'shop_id'] = 57
# Якутск ТЦ "Центральный"
train.loc[train.shop_id == 1, 'shop_id'] = 58
# drop these in the shop details df
shops = shops.drop([0,1,11,40], axis=0).reset_index(drop=True)

In [8]:
# items never sold(ids)
unsold_items = set(items.item_id.unique()) - set(train.item_id.unique())

#### Extract shop location city and items main and sub category

In [16]:
len(unsold_items)

364

In [10]:
def get_icat(x):
    split = x.split('-')
    main = split[0].strip()
    if len(split) > 1:
        sub = split[1].strip()
    else:
        sub = main
    return main, sub

In [11]:
# correct shop name
shops.loc[shops.shop_name == 'Сергиев Посад ТЦ "7Я"', 'shop_name'] = 'СергиевПосад ТЦ "7Я"'
# extract city code
shops['city'] = shops.shop_name.apply(lambda x: x.split(' ')[0].strip())
shops['city_code'] = LabelEncoder().fit_transform(shops['city'])
shops = shops[['shop_id', 'city_code']]

# extract main and sub category code
main_sub = cats.item_category_name.apply(get_icat)
cats['main'], cats['sub'] = [x[0] for x in main_sub], [x[1] for x in main_sub]
cats['main_code'] = LabelEncoder().fit_transform(cats['main'])
cats['sub_code'] = LabelEncoder().fit_transform(cats['sub'])
cats = cats[['item_category_id', 'main_code', 'sub_code']]
items = items[['item_id', 'item_category_id']]

In [12]:
del main_sub
gc.collect()

71

In [13]:
# add date_block_num to test
test['date_block_num'] = 34

# give train similar structure as test
index_cols = ['date_block_num', 'shop_id', 'item_id']
grid = []
for i in range(34):
    curr_shops = train.loc[train.date_block_num == i, 'shop_id'].unique()
    curr_items = train.loc[train.date_block_num == i, 'item_id'].unique()
    grid.append(np.array(list(product(*[[i], curr_shops, curr_items])), dtype=np.int16))

grid = pd.DataFrame(np.vstack(grid), columns=index_cols)
grid = grid.sort_values(index_cols)

In [14]:
# append test to grid 
grid = pd.concat([grid,test], ignore_index=True, sort=False, keys=index_cols) 
grid['date_block_num'] = grid['date_block_num'].astype(np.int8)
grid['shop_id'] = grid['shop_id'].astype(np.int8)
grid['item_id'] = grid['item_id'].astype(np.int16)

In [15]:
del test
gc.collect()

49

In [None]:
# season feature
seasons = pd.DataFrame()
seasons['date_block_num'] = range(35)
month = seasons.date_block_num % 12 + 1
seasons.loc[month.isin([12,1,2,3]), 'season'] = 0    # 'winter'
seasons.loc[month.isin([6,7,8]), 'season'] = 1  #'summer'
seasons.loc[month.isin([4,5]), 'season'] = 2 #'spring'
seasons.loc[month.isin([9,10,11]), 'season'] = 3 #'autumn'

# revenue feature
train['revenue'] = train.item_cnt_day * train.item_price

In [None]:
# prices ending with 9
str_prices   = train.item_price.astype(str)
str_prices   = str_prices.apply(lambda x: x.split('.')[0])
ends_with_9  = str_prices.apply(lambda x: x.endswith('9'))
# take items with prices ending with 9 more than 70% of the time
items['end_with_9'] = items['item_id'].map((ends_with_9.groupby(train.item_id).mean() > 0.70) * 1)
items.end_with_9.fillna(1, inplace=True)

In [None]:
del str_prices, ends_with_9
gc.collect()

In [None]:
# pre-process the numerical features
train['item_price'] = np.log(train.item_price)
train['item_cnt_day'] = train.item_cnt_day.apply(cube_root)
train['revenue'] = train.revenue.apply(cube_root)
# drop unwanted cols
train = train.drop(['date', 'item_price'], axis=1)

In [None]:
# add the generated features into grid
grid = pd.merge(grid, items, on='item_id', how='left')
grid = pd.merge(grid, seasons, on='date_block_num', how='left')
grid = pd.merge(grid, cats, on='item_category_id', how='left')
grid = pd.merge(grid, shops, on='shop_id', how='left')

In [None]:
# downcast the datatypes
grid['item_category_id'] = grid['item_category_id'].astype(np.int8)
grid['end_with_9'] = grid['end_with_9'].astype(np.int8)
grid['season'] = grid['season'].astype(np.int8)
grid['main_code'] = grid['main_code'].astype(np.int8)
grid['sub_code'] = grid['sub_code'].astype(np.int8)
grid['city_code'] = grid['city_code'].astype(np.int8)

In [None]:
del items, cats, seasons, shops
gc.collect()

### Aggregate sales data

In [None]:
# aggregate sale counts
group = train.groupby(index_cols).agg({'item_cnt_day': 'sum'})
group.columns = ['item_cnt_month']
group.reset_index(inplace=True)
grid = pd.merge(grid, group, on=index_cols, how='left').fillna(0)

In [None]:
grid.head()

In [None]:
del train, group
gc.collect()

### Mean encodings

In [None]:
from sklearn.model_selection import KFold
gb_mean = grid.loc[grid.date_block_num < 34, 'item_cnt_month'].mean()

def mean_encode(cat_feat, all_data):
    ran_ins = np.random.RandomState(123)
    kf = KFold(n_splits=5, shuffle=False, random_state=ran_ins)
    encoded_feature_tr = pd.Series()
    encoded_feature_te = pd.Series()
    
    for tr, te in zip(kf.split(all_data.loc[all_data.date_block_num < 34, :]), kf.split(all_data.loc[all_data.date_block_num == 34, :])):
        tr, val , te_val = all_data.iloc[tr[0]], all_data.iloc[tr[1]], all_data.iloc[te[1]]
        means = tr.groupby(cat_feat).item_cnt_month.mean()
        encoded_feature_tr = encoded_feature_tr.append(pd.Series(val['item_id'].map(means)))
        encoded_feature_te = encoded_feature_te.append(pd.Series(te_val['item_id'].map(means)))
    return encoded_feature_tr.append(encoded_feature_te, ignore_index=True)

In [None]:
menc_lag = ['item_id','date_block_num', 'item_category_id', 'season','main_code']

for col in tqdm_notebook(menc_lag):
    grid[col+'_enc'] = mean_encode(col, grid).fillna(gb_mean)

In [None]:
# downcast the mean encodings and the target
for col in tqdm_notebook(grid.select_dtypes(include=float).columns):
    grid[col] = grid[col].astype(np.float16)

In [None]:
grid.head()

### Lags

In [None]:
def lag_feature(df, lags, col):
    tmp = df[['date_block_num','shop_id','item_id',col]]
    for i in lags:
        shifted = tmp.copy()
        shifted.columns = ['date_block_num','shop_id','item_id', col+'_lag_'+str(i)]
        shifted['date_block_num'] += i
        df = pd.merge(df, shifted, on=['date_block_num','shop_id','item_id'], how='left')
    return df

In [None]:
# create lags
col_lags = [('item_id_enc', [1,2,3]), ('date_block_num_enc', [2]), ('item_category_id_enc', [3]),\
            ('season_enc', [3]), ('item_cnt_month', [1,2,3,4,6]), ('main_code_enc', [2])]

for pair in tqdm_notebook(col_lags):
    grid = lag_feature(grid, pair[1], pair[0])

In [None]:
grid.head()

In [None]:
# drop unwanted cols
cols_to_drop = [col for col in grid.columns if col.endswith('enc')]
grid.drop(cols_to_drop, axis=1, inplace=True)
grid = grid[grid.date_block_num > 5]
grid.fillna(0, inplace=True)

In [None]:
grid.info()

In [None]:
# downcast the mean encodings and the target
for col in tqdm_notebook(grid.select_dtypes(include=float).columns):
    grid[col] = grid[col].astype(np.float16)

In [None]:
gc.collect()

### Training

In [None]:
from catboost import CatBoostRegressor
from catboost import Pool

In [None]:
# grid.head()

In [None]:
#grid = grid[['date_block_num', 'shop_id', 'item_id','season_enc_lag_3',\
             'date_block_num_enc_lag_2','main_code_enc_lag_2','item_id_enc_lag_2','item_id_enc_lag_3',\
             'item_cnt_month_lag_3','item_cnt_month_lag_2','item_cnt_month_lag_1', 'item_cnt_month']]

In [None]:
def create_pool(X_train, y_train,X_test=None, y_test=None, cat_features=None,b_eval=False, only_train=False):
    train_pool = Pool(X_train, label=y_train, cat_features=cat_features)
    
    if only_train:
        return train_pool    
    if b_eval:
        test_pool = Pool(X_test, label=y_test,cat_features=cat_features)
    else:
        test_pool = Pool(X_test, cat_features=cat_features)
    return train_pool, test_pool

In [None]:
# pool_train, pool_eval = create_pool(X_train=grid.loc[grid.date_block_num < 33, :].drop('item_cnt_month', axis=1), \
#                         y_train=grid.loc[grid.date_block_num < 33, 'item_cnt_month'],\
#                         X_test=grid.loc[grid.date_block_num == 33, :].drop('item_cnt_month', axis=1),\
#                         y_test=grid.loc[grid.date_block_num == 33, 'item_cnt_month'], cat_features=[i for i in range(3)], b_eval=True) 

In [None]:
# del grid
# gc.collect()

In [None]:
# model = CatBoostRegressor(
#         random_seed=8,
#         bootstrap_type='Poisson',
#         learning_rate=0.05,
#         depth=7,
#         l2_leaf_reg=5, 
#         subsample=0.2,
#         task_type = "GPU", 
#         snapshot_file='snapshot.bkp',
#         max_ctr_complexity=4, 
#         boosting_type='Plain',
#         od_type='Iter',
#         od_wait=10
#     )

# model.fit(
#     pool_train,
#     eval_set=pool_eval,
#     logging_level='Silent', 
#     plot=True
# )

In [None]:
# import matplotlib.pyplot as plt 

# def plot_feat_imp(model):
#     feat_imps = model.get_feature_importance(prettified=True)
#     feats = []
#     values = []
    
#     for pair in feat_imps:
#         feats.append(pair[0])
#         values.append(pair[1])
    
#     plt.figure(figsize=(15,10))
#     plt.barh(feats, values)

In [None]:
# plot_feat_imp(model)

In [None]:
# sorted(model.get_feature_importance(prettified=True), key=lambda x: x[1])

In [None]:
# del pool_train, pool_eval
# gc.collect()

In [None]:
# del model
# gc.collect()

In [None]:
grid = grid[['date_block_num', 'shop_id', 'item_id', 'item_cnt_month_lag_1','item_cnt_month_lag_2', 'item_id_enc_lag_2', \
             'item_cnt_month_lag_3', 'item_id_enc_lag_3', 'main_code_enc_lag_2', 'date_block_num_enc_lag_2', 'item_cnt_month']]

In [None]:
pool_train, pool_test = create_pool(X_train=grid.loc[grid.date_block_num < 34,:].drop('item_cnt_month', axis=1), y_train=grid.loc[grid.date_block_num < 34,'item_cnt_month'],
                        X_test=grid.loc[grid.date_block_num == 34,:].drop('item_cnt_month', axis=1), cat_features=[0,1,2])

In [None]:
model = CatBoostRegressor(
        iterations=115,
        random_seed=8,
        bootstrap_type='Poisson',
        learning_rate=0.05,
        depth=7,
        l2_leaf_reg=5, 
        subsample=0.2,
        task_type = "GPU", 
        snapshot_file='snapshot.bkp',
        max_ctr_complexity=4, 
        boosting_type='Plain'
    )

model.fit(
    pool_train,
    logging_level='Silent', 
    plot=True
)

In [None]:
sorted(model.get_feature_importance(prettified=True), key=lambda x: x[1])

In [None]:
preds = model.predict(pool_test).clip(0,20)

In [1]:
import time
import datetime
ts = time.time()
st = datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S')
#make_submission(st+':submit.csv', preds)

In [2]:
st

'2018-11-25 11:08:19'

In [None]:
submission = pd.DataFrame({
    "ID": range(214200), 
    "item_cnt_month": preds
    })

In [None]:
submission.to_csv('submit.csv', index=False)

In [None]:
#preds = model.predict(pool_test).clip(0,20)
# def make_submission(file_name, preds):
#     submission = pd.DataFrame({
#     "ID": range(214200), 
#     "item_cnt_month": preds
#     })
    
#     submission.to_csv(file_name, index=False)

In [10]:
sub = pd.read_csv('2018-11-25 11:08:19.csv')

In [11]:
sub

Unnamed: 0,ID,item_cnt_month
0,0,0.605774
1,1,0.173049
2,2,0.664610
3,3,0.366243
4,4,0.173049
5,5,0.417017
6,6,1.227461
7,7,0.232728
8,8,0.884269
9,9,0.358302
