# Bench Mark - A_Wish Based

In [1]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import cross_val_score
from sklearn.metrics import r2_score, mean_squared_error
from multiprocessing import Pool
from tqdm import tqdm_notebook
# import GPyOpt

items_df = pd.read_csv('Data/items.csv')
shops_df = pd.read_csv('Data/shops.csv')
icats_df = pd.read_csv("Data/item_categories.csv")
train_df = pd.read_csv("Data/sales_train.csv.gz")
test_df  = pd.read_csv('Data/test.csv.gz') # 214200 rows

shops_df['city_id'] = shops_df.shop_name.apply(lambda x: str.replace(x, '!', '')).apply(lambda x: x.split(' ')[0])
shops_df['city_id'] = pd.Categorical(shops_df['city_id']).codes

icats_df['item_category_group'] = icats_df['item_category_name'].apply(lambda x: str(x).split(' ')[0])
icats_df['item_category_group'] = pd.Categorical(icats_df['item_category_group']).codes

train_piv = train_df.pivot_table(index=['shop_id','item_id'], columns='date_block_num', values='item_cnt_day',aggfunc='sum').fillna(0.0)    
train_piv = train_piv.reset_index()
train_piv.head()

date_block_num,shop_id,item_id,0,1,2,3,4,5,6,7,...,24,25,26,27,28,29,30,31,32,33
0,0,30,0.0,31.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,31,0.0,11.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,32,6.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,33,3.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,35,1.0,14.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [2]:
X_train = pd.merge(train_piv, shops_df, how='left', on=['shop_id'])
X_train = pd.merge(X_train, items_df, how='left', on=['item_id'])
X_train = pd.merge(X_train, icats_df, how='left', on=['item_category_id'])
Y_train = train_piv[33]

X_train.drop(labels=['shop_name', 'item_name', 'item_category_name', 33], axis=1, inplace=True)

In [3]:
X_test = pd.merge(test_df, train_piv, how='left', on=['shop_id', 'item_id']).fillna(0)
X_test = pd.merge(X_test, shops_df, how='left', on=['shop_id'])
X_test = pd.merge(X_test, items_df, how='left', on=['item_id'])
X_test = pd.merge(X_test, icats_df, how='left', on=['item_category_id'])

X_test.drop(labels=['shop_name', 'item_name', 'item_category_name', 'ID', 0], axis=1, inplace=True)

for i in range(33):
    X_test.rename(columns={i+1: i}, inplace=True)

# KNN Clustering on city

In [13]:
class NearestNeighborsFeats(BaseEstimator, ClassifierMixin):
    '''
        This class should implement KNN features extraction 
    '''
    def __init__(self, n_jobs, k_list, metric, month_history, n_classes=None, n_neighbors=None, eps=0.000001):
        self.n_jobs = n_jobs
        self.k_list = k_list
        self.metric = metric
        self.month_history = month_history
        
        if n_neighbors is None:
            self.n_neighbors = max(k_list) 
        else:
            self.n_neighbors = n_neighbors
            
        self.eps = eps        
        self.n_classes_ = n_classes
    
    def fit(self, X, y):
        '''
            Set's up the train set and self.NN object
        '''
        
        self.X = X
        
        # Create a NearestNeighbors (NN) object. We will use it in `predict` function 
        self.NN = NearestNeighbors(n_neighbors=max(self.k_list), 
                                   metric=self.metric, 
                                   n_jobs=1, 
                                   algorithm='auto')
        self.NN.fit(X)
        
        # Store labels 
        self.y_train = y
        
        # Save how many classes we have
        self.n_classes = np.unique(y).shape[0] if self.n_classes_ is None else self.n_classes_
        
        
    def predict(self, X):       
        '''
            Produces KNN features for every object of a dataset X
        '''
        if self.n_jobs == 1:
            test_feats = []
            for i in tqdm_notebook(range(X.shape[0])):
                test_feats.append(self.get_features_for_one(X[i:i+1]))
        else:
            '''
                 *Make it parallel*
                     Number of threads should be controlled by `self.n_jobs`  
                     
                     
                     You can use whatever you want to do it
                     For Python 3 the simplest option would be to use 
                     `multiprocessing.Pool` (but don't use `multiprocessing.dummy.Pool` here)
                     You may try use `joblib` but you will most likely encounter an error, 
                     that you will need to google up (and eventually it will work slowly)
                     
                     For Python 2 I also suggest using `multiprocessing.Pool` 
                     You will need to use a hint from this blog 
                     http://qingkaikong.blogspot.ru/2016/12/python-parallel-method-in-class.html
                     I could not get `joblib` working at all for this code 
                     (but in general `joblib` is very convenient)
                     
            '''
            test_feats = []
            
            pool = Pool(processes=self.n_jobs)
            
            for i in range(X.shape[0]):
                test_feats.append(pool.apply_async(self.get_features_for_one, (X[i:i+1],)))
            
            pool.close()
            
            test_feats = [res.get() for res in test_feats]
            pool.join()
            
            
#             assert False, 'You need to implement it for n_jobs > 1'
            
        return np.vstack(test_feats)
        
        
    def get_features_for_one(self, x):
        '''
            Computes KNN features for a single object `x`
        '''

        NN_output = self.NN.kneighbors(x) # return dist: array representing the lengths to points
                                          #        ind: indices of the nearest points in the population index
        # Vector of size `n_neighbors`
        # Stores indices of the neighbors
        neighs = NN_output[1][0] # cast down to 1-d array
        
        # Vector of size `n_neighbors`
        # Stores distances to corresponding neighbors
        neighs_dist = NN_output[0][0] # cast down to 1-d array
        
        # Vector of size `n_neighbors`
        # Stores labels of corresponding neighbors
        neighs_y = self.y_train[neighs] 

#         return_list = []
        
        for k in self.k_list:
            mean = self.X[neighs[:k]].mean(axis=0)
            std = self.X[neighs[:k]].std(axis=0)
            
        return_list = [mean[-self.month_history:], std[-self.month_history:]]
        
        knn_feats = np.hstack(return_list)
        
        return knn_feats

In [14]:
from sklearn.model_selection import KFold

def KNN_feature_transformation(X_train, X_test, categorical_features, metrics):
    
    x_train = X_train.drop(labels=categorical_features, axis=1).values
    x_test = X_test.drop(labels=categorical_features, axis=1).values
    
    k_list = [10, 20, 30]

    kf = KFold(n_splits=5)
    
    df_train = pd.DataFrame()
    
    print "5-folds training on the x_train"
    
    for metric in metrics:
        for train_index, test_index in tqdm_notebook(kf.split(x_train)):
    
            # Create instance of our KNN feature extractor
            NNF = NearestNeighborsFeats(n_jobs=1, k_list=k_list, metric=metric, month_history=6)

            train_kf = x_train[train_index]

            # Fit on train_kf set
            NNF.fit(train_kf, X_train['city_id'].values[train_index])

            # Get features for "train" with KFold regularization
            test_kf = x_train[test_index]
            test_kf_knn = NNF.predict(test_kf)

            test_kf_knn_cat = X_train[categorical_features].values[test_index]

            data = np.hstack((test_kf_knn_cat, test_kf_knn))

            columns = categorical_features + ["feature_{:03d}".format(i) for i in range(test_kf_knn.shape[1])]

            df_0 = pd.DataFrame(data=data, columns=columns)

            # concat the result of each fold

            df_train = pd.concat([df_train, df_0], ignore_index=True)
            
    NNF.fit(x_train, x_train['city_id'].values)
    test_knn = NNF.predict(x_test)
    
    test_cat = X_test[categorical_features].values

    data = np.hstack((test_cat, test_knn))

    columns = categorical_features + ["feature_{:03d}".format(i) for i in range(test_knn.shape[1])]
    
    df_test = pd.DataFrame(data=data, columns=columns)
    
    return df_train, df_test

In [15]:
metrics = ['cosine']
categorical_features = ['shop_id', 'item_id', 'city_id', 'item_category_group']

X_train_city, X_test_city = KNN_feature_transformation(X_train, X_test, categorical_features, metrics)

5-folds training on the x_train







IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

Mean and standard deviation of the last six month

In [None]:
prev_6_month_mean = X_train[[27,28,29,30,31,32]].mean(axis=1)
prev_6_month_std = X_train[[27,28,29,30,31,32]].std(axis=1)

X_train['prev_6_month_mean'] = prev_6_month_mean
X_train['prev_6_month_std'] = prev_6_month_std

In [None]:
change_from_previous_month = X_train[[ix for ix in range(1, 33)]]
change_from_previous_month.columns = [ix for ix in range(0, 32)]

change_from_previous_month = change_from_previous_month - X_train[[ix for ix in range(0, 32)]]
change_from_previous_month.columns = ["change_{}".format(ix) for ix in range(0, 32)]

X_train['change_prev_6_month_mean'] = change_from_previous_month[["change_{}".format(ix) for ix in range(26, 32)]].mean(axis=1)

RandomForest

In [None]:
%%time

from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators=400, max_depth=10, random_state=42)

rf.fit(X_train, Y_train)

preds = rf.predict(X_train)

print r2_score(Y_train, preds) 
print mean_squared_error(Y_train, preds) 

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_classif

X_new = SelectKBest(mutual_info_classif, k=12).fit_transform(X_train, Y_train)

rf.fit(X_new, Y_train)

preds = rf.predict(X_new)

print r2_score(Y_train, preds) 
print mean_squared_error(Y_train, preds) 

In [None]:
feature_importance = []

for column, importance in zip(X_train.columns, rf.feature_importances_):
    feature_importance.append((column, importance))
    
feature_importance.sort(key=lambda x: x[1], reverse=True)

In [None]:
feature_importance

In [None]:
summation = 0

for _, a in feature_importance:
    if a > 0.001:
        summation +=a
        
print summation

In [None]:
%%time

baseline = -cross_val_score(rf, X_train, Y_train, scoring='mean_squared_error', cv=5, n_jobs=-1).mean()
print baseline # 15.041

In [None]:
def f(parameters):
    parameters = parameters[0]
    
    rf = RandomForestRegressor(max_depth=int(parameters[1]),
                                              n_estimators=int(parameters[0]))
    
    score = -cross_val_score(rf, X_train, Y_train, scoring='mean_squared_error', cv=5, n_jobs=-1).mean()
    score = np.array(score)
    return score

In [None]:
# Bounds (NOTE: define continuous variables first, then discrete!)
bounds = [
            {'name': 'max_depth', 'type': 'discrete', 'domain': (10, 30)},
            {'name': 'n_estimators', 'type': 'discrete', 'domain': (100, 1000)}
             ]

np.random.seed(777)
optimizer = GPyOpt.methods.BayesianOptimization(f=f, domain=bounds,
                                                initial_design_numdata=4,
                                                model_type='sparseGP',
                                                acquisition_type='MPI',
                                                acquisition_par=0.1,
                                                exact_eval=True)

max_iter = 50

optimizer.run_optimization(max_iter)

In [None]:
print('MSE:', np.min(optimizer.Y), 'Gain:', baseline/np.min(optimizer.Y)*100)

In [None]:
optimizer.Y

In [None]:
X_train_red = X_train.copy()

for column in X_train.columns:
    if my_dict[column] <=0.005:
        X_train_red.drop(labels=[column], axis=1, inplace=True)

rf_red = RandomForestRegressor(n_estimators=400, max_depth=10)        
        
rf_red.fit(X_train_red, Y_train)

preds = rf_red.predict(X_train_red)

print r2_score(Y_train, preds) 
print mean_squared_error(Y_train, preds) 

xgboost analysis

In [None]:
%%time

from xgboost import XGBRegressor

xgb = XGBRegressor(tree_method='gpu_hist')

xgb.fit(X_train, Y_train)

preds = xgb.predict(X_train)

print(r2_score(Y_train, preds)) 
print(mean_squared_error(Y_train, preds))

In [None]:
%%time

baseline = -cross_val_score(xgb, X_train, Y_train, scoring='mean_squared_error', cv=5).mean()
print(baseline)

In [None]:
def f(parameters):
    parameters = parameters[0]
    
    xgb = XGBRegressor(learning_rate=parameters[0],
                       max_depth=int(parameters[2]),
                       n_estimators=int(parameters[3]),
                       gamma=int(parameters[1]),
                       min_child_weight = parameters[4],
                       tree_method='gpu_hist')
    
    score = -cross_val_score(xgb, X_train, Y_train, scoring='mean_squared_error', cv=5).mean()
    score = np.array(score)
    return score

bounds = [
            {'name': 'learning_rate', 'type': 'continuous', 'domain': (0.001, 0.5)},
            {'name': 'gamma', 'type': 'continuous', 'domain': (0, 5)},
            {'name': 'max_depth', 'type': 'discrete', 'domain': (3, 20)},
            {'name': 'n_estimators', 'type': 'discrete', 'domain': (1, 1000)},
            {'name': 'min_child_weight', 'type': 'discrete', 'domain': (1, 10)}
         ]

np.random.seed(777)
optimizer = GPyOpt.methods.BayesianOptimization(f=f, domain=bounds,
                                                initial_design_numdata=4,
                                                model_type='GP',
                                                acquisition_type='MPI',
                                                acquisition_par=0.1,
                                                exact_eval=True)

max_iter = 200

optimizer.run_optimization(max_iter)

In [None]:
print('MSE:', np.min(optimizer.Y), 'Gain:', baseline/np.min(optimizer.Y)*100)

LightGBM

In [None]:
%%time

from lightgbm import LGBMRegressor

lgb = LGBMRegressor(n_estimators=400, max_depth=10)

lgb.fit(X_train, Y_train)

preds = lgb.predict(X_train)

print r2_score(Y_train, preds) 
print mean_squared_error(Y_train, preds) 

baseline = -cross_val_score(lgb, X_train, Y_train, scoring='mean_squared_error', cv=5, n_jobs=-1).mean()
print baseline # 15.041

In [None]:
def f(parameters):
    parameters = parameters[0]
    
    lgb = LGBMRegressor(learning_rate=parameters[0],
                        max_depth=int(parameters[1]),
                        n_estimators=int(parameters[2]),
                        num_leaves=int(parameters[3]),
                        min_data_in_leaf = int(parameters[4]))
    
    score = -cross_val_score(lgb, X_train, Y_train, scoring='mean_squared_error', cv=5).mean()
    score = np.array(score)
    return score

bounds = [
            {'name': 'learning_rate', 'type': 'continuous', 'domain': (0.001, 0.1)},
            {'name': 'max_depth', 'type': 'discrete', 'domain': (3, 20)},
            {'name': 'n_estimators', 'type': 'discrete', 'domain': (100, 1000)},
            {'name': 'num_leaves', 'type': 'discrete', 'domain': (10, 50)},
            {'name': 'min_data_in_leaf', 'type': 'discrete', 'domain': (20, 50)}
         ]

np.random.seed(777)
optimizer = GPyOpt.methods.BayesianOptimization(f=f, domain=bounds,
                                                initial_design_numdata=4,
                                                model_type='GP',
                                                acquisition_type='MPI',
                                                acquisition_par=0.1,
                                                exact_eval=True)

max_iter = 200

optimizer.run_optimization(max_iter)

print('MSE:', np.min(optimizer.Y), 'Gain:', baseline/np.min(optimizer.Y)*100)

# Bench Mark

In [None]:
import pandas as pd
import numpy as np

items_df = pd.read_csv('Data/items.csv')
shops_df = pd.read_csv('Data/shops.csv')
icats_df = pd.read_csv("Data/item_categories.csv")
train_df = pd.read_csv("Data/sales_train.csv.gz")
test_df  = pd.read_csv('Data/test.csv.gz') # 214200 rows

In [None]:
test_shops = test_df.shop_id.unique()
train_df = train_df[train_df.shop_id.isin(test_shops)]
test_items = test_df.item_id.unique()
train_df = train_df[train_df.item_id.isin(test_items)]

print('train:', train_df.shape, 'test:', test_df.shape, 'items:', items_df.shape, 'shops:', shops_df.shape)

In [None]:
test_only = test_df[~test_df['item_id'].isin(train_df['item_id'].unique())]['item_id'].unique()

In [None]:
# group by
train_grp = train_df.groupby(['date_block_num','shop_id','item_id'])

In [None]:
# price mean by month
train_price = pd.DataFrame(train_grp.mean()['item_price']).reset_index()
train_price.head()

In [None]:
# count summary by month
train_monthly = pd.DataFrame(train_grp.sum()['item_cnt_day']).reset_index()
train_monthly.rename(columns={'item_cnt_day':'item_cnt'}, inplace=True)
train_monthly.head()

In [None]:
train_piv = train_df.pivot_table(index=['shop_id','item_id'], columns='date_block_num', values='item_cnt_day',aggfunc='sum').fillna(0.0)    
train_piv = train_piv.reset_index()
train_piv.head()

In [None]:
grp = train_monthly.groupby(['shop_id', 'item_id'])
train_shop = grp.agg({'item_cnt':['mean','median','std']}).reset_index()
train_shop.columns = ['shop_id','item_id','cnt_mean_shop','cnt_med_shop','cnt_std_shop']
train_shop.head()

In [None]:
train_cat_monthly = pd.merge(train_monthly, items_df, on=['item_id'], how='left')
grp = train_cat_monthly.groupby(['shop_id', 'item_category_id'])
train_shop_cat = grp.agg({'item_cnt':['mean']}).reset_index()
train_shop_cat.columns = ['shop_id','item_category_id','cnt_mean_cat_shop']
train_shop_cat.head()

In [None]:
train_last = train_monthly[train_monthly['date_block_num']==33]
train_last = train_last.drop(['date_block_num'], axis=1).rename(columns={'item_cnt':'cnt_sum_last'})
train_last.head()

In [None]:
# Prev month
train_prev = train_monthly.copy()
train_prev['date_block_num'] = train_prev['date_block_num'] + 1
train_prev = train_prev.rename(columns={'item_cnt':'cnt_sum_prev'})
train_prev.head()

In [None]:
train_cat_prev = pd.merge(train_prev, items_df, on=['item_id'], how='left')
grp = train_cat_prev.groupby(['date_block_num','shop_id','item_category_id'])
train_cat_prev = grp['cnt_sum_prev'].sum().reset_index()
train_cat_prev = train_cat_prev.rename(columns={'cnt_sum_prev':'cnt_sum_cat_prev'})
train_cat_prev.head()

In [None]:
col = np.arange(34)
pivT = train_piv[col].T
evm_s = pivT.ewm(span=12).mean().T
evm_l = pivT.ewm(span=26).mean().T

macd = evm_s - evm_l
sig = macd.ewm(span=9).mean()

train_piv_key = train_piv.loc[:,['shop_id','item_id']]
train_evm_list = []

for c in col:
    sub_evm_s = pd.DataFrame(evm_s.loc[:,c]).rename(columns={c:'cnt_evm_s_prev'})
    sub_evm_l = pd.DataFrame(evm_l.loc[:,c]).rename(columns={c:'cnt_evm_l_prev'})
    sub_macd = pd.DataFrame(macd.loc[:,c]).rename(columns={c:'cnt_macd_prev'})
    sub_sig = pd.DataFrame(sig.loc[:,c]).rename(columns={c:'cnt_sig_prev'})
    
    sub_evm = pd.concat([train_piv_key, sub_evm_s, sub_evm_l, sub_macd, sub_sig], axis=1)
    sub_evm['date_block_num'] = c + 1
    train_evm_list.append(sub_evm)
    
train_evm_prev = pd.concat(train_evm_list)
#train_evm_prev.head()
train_evm_prev.query("shop_id == 2 & item_id == 30").tail()

In [None]:
icats_df['item_category_group'] = icats_df['item_category_name'].apply(lambda x: str(x).split(' ')[0])
icats_df['item_category_group'] = pd.Categorical(icats_df['item_category_group']).codes

item_cats = pd.merge(icats_df, pd.get_dummies(icats_df['item_category_group'], prefix='item_category_group', drop_first=True), left_index=True, right_index=True)
item_cats.drop(['item_category_group'], axis=1, inplace=True)

shops_df['city'] = shops_df.shop_name.apply(lambda x: str.replace(x, '!', '')).apply(lambda x: x.split(' ')[0])
shops_df['city'] = pd.Categorical(shops_df['city']).codes

In [None]:
def mergeFeature(source): 
    d = source
    d = pd.merge(d, items_df, on=['item_id'], how='left')
    d = pd.merge(d, item_cats, on=['item_category_id'], how='left')
    d = pd.merge(d, shops_df, on=['shop_id'], how='left')

    d = pd.merge(d, train_price, on=['date_block_num','shop_id','item_id'], how='left')
    d = pd.merge(d, train_shop, on=['shop_id','item_id'], how='left')
    #d = pd.merge(d, train_shop_cat, on=['shop_id','item_category_id'], how='left')
    #d = pd.merge(d, train_last, on=['shop_id','item_id'], how='left')
    d = pd.merge(d, train_prev, on=['date_block_num','shop_id','item_id'], how='left')
    d = pd.merge(d, train_evm_prev, on=['date_block_num','shop_id','item_id'], how='left')
    d = pd.merge(d, train_cat_prev, on=['date_block_num','shop_id','item_category_id'], how='left')

    d.drop(['shop_id','shop_name','item_id','item_name','item_category_id','item_category_name'], axis=1, inplace=True)
    d.fillna(0.0, inplace=True)
    return d


In [None]:
train_set = mergeFeature(train_monthly)

In [None]:
test_df['date_block_num'] = 34

X_test = mergeFeature(test_df.drop(['ID'], axis=1))

In [None]:
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_squared_error

X_train = train_set.drop(['item_cnt'], axis=1)
Y_train = train_set['item_cnt']

xgb = XGBRegressor(n_estimators=25, max_depth=12, learning_rate=0.1, subsample=1, colsample_bytree=1, eval_metric='rmse')

xgb.fit(X_train, Y_train)

preds = xgb.predict(X_train)

print r2_score(Y_train, preds) #0.955330774186
print mean_squared_error(Y_train, preds) #5.69332522603

# Modification

In [None]:
import pandas as pd
import numpy as np

items_df = pd.read_csv('Data/items.csv')
shops_df = pd.read_csv('Data/shops.csv')
icats_df = pd.read_csv("Data/item_categories.csv")
train_df = pd.read_csv("Data/sales_train.csv.gz")
test_df  = pd.read_csv('Data/test.csv.gz') 

test_shops = test_df.shop_id.unique()
train_df = train_df[train_df.shop_id.isin(test_shops)]
test_items = test_df.item_id.unique()
train_df = train_df[train_df.item_id.isin(test_items)]

print('train:', train_df.shape, 'test:', test_df.shape, 'items:', items_df.shape, 'shops:', shops_df.shape)

In [None]:
train_grp = train_df.groupby(['date_block_num','shop_id','item_id'])

train_price = pd.DataFrame(train_grp.mean()['item_price']).reset_index()

test_grp = train_df.groupby(['shop_id', 'item_id'])

test_price = pd.DataFrame(test_grp.mean()['item_price']).reset_index()

test_df['date_block_num'] = 34
test_df['season'] = np.sin(34 * np.pi * 2/12.0)

a = test_df[['date_block_num', 'shop_id', 'item_id']]
a = pd.merge(a, test_price, how='left', on=['shop_id', 'item_id'])
train_price = pd.concat((train_price, a), ignore_index=True)

train_monthly = pd.DataFrame(train_grp.sum()['item_cnt_day']).reset_index()

train_monthly.rename(columns={'item_cnt_day':'item_cnt'}, inplace=True)

train_monthly['season'] = np.sin(train_monthly['date_block_num']*2*np.pi/12.0)

In [None]:
train_piv = train_df.pivot_table(index=['shop_id','item_id'], columns='date_block_num', values='item_cnt_day',aggfunc='sum').fillna(0.0)    
train_piv = train_piv.reset_index()

grp = train_monthly.groupby(['shop_id', 'item_id'])
train_shop = grp.agg({'item_cnt':['mean','median','std']}).reset_index()
train_shop.columns = ['shop_id','item_id','cnt_mean_shop','cnt_med_shop','cnt_std_shop']

In [None]:
train_cat_monthly = pd.merge(train_monthly, items_df, on=['item_id'], how='left')
grp = train_cat_monthly.groupby(['shop_id', 'item_category_id'])
train_shop_cat = grp.agg({'item_cnt':['mean']}).reset_index()
train_shop_cat.columns = ['shop_id','item_category_id','cnt_mean_cat_shop']
train_shop_cat.head()

In [None]:
shops_df['city_id'] = shops_df.shop_name.apply(lambda x: str.replace(x, '!', '')).apply(lambda x: x.split(' ')[0])
shops_df['city_id'] = pd.Categorical(shops_df['city_id']).codes
shops_df.head(5)

In [None]:
# construct cnt_mean_cat_shop

train_cat_shop_monthly = pd.merge(train_cat_monthly, shops_df, on=['shop_id'], how='left')

In [None]:
grp = train_cat_shop_monthly.groupby(['city_id', 'item_category_id'])
train_city_cat = grp.agg({'item_cnt':['mean']}).reset_index()
train_city_cat.columns = ['city_id', 'item_category_id', 'cnt_mean_cat_city']
train_city_cat.head()

In [None]:
train_last = train_monthly[train_monthly['date_block_num']==33]
train_last = train_last.drop(['date_block_num'], axis=1).rename(columns={'item_cnt':'cnt_sum_last'})
train_last.head()

In [None]:
# Prev month
train_prev = train_monthly.copy().drop(['season'], axis=1)
train_prev['date_block_num'] = train_prev['date_block_num'] + 1
train_prev = train_prev.rename(columns={'item_cnt':'cnt_sum_prev'})

train_cat_prev = pd.merge(train_prev, items_df, on=['item_id'], how='left')
grp = train_cat_prev.groupby(['date_block_num','shop_id','item_category_id'])
train_cat_prev = grp['cnt_sum_prev'].sum().reset_index()
train_cat_prev = train_cat_prev.rename(columns={'cnt_sum_prev':'cnt_sum_cat_prev'})
train_cat_prev.head()

In [None]:
col = np.arange(34)
pivT = train_piv[col].T
evm_s = pivT.ewm(span=12).mean().T
evm_l = pivT.ewm(span=26).mean().T

macd = evm_s - evm_l
sig = macd.ewm(span=9).mean()

train_piv_key = train_piv.loc[:,['shop_id','item_id']]
train_evm_list = []

for c in col:
    sub_evm_s = pd.DataFrame(evm_s.loc[:,c]).rename(columns={c:'cnt_evm_s_prev'})
    sub_evm_l = pd.DataFrame(evm_l.loc[:,c]).rename(columns={c:'cnt_evm_l_prev'})
    sub_macd = pd.DataFrame(macd.loc[:,c]).rename(columns={c:'cnt_macd_prev'})
    sub_sig = pd.DataFrame(sig.loc[:,c]).rename(columns={c:'cnt_sig_prev'})
    
    sub_evm = pd.concat([train_piv_key, sub_evm_s, sub_evm_l, sub_macd, sub_sig], axis=1)
    sub_evm['date_block_num'] = c + 1
    train_evm_list.append(sub_evm)
    
train_evm_prev = pd.concat(train_evm_list)
#train_evm_prev.head()
train_evm_prev.query("shop_id == 2 & item_id == 30").tail()

In [None]:
train_cat_monthly.head(5)

In [None]:
icats_df['item_category_group'] = icats_df['item_category_name'].apply(lambda x: str(x).split(' ')[0])
icats_df['item_category_group'] = pd.Categorical(icats_df['item_category_group']).codes

train_cat_group_monthly = pd.merge(train_cat_monthly, icats_df, on=['item_category_id'], how='left')

grp = train_cat_group_monthly.groupby(['item_category_group'])
train_group = grp.agg({'item_cnt':['mean', 'median', 'std']}).reset_index()
train_group.columns = ['item_category_group', 'cnt_mean_group', 'cnt_median_group', 'cnt_std_group']
train_group.head()

In [None]:
def mergeFeature(source): 
    d = source
    d = pd.merge(d, items_df, on=['item_id'], how='left')
    d = pd.merge(d, icats_df, on=['item_category_id'], how='left')
    d = pd.merge(d, shops_df, on=['shop_id'], how='left')

    d = pd.merge(d, train_price, on=['date_block_num','shop_id','item_id'], how='left')
    d = pd.merge(d, train_shop, on=['shop_id','item_id'], how='left')
    d = pd.merge(d, train_shop_cat, on=['shop_id','item_category_id'], how='left')
    d = pd.merge(d, train_city_cat, on=['city_id','item_category_id'], how='left')
    d = pd.merge(d, train_group, on=['item_category_group'], how='left')
    #d = pd.merge(d, train_last, on=['shop_id','item_id'], how='left')
    d = pd.merge(d, train_prev, on=['date_block_num','shop_id','item_id'], how='left')
    d = pd.merge(d, train_evm_prev, on=['date_block_num','shop_id','item_id'], how='left')
    d = pd.merge(d, train_cat_prev, on=['date_block_num','shop_id','item_category_id'], how='left')

    d.drop(['date_block_num', 'shop_id','shop_name','item_id','item_name','item_category_id','item_category_name', 'item_category_group', 'city_id'], axis=1, inplace=True)
    d.fillna(0.0, inplace=True)
    
    return d

In [None]:
def num_rescale(train, test):
    
    d = pd.concat([train, test], ignore_index=True)
    
    d['item_price_inv'] = d['item_price'].values.min()/d['item_price']
    d.drop(['item_price'], axis=1, inplace=True)
    d['item_price_inv'].fillna(0, inplace=True)
    
    columns = ['cnt_mean_shop', 'cnt_std_shop', 'cnt_mean_cat_shop', 'cnt_mean_cat_city', 'cnt_mean_group',
               'cnt_median_group', 'cnt_std_group', 'cnt_sum_prev', 'cnt_evm_s_prev', 'cnt_evm_l_prev',
               'cnt_macd_prev', 'cnt_sig_prev', 'cnt_sum_cat_prev']
    
    for column in columns:
    
        d[column] = d[column]/d[column].values.std()
    
    return d.iloc[:len(train)], d.iloc[len(train):]

In [None]:
train_set = mergeFeature(train_monthly)

X_train = train_set.drop(['item_cnt'], axis=1)
Y_train = train_set[['item_cnt']]

X_test = mergeFeature(test_df.drop(['ID'], axis=1))

X_train, X_test = num_rescale(X_train, X_test)
X_test.reset_index(drop=True, inplace=True)

X_train.to_csv("X_train.csv.gz", index=False, compression='gzip')
X_test['ID'] = test_df['ID']
X_test.to_csv("X_test.csv.gz", index=False, compression='gzip')
Y_train.to_csv("Y_train.csv.gz", index=False, compression='gzip')

After the X_train, Y_train, X_test are produced, we can work from this step