In [1]:
import os
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 600)
pd.set_option('display.max_columns', 50)

#import matplotlib as mpl
import matplotlib.pyplot as plt
#import matplotlib.dates as mdates
%matplotlib inline 

from tqdm import tqdm_notebook
from itertools import product

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import gc

In [2]:
DATA_FOLDER = './data/'

sales = pd.read_csv(os.path.join(DATA_FOLDER, 'sales_train.csv.gz'))
test = pd.read_csv(os.path.join(DATA_FOLDER, 'test.csv.gz'))
items = pd.read_csv(os.path.join(DATA_FOLDER, 'items.csv'))
categories = pd.read_csv(os.path.join(DATA_FOLDER, 'item_categories.csv'))
shops = pd.read_csv(os.path.join(DATA_FOLDER, 'shops.csv'))

In [3]:
def downcast_dtypes(df):
    '''
        Changes column types in the dataframe: 
                
                `float64` type to `float32`
                `int64`   type to `int32`
    '''
    
    # Select columns to downcast
    float_cols = [c for c in df if df[c].dtype == "float64"]
    int_cols =   [c for c in df if df[c].dtype == "int64"]
    
    # Downcast
    df[float_cols] = df[float_cols].astype(np.float32)
    df[int_cols]   = df[int_cols].astype(np.int32)
    
    return df

In [10]:
item_category_mapping = items[['item_id','item_category_id']].drop_duplicates()
index_cols = ['shop_id', 'item_id', 'date_block_num']

In [5]:
def create_grid(sales, index_cols):
    from tqdm import tqdm_notebook
    
    # For every month we create a grid from all shops/items combinations from that month
    grid = [] 
    for block_num in tqdm_notebook(sales['date_block_num'].unique()):
        cur_shops = sales.loc[sales['date_block_num'] == block_num, 'shop_id'].unique()
        cur_items = sales.loc[sales['date_block_num'] == block_num, 'item_id'].unique()
        grid.append(np.array(list(product(*[cur_shops, cur_items, [block_num]])),dtype='int32'))

    # Turn the grid into a dataframe
    grid = pd.DataFrame(np.vstack(grid), columns = index_cols,dtype=np.int32)

    # Groupby data to get shop-item-month aggregates
    gb = sales.groupby(index_cols,as_index=False).agg({'item_cnt_day':{'target':'sum'}})
    # Fix column names
    gb.columns = [col[0] if col[-1]=='' else col[-1] for col in gb.columns.values] 
    # Clip
    gb.target = gb.target.clip(0,20) #TODO
    # Join it to the grid    
    all_data = pd.merge(grid, gb, how='left', on=index_cols).fillna(0)

    # Same as above but with shop-month aggregates
    gb = sales.groupby(['shop_id', 'date_block_num'],as_index=False).agg({'item_cnt_day':{'target_shop':'sum'}})
    gb.columns = [col[0] if col[-1]=='' else col[-1] for col in gb.columns.values]
    all_data = pd.merge(all_data, gb, how='left', on=['shop_id', 'date_block_num']).fillna(0)

    # Same as above but with item-month aggregates
    gb = sales.groupby(['item_id', 'date_block_num'],as_index=False).agg({'item_cnt_day':{'target_item':'sum'}})
    gb.columns = [col[0] if col[-1] == '' else col[-1] for col in gb.columns.values]
    all_data = pd.merge(all_data, gb, how='left', on=['item_id', 'date_block_num']).fillna(0)

    # Same as above but with category-month aggregates
    all_data = pd.merge(all_data, item_category_mapping, how='left', on='item_id')
    
    gb = pd.merge(sales, item_category_mapping, how='left', on='item_id')
    gb = gb.groupby(['item_category_id', 'date_block_num'],as_index=False).agg({'item_cnt_day':{'target_category':'sum'}})
    gb.columns = [col[0] if col[-1] == '' else col[-1] for col in gb.columns.values]
    all_data = pd.merge(all_data, gb, how='left', on=['item_category_id', 'date_block_num']).fillna(0)
    all_data = all_data.drop(['item_category_id'], axis=1)

    # Downcast dtypes from 64 to 32 bit to save memory
    all_data = downcast_dtypes(all_data)
    del grid, gb 
    gc.collect();
    
    return all_data

In [6]:
max_train_date_block_num = sales.date_block_num.max()
max_train_date_block_num

33

In [7]:
all_data = create_grid(
    pd.concat(
        [sales, test.assign(date_block_num=max_train_date_block_num+1)],
        ignore_index=True, sort=False),
    index_cols)
all_data.head()

HBox(children=(IntProgress(value=0, max=35), HTML(value='')))




  return super(DataFrameGroupBy, self).aggregate(arg, *args, **kwargs)


Unnamed: 0,shop_id,item_id,date_block_num,target,target_shop,target_item,target_category
0,59,22154,0,1.0,2017.0,18.0,6094.0
1,59,2552,0,0.0,2017.0,0.0,287.0
2,59,2554,0,0.0,2017.0,1.0,287.0
3,59,2555,0,0.0,2017.0,2.0,268.0
4,59,2564,0,0.0,2017.0,5.0,701.0


In [11]:
def create_lags(all_data, shift_range = [1, 2, 3, 4, 5, 12]):

    # List of columns that we will use to create lags
    cols_to_rename = list(all_data.columns.difference(index_cols)) 

    for month_shift in tqdm_notebook(shift_range):
        train_shift = all_data[index_cols + cols_to_rename].copy()

        train_shift['date_block_num'] = train_shift['date_block_num'] + month_shift

        foo = lambda x: '{}_lag_{}'.format(x, month_shift) if x in cols_to_rename else x
        train_shift = train_shift.rename(columns=foo)

        all_data = pd.merge(all_data, train_shift, on=index_cols, how='left').fillna(0)

    del train_shift

    # List of all lagged features
    fit_cols = [col for col in all_data.columns if col[-1] in [str(item) for item in shift_range]]  
    # We will drop these at fitting stage
    to_drop_cols = list(set(list(all_data.columns)) - (set(fit_cols)|set(index_cols))) + ['date_block_num'] 
    
    all_data = pd.merge(all_data, item_category_mapping, how='left', on='item_id')

    all_data = downcast_dtypes(all_data)
    gc.collect();
    
    return all_data, to_drop_cols

In [12]:
lagged_data, to_drop_cols = create_lags(all_data)
lagged_data.head()
to_drop_cols

HBox(children=(IntProgress(value=0, max=6), HTML(value='')))




Unnamed: 0,shop_id,item_id,date_block_num,target,target_shop,target_item,target_category,target_lag_1,target_category_lag_1,target_item_lag_1,target_shop_lag_1,target_lag_2,target_category_lag_2,target_item_lag_2,target_shop_lag_2,target_lag_3,target_category_lag_3,target_item_lag_3,target_shop_lag_3,target_lag_4,target_category_lag_4,target_item_lag_4,target_shop_lag_4,target_lag_5,target_category_lag_5,target_item_lag_5,target_shop_lag_5,target_lag_12,target_category_lag_12,target_item_lag_12,target_shop_lag_12,item_category_id
0,59,22154,0,1.0,2017.0,18.0,6094.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,37
1,59,2552,0,0.0,2017.0,0.0,287.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,58
2,59,2554,0,0.0,2017.0,1.0,287.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,58
3,59,2555,0,0.0,2017.0,2.0,268.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,56
4,59,2564,0,0.0,2017.0,5.0,701.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,59


['target_item', 'target', 'target_shop', 'target_category', 'date_block_num']

In [None]:
# TODO shop_id one-hot encoding, category_id one-hot encoding

In [13]:
train = lagged_data.loc[lagged_data.date_block_num <= max_train_date_block_num]
test_lagged = lagged_data.loc[lagged_data.date_block_num == max_train_date_block_num + 1]

In [None]:
# TODO
#train = train.loc[train.shop_id.isin([26, 27, 28])]

In [14]:
X_train = train.drop(to_drop_cols, axis=1).values
print(X_train) #X.head()

[[5.9000e+01 2.2154e+04 0.0000e+00 ... 0.0000e+00 0.0000e+00 3.7000e+01]
 [5.9000e+01 2.5520e+03 0.0000e+00 ... 0.0000e+00 0.0000e+00 5.8000e+01]
 [5.9000e+01 2.5540e+03 0.0000e+00 ... 0.0000e+00 0.0000e+00 5.8000e+01]
 ...
 [2.1000e+01 7.6400e+03 0.0000e+00 ... 0.0000e+00 0.0000e+00 6.4000e+01]
 [2.1000e+01 7.6320e+03 0.0000e+00 ... 0.0000e+00 0.0000e+00 6.4000e+01]
 [2.1000e+01 7.4400e+03 0.0000e+00 ... 0.0000e+00 0.0000e+00 5.7000e+01]]


In [15]:
target_col = 'target'
y_train = train.loc[:, [target_col]].values.ravel()
print(y_train) #y.head()

[1. 0. 0. ... 0. 0. 0.]


In [None]:
def gen_time_split(data, n_splits):
    for i in range(n_splits):
        #print(i)
        first_vali_date_block_num = max_train_date_block_num - i
        vali_indices = data.loc[:,'date_block_num'] == first_vali_date_block_num
        train_indices = data.loc[:,'date_block_num'] < first_vali_date_block_num
        yield (train_indices[train_indices].index.values, vali_indices[vali_indices].index.values)

In [None]:
cv = gen_time_split(train, 3)
#for (tidx, vidx) in cv:
#    print(X_train[tidx])
#    print(X_train[vidx])
#    print(y_train[tidx])
#    print(y_train[vidx])
#for split in cv:
#    print(split)

In [None]:
#lr = 1 / np.logspace(0.0, 1.0, num=5)[2:]
#lr = np.array([0.3, 0.45, 0.6])
#lr = np.linspace(0.3, 0.6, 5)
lr = np.array([0.3])
print(lr)

In [16]:
X_test = test.merge(test_lagged, how='left').drop(to_drop_cols + ['ID'], axis=1).values
print(X_test)

[[5.0000e+00 5.0370e+03 0.0000e+00 ... 6.5000e+01 1.4450e+03 1.9000e+01]
 [5.0000e+00 5.3200e+03 0.0000e+00 ... 0.0000e+00 0.0000e+00 5.5000e+01]
 [5.0000e+00 5.2330e+03 1.0000e+00 ... 0.0000e+00 0.0000e+00 1.9000e+01]
 ...
 [4.5000e+01 1.5757e+04 0.0000e+00 ... 9.0000e+00 1.2510e+03 5.5000e+01]
 [4.5000e+01 1.9648e+04 0.0000e+00 ... 0.0000e+00 0.0000e+00 4.0000e+01]
 [4.5000e+01 9.6900e+02 0.0000e+00 ... 6.0000e+00 1.2510e+03 3.7000e+01]]


In [17]:
import xgboost as xgb
params = {'max_depth':7, 'eta':0.3, 'silent':0, 'objective':'reg:linear', 'eval_metric':'rmse'}
dtrain = xgb.DMatrix(X_train, label=y_train)

In [None]:
folds = [fold for fold in gen_time_split(train, 3)]
#print(folds)
eval_hist = xgb.cv(param, dtrain, num_boost_round=1000, folds=folds, metrics=['rmse'], early_stopping_rounds=10, verbose_eval=True)

In [None]:
opt_num_rounds = eval_hist.shape[0]
#eval_hist

In [24]:
xgb_model = xgb.train(params, dtrain, num_boost_round=34)#opt_num_rounds)

[21:25:32] /mnt/xgboost/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 248 extra nodes, 0 pruned nodes, max_depth=7
[21:25:35] /mnt/xgboost/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 254 extra nodes, 0 pruned nodes, max_depth=7
[21:25:38] /mnt/xgboost/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 254 extra nodes, 0 pruned nodes, max_depth=7
[21:25:42] /mnt/xgboost/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 254 extra nodes, 0 pruned nodes, max_depth=7
[21:25:45] /mnt/xgboost/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 254 extra nodes, 0 pruned nodes, max_depth=7
[21:25:49] /mnt/xgboost/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 250 extra nodes, 0 pruned nodes, max_depth=7
[21:25:52] /mnt/xgboost/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 248 extra nodes, 0 pruned nodes, max_depth=7
[21:25:55] /mnt/xgboost/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 254 extra nodes, 0 pruned nodes, max_

In [25]:
from sklearn.metrics import mean_squared_error, r2_score
y_train_pred_xgb = xgb_model.predict(dtrain)
print('RMSE for XGBoost is %f' % np.sqrt(mean_squared_error(y_train, y_train_pred_xgb)))
print('Train R-squared for XGBoost is %f' % r2_score(y_train, y_train_pred_xgb))

RMSE for XGBoost is 0.902616
Train R-squared for XGBoost is 0.454747


RMSE for XGBoost is 0.857923
Train R-squared for XGBoost is 0.507406

In [None]:
from xgboost import plot_importance
plot_importance(xgb_model)

In [20]:
# make prediction
dtest = xgb.DMatrix(X_test)
y_test_pred_xgb = xgb_model.predict(dtest)
print(y_test_pred_xgb)

[0.5309142  0.14552623 0.87035084 ... 0.04141507 0.04115519 0.01889181]


In [None]:
plt.scatter(y_train, y_train_pred_xgb)

In [21]:
submission = test.assign(item_cnt_month=y_test_pred_xgb)[['item_cnt_month']]
submission.describe()

Unnamed: 0,item_cnt_month
count,214200.0
mean,0.277889
std,0.739555
min,-1.609954
25%,0.027626
50%,0.10415
75%,0.273682
max,21.945091


In [22]:
submission.to_csv('Ensemble3.csv', index_label='ID') #header=['ID', 'item_cnt_month'])

In [23]:
!gzip Ensemble3.csv
!ls

Baseline.ipynb	  Ensemble3.csv.gz  Refactor.ipynb
Combos.ipynb	  Ensemble3.ipynb   RefactorXGB.csv.gz
data		  Ensemble.csv.gz   Shop and item category means.ipynb
EDA.ipynb	  Ensemble.ipynb
Ensemble2.csv.gz  Lagged.ipynb


In [None]:
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasRegressor

def create_keras_model():
    model = Sequential()
    model.add(Dense(100, activation='relu'))
    model.add(Dense(100, activation='relu'))
    model.add(Dense(1))
    model.compile(loss='mean_squared_error', optimizer='adam')
    return model

keras_estimator = KerasRegressor(build_fn=create_keras_model, epochs=10, batch_size=1000, verbose=True)
#results = cross_val_score(estimator, X_train, Y_train, cv=gen_time_split(train, 3))
model_keras = keras_estimator.fit(X_train, y_train)

In [None]:
from sklearn.metrics import mean_squared_error, r2_score
y_train_pred_keras = keras_estimator.predict(X_train)
print('RMSE for Keras is %f' % np.sqrt(mean_squared_error(y_train, y_train_pred_keras)))
print('Train R-squared for Keras is %f' % r2_score(y_train, y_train_pred_keras))