In [1]:
import os
import numpy as np
import pandas as pd
import sklearn as skl
pd.set_option('display.max_rows', 600)
pd.set_option('display.max_columns', 50)
np.set_printoptions(edgeitems=100)

#import matplotlib as mpl
import matplotlib.pyplot as plt
#import matplotlib.dates as mdates
%matplotlib inline 

from tqdm import tqdm_notebook
from itertools import product

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import gc

In [2]:
for p in [np, pd, skl]:
    print (p.__name__, p.__version__)

numpy 1.14.5
pandas 0.23.0
sklearn 0.19.1


In [3]:
DATA_FOLDER = './data/'

sales = pd.read_csv(os.path.join(DATA_FOLDER, 'sales_train.csv.gz'))
test = pd.read_csv(os.path.join(DATA_FOLDER, 'test.csv.gz'))
items = pd.read_csv(os.path.join(DATA_FOLDER, 'items.csv'))
categories = pd.read_csv(os.path.join(DATA_FOLDER, 'item_categories.csv'))
shops = pd.read_csv(os.path.join(DATA_FOLDER, 'shops.csv'))

In [4]:
def downcast_dtypes(df):
    '''
        Changes column types in the dataframe: 
                
                `float64` type to `float32`
                `int64`   type to `int32`
    '''
    
    # Select columns to downcast
    float_cols = [c for c in df if df[c].dtype == "float64"]
    int_cols =   [c for c in df if df[c].dtype == "int64"]
    
    # Downcast
    df[float_cols] = df[float_cols].astype(np.float32)
    df[int_cols]   = df[int_cols].astype(np.int32)
    
    return df

In [5]:
item_category_mapping = items[['item_id','item_category_id']].drop_duplicates()
index_cols = ['shop_id', 'item_id', 'date_block_num']

In [6]:
def create_grid(sales, index_cols):
    from tqdm import tqdm_notebook
    
    # For every month we create a grid from all shops/items combinations from that month
    grid = [] 
    for block_num in tqdm_notebook(sales['date_block_num'].unique()):
        cur_shops = sales.loc[sales['date_block_num'] == block_num, 'shop_id'].unique()
        cur_items = sales.loc[sales['date_block_num'] == block_num, 'item_id'].unique()
        grid.append(np.array(list(product(*[cur_shops, cur_items, [block_num]])),dtype='int32'))

    # Turn the grid into a dataframe
    grid = pd.DataFrame(np.vstack(grid), columns = index_cols,dtype=np.int32)

    # Groupby data to get shop-item-month aggregates
    gb = sales.groupby(index_cols,as_index=False).agg({'item_cnt_day':{'target':'sum'}})
    # Fix column names
    gb.columns = [col[0] if col[-1]=='' else col[-1] for col in gb.columns.values] 
    # Clip
    gb.target = gb.target.clip(0,20) #TODO
    # Join it to the grid    
    all_data = pd.merge(grid, gb, how='left', on=index_cols).fillna(0)

    # Same as above but with shop-month aggregates
    gb = sales.groupby(['shop_id', 'date_block_num'],as_index=False).agg({'item_cnt_day':{'target_shop':'sum'}})
    gb.columns = [col[0] if col[-1]=='' else col[-1] for col in gb.columns.values]
    all_data = pd.merge(all_data, gb, how='left', on=['shop_id', 'date_block_num']).fillna(0)

    # Same as above but with item-month aggregates
    gb = sales.groupby(['item_id', 'date_block_num'],as_index=False).agg({'item_cnt_day':{'target_item':'sum'}})
    gb.columns = [col[0] if col[-1] == '' else col[-1] for col in gb.columns.values]
    all_data = pd.merge(all_data, gb, how='left', on=['item_id', 'date_block_num']).fillna(0)

    # Same as above but with category-month aggregates
    all_data = pd.merge(all_data, item_category_mapping, how='left', on='item_id')
    
    gb = pd.merge(sales, item_category_mapping, how='left', on='item_id')
    gb = gb.groupby(['item_category_id', 'date_block_num'],as_index=False).agg({'item_cnt_day':{'target_category':'sum'}})
    gb.columns = [col[0] if col[-1] == '' else col[-1] for col in gb.columns.values]
    all_data = pd.merge(all_data, gb, how='left', on=['item_category_id', 'date_block_num']).fillna(0)
    all_data = all_data.drop(['item_category_id'], axis=1)

    # Downcast dtypes from 64 to 32 bit to save memory
    all_data = downcast_dtypes(all_data)
    del grid, gb 
    gc.collect();
    
    return all_data

In [7]:
max_train_date_block_num = sales.date_block_num.max()
max_train_date_block_num

33

In [8]:
all_data = create_grid(
    pd.concat(
        [sales, test.assign(date_block_num=max_train_date_block_num+1)],
        ignore_index=True, sort=False),
    index_cols)
all_data.head()

HBox(children=(IntProgress(value=0, max=35), HTML(value='')))




  return super(DataFrameGroupBy, self).aggregate(arg, *args, **kwargs)


Unnamed: 0,shop_id,item_id,date_block_num,target,target_shop,target_item,target_category
0,59,22154,0,1.0,2017.0,18.0,6094.0
1,59,2552,0,0.0,2017.0,0.0,287.0
2,59,2554,0,0.0,2017.0,1.0,287.0
3,59,2555,0,0.0,2017.0,2.0,268.0
4,59,2564,0,0.0,2017.0,5.0,701.0


In [9]:
def create_lags(all_data, shift_range = [1, 2, 3, 4, 5, 12]):

    # List of columns that we will use to create lags
    cols_to_rename = list(all_data.columns.difference(index_cols)) 

    for month_shift in tqdm_notebook(shift_range):
        train_shift = all_data[index_cols + cols_to_rename].copy()

        train_shift['date_block_num'] = train_shift['date_block_num'] + month_shift

        foo = lambda x: '{}_lag_{}'.format(x, month_shift) if x in cols_to_rename else x
        train_shift = train_shift.rename(columns=foo)

        all_data = pd.merge(all_data, train_shift, on=index_cols, how='left').fillna(0)

    del train_shift

    # List of all lagged features
    fit_cols = [col for col in all_data.columns if col[-1] in [str(item) for item in shift_range]]  
    # We will drop these at fitting stage
    to_drop_cols = list(set(list(all_data.columns)) - (set(fit_cols)|set(index_cols))) + ['date_block_num'] 
    
    all_data = pd.merge(all_data, item_category_mapping, how='left', on='item_id')

    all_data = downcast_dtypes(all_data)
    gc.collect();
    
    return all_data, to_drop_cols

In [10]:
lagged_data, to_drop_cols = create_lags(all_data)
lagged_data.head()
to_drop_cols

HBox(children=(IntProgress(value=0, max=6), HTML(value='')))




Unnamed: 0,shop_id,item_id,date_block_num,target,target_shop,target_item,target_category,target_lag_1,target_category_lag_1,target_item_lag_1,target_shop_lag_1,target_lag_2,target_category_lag_2,target_item_lag_2,target_shop_lag_2,target_lag_3,target_category_lag_3,target_item_lag_3,target_shop_lag_3,target_lag_4,target_category_lag_4,target_item_lag_4,target_shop_lag_4,target_lag_5,target_category_lag_5,target_item_lag_5,target_shop_lag_5,target_lag_12,target_category_lag_12,target_item_lag_12,target_shop_lag_12,item_category_id
0,59,22154,0,1.0,2017.0,18.0,6094.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,37
1,59,2552,0,0.0,2017.0,0.0,287.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,58
2,59,2554,0,0.0,2017.0,1.0,287.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,58
3,59,2555,0,0.0,2017.0,2.0,268.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,56
4,59,2564,0,0.0,2017.0,5.0,701.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,59


['target', 'target_item', 'target_category', 'target_shop', 'date_block_num']

In [11]:
#train = lagged_data.loc[lagged_data.date_block_num <= max_train_date_block_num]
#test_lagged = lagged_data.loc[lagged_data.date_block_num == max_train_date_block_num + 1]
train_indices = lagged_data.date_block_num <= max_train_date_block_num
test_indices = lagged_data.date_block_num == max_train_date_block_num + 1

In [12]:
numeric_features = sorted(list(set(list(lagged_data.columns.values)) 
                               - set(to_drop_cols) - set(index_cols) - set(['item_category_id'])))
numeric_features

#categorical_features = list(set(index_cols + ['item_category_id']) - set(['date_block_num']))
categorical_features = ['item_category_id']
categorical_features

['target_category_lag_1',
 'target_category_lag_12',
 'target_category_lag_2',
 'target_category_lag_3',
 'target_category_lag_4',
 'target_category_lag_5',
 'target_item_lag_1',
 'target_item_lag_12',
 'target_item_lag_2',
 'target_item_lag_3',
 'target_item_lag_4',
 'target_item_lag_5',
 'target_lag_1',
 'target_lag_12',
 'target_lag_2',
 'target_lag_3',
 'target_lag_4',
 'target_lag_5',
 'target_shop_lag_1',
 'target_shop_lag_12',
 'target_shop_lag_2',
 'target_shop_lag_3',
 'target_shop_lag_4',
 'target_shop_lag_5']

['item_category_id']

In [13]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer #, SimpleImputer
from sklearn.pipeline import Pipeline, FeatureUnion
#from sklearn.compose import ColumnTransformer
from sklearn_pandas import DataFrameMapper, gen_features

def only_numeric_features(data):
    return data.loc[:,data.columns.isin(numeric_features)]

def create_preprocessor(data, categorical_features):
    numeric_transformer = Pipeline(steps=[
    #    ('imputer', SimpleImputer(strategy='median')),
        ('selecter', FunctionTransformer(only_numeric_features)),
        ('scaler', StandardScaler())])

    categorical_transformer = Pipeline(steps=[
    #    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('onehot', OneHotEncoder(dtype=np.float32, sparse=False, handle_unknown='ignore', categorical_features=data.columns.isin(categorical_features)))])

    preprocessor = FeatureUnion([
#            ('num', numeric_features),
            ('cat', categorical_transformer)])
    
    return categorical_transformer

def create_mapper():
#    return DataFrameMapper([
#        (categorical_features, OneHotEncoder(dtype=np.float32, sparse=False, handle_unknown='ignore'))
#    ], default=None)
    categorial_maps = gen_features(
        columns=[categorical_features],
        classes=[{'class': OneHotEncoder, 'dtype': np.float32, 'sparse':False, 'handle_unknown':'ignore'}])
    return DataFrameMapper(categorial_maps, default=None)


In [14]:
mapper = create_mapper()
mapped_data = mapper.fit_transform(lagged_data.drop(to_drop_cols, axis=1))
mapper.transformed_names_

['item_category_id_0',
 'item_category_id_1',
 'item_category_id_2',
 'item_category_id_3',
 'item_category_id_4',
 'item_category_id_5',
 'item_category_id_6',
 'item_category_id_7',
 'item_category_id_8',
 'item_category_id_9',
 'item_category_id_10',
 'item_category_id_11',
 'item_category_id_12',
 'item_category_id_13',
 'item_category_id_14',
 'item_category_id_15',
 'item_category_id_16',
 'item_category_id_17',
 'item_category_id_18',
 'item_category_id_19',
 'item_category_id_20',
 'item_category_id_21',
 'item_category_id_22',
 'item_category_id_23',
 'item_category_id_24',
 'item_category_id_25',
 'item_category_id_26',
 'item_category_id_27',
 'item_category_id_28',
 'item_category_id_29',
 'item_category_id_30',
 'item_category_id_31',
 'item_category_id_32',
 'item_category_id_33',
 'item_category_id_34',
 'item_category_id_35',
 'item_category_id_36',
 'item_category_id_37',
 'item_category_id_38',
 'item_category_id_39',
 'item_category_id_40',
 'item_category_id_41',
 '

In [16]:
X_train = mapped_data[train_indices]
X_test = mapped_data[test_indices]

target_col = 'target'
y_train = lagged_data.loc[train_indices, [target_col]].values.ravel()

In [None]:
from scipy.stats import describe
describe(X_train)

In [18]:
#print(X_train) 
print(X_test)
#print(y_train) 

[[0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00
  0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00
  0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00
  0.0000e+00 1.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00
  0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00
  0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00
  0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00
  0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00
  0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00
  0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00
  0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00
  0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00
  0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00
  0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00
  5.0000e+00 5.0370e+03 0.0000e+00 2.4430e+03 2.

In [None]:
def gen_time_split(data, n_splits):
    for i in range(n_splits):
        #print(i)
        first_vali_date_block_num = max_train_date_block_num - i
        vali_indices = data.loc[:,'date_block_num'] == first_vali_date_block_num
        train_indices = data.loc[:,'date_block_num'] < first_vali_date_block_num
        yield (train_indices[train_indices].index.values, vali_indices[vali_indices].index.values)

In [None]:
cv = gen_time_split(lagged_data, 3)
#for (tidx, vidx) in cv:
#    print(X_train[tidx])
#    print(X_train[vidx])
#    print(y_train[tidx])
#    print(y_train[vidx])
#for split in cv:
#    print(split)

In [None]:
#lr = 1 / np.logspace(0.0, 1.0, num=5)[2:]
#lr = np.array([0.3, 0.45, 0.6])
#lr = np.linspace(0.3, 0.6, 5)
lr = np.array([0.3])
print(lr)

In [None]:
#X_test = test.merge(test_lagged, how='left').drop(to_drop_cols + ['ID'], axis=1).values
#print(X_test)

In [19]:
import xgboost as xgb
xgb_params = {'max_depth':7, 'eta':0.3, 'silent':0, 'objective':'reg:linear', 'eval_metric':'rmse'}
dtrain = xgb.DMatrix(X_train, label=y_train)

In [None]:
folds = [fold for fold in gen_time_split(train, 3)]
#print(folds)
eval_hist = xgb.cv(xgb_params, dtrain, num_boost_round=1000, folds=folds, metrics=['rmse'], early_stopping_rounds=10, verbose_eval=True)

In [None]:
opt_num_rounds = eval_hist.shape[0]
#eval_hist

In [20]:
xgb_model = xgb.train(xgb_params, dtrain, num_boost_round=100)#opt_num_rounds)

[12:37:43] Tree method is automatically selected to be 'approx' for faster speed. to use old behavior(exact greedy algorithm on single machine), set tree_method to 'exact'
[12:38:04] /mnt/xgboost/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 252 extra nodes, 0 pruned nodes, max_depth=7
[12:38:14] /mnt/xgboost/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 254 extra nodes, 0 pruned nodes, max_depth=7
[12:38:24] /mnt/xgboost/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 254 extra nodes, 0 pruned nodes, max_depth=7
[12:38:33] /mnt/xgboost/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 252 extra nodes, 0 pruned nodes, max_depth=7
[12:38:43] /mnt/xgboost/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 252 extra nodes, 0 pruned nodes, max_depth=7
[12:38:53] /mnt/xgboost/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 254 extra nodes, 0 pruned nodes, max_depth=7
[12:39:03] /mnt/xgboost/src/tree/updater_prune.cc:74: tree pruning end, 

[12:48:20] /mnt/xgboost/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 230 extra nodes, 0 pruned nodes, max_depth=7
[12:48:30] /mnt/xgboost/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 232 extra nodes, 0 pruned nodes, max_depth=7
[12:48:40] /mnt/xgboost/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 248 extra nodes, 0 pruned nodes, max_depth=7
[12:48:49] /mnt/xgboost/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 222 extra nodes, 0 pruned nodes, max_depth=7
[12:48:59] /mnt/xgboost/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 234 extra nodes, 0 pruned nodes, max_depth=7
[12:49:08] /mnt/xgboost/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 214 extra nodes, 0 pruned nodes, max_depth=7
[12:49:18] /mnt/xgboost/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 224 extra nodes, 0 pruned nodes, max_depth=7
[12:49:28] /mnt/xgboost/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 242 extra nodes, 0 pruned nodes, max_

In [21]:
from sklearn.metrics import mean_squared_error, r2_score
y_train_pred_xgb = xgb_model.predict(dtrain)
print('RMSE for XGBoost is %f' % np.sqrt(mean_squared_error(y_train, y_train_pred_xgb)))
print('Train R-squared for XGBoost is %f' % r2_score(y_train, y_train_pred_xgb))

RMSE for XGBoost is 0.861517
Train R-squared for XGBoost is 0.503271


RMSE for XGBoost is 0.857923
Train R-squared for XGBoost is 0.507406

In [41]:
xgb_model.save_model('Pipeline.model')

In [None]:
from xgboost import plot_importance
plot_importance(xgb_model)

In [22]:
# make prediction
dtest = xgb.DMatrix(X_test)
y_test_pred_xgb = xgb_model.predict(dtest)
print(y_test_pred_xgb)

[ 4.56317842e-01  1.65202200e-01  1.01598799e+00  4.60495472e-01
  2.94543529e+00  4.15736437e-01  5.01594424e-01  1.84917271e-01
  9.35702085e-01  2.83184767e-01  2.44372916e+00  5.07269561e-01
  1.28759593e-01  4.78896767e-01  1.35575688e+00  2.32887983e+00
  1.29792988e-02  1.87394351e-01  1.40559673e+00  2.19666660e-01
  5.00200212e-01  3.78364116e-01  2.14191055e+00  5.69336176e-01
  1.36088681e+00  4.95707810e-01  4.33984309e-01  5.56297898e-01
  8.99378181e-01  2.86618590e+00  8.01769078e-01  1.10505986e+00
  8.37616444e-01  9.47826624e-01  1.86360717e-01  3.24673951e-01
  2.04735309e-01  1.06710291e+00  1.44468457e-01  7.50213563e-02
  6.85839653e-01  2.16660452e+00  6.24728727e+00  3.56377172e+00
  7.91018546e-01  2.06946421e+00  6.93715215e-01  2.36140192e-01
  5.04642546e-01  7.58429050e-01  8.87397408e-01  3.78522247e-01
  3.18999439e-01  1.98120654e-01  7.07955503e+00  3.00846553e+00
  1.13395095e+00  1.39287341e+00  6.73263597e+00  2.72423339e+00
  9.20109653e+00  2.71544

In [39]:
y_test_pred = y_test_pred_xgb
predictions = lagged_data.loc[test_indices, ['shop_id', 'item_id']]
predictions = predictions.assign(item_cnt_month=y_test_pred)
predictions.head()

Unnamed: 0,shop_id,item_id,item_cnt_month
10913850,5,5037,0.456318
10913851,5,5320,0.165202
10913852,5,5233,1.015988
10913853,5,5232,0.460495
10913854,5,5268,2.945435


In [None]:
submission = test.merge(predictions, how='left')[['item_cnt_month']]
submission.head()
#submission.describe()

In [42]:
submission.to_csv('Pipeline.csv', index_label='ID') #header=['ID', 'item_cnt_month'])
!gzip Pipeline.csv

OSError: [Errno 12] Cannot allocate memory

In [None]:
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasRegressor

def create_keras_model():
    model = Sequential()
    model.add(Dense(100, activation='relu'))
    model.add(Dense(100, activation='relu'))
    model.add(Dense(1))
    model.compile(loss='mean_squared_error', optimizer='adam')
    return model

keras_estimator = KerasRegressor(build_fn=create_keras_model, epochs=10, batch_size=1000, verbose=True)
#results = cross_val_score(estimator, X_train, Y_train, cv=gen_time_split(train, 3))
model_keras = keras_estimator.fit(X_train, y_train)

In [None]:
from sklearn.metrics import mean_squared_error, r2_score
y_train_pred_keras = keras_estimator.predict(X_train)
print('RMSE for Keras is %f' % np.sqrt(mean_squared_error(y_train, y_train_pred_keras)))
print('Train R-squared for Keras is %f' % r2_score(y_train, y_train_pred_keras))