In [1]:
%load_ext autoreload
%autoreload 2
import pandas as pd
import numpy as np
from datetime import datetime
from src.sales_data import SalesData
import src.gb_lag as gbl

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
model_input = gbl.feature_matrix(daily_threshold=300)

Formatting sales data...
Aggregating for monthly sales...


In [3]:
model_input.head()

year_month,201301,201302,201303,201304,201305,201306,201307,201308,201309,201310,...,201501,201502,201503,201504,201505,201506,201507,201508,201509,201510
shop_item,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0_1000,5.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0_10004,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0_1001,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0_10012,1.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0_1002,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Quick and Dirty
Let's see if this idea works with all the data. We'll train on data from 201301 to 201509 to see how well we can predict 201510 with RSME as our scoring method. Break up the data into independent and dependent arrays, the do a train-test split. With this data, we'll train a gradient boosting regressor model to predict on the test set.

In [4]:
X = model_input.iloc[:, :-1].copy()
y = model_input.iloc[:, -1:].copy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.20, random_state=123)

In [5]:
# scale
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)
sc_y = StandardScaler()
y_train = sc_y.fit_transform(y_train)

In [10]:
gb = GradientBoostingRegressor(loss='ls', 
                               learning_rate=0.075, 
                               n_estimators=200, 
                               subsample=1.0, 
                               criterion='friedman_mse', 
                               min_samples_split=2, 
                               min_samples_leaf=1, 
                               min_weight_fraction_leaf=0.0, 
                               max_depth=3, 
                               min_impurity_decrease=0.0, 
                               min_impurity_split=None, 
                               init=None, 
                               random_state=0, 
                               max_features=None, 
                               alpha=0.9, 
                               verbose=1, 
                               max_leaf_nodes=None, 
                               warm_start=False, 
                               presort='deprecated', 
                               validation_fraction=0.1, 
                               n_iter_no_change=None, 
                               tol=0.0001, 
                               ccp_alpha=0.0)

gb.fit(X_train, np.array(y_train).ravel())
y_pred = gb.predict(X_test)

      Iter       Train Loss   Remaining Time 
         1           0.9334            1.27m
         2           0.8761            1.24m
         3           0.8267            1.22m
         4           0.7840            1.22m
         5           0.7452            1.20m
         6           0.7119            1.18m
         7           0.6841            1.18m
         8           0.6580            1.16m
         9           0.6372            1.15m
        10           0.6167            1.15m
        20           0.5151            1.06m
        30           0.4852           59.07s
        40           0.4739           55.02s
        50           0.4692           51.40s
        60           0.4641           47.68s
        70           0.4609           44.10s
        80           0.4591           40.60s
        90           0.4568           37.13s
       100           0.4549           33.71s
       200           0.4449            0.00s


In [11]:
print(f'RMSE: {np.sqrt(mean_squared_error(y_test, y_pred))}')

RMSE: 1.6234056556770544


In [12]:
y_pred = sc_y.inverse_transform(y_pred)

In [13]:
print(f'RMSE: {np.sqrt(mean_squared_error(y_test, y_pred))}')

RMSE: 1.21873544385586


This is a better score than what I was getting before with the baseline models. Additionally, the train loss looks to be converging.

## Gradient Boosting Regressor to Predict 201511
EDA on the dataset when looking at possibly implementing an ARIMA model showed a 12-period seasonality. So, in this attempt, we'll use the P11 of each year as the dependent variable. However, since the earliest period is 201301, we don't have an entire 12 periods to train on for 2013.

Let's see what it looks like when we train on P1 to P11 for years 2013 & 2014. In this case, P1-P10 will be the independent array and P11 will be the dependent one. Then, we'll use 2015 data to predict 201511.

In [14]:
model_input.head()

year_month,201301,201302,201303,201304,201305,201306,201307,201308,201309,201310,...,201501,201502,201503,201504,201505,201506,201507,201508,201509,201510
shop_item,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0_1000,5.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0_10004,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0_1001,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0_10012,1.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0_1002,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
cols_2013 = []
cols_2014 = []
cols_2015 = []

for col in model_input.columns:
    if '2013' in col:
        cols_2013.append(col)
    if '2014' in col:
        cols_2014.append(col)
    if '2015' in col:
        cols_2015.append(col)

input_2013 = model_input.loc[:, cols_2013].copy()
input_2014 = model_input.loc[:, cols_2014].copy()

input_2015 = model_input.loc[:, cols_2015].copy()

input_2013.drop('201312', axis=1, inplace=True)
input_2014.drop('201412', axis=1, inplace=True)

for df in [input_2013, input_2014, input_2015]:
    for col in df:
        df.rename(columns={col: f'P{col[-2:]}'}, inplace=True)

In [16]:
train_df = pd.concat([input_2013, input_2014], axis=0)

In [17]:
X = train_df.iloc[:, :-1].copy()
y = train_df.iloc[:, -1:].copy()

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.20, random_state=123)

In [19]:
# scale
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)
sc_y = StandardScaler()
y_train = sc_y.fit_transform(y_train)

In [20]:
gb = GradientBoostingRegressor(loss='ls', 
                               learning_rate=0.075, 
                               n_estimators=400, 
                               subsample=1.0, 
                               criterion='friedman_mse', 
                               min_samples_split=2, 
                               min_samples_leaf=1, 
                               min_weight_fraction_leaf=0.0, 
                               max_depth=2, 
                               min_impurity_decrease=0.0, 
                               min_impurity_split=None, 
                               init=None, 
                               random_state=0, 
                               max_features=None, 
                               alpha=0.9, 
                               verbose=1, 
                               max_leaf_nodes=None, 
                               warm_start=False, 
                               presort='deprecated', 
                               validation_fraction=0.1, 
                               n_iter_no_change=None, 
                               tol=0.0001, 
                               ccp_alpha=0.0)

gb.fit(X_train, np.array(y_train).ravel())
y_pred = gb.predict(X_test)

      Iter       Train Loss   Remaining Time 
         1           0.9270            1.47m
         2           0.8639            1.53m
         3           0.8093            1.52m
         4           0.7616            1.53m
         5           0.7206            1.56m
         6           0.6847            1.55m
         7           0.6537            1.53m
         8           0.6265            1.52m
         9           0.5987            1.53m
        10           0.5778            1.51m
        20           0.4518            1.42m
        30           0.4059            1.38m
        40           0.3774            1.35m
        50           0.3666            1.33m
        60           0.3607            1.29m
        70           0.3560            1.24m
        80           0.3516            1.19m
        90           0.3484            1.15m
       100           0.3455            1.10m
       200           0.3289           42.82s
       300           0.3198           21.29s
       40

In [21]:
y_pred = sc_y.inverse_transform(y_pred)

In [22]:
print(f'RMSE: {np.sqrt(mean_squared_error(y_test, y_pred))}')

RMSE: 2.2001159275205935


# Train
Train the model on the entire set.


In [23]:
# scale
sc_X = StandardScaler()
X = sc_X.fit_transform(X)
# X_test = sc_X.transform(X_test)
sc_y = StandardScaler()
y = sc_y.fit_transform(y)

In [24]:
gb.fit(X, np.array(y).ravel())

      Iter       Train Loss   Remaining Time 
         1           0.9250            1.83m
         2           0.8604            1.83m
         3           0.8046            1.85m
         4           0.7559            1.87m
         5           0.7126            1.90m
         6           0.6754            1.92m
         7           0.6429            1.92m
         8           0.6137            1.90m
         9           0.5887            1.91m
        10           0.5660            1.91m
        20           0.4307            1.83m
        30           0.3848            1.72m
        40           0.3660            1.64m
        50           0.3519            1.57m
        60           0.3433            1.51m
        70           0.3368            1.46m
        80           0.3318            1.41m
        90           0.3286            1.37m
       100           0.3262            1.32m
       200           0.3125           52.61s
       300           0.3059           26.11s
       40

GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.075, loss='ls',
                          max_depth=2, max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=400,
                          n_iter_no_change=None, presort='deprecated',
                          random_state=0, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=1, warm_start=False)

# Test Set
Merge test set to 2015 data. Then get predictions.

In [25]:
test_set = pd.read_csv('data/test.csv')

test_set['shop_item'] = \
    test_set['shop_id'].astype(str) + '_' + test_set['item_id'].astype(str)

input_2015 = input_2015.reset_index()

test_input = pd.merge(test_set.loc[:, 'shop_item'], 
                    input_2015, 
                    on='shop_item', how='left')

test_input.fillna(0, inplace=True)

In [26]:
test_input.head()

Unnamed: 0,shop_item,P01,P02,P03,P04,P05,P06,P07,P08,P09,P10
0,5_5037,2.0,0.0,0.0,0.0,1.0,1.0,1.0,3.0,1.0,0.0
1,5_5320,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,5_5233,0.0,0.0,0.0,0.0,3.0,2.0,0.0,1.0,3.0,1.0
3,5_5232,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,5_5268,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [27]:
X_test = test_input.iloc[:, 1:]
X_test = sc_X.transform(X_test)

In [28]:
y_pred = gb.predict(X_test)

In [31]:
y_pred = sc_y.inverse_transform(y_pred)

In [32]:
test_set.head()

Unnamed: 0,ID,shop_id,item_id,shop_item
0,0,5,5037,5_5037
1,1,5,5320,5_5320
2,2,5,5233,5_5233
3,3,5,5232,5_5232
4,4,5,5268,5_5268


In [33]:
test_set['item_cnt_month'] = y_pred

output = test_set.loc[:, ['ID', 'item_cnt_month']].copy()
output.set_index(keys='ID', inplace=True)
output = output.clip(0, 20)

output.to_csv('output/submission_gb_ts_ss.csv', index=True)

In [34]:
output.head()

Unnamed: 0_level_0,item_cnt_month
ID,Unnamed: 1_level_1
0,0.363416
1,0.146229
2,0.758032
3,0.152871
4,0.146229
