# Introduction

In this notebook, I go through how I worked to find a decent solution for this challenge using simple uncomplicated techniques. No machine learning, no fancy black-box models. Throw away your ARIMAs and Gradient Boosts. Think simple.

# Setup and Loading Data

In [1]:
%load_ext autoreload
# %autoreload 2
%matplotlib inline
%load_ext ipython_unittest

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeRegressor

pd.set_option('display.max_rows', 12)

In [2]:
train = pd.read_csv("train.csv", low_memory=False, 
                    parse_dates=['date'], index_col=['date'])
train.sort_index(inplace=True)

test = train.loc['2017-09':, :]
train = train.loc[:'2017-08', :]
# train = train.loc['2015-09':'2017-08', :]  #short
# test = train.loc['2017-09': '2017-10', :] #short


In [3]:
test.head()

Unnamed: 0_level_0,store,item,sales
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-09-01,2,5,29
2017-09-01,1,48,63
2017-09-01,2,45,138
2017-09-01,9,40,36
2017-09-01,8,17,48


In [4]:
def smape(A, F):
    A, F = A.astype(float), F.astype(float)

    return (100. / len(A)) * np.sum(2 * np.abs(F - A) / (np.abs(A) + np.abs(F)  + np.finfo(float).eps))

### unit tests

In [5]:
%%unittest_testcase

def test_smape(self):
    A1, F1 = np.array([1, 2]), np.array([1.1, 2.2])
    expected_result1 = 100 * .5 * (.1 / 1.05 + .2 / 2.1)
    A2, F2 = np.array([1, 2]), np.array([3, 4])
    expected_result2 = 100 * .5 * (2 / 2 + 2. / 3)
    A3 = np.arange(101)
    F3 = np.arange(101)
    F3[100] = 101
    expected_result3 = (100. / 101) * (1 / (201. / 2))
    
    self.assertAlmostEqual(smape(A1,F1), expected_result1, places = 7)
    self.assertAlmostEqual(smape(A2,F2), expected_result2, places = 7)
    self.assertAlmostEqual(smape(A3,F3), expected_result3, places = 7)



Success

.
----------------------------------------------------------------------
Ran 1 test in 0.000s

OK


<unittest.runner.TextTestResult run=1 errors=0 failures=0>

### This solution gets a score of 20.72 for the 2 month test


## Exploring the data

In [25]:
# Expand dataframe with more useful columns
def expand_df(df):
    data = df.copy()
    data['day'] = data.index.day
    data['month'] = data.index.month
    data['year'] = data.index.year
    data['dayofweek'] = data.index.dayofweek
#     data['week_of_year']  = data.index.weekofyear
    return data

data = expand_df(train)
data_test = expand_df(test)



In [26]:
train_colums = ['store', 'item', 'month', 'year', 'dayofweek']
labels_column = 'sales'
data_x = data[train_colums].values
data_y = data[labels_column].values

data_test_x = data_test[train_colums].values
data_test_y = data_test[labels_column].values

In [27]:
regr_1 = DecisionTreeRegressor(max_depth=2)
regr_2 = DecisionTreeRegressor(max_depth=50)

regr_1.fit(data_x, data_y)
regr_2.fit(data_x, data_y)

DecisionTreeRegressor(criterion='mse', max_depth=50, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')

In [28]:
y_1 = regr_1.predict(data_test_x)
y_2 = regr_2.predict(data_test_x)

In [29]:
smape(y_1, data_test_y)

39.48035565007049

In [30]:
smape(y_2, data_test_y)

13.58438011991653

## One hot transformation

In [11]:
smape(y_3, data_test_y)

13.5842842652896

In [55]:
train_colums = ['store', 'item', 'month', 'year', 'dayofweek']
labels_column = 'sales'
# colums_for_one_hot = ['store', 'item']
colums_for_one_hot = train_colums
data_x = pd.get_dummies(data[train_colums], columns=colums_for_one_hot).values
data_y = data[labels_column].values
print colums_for_one_hot
data_test_x = pd.get_dummies(data_test[train_colums], columns=colums_for_one_hot).values
data_test_y = data_test[labels_column].values

['store', 'item', 'month', 'year', 'dayofweek']


In [59]:
colums_for_one_hot

['store', 'item', 'month', 'year', 'dayofweek']

In [58]:
pd.get_dummies(data[train_colums], columns=colums_for_one_hot).shape

(852000, 84)

In [57]:
pd.get_dummies(data_test[train_colums], columns=colums_for_one_hot).shape

(61000, 72)

In [49]:
pd.get_dummies(data[train_colums], columns=colums_for_one_hot).shape

(852000, 84)

In [50]:
regr_1 = DecisionTreeRegressor(max_depth=2)
regr_2 = DecisionTreeRegressor(max_depth=200)

regr_1.fit(data_x, data_y)
regr_2.fit(data_x, data_y)

DecisionTreeRegressor(criterion='mse', max_depth=200, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')

In [51]:
y_1 = regr_1.predict(data_test_x)
y_2 = regr_2.predict(data_test_x)

ValueError: Number of features of the model must match the input. Model n_features is 84 and input n_features is 72 

In [None]:
print smape(y_1, data_test_y), smape(y_2, data_test_y)

In [52]:
data_test_x.shape

(61000, 72)