# Introduction

In this notebook, I go through how I worked to find a decent solution for this challenge using simple uncomplicated techniques. No machine learning, no fancy black-box models. Throw away your ARIMAs and Gradient Boosts. Think simple.

# Setup and Loading Data

In [1]:
%load_ext autoreload
# %autoreload 2
%matplotlib inline
%load_ext ipython_unittest

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor


pd.set_option('display.max_rows', 12)

In [2]:
%%time
data = pd.read_csv("train.csv", low_memory=False, 
                    parse_dates=['date'], index_col=['date'])
data.sort_index(inplace=True)



CPU times: user 816 ms, sys: 108 ms, total: 925 ms
Wall time: 927 ms


In [3]:
def smape(A, F):
    """ SMAPE transformation"""
    A, F = A.astype(float), F.astype(float)

    return (100. / len(A)) * np.sum(2 * np.abs(F - A) / (np.abs(A) + np.abs(F)  + np.finfo(float).eps))

 
def expand_df(df):
    """Expand dataframe with more useful columns"""
    data = df.copy()
    data['day'] = data.index.day
    data['month'] = data.index.month
    data['year'] = data.index.year
    data['dayofweek'] = data.index.dayofweek
    data['week_of_year']  = data.index.weekofyear
    return data

### unit tests

In [4]:
%%unittest_testcase

def test_smape(self):
    A1, F1 = np.array([1, 2]), np.array([1.1, 2.2])
    expected_result1 = 100 * .5 * (.1 / 1.05 + .2 / 2.1)
    A2, F2 = np.array([1, 2]), np.array([3, 4])
    expected_result2 = 100 * .5 * (2 / 2 + 2. / 3)
    A3 = np.arange(101)
    F3 = np.arange(101)
    F3[100] = 101
    expected_result3 = (100. / 101) * (1 / (201. / 2))
    
    self.assertAlmostEqual(smape(A1,F1), expected_result1, places = 7)
    self.assertAlmostEqual(smape(A2,F2), expected_result2, places = 7)
    self.assertAlmostEqual(smape(A3,F3), expected_result3, places = 7)



Success

.
----------------------------------------------------------------------
Ran 1 test in 0.000s

OK


<unittest.runner.TextTestResult run=1 errors=0 failures=0>

## One hot transformation 

In [5]:
%%time
data = expand_df(data)
data = pd.get_dummies(data, columns=[u'store', u'item'])


test = data.loc['2017-09':, :]
train = data.loc[:'2017-08', :]

data_x = train.drop(columns='sales').values
data_y = train['sales'].values

data_test_x = test.drop(columns='sales').values
data_test_y = test['sales'].values


CPU times: user 582 ms, sys: 279 ms, total: 861 ms
Wall time: 860 ms


In [10]:
%%time
regr_1 = DecisionTreeRegressor(max_depth=10)
regr_2 = DecisionTreeRegressor(max_depth=100)
regr_3 = RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=50,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=20, min_samples_split=20,
           min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=8,
           oob_score=False, random_state=0, verbose=0, warm_start=False)


regr_1.fit(data_x, data_y)
regr_2.fit(data_x, data_y)
regr_3.fit(data_x, data_y)

CPU times: user 3min 35s, sys: 1.06 s, total: 3min 36s
Wall time: 43.9 s


In [11]:
y_1 = regr_1.predict(data_test_x)
y_2 = regr_2.predict(data_test_x)
y_3 = regr_3.predict(data_test_x)


print smape(y_1, data_test_y), smape(y_2, data_test_y), smape(y_3, data_test_y)

38.025085882405925 17.57035299822435 13.871961174349924
