# Introduction

In this notebook, I go through how I worked to find a decent solution for this challenge using simple uncomplicated techniques. No machine learning, no fancy black-box models. Throw away your ARIMAs and Gradient Boosts. Think simple.

# Setup and Loading Data

In [62]:
%load_ext autoreload
# %autoreload 2
%matplotlib inline
%load_ext ipython_unittest
from math import sin, cos, pi
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor


pd.set_option('display.max_rows', 12)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
The ipython_unittest extension is already loaded. To reload it, use:
  %reload_ext ipython_unittest


## Reading the data

In [50]:
%%time
data = pd.read_csv("train.csv", low_memory=False, 
                    parse_dates=['date'], index_col=['date'])
data.sort_index(inplace=True)

CPU times: user 896 ms, sys: 127 ms, total: 1.02 s
Wall time: 1.02 s


# Auxilary functions

In [49]:
def smape(A, F):
    """ SMAPE transformation"""
    A, F = A.astype(float), F.astype(float)

    return (100. / len(A)) * np.sum(2 * np.abs(F - A) / (np.abs(A) + np.abs(F)  + np.finfo(float).eps))

 
def expand_df(df):
    """Expand dataframe with more useful columns"""
    data = df.copy()
    data['day'] = data.index.day
    data['month'] = data.index.month
    data['year'] = data.index.year
    data['dayofweek'] = data.index.dayofweek
    data['week_of_year']  = data.index.weekofyear
    return data


def cycle(t, T):
    """transform cyclic feature (hor, day, month) into sin(\alpha), cos(\alpha)"""
    arg = 2 * pi * t / T
    return sin(arg), cos(arg)


def cyclic_features(data, cyclic_features = {'month': 12, 'day' :31, 'dayofweek': 7}):
    """adding transforming the cyclic feature to sin and cos"""

    for col_name in cyclic_features:
        new_cyclic_feature_sin, new_cyclic_feature_cos = [], []
        T = cyclic_features[col_name]
        print col_name, T
        for _, val in data[col_name].iteritems():
            temp_sin, temp_cos = cycle(t=val, T=T)
            new_cyclic_feature_sin.append(temp_sin)
            new_cyclic_feature_cos.append(temp_cos)

        data[col_name + '_sin'], data[col_name + '_cos'] = [new_cyclic_feature_sin, new_cyclic_feature_cos]
        
    return data.drop(columns=cyclic_features, inplace=False) 

### unit tests

In [30]:
%%unittest_testcase
from scipy.spatial.distance import euclidean

def test_smape(self):
    """to check vairous edge cases where SMAPE implementation may fail"""
    A1, F1 = np.array([1, 2]), np.array([1.1, 2.2])
    expected_result1 = 100 * .5 * (.1 / 1.05 + .2 / 2.1)
    A2, F2 = np.array([1, 2]), np.array([3, 4])
    expected_result2 = 100 * .5 * (2 / 2 + 2. / 3)
    A3 = np.arange(101)
    F3 = np.arange(101)
    F3[100] = 101
    expected_result3 = (100. / 101) * (1 / (201. / 2))
    
    self.assertAlmostEqual(smape(A1,F1), expected_result1, places = 7)
    self.assertAlmostEqual(smape(A2,F2), expected_result2, places = 7)
    self.assertAlmostEqual(smape(A3,F3), expected_result3, places = 7)
    
    
def test_cycle(self):
    """To ensure that the distance between minures 58 and 0 is the
    same as between 1 and 3
    """
    t1, t2 = 59, 1
    t3, t4 = 1, 3
    T = 60
    self.assertEqual(euclidean(cycle(t1, T), cycle(t2, T)),
                     euclidean(cycle(t3, T), cycle(t4, T)))
    



Success

..
----------------------------------------------------------------------
Ran 2 tests in 0.000s

OK


<unittest.runner.TextTestResult run=2 errors=0 failures=0>

## One hot transformation and Cyclic


## Addindg Cyclic and One Hot Encoded features

In [51]:
%%time
data = expand_df(data)
data = pd.get_dummies(data, columns=[u'store', u'item'])
features = {'month': 12, 'day' :31, 'dayofweek': 7, 'week_of_year':52}
data = cyclic_features(data, features)

test = data.loc['2017-09':, :]
train = data.loc[:'2017-08', :]

data_x = train.drop(columns='sales').values
data_y = train['sales'].values

data_test_x = test.drop(columns='sales').values
data_test_y = test['sales'].values


dayofweek 7
week_of_year 52
day 31
month 12
CPU times: user 9.44 s, sys: 803 ms, total: 10.2 s
Wall time: 10.1 s


In [None]:
data.shape

In [63]:
regr_0 = LinearRegression()
regr_1 = DecisionTreeRegressor(max_depth=50)
regr_2 = DecisionTreeRegressor(max_depth=200)
regr_3 = RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=100,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=20, min_samples_split=20,
           min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=8,
           oob_score=False, random_state=0, verbose=0, warm_start=False)


regr_0.fit(data_x, data_y)
regr_1.fit(data_x, data_y)
regr_2.fit(data_x, data_y)
regr_3.fit(data_x, data_y)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=100,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=20, min_samples_split=20,
           min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=8,
           oob_score=False, random_state=0, verbose=0, warm_start=False)

In [65]:

y_1 = regr_1.predict(data_test_x)
y_2 = regr_2.predict(data_test_x)
y_3 = regr_3.predict(data_test_x)
y_0 = regr_0.predict(data_test_x)


print smape(y_1, data_test_y), smape(y_2, data_test_y), smape(y_3, data_test_y), smape(y_0, data_test_y)

16.66853935540901 17.837339757967474 13.445495547733241 20.336638157503923
