# Kaggle Demand Forecasting with Fast.ai

See [competition details](https://www.kaggle.com/c/demand-forecasting-kernels-only)

This is largely based on the lesson3 notebook for the Rossman forecasting challenge.

In [1]:

%matplotlib  inline
%reload_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np

from fastai.structured import *
from fastai.column_data import *
np.set_printoptions(threshold=50, edgeitems=20)




  from numpy.core.umath_tests import inner1d


# Load Data

In [2]:

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
ssub = pd.read_csv('sample_submission.csv')

print(f'train: {train.shape}', f'test {test.shape}')

train: (913000, 4) test (45000, 4)


In [3]:
train.head()

Unnamed: 0,date,store,item,sales
0,2013-01-01,1,1,13
1,2013-01-02,1,1,11
2,2013-01-03,1,1,14
3,2013-01-04,1,1,13
4,2013-01-05,1,1,10


In [4]:
test.head()

Unnamed: 0,id,date,store,item
0,0,2018-01-01,1,1
1,1,2018-01-02,1,1
2,2,2018-01-03,1,1
3,3,2018-01-04,1,1
4,4,2018-01-05,1,1


In [5]:
for col in ['store', 'item']:
    train[col] = train[col].astype('category')
    test[col] = test[col].astype('category')
    
train.describe(include='all')

Unnamed: 0,date,store,item,sales
count,913000,913000.0,913000.0,913000.0
unique,1826,10.0,50.0,
top,2017-04-28,10.0,50.0,
freq,500,91300.0,18260.0,
mean,,,,52.250287
std,,,,28.801144
min,,,,0.0
25%,,,,30.0
50%,,,,47.0
75%,,,,70.0


In [6]:
train.isnull().sum()

date     0
store    0
item     0
sales    0
dtype: int64

# Feature Engineering

In [7]:
train2 = train.copy()
test2 = test.copy()

add_datepart(train2, "date", drop=False)
add_datepart(test2, "date", drop=False)
train2.head()

Unnamed: 0,date,store,item,sales,Year,Month,Week,Day,Dayofweek,Dayofyear,Is_month_end,Is_month_start,Is_quarter_end,Is_quarter_start,Is_year_end,Is_year_start,Elapsed
0,2013-01-01,1,1,13,2013,1,1,1,1,1,False,True,False,True,False,True,1356998400
1,2013-01-02,1,1,11,2013,1,1,2,2,2,False,False,False,False,False,False,1357084800
2,2013-01-03,1,1,14,2013,1,1,3,3,3,False,False,False,False,False,False,1357171200
3,2013-01-04,1,1,13,2013,1,1,4,4,4,False,False,False,False,False,False,1357257600
4,2013-01-05,1,1,10,2013,1,1,5,5,5,False,False,False,False,False,False,1357344000


In [8]:
test2.head()

Unnamed: 0,id,date,store,item,Year,Month,Week,Day,Dayofweek,Dayofyear,Is_month_end,Is_month_start,Is_quarter_end,Is_quarter_start,Is_year_end,Is_year_start,Elapsed
0,0,2018-01-01,1,1,2018,1,1,1,0,1,False,True,False,True,False,True,1514764800
1,1,2018-01-02,1,1,2018,1,1,2,1,2,False,False,False,False,False,False,1514851200
2,2,2018-01-03,1,1,2018,1,1,3,2,3,False,False,False,False,False,False,1514937600
3,3,2018-01-04,1,1,2018,1,1,4,3,4,False,False,False,False,False,False,1515024000
4,4,2018-01-05,1,1,2018,1,1,5,4,5,False,False,False,False,False,False,1515110400


In [9]:
cat_vars = list(train2)
[cat_vars.remove(col) for col in ['sales', 'Elapsed', 'date']]
for v in cat_vars: train2[v] = train2[v].astype('category').cat.as_ordered()
apply_cats(test2, train2)

In [10]:
for v in ['sales', 'Elapsed']:
    train2[v] = train2[v].fillna(0).astype('float32')
    if v in test2:
        test2[v] = test2[v].fillna(0).astype('float32')

In [11]:
train2 = train2.set_index('date')
test2 = test2.set_index('date')

df, y, nas, mapper = proc_df(train2, 'sales', do_scale=True)
yl = np.log(y+1)

In [12]:
test2['sales'] = 0
df_test, _, nas, mapper = proc_df(test2, 'sales', do_scale=True, skip_flds=['id'], mapper=mapper, na_dict=nas)

In [13]:
df_test.head()

Unnamed: 0_level_0,store,item,Year,Month,Week,Day,Dayofweek,Dayofyear,Is_month_end,Is_month_start,Is_quarter_end,Is_quarter_start,Is_year_end,Is_year_start,Elapsed
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2018-01-01,1,1,0,1,1,1,1,1,1,2,1,2,1,2,1.733004
2018-01-02,1,1,0,1,1,2,2,2,1,1,1,1,1,1,1.734902
2018-01-03,1,1,0,1,1,3,3,3,1,1,1,1,1,1,1.736799
2018-01-04,1,1,0,1,1,4,4,4,1,1,1,1,1,1,1.738696
2018-01-05,1,1,0,1,1,5,5,5,1,1,1,1,1,1,1.740593


In [14]:
df.head()

Unnamed: 0_level_0,store,item,Year,Month,Week,Day,Dayofweek,Dayofyear,Is_month_end,Is_month_start,Is_quarter_end,Is_quarter_start,Is_year_end,Is_year_start,Elapsed
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2013-01-01,1,1,1,1,1,1,2,1,1,2,1,2,1,2,-1.731097
2013-01-02,1,1,1,1,1,2,3,2,1,1,1,1,1,1,-1.7292
2013-01-03,1,1,1,1,1,3,4,3,1,1,1,1,1,1,-1.727303
2013-01-04,1,1,1,1,1,4,5,4,1,1,1,1,1,1,-1.725406
2013-01-05,1,1,1,1,1,5,6,5,1,1,1,1,1,1,-1.723509


Time-based validation, as that's the goal with the test set.

In [15]:
val_idx = np.flatnonzero((df.index<datetime.datetime(2018,1,1)) & (df.index>=datetime.datetime(2017,10,1)))

# Model

First we need to ensure our target metric matches the competition

In [16]:
def inv_y(a): return np.exp(a) - 1

def smape(y_pred, targ):
    targ = inv_y(targ)
    pred = inv_y(y_pred)
    ape = 2 * np.abs(pred - targ) / (np.abs(pred) + np.abs(targ))
    return ape.mean() 

max_log_y = np.max(yl)
y_range = (0, max_log_y*1.2)

In [17]:
md = ColumnarModelData.from_data_frame('.', val_idx, df, yl.astype(np.float32), cat_flds=cat_vars, bs=128, test_df=df_test)

Determine embedding levels for categorical variables

In [18]:
cat_sz = [(c, len(train2[c].cat.categories)+1) for c in cat_vars]
cat_sz

[('store', 11),
 ('item', 51),
 ('Year', 6),
 ('Month', 13),
 ('Week', 54),
 ('Day', 32),
 ('Dayofweek', 8),
 ('Dayofyear', 367),
 ('Is_month_end', 3),
 ('Is_month_start', 3),
 ('Is_quarter_end', 3),
 ('Is_quarter_start', 3),
 ('Is_year_end', 3),
 ('Is_year_start', 3)]

In [19]:
emb_szs = [(c, min(50, (c+1)//2)) for _,c in cat_sz]
emb_szs

[(11, 6),
 (51, 26),
 (6, 3),
 (13, 7),
 (54, 27),
 (32, 16),
 (8, 4),
 (367, 50),
 (3, 2),
 (3, 2),
 (3, 2),
 (3, 2),
 (3, 2),
 (3, 2)]

In [20]:
m  =  md.get_learner(emb_szs, len(df.columns)-len(cat_vars),
                   0.04, 1, [1000,500], [0.001,0.01], y_range=y_range)
m.summary()



OrderedDict([('Embedding-1',
              OrderedDict([('input_shape', [-1]),
                           ('output_shape', [-1, 6]),
                           ('trainable', True),
                           ('nb_params', tensor(66))])),
             ('Embedding-2',
              OrderedDict([('input_shape', [-1]),
                           ('output_shape', [-1, 26]),
                           ('trainable', True),
                           ('nb_params', tensor(1326))])),
             ('Embedding-3',
              OrderedDict([('input_shape', [-1]),
                           ('output_shape', [-1, 3]),
                           ('trainable', True),
                           ('nb_params', tensor(18))])),
             ('Embedding-4',
              OrderedDict([('input_shape', [-1]),
                           ('output_shape', [-1, 7]),
                           ('trainable', True),
                           ('nb_params', tensor(91))])),
             ('Embedding-5',
              Or

In [None]:
lr = 1e-3
m.lr_find()

In [None]:
m.sched.plot(100)

In [None]:
m.fit(lr, 3, metrics=[smape])

In [None]:
m.save('val0')

In [21]:
m.load('val0')

In [22]:
x,y=m.predict_with_targs()







In [23]:
smape(x, y)

0.12447625

In [24]:
pred_test=m.predict(True)









In [26]:
pred_test=np.exp(pred_test) - 1

In [27]:
test2['']

array([[12.66984],
       [15.39304],
       [15.04482],
       [16.02245],
       [17.66339],
       [18.37978],
       [19.66224],
       [12.95197],
       [15.3669 ],
       [15.24631],
       [16.51021],
       [17.58158],
       [18.294  ],
       [19.41352],
       [12.99131],
       [15.10318],
       [15.12862],
       [16.50207],
       [17.09075],
       [18.48904],
       ...,
       [63.61925],
       [72.21856],
       [73.93838],
       [78.40424],
       [83.2681 ],
       [88.9375 ],
       [92.63979],
       [63.23325],
       [73.79839],
       [73.22952],
       [77.7774 ],
       [84.27683],
       [87.34531],
       [92.45547],
       [64.01272],
       [73.74506],
       [74.59523],
       [77.27998],
       [83.91173],
       [88.83168]], dtype=float32)