# Kaggle Demand Forecasting with Fast.ai

See [competition details](https://www.kaggle.com/c/demand-forecasting-kernels-only)

This is largely based on the lesson3 notebook for the Rossman forecasting challenge.

In [3]:

%matplotlib  inline
%reload_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np

from fastai.structured import *
from fastai.column_data import *
np.set_printoptions(threshold=50, edgeitems=20)

from IPython.display import HTML, display



# Load Data

In [4]:

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
ssub = pd.read_csv('sample_submission.csv')

print(f'train: {train.shape}', f'test {test.shape}')

train: (913000, 4) test (45000, 4)


In [5]:
train.head()

Unnamed: 0,date,store,item,sales
0,2013-01-01,1,1,13
1,2013-01-02,1,1,11
2,2013-01-03,1,1,14
3,2013-01-04,1,1,13
4,2013-01-05,1,1,10


In [6]:
test.head()

Unnamed: 0,id,date,store,item
0,0,2018-01-01,1,1
1,1,2018-01-02,1,1
2,2,2018-01-03,1,1
3,3,2018-01-04,1,1
4,4,2018-01-05,1,1


In [7]:
for col in ['store', 'item']:
    train[col] = train[col].astype('category')
    test[col] = test[col].astype('category')
    
train.describe(include='all')

Unnamed: 0,date,store,item,sales
count,913000,913000.0,913000.0,913000.0
unique,1826,10.0,50.0,
top,2014-01-13,10.0,50.0,
freq,500,91300.0,18260.0,
mean,,,,52.250287
std,,,,28.801144
min,,,,0.0
25%,,,,30.0
50%,,,,47.0
75%,,,,70.0


In [8]:
train.isnull().sum()

date     0
store    0
item     0
sales    0
dtype: int64

# Feature Engineering

In [57]:
train2 = train.copy()
test2 = test.copy()

add_datepart(train2, "date", drop=False)
add_datepart(test2, "date", drop=False)
train2.head()

Unnamed: 0,date,store,item,sales,Year,Month,Week,Day,Dayofweek,Dayofyear,Is_month_end,Is_month_start,Is_quarter_end,Is_quarter_start,Is_year_end,Is_year_start,Elapsed
0,2013-01-01,1,1,13,2013,1,1,1,1,1,False,True,False,True,False,True,1356998400
1,2013-01-02,1,1,11,2013,1,1,2,2,2,False,False,False,False,False,False,1357084800
2,2013-01-03,1,1,14,2013,1,1,3,3,3,False,False,False,False,False,False,1357171200
3,2013-01-04,1,1,13,2013,1,1,4,4,4,False,False,False,False,False,False,1357257600
4,2013-01-05,1,1,10,2013,1,1,5,5,5,False,False,False,False,False,False,1357344000


In [58]:
test2.head()

Unnamed: 0,id,date,store,item,Year,Month,Week,Day,Dayofweek,Dayofyear,Is_month_end,Is_month_start,Is_quarter_end,Is_quarter_start,Is_year_end,Is_year_start,Elapsed
0,0,2018-01-01,1,1,2018,1,1,1,0,1,False,True,False,True,False,True,1514764800
1,1,2018-01-02,1,1,2018,1,1,2,1,2,False,False,False,False,False,False,1514851200
2,2,2018-01-03,1,1,2018,1,1,3,2,3,False,False,False,False,False,False,1514937600
3,3,2018-01-04,1,1,2018,1,1,4,3,4,False,False,False,False,False,False,1515024000
4,4,2018-01-05,1,1,2018,1,1,5,4,5,False,False,False,False,False,False,1515110400


In [59]:
cat_vars = list(train2)
[cat_vars.remove(col) for col in ['sales', 'Elapsed', 'date']]
for v in cat_vars: train2[v] = train2[v].astype('category').cat.as_ordered()
apply_cats(train2, test2)

In [60]:
for v in ['sales', 'Elapsed']:
    train2[v] = train2[v].fillna(0).astype('float32')
    if v in test2:
        test2[v] = test2[v].fillna(0).astype('float32')

In [61]:
train2 = train2.set_index('date')
test2 = test2.set_index('date')

df, y, nas, mapper = proc_df(train2, 'sales', do_scale=True)
yl = np.log(y+1)

In [62]:
test2['sales'] = 0
df_test, _, nas, mapper = proc_df(test2, 'sales', do_scale=True, skip_flds=['id'], mapper=mapper, na_dict=nas)

Time-based validation, as that's the goal with the test set.

In [49]:
val_idx = np.flatnonzero((df.index<datetime.datetime(2018,1,1)) & (df.index>=datetime.datetime(2017,10,1)))

0.050383351588170866

# Model

First we need to ensure our target metric matches the competition

In [71]:
def inv_y(a): return np.exp(a) - 1

def smape(y_pred, targ):
    targ = inv_y(targ)
    pred = inv_y(y_pred)
    ape = 2 * np.abs(pred - targ) / (np.abs(pred) + np.abs(targ))
    return np.mean(ape)

max_log_y = np.max(yl)
y_range = (0, max_log_y*1.2)

In [64]:
md = ColumnarModelData.from_data_frame('.', val_idx, df, yl.astype(np.float32), cat_flds=cat_vars, bs=128, test_df=df_test)

Determine embedding levels for categorical variables

In [65]:
cat_sz = [(c, len(train2[c].cat.categories)+1) for c in cat_vars]
cat_sz

[('store', 11),
 ('item', 51),
 ('Year', 6),
 ('Month', 13),
 ('Week', 54),
 ('Day', 32),
 ('Dayofweek', 8),
 ('Dayofyear', 367),
 ('Is_month_end', 3),
 ('Is_month_start', 3),
 ('Is_quarter_end', 3),
 ('Is_quarter_start', 3),
 ('Is_year_end', 3),
 ('Is_year_start', 3)]

In [66]:
emb_szs = [(c, min(50, (c+1)//2)) for _,c in cat_sz]
emb_szs

[(11, 6),
 (51, 26),
 (6, 3),
 (13, 7),
 (54, 27),
 (32, 16),
 (8, 4),
 (367, 50),
 (3, 2),
 (3, 2),
 (3, 2),
 (3, 2),
 (3, 2),
 (3, 2)]

In [72]:
m  =  md.get_learner(emb_szs, len(df.columns)-len(cat_vars),
                   0.04, 1, [1000,500], [0.001,0.01], y_range=y_range)
m.summary()

TypeError: torch.index_select received an invalid combination of arguments - got ([32;1mtorch.FloatTensor[0m, [32;1mint[0m, [31;1mtorch.FloatTensor[0m), but expected (torch.FloatTensor source, int dim, torch.LongTensor index)