In [1]:
#|default_exp app

In [2]:
#| export
from fastai.tabular.all import *
from fastbook import *

from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_log_error

import seaborn as sns

from dtreeviz.trees import *
import dtreeviz

from treeinterpreter import treeinterpreter as ti
import waterfall_chart

from fastprogress import master_bar, progress_bar
from fastprogress.fastprogress import force_console_behavior


In [3]:
master_bar, progress_bar = force_console_behavior()

In [4]:
#| export
iskaggle = os.environ.get('KAGGLE_KERNEL_RUN_TYPE', '')
creds = ''

In [5]:
#| export
cred_path = Path('~/.kaggle/kaggle.json').expanduser()
if not cred_path.exists():
    cred_path.parent.mkdir(exist_ok=True)
    cred_path.write_text(creds)
    cred_path.chmod(0o600)

In [6]:
#| export
path = Path('store-sales-time-series-forecasting')

In [7]:
#| export
if not iskaggle and not path.exists():
    import zipfile, kaggle
    kaggle.api.competition_download_cli(str(path))    
    zipfile.ZipFile(f'{path}.zip').extractall(path)


In [8]:
#| export
if iskaggle:
    path = Path('../input/store-sales-time-series-forecasting')
    ! pip install -q dataset

Import CSVs as dataframes

In [9]:
#| export
train_df = pd.read_csv(path/'train.csv', low_memory=False)
test_df = pd.read_csv(path/'test.csv', low_memory=False)
sub_df = pd.read_csv(path/'sample_submission.csv', low_memory=False)
stores_df = pd.read_csv(path/'stores.csv', low_memory=False)
oil_df = pd.read_csv(path/'oil.csv', low_memory=False)
hol_events_df = pd.read_csv(path/'holidays_events.csv', low_memory=False)
transactions_df = pd.read_csv(path/'transactions.csv', low_memory=False)

Combine training and test set for parrallel data transforms - to be split later before training

In [21]:
#| export
combined_df = pd.concat([train_df, test_df]).reset_index()

Create `test_indexes` to split the dataframe later

In [27]:
#| export
test_idxs = combined_df.index[(combined_df.index > train_df.index.max())] 

Create Index values for training and validation sets

In [28]:
#| export
train_idxs = combined_df.index[(combined_df.index <= round(len(train_df) * 0.8))]

In [29]:
#| export
valid_idxs = combined_df.index[(combined_df.index > train_idxs.max()) & (combined_df.index < test_idxs.min())]

Merge Oil Prices

In [30]:
#| export
combined_df = combined_df.merge(oil_df, on='date', how='left')

Merge Store Metadata

In [31]:
#| export
combined_df = combined_df.merge(stores_df, on='store_nbr', how='left')

Rename `type` column and Merge Observed Holidays

In [32]:
#| export
hol_events_df.rename(columns={'type': 'hol_type'}, inplace=True)

In [33]:
#| export
# combined_df = combined_df.merge(hol_events_df, on='date', how='left')

Convert `date` to datetime

In [34]:
#| export
combined_df['date'] = pd.to_datetime(combined_df['date'])

Block out a month for data affected by earthquake

In [35]:
#| export
eq_start_date = pd.to_datetime("2016-04-16")
eq_end_date = pd.to_datetime("2016-05-16")

In [36]:
#| export
earthquake_cond = (combined_df.date >= eq_start_date) & (combined_df.date < eq_end_date)

Get indexes of items at earthquake dates - to be removed or transformed depending on model performance

In [37]:
#| export
earthquake_indexes = combined_df.index[earthquake_cond]

Convert `date` to mulitple date partition columns

In [38]:
#| export
combined_df = add_datepart(combined_df, 'date')

Drop rows at earthquake timeframe

In [39]:
#| export
combined_df.drop(earthquake_indexes, inplace=True)

In [40]:
dep_var = 'sales'

In [41]:
combined_df[dep_var] = np.log(combined_df[dep_var] + 1e-5)

In [42]:
procs = [Categorify, FillMissing, Normalize]

In [43]:
cont, cat = cont_cat_split(combined_df, 1, dep_var=dep_var)

In [49]:
len(list(train_idxs)) + len(list(valid_idxs))

3000888

In [44]:
train_val_splits = (list(train_idxs), list(valid_idxs))

In [45]:
to = TabularPandas(combined_df, procs, cat, cont, y_names=dep_var, splits=train_val_splits)

IndexError: positional indexers are out-of-bounds

In [None]:
to.train.xs

In [None]:
to.valid.xs

In [None]:
import nbdev
nbdev.export.nb_export('store_sales_2.ipynb', 'app')
print('export successful')