In [1]:
#|default_exp app

In [2]:
#| export
from fastai.tabular.all import *
from fastbook import *

from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_log_error

import seaborn as sns

from dtreeviz.trees import *
import dtreeviz

from treeinterpreter import treeinterpreter as ti
import waterfall_chart

from fastprogress import master_bar, progress_bar
from fastprogress.fastprogress import force_console_behavior


In [3]:
master_bar, progress_bar = force_console_behavior()

In [4]:
#| export
iskaggle = os.environ.get('KAGGLE_KERNEL_RUN_TYPE', '')
creds = ''

In [5]:
#| export
cred_path = Path('~/.kaggle/kaggle.json').expanduser()
if not cred_path.exists():
    cred_path.parent.mkdir(exist_ok=True)
    cred_path.write_text(creds)
    cred_path.chmod(0o600)

In [6]:
#| export
path = Path('store-sales-time-series-forecasting')

In [7]:
#| export
if not iskaggle and not path.exists():
    import zipfile, kaggle
    kaggle.api.competition_download_cli(str(path))    
    zipfile.ZipFile(f'{path}.zip').extractall(path)


In [8]:
#| export
if iskaggle:
    path = Path('../input/store-sales-time-series-forecasting')
    ! pip install -q dataset

Import CSVs as dataframes

In [9]:
#| export
train_df = pd.read_csv(path/'train.csv', low_memory=False)
test_df = pd.read_csv(path/'test.csv', low_memory=False)
sub_df = pd.read_csv(path/'sample_submission.csv', low_memory=False)
stores_df = pd.read_csv(path/'stores.csv', low_memory=False)
oil_df = pd.read_csv(path/'oil.csv', low_memory=False)
hol_events_df = pd.read_csv(path/'holidays_events.csv', low_memory=False)
transactions_df = pd.read_csv(path/'transactions.csv', low_memory=False)

Combine training and test set for parrallel data transforms - to be split later before training

In [10]:
#| export
combined_df = pd.concat([train_df, test_df]).reset_index()

In [11]:
combined_df.shape

(3029400, 7)

Create `test_indexes` to split the dataframe later

In [12]:
#| export
test_idxs = combined_df.index[(combined_df.index > train_df.index.max())] 

Create Index values for training and validation sets

In [13]:
#| export
train_idxs = combined_df.index[(combined_df.index < round(len(train_df) * 0.8))]

In [14]:
#| export
valid_idxs = combined_df.index[(combined_df.index > len(train_idxs)) & (combined_df.index < test_idxs.min())]

In [15]:
combined_df.shape

(3029400, 7)

Merge Oil Prices

In [16]:
#| export
combined_df = combined_df.merge(oil_df, on='date', how='left')

Merge Store Metadata

In [17]:
#| export
combined_df = combined_df.merge(stores_df, on='store_nbr', how='left')

Rename `type` column and Merge Observed Holidays

In [18]:
#| export
hol_events_df.rename(columns={'type': 'hol_type'}, inplace=True)

In [19]:
# | export
# combined_df = combined_df.merge(hol_events_df, on='date', how='left')

Convert `date` to datetime

In [20]:
#| export
combined_df['date'] = pd.to_datetime(combined_df['date'])

Block out a month for data affected by earthquake

In [21]:
#| export
eq_start_date = pd.to_datetime("2016-04-16")
eq_end_date = pd.to_datetime("2016-05-16")

In [22]:
#| export
earthquake_cond = (combined_df.date >= eq_start_date) & (combined_df.date < eq_end_date)

Get indexes of items at earthquake dates - to be removed or transformed depending on model performance

In [23]:
#| export
earthquake_indexes = combined_df.index[earthquake_cond]

Convert `date` to mulitple date partition columns

In [24]:
#| export
combined_df = add_datepart(combined_df, 'date')

Drop rows at earthquake timeframe

In [25]:
# #| export
# combined_df.drop(earthquake_indexes, inplace=True)

In [26]:
dep_var = 'sales'

In [27]:
#| export
combined_df[dep_var] = np.log(combined_df[dep_var] + 1e-5)

In [28]:
#| export
procs = [Categorify, FillMissing, Normalize]

In [29]:
#| export
cont, cat = cont_cat_split(combined_df, 1, dep_var=dep_var)

In [30]:
#| export
train_val_splits = (list(train_idxs), list(valid_idxs))

In [31]:
#| export
to = TabularPandas(combined_df, procs, cat, cont, y_names=dep_var, splits=train_val_splits)

In [35]:
len(to.train.xs) + len(to.valid.xs)

3000887

In [33]:
to.valid.xs

Unnamed: 0,family,city,state,type,Is_month_end,Is_month_start,Is_quarter_end,Is_quarter_start,Is_year_end,Is_year_start,dcoilwtico_na,index,id,store_nbr,onpromotion,dcoilwtico,cluster,Year,Month,Week,Day,Dayofweek,Dayofyear,Elapsed
2400711,28,8,2,3,1,1,1,1,1,1,1,1.732053,1.732053,-0.545235,-0.154124,-1.265877,1.401758,1.508730,0.845244,0.812129,-0.411672,-1.500057,0.820176,1.732327
2400712,29,8,2,3,1,1,1,1,1,1,1,1.732054,1.732054,-0.545235,-0.154124,-1.265877,1.401758,1.508730,0.845244,0.812129,-0.411672,-1.500057,0.820176,1.732327
2400713,30,8,2,3,1,1,1,1,1,1,1,1.732056,1.732056,-0.545235,-0.049185,-1.265877,1.401758,1.508730,0.845244,0.812129,-0.411672,-1.500057,0.820176,1.732327
2400714,31,8,2,3,1,1,1,1,1,1,1,1.732057,1.732057,-0.545235,-0.049185,-1.265877,1.401758,1.508730,0.845244,0.812129,-0.411672,-1.500057,0.820176,1.732327
2400715,32,8,2,3,1,1,1,1,1,1,1,1.732059,1.732059,-0.545235,-0.154124,-1.265877,1.401758,1.508730,0.845244,0.812129,-0.411672,-1.500057,0.820176,1.732327
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3000883,29,19,13,2,1,1,1,1,1,1,1,2.598070,2.598070,-1.186848,-0.154124,-1.208063,-0.533776,2.439389,0.546526,0.539031,-0.071145,-1.000335,0.536316,2.596557
3000884,30,19,13,2,1,1,1,1,1,1,1,2.598072,2.598072,-1.186848,-0.049185,-1.208063,-0.533776,2.439389,0.546526,0.539031,-0.071145,-1.000335,0.536316,2.596557
3000885,31,19,13,2,1,1,1,1,1,1,1,2.598073,2.598073,-1.186848,15.376781,-1.208063,-0.533776,2.439389,0.546526,0.539031,-0.071145,-1.000335,0.536316,2.596557
3000886,32,19,13,2,1,1,1,1,1,1,1,2.598075,2.598075,-1.186848,0.685385,-1.208063,-0.533776,2.439389,0.546526,0.539031,-0.071145,-1.000335,0.536316,2.596557


In [34]:
import nbdev
nbdev.export.nb_export('store_sales_2.ipynb', 'app')
print('export successful')

export successful
