In [1]:
from fastai.tabular.all import *
from fastbook import *

from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_log_error

import seaborn as sns

from dtreeviz.trees import *
import dtreeviz

from treeinterpreter import treeinterpreter as ti
import waterfall_chart

from fastprogress import master_bar, progress_bar
from fastprogress.fastprogress import force_console_behavior


In [2]:
master_bar, progress_bar = force_console_behavior()

In [3]:
#| export
iskaggle = os.environ.get('KAGGLE_KERNEL_RUN_TYPE', '')
creds = ''

In [4]:
#| export
cred_path = Path('~/.kaggle/kaggle.json').expanduser()
if not cred_path.exists():
    cred_path.parent.mkdir(exist_ok=True)
    cred_path.write_text(creds)
    cred_path.chmod(0o600)

In [5]:
#| export
path = Path('store-sales-time-series-forecasting')

In [6]:
#| export
if not iskaggle and not path.exists():
    import zipfile, kaggle
    kaggle.api.competition_download_cli(str(path))    
    zipfile.ZipFile(f'{path}.zip').extractall(path)


In [7]:
#| export
if iskaggle:
    path = Path('../input/store-sales-time-series-forecasting')
    ! pip install -q dataset

Import CSVs as dataframes

In [8]:
train_df = pd.read_csv(path/'train.csv', low_memory=False)
test_df = pd.read_csv(path/'test.csv', low_memory=False)
sub_df = pd.read_csv(path/'sample_submission.csv', low_memory=False)
stores_df = pd.read_csv(path/'stores.csv', low_memory=False)
oil_df = pd.read_csv(path/'oil.csv', low_memory=False)
hol_events_df = pd.read_csv(path/'holidays_events.csv', low_memory=False)
transactions_df = pd.read_csv(path/'transactions.csv', low_memory=False)

In [9]:
train_df.head()

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion
0,0,2013-01-01,1,AUTOMOTIVE,0.0,0
1,1,2013-01-01,1,BABY CARE,0.0,0
2,2,2013-01-01,1,BEAUTY,0.0,0
3,3,2013-01-01,1,BEVERAGES,0.0,0
4,4,2013-01-01,1,BOOKS,0.0,0


Combine training and test set for parrallel data transforms - to be split later before training

In [10]:
combined_df = pd.concat([train_df, test_df])

Merge Oil Prices

In [11]:
combined_df = combined_df.merge(oil_df, on='date', how='left')

Merge Store Metadata

In [12]:
combined_df = combined_df.merge(stores_df, on='store_nbr', how='left')

Rename `type` column and Merge Observed Holidays

In [13]:
hol_events_df.rename(columns={'type': 'hol_type'}, inplace=True)

In [14]:
# combined_df = combined_df.merge(hol_events_df, on='date', how='left')

Convert `date` to datetime

In [15]:
combined_df['date'] = pd.to_datetime(combined_df['date'])

Block out a month for data affected by earthquake

In [16]:
eq_start_date = pd.to_datetime("2016-04-16")
eq_end_date = pd.to_datetime("2016-05-16")

In [17]:
earthquake_cond = (combined_df.date >= eq_start_date) & (combined_df.date < eq_end_date)

Get indexes of items at earthquake dates - to be removed or transformed depending on model performance

In [18]:
earthquake_indexes = combined_df.index[earthquake_cond]

Convert `date` to mulitple date partition columns

In [19]:
combined_df = add_datepart(combined_df, 'date')

In [20]:
combined_df.sample(n=10)

Unnamed: 0,id,store_nbr,family,sales,onpromotion,dcoilwtico,city,state,type,cluster,Year,Month,Week,Day,Dayofweek,Dayofyear,Is_month_end,Is_month_start,Is_quarter_end,Is_quarter_start,Is_year_end,Is_year_start,Elapsed
151136,151136,49,PREPARED FOODS,330.521,0,95.99,Quito,Pichincha,A,11,2013,3,13,26,1,85,False,False,False,False,False,False,1364256000.0
379340,379340,52,BREAD/BAKERY,0.0,0,107.93,Manta,Manabi,A,11,2013,8,31,1,3,213,False,True,False,False,False,False,1375315000.0
1101737,1101737,21,SEAFOOD,0.0,0,92.18,Santo Domingo,Santo Domingo de los Tsachilas,B,6,2014,9,37,12,4,255,False,False,False,False,False,False,1410480000.0
2154512,2154512,11,DAIRY,525.0,106,45.29,Cayambe,Pichincha,B,6,2016,4,17,27,2,118,False,False,False,False,False,False,1461715000.0
1315229,1315229,12,HARDWARE,1.0,0,,Latacunga,Cotopaxi,C,15,2015,1,2,11,6,11,False,False,False,False,False,False,1420934000.0
2895924,2895924,14,DELI,215.0,1,,Riobamba,Chimborazo,C,7,2017,6,24,18,6,169,False,False,False,False,False,False,1497744000.0
2721722,2721722,26,HARDWARE,0.0,0,,Guayaquil,Guayas,D,10,2017,3,10,12,6,71,False,False,False,False,False,False,1489277000.0
2345563,2345563,21,"LIQUOR,WINE,BEER",327.0,1,44.47,Santo Domingo,Santo Domingo de los Tsachilas,B,6,2016,8,32,12,4,225,False,False,False,False,False,False,1470960000.0
221869,221869,34,EGGS,91.0,0,,Guayaquil,Guayas,B,6,2013,5,18,5,6,125,False,False,False,False,False,False,1367712000.0
447058,447058,52,CLEANING,0.0,0,,Manta,Manabi,A,11,2013,9,36,8,6,251,False,False,False,False,False,False,1378598000.0
