# Rossman_simple
* Rossman competition without external data
* See Lesson 12 of machine learning course for the full explanation of this notebook

In [113]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2

from IPython.display import HTML, display

from functions_and_modules import *

In [109]:
MAIN = '/Users/s6215054/Desktop/play/rossman_sales_forecasting'
PATH = f'{MAIN}/data'

In [110]:
table_names = ['train', 'store', 'test']
tables = [pd.read_csv(f'{PATH}/{fname}.csv', low_memory=False) for fname in table_names]

In [111]:
tables[1].head()

Unnamed: 0,Store,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval
0,1,c,a,1270.0,9.0,2008.0,0,,,
1,2,a,a,570.0,11.0,2007.0,1,13.0,2010.0,"Jan,Apr,Jul,Oct"
2,3,a,a,14130.0,12.0,2006.0,1,14.0,2011.0,"Jan,Apr,Jul,Oct"
3,4,c,c,620.0,9.0,2009.0,0,,,
4,5,a,a,29910.0,4.0,2015.0,0,,,


# Cleaning / Feature Engineering

In [5]:
train, store, test = tables
len(train), len(test)

(1017209, 41088)

In [6]:
# Encode state holidays
train.StateHoliday = train.StateHoliday!='0'
test.StateHoliday = test.StateHoliday!='0'

In [7]:
# Add date features
add_datepart(train, "Date", drop=False)
add_datepart(test, "Date", drop=False)

In [8]:
test.head()

Unnamed: 0,Id,Store,DayOfWeek,Date,Open,Promo,StateHoliday,SchoolHoliday,Year,Month,...,Day,Dayofweek,Dayofyear,Is_month_end,Is_month_start,Is_quarter_end,Is_quarter_start,Is_year_end,Is_year_start,Elapsed
0,1,1,4,2015-09-17,1.0,1,False,0,2015,9,...,17,3,260,False,False,False,False,False,False,1442448000
1,2,3,4,2015-09-17,1.0,1,False,0,2015,9,...,17,3,260,False,False,False,False,False,False,1442448000
2,3,7,4,2015-09-17,1.0,1,False,0,2015,9,...,17,3,260,False,False,False,False,False,False,1442448000
3,4,8,4,2015-09-17,1.0,1,False,0,2015,9,...,17,3,260,False,False,False,False,False,False,1442448000
4,5,9,4,2015-09-17,1.0,1,False,0,2015,9,...,17,3,260,False,False,False,False,False,False,1442448000


## Join

In [9]:
joined = join_df(train, store, "Store")
joined_test = join_df(test, store, "Store")
display(len(joined[joined.StoreType.isnull()]),len(joined_test[joined_test.StoreType.isnull()]))

0

0

In [10]:
# Drop duplicate columns due to merge
for df in (joined, joined_test):
    for c in df.columns:
        if c.endswith('_y'):
            if c in df.columns: df.drop(c, inplace=True, axis=1)

## Clean NAs

In [11]:
for df in (joined,joined_test):
    df['CompetitionOpenSinceYear'] = df.CompetitionOpenSinceYear.fillna(1900).astype(np.int32)
    df['CompetitionOpenSinceMonth'] = df.CompetitionOpenSinceMonth.fillna(1).astype(np.int32)
    df['Promo2SinceYear'] = df.Promo2SinceYear.fillna(1900).astype(np.int32)
    df['Promo2SinceWeek'] = df.Promo2SinceWeek.fillna(1).astype(np.int32)

## Limit CompeitionOpen Feature

In [12]:
for df in (joined, joined_test):
    # Convert CompetitionOpenSinceMonth and CompetitionOpenSinceYear
    # features to a proper datetime object
    comp_dict = dict(year=df.CompetitionOpenSinceYear, 
                     month=df.CompetitionOpenSinceMonth, 
                     day=15)
    df["CompetitionOpenSince"] = pd.to_datetime(comp_dict)

    # Create feature on # of days since competition opened
    df["CompetitionDaysOpen"] = (df.
                                 Date.
                                 subtract(df.CompetitionOpenSince).
                                 dt.days)

In [13]:
# If CompeitionDaysOpen is negative or occured in year before 1990
# just set it equal to zero
for df in (joined,joined_test):
    df.loc[df.CompetitionDaysOpen<0, "CompetitionDaysOpen"] = 0
    df.loc[df.CompetitionOpenSinceYear<1990, "CompetitionDaysOpen"] = 0

In [14]:
# Create feature on # of months since compeition open
# If number of months is greater than 24 (i.e. > 2 years open)
# top code it and set to 24 months
for df in (joined,joined_test):
    
    df["CompetitionMonthsOpen"] = df["CompetitionDaysOpen"] // 30

    df.loc[df.CompetitionMonthsOpen>24, "CompetitionMonthsOpen"] = 24

## Limit Promo Feature
Apply same approach to Promo feature

In [15]:
for df in (joined,joined_test):
    # Convert Promo2SinceWeek and Promo2SinceYear into
    # a proper datetime object
    df["Promo2Since"] = pd.to_datetime(df.apply(lambda x: Week(
        x.Promo2SinceYear, x.Promo2SinceWeek).monday(), axis=1).astype(pd.datetime))
    
    # Compute number of days since last Promo2 was first introduced
    df["Promo2Days"] = df.Date.subtract(df["Promo2Since"]).dt.days

In [16]:
for df in (joined,joined_test):
    
    # If days since is negative or before 1990, set to 0
    df.loc[df.Promo2Days<0, "Promo2Days"] = 0
    df.loc[df.Promo2SinceYear<1990, "Promo2Days"] = 0
    
    # Number of weeks
    df["Promo2Weeks"] = df["Promo2Days"]//7

    # If continuous promo is older than 25 weeks, top code to 25 weeks
    df.loc[df.Promo2Weeks<0, "Promo2Weeks"] = 0
    df.loc[df.Promo2Weeks>25, "Promo2Weeks"] = 25
    df.Promo2Weeks.unique()

# Featherize

In [17]:
joined.to_feather(f'{PATH}/joined_simple')
joined_test.to_feather(f'{PATH}/joined_test_simple')

# Durations Feature

In [185]:
joined = feather.read_dataframe(f'{PATH}/joined_simple')
joined_test = feather.read_dataframe(f'{PATH}/joined_test_simple')

## Periods elapsed
* Period elapsed computes the # of days before a holiday takes place and the # of days since the last holiday

In [186]:
# Columns on which we will apply get_elapsed
columns = ["Date", "Store", "Promo", "StateHoliday", "SchoolHoliday"]

In [187]:
# Append rows in test[columns] to end of train[columns]
# Why?
# Periods elapsed need dates from training in order for 
# window to roll continuously
df = train[columns].append(test[columns])

In [188]:
# Get time elapsed before and after event (see explanation below)

print("SchoolHoliday get elapsed...")
fld = 'SchoolHoliday'

df = df.sort_values(['Store', 'Date'])
CompFuncs.get_elapsed(df, fld, 'After')

df = df.sort_values(['Store', 'Date'], ascending=[True, False])
CompFuncs.get_elapsed(df, fld, 'Before')

print("StateHoliday get elapsed...")
fld = 'StateHoliday'
df = df.sort_values(['Store', 'Date'])
CompFuncs.get_elapsed(df, fld, 'After')
df = df.sort_values(['Store', 'Date'], ascending=[True, False])
CompFuncs.get_elapsed(df, fld, 'Before')

print("Promo get elapsed...")
fld = 'Promo'
df = df.sort_values(['Store', 'Date'])
CompFuncs.get_elapsed(df, fld, 'After')
df = df.sort_values(['Store', 'Date'], ascending=[True, False])
CompFuncs.get_elapsed(df, fld, 'Before')

SchoolHoliday get elapsed...
StateHoliday get elapsed...
Promo get elapsed...


In [189]:
train.shape, test.shape, df.shape

((1017209, 22), (41088, 21), (1058297, 11))

In [190]:
df = df.set_index("Date")
columns = ['SchoolHoliday', 'StateHoliday', 'Promo']

for o in ['Before', 'After']:
    for p in columns:
        a = o+p
        df[a] = df[a].fillna(0).astype(int)

## Rolling quantities
* Compute the forward and backward window rolling sum
* For a given row, bwd computes the sum total of days a holiday has been in effect in the past 7 days
* Similarly, for a given row, fwd computes the the sum total of days before a holiday ends

In [191]:
# Back of rolling window
bwd = (
        df[['Store']+columns].
        sort_index().
        groupby("Store").
        rolling(7, min_periods=1).
        sum()
    )

In [192]:
# Front of rolling window
fwd = (
        df[['Store']+columns].
        sort_index(ascending=False).
        groupby("Store").
        rolling(7, min_periods=1).
        sum()
    )

In [193]:
bwd.drop('Store',1,inplace=True)
bwd.reset_index(inplace=True)
fwd.drop('Store',1,inplace=True)
fwd.reset_index(inplace=True)
df.reset_index(inplace=True)

In [194]:
# Merge all together
df = df.merge(bwd, 'left', ['Date', 'Store'], suffixes=['', '_bw'])
df = df.merge(fwd, 'left', ['Date', 'Store'], suffixes=['', '_fw'])

In [195]:
# Drop original columns
df.drop(columns,1,inplace=True)

In [196]:
# Understanding intuition for periods elapsed and rolling quantities  features
# Comment out drop of orig. columns in previous cell to see original columns

# school_cols = ['Date',
#  'Store',
#  'SchoolHoliday_bw',
#  'SchoolHoliday',
#  'SchoolHoliday_fw']

# school_cols = ['Date',
#  'Store',
#  'AfterSchoolHoliday',
#  'SchoolHoliday',
#  'BeforeSchoolHoliday']

# df[school_cols].sort_values(['Store', 'Date'], ascending=[True, False]).head(100)

## Join back with joined df

In [197]:
joined = join_df(joined, df, ['Store', 'Date'])
joined_test = join_df(joined_test, df, ['Store', 'Date'])

In [198]:
# Restricting to cases where sales are positive
joined = joined[joined.Sales!=0]

# Featherize

In [199]:
# Feather Save
joined.reset_index().to_feather(f'{PATH}/joined')
joined_test.reset_index().to_feather(f'{PATH}/joined_test')

# Create Features

In [200]:
# Feather Load
joined = feather.read_dataframe(f'{PATH}/joined')
joined_test = feather.read_dataframe(f'{PATH}/joined_test')

In [201]:
cat_vars = [
    'Store', 'DayOfWeek', 'Year', 'Month', 'Day', 'StateHoliday', 'CompetitionMonthsOpen',
    'Promo2Weeks', 'StoreType', 'Assortment', 'PromoInterval', 'CompetitionOpenSinceYear', 'Promo2SinceYear',
    'Week', 'Promo_fw', 'Promo_bw', 'StateHoliday_fw', 'StateHoliday_bw',
    'SchoolHoliday_fw', 'SchoolHoliday_bw'
]
contin_vars = ['CompetitionDistance', 'AfterStateHoliday', 
               'BeforeStateHoliday', 'Promo', 'SchoolHoliday']

In [202]:
dep = 'Sales'
joined = joined[cat_vars + contin_vars + [dep, 'Date']].copy()

In [203]:
joined_test[dep] = 0
joined_test = joined_test[cat_vars + contin_vars + [dep, 'Date', 'Id']].copy()

In [204]:
# Cast var as a category dtype
for v in cat_vars:
     joined[v] = joined[v].astype('category').cat.as_ordered()

# Make categorical encoding in joined_test to be
# the same as in joined
# If Saturday in a week category feature is encoded as a 6
# in the training set, it will be a 6 in the test set
apply_cats(joined_test, joined)

# For each continuous var, fill mising with 0 and
# cast as float32 (requirement by Pytorch)
for v in contin_vars:
    joined[v] = joined[v].fillna(0).astype('float32')
    joined_test[v] = joined_test[v].fillna(0).astype('float32')

# Featherize

In [206]:
# Feather Save
# joined.reset_index().to_feather(f'{PATH}/joined')
# joined_test.reset_index().to_feather(f'{PATH}/joined_test')

## Sampling and Final Data Prep
* Scaling important for neural networks
* `mapper` keeps track of mean, median used in normalization for application to test set
* `nas` keeps track of missing (cat: missing become zero, continuous: missing becomes median and creates new boolean column if col missing in row)
* See ML course for details

In [207]:
# Feather Load
# joined = feather.read_dataframe(f'{PATH}/joined')
# joined_test = feather.read_dataframe(f'{PATH}/joined_test')

In [208]:
# !!!

n = len(joined)

# Get sub-sample of data
idxs = get_cv_idxs(n, val_pct=150000/n)  # ???
joined_samp = joined.iloc[idxs].set_index("Date")
samp_size = len(joined_samp); samp_size

# To run on the full dataset, use this instead:
# samp_size = n
# joined_samp = joined.set_index("Date")

150000

In [209]:
# Split off the response variable, convert to all columns to
# numeric. Particularly the categorical vars
df, y, nas, mapper = proc_df(joined_samp, 'Sales', 
                             do_scale=True)
yl = np.log(y)

The mapper object contains the column names and the corresponding transformation applied to them (such as the StandardScaler for continuous variables)

In [210]:
joined_test = joined_test.set_index("Date")

In [211]:
# Do same to test sample
# Note application of nas and mapper derived from training
# to ensure treatment of missing is same as in training set
df_test, _, nas, mapper = proc_df(joined_test, 'Sales', 
                                  do_scale=True, skip_flds=['Id'],
                                  mapper=mapper, na_dict=nas)

# Validation sample

* `cond` below is an array containing booleans that is true if the
element in df.index satisfies the condition
* `np.flatnonzero` returns an array of indices where the corresponding
element in cond is true

In [212]:
# Index for data to be used for validation
# Last two weeks of training set

# Number of data used in validation == number of data in test set
# which is from 2014,8,1 to 2014,9,17

# val_idx contains the index for which the date in df.index
# is in between 2014/8/1 and 2014/9/17

# Note the year for validation. In the test data, the corresponding
# interval is 2015-08-01 to 2015-09-17

# We are using only the 2014 data hare
cond = (df.index >= datetime.datetime(2014,8,1)) & \
       (df.index <= datetime.datetime(2014,9,17)) 

val_idx = np.flatnonzero(cond)

In [213]:
# See ML video, lesson 12 00:30:00 mark
# This is for full training on the entire dataset
# for the final model

# Do this after settling on a given model and hyperparameters

# val_idx = [0] 

# Modelling
* oob_score: 

## Split validation / training

In [214]:
from sklearn.ensemble import RandomForestRegressor
((val,trn), (y_val,y_trn)) = split_by_idx(val_idx, df.values, yl)

## RF

In [215]:
params = {
    'n_estimators'=40, 
    'max_features'=0.33, 
    'min_samples_leaf'=2,
    'n_jobs'=-1, 
    'oob_score'=False
}
m = RandomForestRegressor(**params)
m.fit(trn, y_trn)
preds = m.predict(val)
m.score(trn, y_trn), m.score(val, y_val), CompFuncs.exp_rmspe(preds, y_val)

(0.9350441970942147, 0.842723494322492, 0.16456469688910202)

In [216]:
params = {
    'n_estimators'=40, 
    'max_features'=0.99, 
    'min_samples_leaf'=2,
    'n_jobs'=-1, 
    'oob_score'=False
}
m = RandomForestRegressor(**params)
m.fit(trn, y_trn);
preds = m.predict(val)
m.score(trn, y_trn), m.score(val, y_val), CompFuncs.exp_rmspe(preds, y_val)

(0.9676962609517541, 0.9105141238751029, 0.12250462415303369)

## Xgboost

In [223]:
from xgboost import XGBRegressor

In [234]:
params = {"objective": "reg:linear",
          "n_estimators": 300,
          "max_depth": 8,
          "colsample_bytree": 0.7,
          "subsample": 0.7
         }

In [235]:
m = XGBRegressor(**params)
m.fit(trn, y_trn)
preds = m.predict(val)
m.score(trn, y_trn), m.score(val, y_val), CompFuncs.exp_rmspe(preds, y_val)

(0.926170871417617, 0.8983104735215454, 0.13236952552405945)