March 27, 2019.
Luis Da Silva.

This notebook implements Facebook's package Prophet (https://facebook.github.io/prophet/) to Walmart data.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from fbprophet import Prophet

In [None]:
def wmae(holiday, y, y_pred):
    """Computes weighted mean absolute error"""
    w = holiday*4 + 1
    return -1 * (1 / w.sum()) * (w @ abs(y-y_pred))

In [None]:
def read_clean_df(train=True):
    if train:
        path = '../data/merged_train_data.csv'
    else:
        path = '../data/merged_test_data.csv'
        
    df = pd.read_csv(path).iloc[:,1:]
    df.rename(index=str, columns={'Size (sq ft)':'Size'}, inplace=True)
    df.drop(['Date-1', 'Date-2', 'Promotion17', 'Promotion114',
             'Promotion121', 'Year', 'HighPromoter1', 'LowPromoter1',
             'HighPromoter2', 'LowPromoter2', 'HighPromoter3',
             'LowPromoter3', 'HighPromoter4', 'LowPromoter4',
             'HighPromoter5', 'LowPromoter5',], axis=1, inplace=True)
    if train:
        df.drop(['ImportantHoliday'], axis=1, inplace=True)
        df.loc[df['Weekly_Sales'] < 0, 'Weekly_Sales'] = 0
    
    df['Date'] = pd.to_datetime(df['Date'])
    df['IsHoliday_weight'] = df['IsHoliday']
    df['AllDept'] = df['Dept']
    df = pd.get_dummies(df, columns=['Type', 'Dept'])
    df.sort_values(['Date', 'Store', 'AllDept'], inplace=True)
    
    if train:
        # Very low weekly sales will be replaced by 0s
        threshold = df.groupby(['Store', 'AllDept'])['Weekly_Sales'].mean()/50
        for idx, v in zip(threshold.index, threshold):
            mask = np.logical_and(df['Store']==idx[0], df['AllDept']==idx[1])
            mask = np.logical_and(mask, df['Weekly_Sales']<=v)
            df.loc[mask, 'Weekly_Sales'] = 0
    return df

In [None]:
def get_cut_date(dates, n):
    udates = np.unique(dates)
    udates.sort()
    ndates = udates.shape[0]
    cut_date = udates[-int(ndates/n)]
    return cut_date

In [None]:
df = read_clean_df()
df.head()

In [None]:
tdf = read_clean_df(False)

In [None]:
class Model:
    '''
        Main class to build Prophet model with all the required information.
        As Prophet is a Time Series framework, and panel data is being handled,
        one needs to model one department at a time.
    '''
    def __init__(self, df, store, dept):
        # Creating masks
        train_mask = np.logical_and(df['Store']==store, df['AllDept']==dept)
        test_mask = np.logical_and(tdf['Store']==store, tdf['AllDept']==dept)

        # Cutdate for validation
        cut_date = get_cut_date(df[train_mask]['Date'], 5)
        self.validation_mask = np.logical_and(train_mask, df['Date']>=cut_date)
        train_mask =  np.logical_and(train_mask, df['Date']<cut_date)

        # Main dataframe
        self.tsdf = df[train_mask][['Date', 'Weekly_Sales']]
        self.tsdf.columns = ['ds', 'y']

        # Holidays
        superbowl = pd.DataFrame({
          'holiday': 'superbowl',
          'ds': pd.to_datetime(['2010-02-12', '2011-02-11', '2012-02-10', '2013-02-08']),
          'lower_window': 0,
          'upper_window': 1,
        })
        labor = pd.DataFrame({
          'holiday': 'labor',
          'ds': pd.to_datetime(['2010-09-10', '2011-09-09', '2012-09-07', '2013-11-29']),
          'lower_window': 0,
          'upper_window': 0,
        })
        thanks = pd.DataFrame({
          'holiday': 'thanks',
          'ds': pd.to_datetime(['2010-11-26', '2011-11-25', '2012-11-23', '2013-11-29']),
          'lower_window': -1,
          'upper_window': 0,
        })
        christmas = pd.DataFrame({
          'holiday': 'christmas',
          'ds': pd.to_datetime(['2010-12-31', '2011-12-30', '2012-12-28', '2013-11-27']),
          'lower_window': -1,
          'upper_window': 0,
        })
        self.holidays = pd.concat((superbowl, labor, thanks, christmas))

        # Future dates to be predicted
        self.future_df = tdf[['Date']].drop_duplicates()
        self.future_df.columns = ['ds']
        
    def fit(self, **kwargs):
        self.prophet = Prophet(holidays=self.holidays, **kwargs)
        self.prophet.fit(self.tsdf)
        self.past_fut = pd.concat((self.tsdf[['ds']],self.future_df))
        self.forecast = self.prophet.predict(self.past_fut)
        
    def plot(self):
        self.prophet.plot_components(self.forecast)
        
    def validate(self):
        holi = df[self.validation_mask]['IsHoliday'].reset_index(drop=True)
        val_dates = df[self.validation_mask][['Date']]
        val_dates.columns = ['ds']
        y = df[self.validation_mask]['Weekly_Sales'].reset_index(drop=True)
        y_pred = self.prophet.predict(val_dates)['yhat']

        self.score = wmae(holi, y, y_pred)
        print(self.score)

In [None]:
# Test a department in a store to see if the class behaves accordingly
s1d1 = Model(df, 1, 1)
s1d1.fit(weekly_seasonality=True, daily_seasonality=False)
s1d1.plot()
s1d1.validate()

In [None]:
%%time
# Fit all the departments and stores
scores = []
preds = []
for store in df['Store'].unique():
    mask = df['Store']==store
    for dept in df[df['Store']==store]['AllDept'].unique():
        model = Model(df, store, dept)
        if model.tsdf.shape[0] == 0:
            continue
        model.fit()
        model.validate()
        
        scores.append(model.score)
        preds.append(model.forecast)

In [None]:
print('Percentiles: ', {i:np.percentile(scores, i) for i in (5, 10, 25, 50)})
print('Mean: ', np.mean(scores))
print('Number of scores: ', len(scores))