# ROSSMAN SALES PREDICTION

# 1. Problem Description

* Id - an Id that represents a (Store, Date) duple within the test set
* Store - a unique Id for each store
* Sales - the turnover for any given day (this is what you are predicting)
* Customers - the number of customers on a given day
* Open - an indicator for whether the store was open: 0 = closed, 1 = open
* StateHoliday - indicates a state holiday. Normally all stores, with few exceptions, are closed on state holidays. Note that all schools are closed on public holidays and weekends. a = public holiday, b = Easter holiday, c = Christmas, 0 = None
* SchoolHoliday - indicates if the (Store, Date) was affected by the closure of public schools
* StoreType - differentiates between 4 different store models: a, b, c, d
* Assortment - describes an assortment level: a = basic, b = extra, c = extended
* CompetitionDistance - distance in meters to the nearest competitor store
* CompetitionOpenSince[Month/Year] - gives the approximate year and month of the time the nearest competitor was opened
* Promo - indicates whether a store is running a promo on that day
* Promo2 - Promo2 is a continuing and consecutive promotion for some stores: 0 = store is not participating, 1 = store is participating
* Promo2Since[Year/Week] - describes the year and calendar week when the store started participating in Promo2
* PromoInterval - describes the consecutive intervals Promo2 is started, naming the months the promotion is started anew. E.g. "Feb,May,Aug,Nov" means each round starts in February, May, August, November of any given year for that store

Ordinal encoder on STORE column is OK for Tree Based Models but not so good for Linear Models.

Show a graph for TimeSeriesKFold showing the folds.

In [1]:
import math
import numpy  as np
import pandas as pd
import random
import warnings
import inflection
import seaborn as sns
import xgboost as xgb
import datetime
import missingno as msno

from matplotlib            import pyplot as plt
from IPython.core.display  import HTML


from sklearn.metrics       import mean_absolute_error, mean_squared_error
from sklearn.ensemble      import RandomForestRegressor
from sklearn.linear_model  import LinearRegression, Lasso
from sklearn.preprocessing import RobustScaler, FunctionTransformer, OneHotEncoder
from sklearn.impute        import SimpleImputer
from sklearn.pipeline      import Pipeline, make_pipeline 
from sklearn.compose       import ColumnTransformer, TransformedTargetRegressor, make_column_transformer
from sklearn.linear_model  import LinearRegression
from sklearn.base          import  BaseEstimator, TransformerMixin
from sklearn.feature_selection import SelectKBest, SelectPercentile, f_regression, mutual_info_regression, RFECV
from sklearn.svm import SVR

from boruta import BorutaPy

from sklearn.model_selection import cross_validate, cross_val_score, train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_predict

warnings.filterwarnings('ignore')

  from pandas import MultiIndex, Int64Index


# 2. Imports and Helper Functions

In [2]:
def mean_absolute_percentage_error(y, y_pred):
    return np.mean(np.abs((y - y_pred)/y))

    
def ml_error(model_name, y, y_pred):
    mae = mean_absolute_error(y, y_pred)
    mape = mean_absolute_percentage_error(y, y_pred)
    rmse = np.sqrt( mean_squared_error(y, y_pred))
    return pd.DataFrame( { 'Model Name': model_name, 
                           'MAE': mae, 
                           'MAPE': mape,
                           'RMSE': rmse }, index=[0] )

def cross_validation(df, kfold, model_name, model, verbose=False):
    mae_list = []
    mape_list = []
    rmse_list = []
    for k in reversed(range(1, kfold +1)):
        if verbose:
            print( '\nKFold Number: {}'.format( k ) )
        # start and end date for validation 
        validation_start_date = df['Date'].max() - datetime.timedelta(days=k*6*7)
        validation_end_date = df['Date'].max() - datetime.timedelta(days=(k-1)*6*7)

        # filtering dataset
        train = df[df['Date'] < validation_start_date]
        test = df[(df['Date'] >= validation_start_date) & (df['Date'] <= validation_end_date)]

        # train
        X_train = train.drop(['Customers', 'Sales'], axis=1)
        y_train = train['Sales']

        # validation
        X_test = test.drop(['Customers', 'Sales'], axis=1)
        y_test = test['Sales']

        # model
        m = model.fit(X_train, y_train)

        # prediction
        y_pred = m.predict(X_test)

        # performance
        m_result = ml_error(model_name, y_test, y_pred)

        # store performance of each kfold iteration
        mae_list.append(m_result['MAE'])
        mape_list.append(m_result['MAPE'])
        rmse_list.append(m_result['RMSE'])

    return pd.DataFrame( {'Model Name': model_name,
                          'MAE CV': np.round( np.mean( mae_list ), 2 ).astype( str ) + ' +/- ' + np.round( np.std( mae_list ), 2 ).astype( str ),
                          'MAPE CV': np.round( np.mean( mape_list ), 2 ).astype( str ) + ' +/- ' + np.round( np.std( mape_list ), 2 ).astype( str ),
                          'RMSE CV': np.round( np.mean( rmse_list ), 2 ).astype( str ) + ' +/- ' + np.round( np.std( rmse_list ), 2 ).astype( str ) }, index=[0] )


def jupyter_settings():
    %matplotlib inline
    
    plt.rcParams['figure.figsize'] = [25, 12]
    plt.rcParams['font.size'] = 24
    
    display(HTML('<style>.container { width:100% !important; }</style>'))
    pd.options.display.max_columns = None
    pd.options.display.max_rows = None
    pd.set_option('display.expand_frame_repr', False)

    sns.set_theme()

In [3]:
jupyter_settings()

In [4]:
class TimeSeriesKFold:
    """ A cross-validation generator specifically created for time series 
    containing a 'Date' column with frequency in days.

    Provides train/test indices to split time series data samples in train/test
    sets. Samples can be observed at irregular time intervals. Multiple samples per
    timestamp are allowed.

    This cross-validation object is a variation of class 'KFold'.
    In the kth split, it returns first k folds as train set and the
    (k+1)th fold as test set. Note that unlike standard cross-validation methods,
    successive training sets are supersets of those that come before them.

    It should be used inside a pipeline that drops the 'Date' column during 
    preprocessing.

    Parameters
    ----------
    n_splits : int, default=5
        Number of splits. Must be at least 2.

    test_size : int, default=None
        Used to limit the size of the test set. Default is 42 days, or 6 weeks. 
        
    gap : int, default=0
        Number of DAYS to exclude from the end of each train set before
        the test set.
    """

    def __init__(self, n_splits=5, test_size=42, gap=0):
        self.n_splits = n_splits
        self.test_size = test_size
        self.gap = gap

    def split(self, X, y=None, groups=None):

        n_splits = self.n_splits
        n_folds = n_splits + 1
        test_size = self.test_size
        gap = self.gap

        X.index = np.arange(X.shape[0])

        for k in range(1, n_folds):

            test_start_date = X['Date'].max() - datetime.timedelta(days= k * test_size)
            test_end_date = X['Date'].max() - datetime.timedelta(days= (k-1) * test_size)

            train_end_date = test_start_date - datetime.timedelta(days=gap)

            train_index = X[X['Date'] < train_end_date].index.values
            test_index = X[(X['Date'] >= test_start_date) & (X['Date'] <= test_end_date)].index.values

            yield train_index, test_index

    def get_n_splits(self, X, y, groups=None):
        return self.n_splits

# 3. Data Preparation

## 3.1 Data Import

In [29]:
df_sales_raw = pd.read_csv('/Users/lucasstelmastchuk/Documents/repos/rossman_sales/data/train.csv', low_memory=False)
df_store_raw = pd.read_csv('/Users/lucasstelmastchuk/Documents/repos/rossman_sales/data/store.csv', low_memory=False)

df_raw = pd.merge(df_sales_raw, df_store_raw, how='left', on='Store')

df = df_raw.copy()
df = df[df['Open'] == 1]
df = df[df['Sales'] > 0]
df['Date'] = pd.to_datetime(df['Date'])

X_train = df.drop(['Sales', 'Customers'], axis=1).copy()
y_train = df['Sales'].copy()

print(f'Number of rows: {df.shape[0]}')
print(f'Number of columns: {df.shape[1]}')
df.sample()

Number of rows: 844338
Number of columns: 18


Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval
542033,929,1,2014-03-03,8709,954,1,1,0,0,a,c,4820.0,9.0,2013.0,0,,,


In [30]:
df.columns

Index(['Store', 'DayOfWeek', 'Date', 'Sales', 'Customers', 'Open', 'Promo',
       'StateHoliday', 'SchoolHoliday', 'StoreType', 'Assortment',
       'CompetitionDistance', 'CompetitionOpenSinceMonth',
       'CompetitionOpenSinceYear', 'Promo2', 'Promo2SinceWeek',
       'Promo2SinceYear', 'PromoInterval'],
      dtype='object')

In [31]:
df_sales_test = pd.read_csv('/Users/lucasstelmastchuk/Documents/repos/rossman_sales/data/test.csv', low_memory=False)
df_raw_test = pd.merge(df_sales_test, df_store_raw, how='left', on='Store')
X_test = df_raw_test.drop(['Id'], axis=1).copy()
X_test['Date'] = pd.to_datetime(X_test['Date'])
X_test.head()

Unnamed: 0,Store,DayOfWeek,Date,Open,Promo,StateHoliday,SchoolHoliday,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval
0,1,4,2015-09-17,1.0,1,0,0,c,a,1270.0,9.0,2008.0,0,,,
1,3,4,2015-09-17,1.0,1,0,0,a,a,14130.0,12.0,2006.0,1,14.0,2011.0,"Jan,Apr,Jul,Oct"
2,7,4,2015-09-17,1.0,1,0,0,a,c,24000.0,4.0,2013.0,0,,,
3,8,4,2015-09-17,1.0,1,0,0,a,a,7520.0,10.0,2014.0,0,,,
4,9,4,2015-09-17,1.0,1,0,0,a,c,2030.0,8.0,2000.0,0,,,


## 3.2 Data Cleaning

In [None]:
msno.bar(df)

In [None]:
msno.matrix(df)

In [None]:
msno.heatmap(df)

In [14]:
class DataFrameCleaner(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        
        self.competition_open_since_month_imputer = SimpleImputer(strategy='median').fit(X[['CompetitionOpenSinceMonth']])
        self.competition_open_since_year_imputer = SimpleImputer(strategy='median').fit(X[['CompetitionOpenSinceYear']])
        
        return self
    
    def transform(self, X):
        
        X = X.copy()

        # CHANGING COLUMN NAMES TO SNAKE_CASE
        cols_old = X.columns
        snakecase = lambda x: inflection.underscore(x)
        cols_new = list(map(snakecase, cols_old))
        X.columns = cols_new

        # FILLING IN MISSING VALUES
        # Maximum competition distance is 75,860 meters. Considering that missing values are probably related to stores with
        # no competition nearby, these stores will get the value 200,000 meters to indicate that competitors are far away.      
        X['competition_distance'] = X['competition_distance'].apply(lambda x: 200000.0 if math.isnan(x) else x)

        # Imputation of the median competition opening month and year for stores with no information about competition opening date.
        # Note that missing values here do not represent absence of competition, but absence of information about competition opening date,
        # as competion distance have very few missing values.
        X['competition_open_since_month'] = self.competition_open_since_month_imputer.transform(X[['competition_open_since_month']])
        X['competition_open_since_year'] = self.competition_open_since_year_imputer.transform(X[['competition_open_since_year']])

        # Missing values for start of promo2 are linked to stores that do not participate in this second type of promotion at any given year.
        # Missing values will be replaced with the current date so that the total elapsed time for these stores since the start of the 
        # promotion 2 will always be zero 0 (see feature engineering section).
        X['promo2_since_week'] = X.apply(lambda x: x['date'].week if math.isnan(x['promo2_since_week']) else x['promo2_since_week'], axis=1)
        X['promo2_since_year'] = X.apply(lambda x: x['date'].year if math.isnan(x['promo2_since_year']) else x['promo2_since_year'], axis=1)

        # CORRECTING DATA TYPES
        X['competition_open_since_month'] = X['competition_open_since_month'].astype(int)
        X['competition_open_since_year'] = X['competition_open_since_year'].astype(int)
        X['promo2_since_week'] = X['promo2_since_week'].astype(int)
        X['promo2_since_year'] = X['promo2_since_year'].astype(int)
        
        return X

# 4. Feature Engineering

In [15]:
class AttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self,  cyclical_features=True):
        self.cyclical_features = cyclical_features
    
    def fit(self, X, y=None):
        return self
    
    def _transform_promo_interval(self, x):
            if x['promo_interval'] == 'Jan,Apr,Jul,Oct':
                if x['date'].month in [1, 4, 7, 10]:
                    return 1
                elif x['date'].month in [2, 5, 8, 11]:
                    return 2
                elif x['date'].month in [3, 6, 9, 12]:
                    return 3
            elif x['promo_interval'] == 'Feb,May,Aug,Nov':
                if x['date'].month in [2, 5, 8, 11]:
                    return 1
                elif x['date'].month in [3, 6, 9, 12]:
                    return 2
                elif x['date'].month in [1, 4, 7, 10]:
                    return 3
            elif x['promo_interval'] == 'Mar,Jun,Sept,Dec':
                if x['date'].month in [3, 6, 9, 12]:
                    return 1
                elif x['date'].month in [1, 4, 7, 10]:
                    return 2
                elif x['date'].month in [2, 5, 8, 11]:
                    return 3
            else:
                return 0

    def _sin_transform(self, col, period):
                return np.sin(2*np.pi*col/period)
            
    def _cos_transform(self, col, period):
                return np.cos(2*np.pi*col/period)

    def transform(self, X):
        
        X = X.copy()

        # Extract features from date column
        X['month'] = X['date'].dt.month
        X['day_of_month'] = X['date'].dt.day
        X['week_of_year'] = X['date'].dt.weekofyear
        X['is_weekend'] = X['date'].dt.weekday > 4
        X['is_weekend'] = X['is_weekend'].astype(int)
        
        # Combine year and month of competition opening and then calculate the elapsed time in months since competition opened.
        X['competition_since_date'] = X.apply(lambda x: datetime.datetime(year=x['competition_open_since_year'],
                                                                            month=x['competition_open_since_month'],
                                                                            day=1), axis=1)
        X['months_since_competition_opened'] = ((X['date'] - X['competition_since_date'])/pd.Timedelta('30 days')).astype(int)
        X['months_since_competition_opened'] = X['months_since_competition_opened'].apply(lambda x: 0 if x < 0 else x)

        # promo since
        X['promo2_since_date'] = X['promo2_since_year'].astype(str) + '-' + X['promo2_since_week'].astype(str)
        X['promo2_since_date'] = X['promo2_since_date'].apply(lambda x: datetime.datetime.strptime(x + '-0', '%Y-%W-%w'))
        X['weeks_since_promo2_started'] = ((X['date'] - X['promo2_since_date'])/pd.Timedelta('7 days')).astype(int)
        X['weeks_since_promo2_started'] = X['weeks_since_promo2_started'].apply(lambda x: 0 if x < 0 else x)

        # promo interval
        X['promo_interval'] = X[['date', 'promo_interval']].apply(lambda y: self._transform_promo_interval(y), axis=1)

        # Drop unnecessary columns
        X.drop(['date','open', 'competition_open_since_month', 'competition_open_since_year', 
                'promo2_since_week', 'promo2_since_year', 'competition_since_date', 'promo2_since_date'], 
                axis=1, inplace=True)

        # Add cyclical features
        if self.cyclical_features:
            
            X['sin_month'] = self._sin_transform(X['month'], 12)
            X['cos_month'] = self._cos_transform(X['month'], 12)
            
            X['sin_day_of_month'] = self._sin_transform(X['day_of_month'], 30)
            X['cos_day_of_month'] = self._cos_transform(X['day_of_month'], 30)

            X['sin_day_of_week'] = self._sin_transform(X['day_of_week'], 7)
            X['cos_day_of_week'] = self._cos_transform(X['day_of_week'], 7)


            X['sin_week_of_year'] = self._sin_transform(X['month'], 52)
            X['cos_week_of_year'] = self._cos_transform(X['month'], 52)
  
            X.drop(['month', 'day_of_month', 'day_of_week', 'week_of_year'], axis=1, inplace=True)
            
        return X

In [16]:
class Scaler(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        self.competition_distance_scaler = RobustScaler().fit(X[['competition_distance']])
        self.months_since_competition_opened_scaler = RobustScaler().fit(X[['months_since_competition_opened']])
        self.weeks_since_promo2_started_scaler = RobustScaler().fit(X[['weeks_since_promo2_started']])
        return self
    
    def transform(self, X):
        
        X = X.copy()

        # SCALING NUMERICAL FEATURES
        X['competition_distance'] = self.competition_distance_scaler.transform(X[['competition_distance']]).ravel()
        X['months_since_competition_opened'] = self.months_since_competition_opened_scaler.transform(X[['months_since_competition_opened']]).ravel()
        X['weeks_since_promo2_started'] = self.weeks_since_promo2_started_scaler.transform(X[['weeks_since_promo2_started']]).ravel()

        return X

In [17]:
class Encoder(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        
        self.state_holiday_encoder = OneHotEncoder(handle_unknown='ignore').fit(X[['state_holiday']])
        self.state_holiday_new_columns = self.state_holiday_encoder.get_feature_names_out()

        self.store_type_encoder = OneHotEncoder(handle_unknown='ignore').fit(X[['store_type']])
        self.store_type_new_columns = self.store_type_encoder.get_feature_names_out()

        return self
    
    def transform(self, X):
        
        X = X.copy()

        # ENCODING CATEGORICAL FEATURES
        X['assortment'] = X['assortment'].map({'a': 1, 'b': 2, 'c': 3})
        X['state_holiday'] = X['state_holiday'].map({'a': 'public_holiday', 'b': 'easter', 'c': 'christmas', '0': 'not_holiday'})

        state_holiday_encoded = pd.DataFrame(self.state_holiday_encoder.transform(X[['state_holiday']]).toarray(), columns=self.state_holiday_new_columns, index=X.index)

        store_type_encoded = pd.DataFrame(self.store_type_encoder.transform(X[['store_type']]).toarray(), columns=self.store_type_new_columns, index=X.index)

        X = pd.concat([X, state_holiday_encoded, store_type_encoded], axis=1).drop(['store_type', 'state_holiday'], axis=1)

        return X

## FEATURE SELECTION

In [None]:
pipeline = Pipeline(steps=[('data_frame_cleaner', DataFrameCleaner()),
                             ('attributes_adder', AttributesAdder()),
                             ('scaler', Scaler()),
                             ('encoder', Encoder()),
                             ('feature_selection', SelectPercentile(mutual_info_regression, percentile=65)),
                             ])

In [None]:
df_selected = pipeline.fit_transform(X_train, y_train)
df_selected

In [None]:
df_selected = pd.DataFrame(df_selected, columns=pipeline[-1].get_feature_names_out())
df_selected.head()

In [None]:
df_selected.shape

# 7. Machine Learning Models

### Linear Regression

In [18]:
linear_regression_pipeline = Pipeline(steps=[('data_frame_cleaner', DataFrameCleaner()),
                                             ('attributes_adder', AttributesAdder()),
                                             ('scaler', Scaler()),
                                             ('encoder', Encoder()),
                                             ('feature_selection', SelectPercentile(mutual_info_regression, percentile=65)),
                                             ('model', LinearRegression())
                                             ])
linear_regression_pipeline

In [None]:
linear_regression_pipeline.fit(X_train, y_train).predict(X_test)

In [19]:
scores = cross_val_score(linear_regression_pipeline, X_train, y_train, scoring='neg_root_mean_squared_error', cv=TimeSeriesKFold(n_splits=5), error_score='raise')

In [21]:
scores.mean()

-2680.3013447118474

In [32]:
cv_results = cross_validate(linear_regression_pipeline, X_train, y_train,
                            scoring=['neg_mean_absolute_error', 'neg_mean_absolute_percentage_error','neg_root_mean_squared_error'],
                            cv=TimeSeriesKFold(n_splits=5), error_score='raise', n_jobs=-1
                            )

Feature names unseen at fit time:
- competition_open_since_month
Feature names seen at fit time, yet now missing:
- CompetitionOpenSinceMonth

Feature names unseen at fit time:
- competition_open_since_year
Feature names seen at fit time, yet now missing:
- CompetitionOpenSinceYear

Feature names unseen at fit time:
- competition_open_since_month
Feature names seen at fit time, yet now missing:
- CompetitionOpenSinceMonth

Feature names unseen at fit time:
- competition_open_since_month
Feature names seen at fit time, yet now missing:
- CompetitionOpenSinceMonth

Feature names unseen at fit time:
- competition_open_since_year
Feature names seen at fit time, yet now missing:
- CompetitionOpenSinceYear

Feature names unseen at fit time:
- competition_open_since_month
Feature names seen at fit time, yet now missing:
- CompetitionOpenSinceMonth

Feature names unseen at fit time:
- competition_open_since_year
Feature names seen at fit time, yet now missing:
- CompetitionOpenSinceYear

Featu

In [33]:
print(cv_results['test_neg_mean_absolute_error'].mean())
print(cv_results['test_neg_mean_absolute_percentage_error'].mean())
print(cv_results['test_neg_root_mean_squared_error'].mean())

-1930.4877876671515
-0.30654669990754435
-2676.4187551911664


In [22]:
# performance
lr_result_cv = cross_validation(df, 5, 'LinearRegression', linear_regression_pipeline, verbose=False)
lr_result_cv

Unnamed: 0,Model Name,MAE CV,MAPE CV,RMSE CV
0,LinearRegression,1937.25 +/- 103.32,0.31 +/- 0.01,2681.29 +/- 174.77


### Lasso

In [None]:
# model
lrr = Lasso(alpha=0.01)
tlrr = TransformedTargetRegressor(lrr, func=np.log1p, inverse_func=np.expm1) 
lrr_pipeline = make_pipeline(clean_df_transformer, add_features_transformer, features_transformer, tlrr)

# performance
lrr_result_cv = cross_validation(df, 5, 'Lasso', tlrr_pipeline, verbose=False )
lrr_result_cv

### Random Forest

In [None]:
# model
rf = RandomForestRegressor(n_estimators=100, n_jobs=-1, random_state=42)
trf = TransformedTargetRegressor(rf, func=np.log1p, inverse_func=np.expm1) 
trf_pipeline = make_pipeline(clean_df_transformer, add_features_transformer, features_transformer, trf)

# performance
rf_result_cv = cross_validation(df, 5, 'Random Forest', trf_pipeline, verbose=False)
rf_result_cv

In [None]:
trf_pipeline[-1]

### XGBoost

In [None]:
# model
xgb_regressor = xgb.XGBRegressor( objective='reg:squarederror',
                                  n_estimators=1000, 
                                  eta=0.01, 
                                  max_depth=10, 
                                  subsample=0.7,
                                  colsample_bytree=0.9 )
t_xgb_regressor = TransformedTargetRegressor(xgb_regressor, func=np.log1p, inverse_func=np.expm1) 
xgb_pipeline = make_pipeline(clean_df_transformer, add_features_transformer, features_transformer, t_xgb_regressor)

# performance
xgb_result_cv = cross_validation(df, 5, 'XGBoost', xgb_pipeline, verbose=False)
xgb_result_cv

### Model Performance Comparison

In [None]:
models_result_cv = pd.concat([lr_result_cv, lrr_result_cv, rf_result_cv, xgb_result_cv])
models_result_cv

### Hyperparameter Fine Tuning

In [None]:
param = {
   'n_estimators': [1500, 1700, 2500, 3000, 3500],
   'eta': [0.01, 0.03],
   'max_depth': [3, 5, 9],
   'subsample': [0.1, 0.5, 0.7],
   'colsample_bytree': [0.3, 0.7, 0.9],
   'min_child_weight': [3, 8, 15]
       }

MAX_EVAL = 10

In [None]:
final_result = pd.DataFrame()

for i in range(MAX_EVAL):
   # choose values for parameters randomly
   hp = {k:random.sample(v,1)[0] for k, v in param.items()}
   print(hp)
   
   # model
   model_xgb = xgb.XGBRegressor( objective='reg:squarederror',
                                 n_estimators=hp['n_estimators'], 
                                 eta=hp['eta'], 
                                 max_depth=hp['max_depth'], 
                                 subsample=hp['subsample'],
                                 colsample_bytree=hp['colsample_bytree'],
                                 min_child_weight=hp['min_child_weight'] )
   transformed_xgb_regressor = TransformedTargetRegressor(model_xgb, func=np.log1p, inverse_func=np.expm1) 
   xgb_pipeline_tuning = make_pipeline(clean_df_transformer, add_features_transformer, features_transformer, transformed_xgb_regressor)
   
   # performance
   result = cross_validation(df, 5, 'XGBoost Regressor', xgb_pipeline_tuning, verbose=False)
   final_result = pd.concat([final_result, result])
       
final_result

### Final Model

In [None]:
param_tuned = {
    'n_estimators': 1700,
    'eta': 0.03,
    'max_depth': 9,
    'subsample': 0.5,
    'colsample_bytree': 0.7,
    'min_child_weight': 8 
        }

In [None]:
# model
xgb_tuned = xgb.XGBRegressor( objective='reg:squarederror',
                                    n_estimators=param_tuned['n_estimators'], 
                                    eta=param_tuned['eta'], 
                                    max_depth=param_tuned['max_depth'], 
                                    subsample=param_tuned['subsample'],
                                    colsample_bytree=param_tuned['colsample_bytree'],
                                    min_child_weight=param_tuned['min_child_weight'])
model_xgb_tuned = TransformedTargetRegressor(xgb_tuned, func=np.log1p, inverse_func=np.expm1) 
xgb_pipeline_tuned = make_pipeline(clean_df_transformer, add_features_transformer, features_transformer, model_xgb_tuned)

## Error Interpretation

In [None]:
train = df[df['Date'] < '2015-06-19']
test = df[df['Date'] >= '2015-06-19']

# train
X_train = train.drop(['Customers', 'Sales'], axis=1)
y_train = train['Sales']

# validation
X_test = test.drop(['Customers', 'Sales'], axis=1)
y_test = test['Sales']

In [None]:
xgb_pipeline_tuned.fit(X_train, y_train)

In [None]:
y_pred = xgb_pipeline_tuned.predict(X_test)

xgb_result_tuned = ml_error('XGBoost Regressor', y_test, y_pred)
xgb_result_tuned

In [None]:
sales_predictions_next_six_weeks = pd.DataFrame({'store':X_test['store'], 'sales':y_test, 'sales_prediction':y_pred})
total_sales_predictions_per_store = sales_predictions_next_six_weeks.groupby('store').sum()
mae = sales_predictions_next_six_weeks.groupby('store').apply(lambda x: mean_absolute_error( x['sales'], x['sales_prediction'])).reset_index().rename(columns={0:'MAE'})
mape = sales_predictions_next_six_weeks.groupby('store').apply(lambda x: mean_absolute_percentage_error( x['sales'], x['sales_prediction'])).reset_index().rename(columns={0:'MAPE'})

# Merge
aux1 = pd.merge(mae, mape, how='inner', on='store')
aux2 = pd.merge(aux1, total_sales_predictions_per_store, how='inner', on='store')

plt.subplot(1,2,1)
sns.scatterplot(x='store', y='MAPE', data=aux2)

plt.subplot(1,2,2)
sns.scatterplot(x='store', y='MAE', data=aux2)

### Total Performance

In [None]:
total_sales_prediction = sales_predictions_next_six_weeks['sales_prediction'].sum()
best_scenario = total_sales_prediction + xgb_result_tuned['MAE'].values[0]
worst_scenario = total_sales_prediction - xgb_result_tuned['MAE'].values[0]

scenarios = pd.DataFrame({'Scenario':['Total Sales Prediction', 'Worst Scenario', 'Best Scenario'],
                'Value':[total_sales_prediction, worst_scenario, best_scenario]})

scenarios['Value'] = scenarios['Value'].map('R${:,.2f}'.format)
scenarios

### Machine Learning Performance

In [None]:
df_results = pd.DataFrame({'date':X_test['date'], 'store':X_test['store'], 'sales':y_test, 'sales_prediction':y_pred})
df_results['residual'] = df_results['sales'] - df_results['sales_prediction']
df_results['error_rate'] = df_results['sales_prediction']/df_results['sales']
df_results.head()

In [None]:
plt.subplot(2, 2, 1)
sns.lineplot( x='date', y='sales', data=df_results, label='SALES')
sns.lineplot( x='date', y='sales_prediction', data=df_results, label='PREDICTIONS')

plt.subplot(2, 2, 2)
sns.lineplot(x='date', y='error_rate', data=df_results)
plt.axhline(1, linestyle='--')

plt.subplot(2, 2, 3)
sns.distplot(df_results['residual'])

plt.subplot(2, 2, 4)
sns.scatterplot(df_results['sales_prediction'], df_results['residual'])

# Kaggle Submission