# 0.0 General Section

## 0.1 Import Libraries

In [15]:
import pandas  as pd
import numpy   as np
import xgboost as xgb

import datetime
import warnings

from sklearn.ensemble      import RandomForestRegressor
from sklearn.linear_model  import LinearRegression, Lasso
from sklearn.metrics       import mean_absolute_error, mean_absolute_percentage_error, mean_squared_error

warnings.filterwarnings( 'ignore' )

## 0.2 Load Data

In [2]:
X_train = pd.read_csv( '../data/train_feat.csv',  low_memory= False )

## 0.3 Change Types

In [3]:
X_train['date'] = pd.to_datetime( X_train['date'] )

## 0.3 Functions

In [4]:
def ml_error( model_name, y, yhat ):
    mae  = mean_absolute_error( y, yhat )
    mape = mean_absolute_percentage_error( y, yhat )
    rmse = np.sqrt( mean_squared_error( y, yhat ) )

    return pd.DataFrame( { 'Model Name' : model_name,
                           'MAE'        : mae,
                           'MAPE'       : mape,
                           'RMSE'       : rmse
                           }, index=[0] )

def cross_validation( x_training, kfold, model_name, model, verbose=False ):
    mae_list =  []
    mape_list = []
    rmse_list = []

    for k in reversed( range( 1, kfold+1 ) ):
        if verbose:
            print( '\nKFold Number: {}'.format( k ) )

        # start and end date for validation
        validation_start_date = x_training['date'].max() - datetime.timedelta( days=k*6*7 )
        validation_end_date   = x_training['date'].max() - datetime.timedelta( days=( k-1 )*6*7 )
                                                                            
        # filtering dataset
        training   = x_training[ x_training['date'] < validation_start_date]
        validation = x_training[ (x_training['date'] >= validation_start_date) & (x_training['date'] <= validation_end_date) ]

        # training dataset
        xtraining = training.drop( ['date', 'sales'], axis=1 )
        ytraining = training['sales']

        # validation
        xvalidation = validation.drop( ['date', 'sales'], axis=1 )
        yvalidation = validation['sales']

        # model
        m = model.fit( xtraining, ytraining )

        # prediction
        yhat = m.predict( xvalidation )

        # performance
        m_result = ml_error( model_name, np.expm1( yvalidation ), np.expm1( yhat ) )

        # store performance of each kfold iteration
        mae_list.append( m_result['MAE'] )
        mape_list.append( m_result['MAPE'] )
        rmse_list.append( m_result['RMSE'] )

        

    return pd.DataFrame( {  'Model Name': model_name,
                            'MAE CV'  : np.round( np.mean( mae_list ),  2 ).astype( str ) + ' +/- ' + np.round( np.std( mae_list ), 2 ).astype( str ),
                            'MAPE CV' : np.round( np.mean( mape_list ), 2 ).astype( str ) + ' +/- ' + np.round( np.std( mape_list ), 2 ).astype( str ),
                            'RMSE CV' : np.round( np.mean( rmse_list ), 2 ).astype( str ) + ' +/- ' + np.round( np.std( rmse_list ), 2 ).astype( str )
                        }, index=[0] )

# def mean_percentage_error( y, yhat ):
#    return np.mean( ( y - yhat ) / y )

# 1.0 Machine Learning Modelling

In [6]:
# calculate period - last 6  weeks
last_date  = X_train[['store', 'date']].groupby( 'store' ).max().reset_index().loc[0, 'date']
first_date = last_date - datetime.timedelta( days=6*7 )

# creating training dataset
x_train = X_train.copy() 
x_train = x_train[ x_train['date'] < first_date ]
y_train = x_train['sales']
x_train = x_train.drop( ['date', 'sales'], axis=1 )

# creating validation dataset
x_validation = X_train.copy()
x_validation = x_validation[ x_validation['date'] >= first_date ]
y_validation = x_validation['sales']
x_validation = x_validation.drop( ['date', 'sales'], axis=1 )


## 1.1 Average Model

In [7]:
# create aux_df with database test + response variable (sales)
aux1          = x_validation.copy()
aux1['sales'] = y_validation.copy()

# prediction
aux2 = aux1[['store', 'sales']].groupby( 'store' ).mean().reset_index().rename( columns = {'sales' :  'sales_predictions' } )
aux1 = pd.merge( aux1, aux2, how='left', on='store' )
yhat_avg = aux1['sales_predictions']

# calculate performance with response variable "sales" on original scale 
avg_model_result = ml_error( 'Average Model', np.expm1( y_validation ), np.expm1( yhat_avg ) )
avg_model_result

Unnamed: 0,Model Name,MAE,MAPE,RMSE
0,Average Model,1354.800353,0.2064,1835.135542


## 1.2 Linear Regression Model - Single Performance

In [8]:
# define model
lr_model = LinearRegression().fit( x_train, y_train )

# create predictions
yhat_lr = lr_model.predict( x_validation )

# calculate performance
lr_model_result = ml_error( 'Linear Regression Model', np.expm1( y_validation ), np.expm1( yhat_lr) )
lr_model_result

Unnamed: 0,Model Name,MAE,MAPE,RMSE
0,Linear Regression Model,1867.083196,0.292677,2671.107227


### 1.2.1 Linear Regression Model - Cross Validation

In [11]:
lr_result_cv = cross_validation( X_train, 5, 'Linear Regression', lr_model, verbose=False )
lr_result_cv

Unnamed: 0,Model Name,MAE CV,MAPE CV,RMSE CV
0,Linear Regression,1940.09 +/- 97.64,0.3 +/- 0.02,2734.59 +/- 179.31


## 1.3 Linear Regression Regularized Model (Lasso) - Single Performance

In [9]:
# define model
lasso_model = Lasso( alpha=0.01 ).fit( x_train, y_train )

# create predictions
yhat_lasso = lasso_model.predict( x_validation )

# calculate performance
lasso_model_result = ml_error( 'Linear Regression Regularized Model - Lasso', np.expm1( y_validation), np.expm1( yhat_lasso) )
lasso_model_result

Unnamed: 0,Model Name,MAE,MAPE,RMSE
0,Linear Regression Regularized Model - Lasso,1896.749531,0.303882,2696.642706


### 1.3.1 Linear Regression Regularized Model (Lasso) - Cross Validation

In [12]:
lasso_result_cv = cross_validation( X_train, 5, 'Linear Regression Regularized - Lasso', lasso_model, verbose=False )
lasso_result_cv

Unnamed: 0,Model Name,MAE CV,MAPE CV,RMSE CV
0,Linear Regression Regularized - Lasso,1939.08 +/- 113.04,0.3 +/- 0.01,2759.46 +/- 199.29


## 1.4 Random Forest Model - Single Performance

In [9]:
# define model
rf_model = RandomForestRegressor( n_estimators=100, n_jobs=-1, random_state=42 ).fit( x_train, y_train )

# create predictions
yhat_rf = rf_model.predict( x_validation )

# calculate performance
rf_model_result = ml_error( 'Random Forest Model', np.expm1( y_validation ), np.expm1( yhat_rf) )
rf_model_result

### 1.4.1 Random Forest Model - Cross Validation

In [None]:
rf_result_cv = cross_validation( X_train, 5, 'Random Forest Model', rf_model, verbose=True )
rf_result_cv

## 1.5 XGBoost Model - Single Performance

In [16]:
# define model
xgb_model = xgb.XGBRegressor( objective='reg:squarederror', 
                              n_estimators=100, 
                              #eta=0.01, 
                              #max_depth=10,  
                              subsample=0.7,
                              colsample_bytree=0.9).fit( x_train, y_train )

# create predictions
yhat_xgb = xgb_model.predict( x_validation )

# calculate performance
xgb_result = ml_error( 'XGBoost Regressor', np.expm1( y_validation ), np.expm1( yhat_xgb) )
xgb_result

Unnamed: 0,Model Name,MAE,MAPE,RMSE
0,XGBoost Regressor,924.01982,0.141888,1298.391107


### 1.5.1 XGBoost Model - Cross Validation

In [17]:
xgb_result_cv = cross_validation( X_train, 5, 'XGBoost Regressor', xgb_model, verbose=True )
xgb_result_cv


KFold Number: 5

KFold Number: 4

KFold Number: 3

KFold Number: 2

KFold Number: 1


Unnamed: 0,Model Name,MAE CV,MAPE CV,RMSE CV
0,XGBoost Regressor,1004.2 +/- 92.18,0.14 +/- 0.01,1434.7 +/- 138.51
