# 0.0 General Section

## 0.1 Import Libraries

In [1]:
import pandas  as pd
import numpy   as np
import xgboost as xgb

import datetime
import warnings

from sklearn.ensemble      import RandomForestRegressor
from sklearn.linear_model  import LinearRegression, Lasso
from sklearn.metrics       import mean_absolute_error, mean_absolute_percentage_error, mean_squared_error

warnings.filterwarnings( 'ignore' )

## 0.2 Load Data

In [2]:
df_train_feat = pd.read_csv( '../data/train_feat.csv',  low_memory= False )

## 0.3 Change Types

In [3]:
df_train_feat['date'] = pd.to_datetime( df_train_feat['date'] )

## 0.3 Functions

In [4]:
def ml_error( model_name, y, yhat ):
    mae  = mean_absolute_error( y, yhat )
    mape = mean_absolute_percentage_error( y, yhat )
    rmse = np.sqrt( mean_squared_error( y, yhat ) )

    return pd.DataFrame( { 'Model Name' : model_name,
                           'MAE'        : mae,
                           'MAPE'       : mape,
                           'RMSE'       : rmse
                           }, index=[0] )

def cross_validation( x_training, kfold, model_name, model, verbose=False ):
    mae_list =  []
    mape_list = []
    rmse_list = []

    for k in reversed( range( 1, kfold+1 ) ):
        # start and end date for validation
        validation_start_date = x_training['date'].max() - datetime.timedelta( days=k*6*7 )
        validation_end_date   = x_training['date'].max() - datetime.timedelta( days=( k-1 )*6*7 )

        if verbose:
            print(f'\nValidation Block: { k }. Dates: { validation_start_date } - { validation_end_date }' )

                                                                            
        # filtering dataset
        training   = x_training[ x_training['date'] <= validation_start_date]
        validation = x_training[ (x_training['date'] > validation_start_date) & (x_training['date'] <= validation_end_date) ]

        # training dataset
        xtraining = training.drop( ['date', 'sales'], axis=1 )
        ytraining = training['sales']

        # validation
        xvalidation = validation.drop( ['date', 'sales'], axis=1 )
        yvalidation = validation['sales']

        # model
        m = model.fit( xtraining, ytraining )

        # prediction
        yhat = m.predict( xvalidation )

        # performance
        m_result = ml_error( model_name, np.expm1( yvalidation ), np.expm1( yhat ) )

        # store performance of each kfold iteration
        mae_list.append( m_result['MAE'] )
        mape_list.append( m_result['MAPE'] )
        rmse_list.append( m_result['RMSE'] )

        

    return pd.DataFrame( {  'Model Name': model_name,
                            'MAE CV'  : np.round( np.mean( mae_list ),  2 ).astype( str ) + ' +/- ' + np.round( np.std( mae_list ), 2 ).astype( str ),
                            'MAPE CV' : np.round( np.mean( mape_list ), 2 ).astype( str ) + ' +/- ' + np.round( np.std( mape_list ), 2 ).astype( str ),
                            'RMSE CV' : np.round( np.mean( rmse_list ), 2 ).astype( str ) + ' +/- ' + np.round( np.std( rmse_list ), 2 ).astype( str )
                        }, index=[0] )

# def mean_percentage_error( y, yhat ):
#    return np.mean( ( y - yhat ) / y )

## 0.5 Split df_feat_train in two datasets: X_train( full ) and X_validation( last 36 weeks )

In [5]:
# calculate period - last 36  weeks
last_date  = df_train_feat[['store', 'date']].groupby( 'store' ).max().reset_index().loc[0, 'date']
first_date = last_date - datetime.timedelta( days=36*7 )

last_date_period = last_date

print(f'First date: {first_date}')
print(f'Last date: {last_date}')

# creating full datasets: X_train e X_validation
X_train      = df_train_feat.copy() 
X_validation = X_train[ X_train['date'] > first_date ]

# create x_train, y_train variables
x_train = X_train[ X_train['date'] <= first_date ]
y_train = x_train['sales']
x_train = x_train.drop( ['date', 'sales'], axis=1 )

# calculate period - first 6 weeks from validation dataset
first_date = X_validation[['store', 'date']].groupby( 'store' ).min().reset_index().loc[0, 'date']
last_date  = first_date + datetime.timedelta( days=6*7 )

print(f'\nFirst date Validation: {first_date}')
print(f'Last date Validation: {last_date}')

# create x_validation, y_validation variables
x_validation = X_validation[ X_validation['date'] < last_date ]
y_validation = x_validation['sales']
x_validation = x_validation.drop( ['date', 'sales'], axis=1 )

print(f'\n { last_date - first_date }' )


First date: 2014-11-21 00:00:00
Last date: 2015-07-31 00:00:00

First date Validation: 2014-11-22 00:00:00
Last date Validation: 2015-01-03 00:00:00

 42 days 00:00:00


# 1.0 Machine Learning Modelling

## 1.1 Average Model

In [6]:
# create aux_df with database test + response variable (sales)
aux1          = x_validation.copy()
aux1['sales'] = y_validation.copy()

# prediction
aux2 = aux1[['store', 'sales']].groupby( 'store' ).mean().reset_index().rename( columns = {'sales' :  'sales_predictions' } )
aux1 = pd.merge( aux1, aux2, how='left', on='store' )
yhat_avg = aux1['sales_predictions']

# calculate performance with response variable "sales" on original scale 
avg_model_result = ml_error( 'Average Model', np.expm1( y_validation ), np.expm1( yhat_avg ) )
avg_model_result

Unnamed: 0,Model Name,MAE,MAPE,RMSE
0,Average Model,1892.237342,0.259841,2476.199792


## 1.2 Linear Regression Model - Single Performance

In [7]:
# define model
lr_model = LinearRegression().fit( x_train, y_train )

# create predictions
yhat_lr = lr_model.predict( x_validation )

# calculate performance
lr_model_result = ml_error( 'Linear Regression Model', np.expm1( y_validation ), np.expm1( yhat_lr) )
lr_model_result


Unnamed: 0,Model Name,MAE,MAPE,RMSE
0,Linear Regression Model,2747.082489,0.313411,3955.414335


### 1.2.1 Linear Regression Model - Cross Validation

In [8]:
lr_model_result_cv = cross_validation( X_train, 5, 'Linear Regression', lr_model, verbose=True )
lr_model_result_cv


Validation Block: 5. Dates: 2015-01-02 00:00:00 - 2015-02-13 00:00:00

Validation Block: 4. Dates: 2015-02-13 00:00:00 - 2015-03-27 00:00:00

Validation Block: 3. Dates: 2015-03-27 00:00:00 - 2015-05-08 00:00:00

Validation Block: 2. Dates: 2015-05-08 00:00:00 - 2015-06-19 00:00:00

Validation Block: 1. Dates: 2015-06-19 00:00:00 - 2015-07-31 00:00:00


Unnamed: 0,Model Name,MAE CV,MAPE CV,RMSE CV
0,Linear Regression,1946.31 +/- 100.86,0.3 +/- 0.02,2740.6 +/- 184.58


## 1.3 Linear Regression Regularized Model (Lasso) - Single Performance

In [9]:
# define model
lasso_model = Lasso( alpha=0.01 ).fit( x_train, y_train )

# create predictions
yhat_lasso = lasso_model.predict( x_validation )

# calculate performance
lasso_model_result = ml_error( 'Linear Regression Regularized Model - Lasso', np.expm1( y_validation), np.expm1( yhat_lasso) )
lasso_model_result

Unnamed: 0,Model Name,MAE,MAPE,RMSE
0,Linear Regression Regularized Model - Lasso,2747.164557,0.316044,3957.73905


### 1.3.1 Linear Regression Regularized Model (Lasso) - Cross Validation

In [10]:
lasso_model_result_cv = cross_validation( X_train, 5, 'Linear Regression Regularized - Lasso', lasso_model, verbose=True )
lasso_model_result_cv


Validation Block: 5. Dates: 2015-01-02 00:00:00 - 2015-02-13 00:00:00

Validation Block: 4. Dates: 2015-02-13 00:00:00 - 2015-03-27 00:00:00

Validation Block: 3. Dates: 2015-03-27 00:00:00 - 2015-05-08 00:00:00

Validation Block: 2. Dates: 2015-05-08 00:00:00 - 2015-06-19 00:00:00

Validation Block: 1. Dates: 2015-06-19 00:00:00 - 2015-07-31 00:00:00


Unnamed: 0,Model Name,MAE CV,MAPE CV,RMSE CV
0,Linear Regression Regularized - Lasso,1943.94 +/- 117.43,0.3 +/- 0.01,2764.83 +/- 206.7


## 1.4 Random Forest Model - Single Performance

In [11]:
# define model
# rf_model = RandomForestRegressor( n_estimators=100, n_jobs=-1, random_state=42 ).fit( x_train, y_train )

# create predictions
# yhat_rf = rf_model.predict( x_validation )

# calculate performance
# rf_model_result = ml_error( 'Random Forest Model', np.expm1( y_validation ), np.expm1( yhat_rf) )
# rf_model_result

SyntaxError: incomplete input (1035288268.py, line 1)

### 1.4.1 Random Forest Model - Cross Validation

In [None]:
# rf_result_cv = cross_validation( X_train, 5, 'Random Forest Model', rf_model, verbose=True )
# rf_result_cv

## 1.5 XGBoost Model - Single Performance

In [12]:
# define model
xgb_model = xgb.XGBRegressor( objective='reg:squarederror', 
                              n_estimators=100
                              #eta=0.01, 
                              #max_depth=10,  
                              #subsample=0.7,
                              #colsample_bytree=0.9
                            ).fit( x_train, y_train )

# create predictions
yhat_xgb = xgb_model.predict( x_validation )

# calculate performance
xgb_model_result = ml_error( 'XGBoost Regressor', np.expm1( y_validation ), np.expm1( yhat_xgb) )
xgb_model_result

Unnamed: 0,Model Name,MAE,MAPE,RMSE
0,XGBoost Regressor,1409.380561,0.190944,1957.450144


### 1.5.1 XGBoost Model - Cross Validation

In [13]:
xgb_model_result_cv = cross_validation( X_train, 5, 'XGBoost Regressor', xgb_model, verbose=True )
xgb_model_result_cv


Validation Block: 5. Dates: 2015-01-02 00:00:00 - 2015-02-13 00:00:00

Validation Block: 4. Dates: 2015-02-13 00:00:00 - 2015-03-27 00:00:00

Validation Block: 3. Dates: 2015-03-27 00:00:00 - 2015-05-08 00:00:00

Validation Block: 2. Dates: 2015-05-08 00:00:00 - 2015-06-19 00:00:00

Validation Block: 1. Dates: 2015-06-19 00:00:00 - 2015-07-31 00:00:00


Unnamed: 0,Model Name,MAE CV,MAPE CV,RMSE CV
0,XGBoost Regressor,985.44 +/- 81.38,0.14 +/- 0.01,1408.7 +/- 124.15


## 1.6 Models Performance

### 1.6.1 Single Performance

In [15]:
models_result = pd.concat( [avg_model_result, lr_model_result, lasso_model_result, xgb_model_result] )
models_result

Unnamed: 0,Model Name,MAE,MAPE,RMSE
0,Average Model,1892.237342,0.259841,2476.199792
0,Linear Regression Model,2747.082489,0.313411,3955.414335
0,Linear Regression Regularized Model - Lasso,2747.164557,0.316044,3957.73905
0,XGBoost Regressor,1409.380561,0.190944,1957.450144


### 1.6.2 Cross Validation Performance

In [16]:
models_result_cv = pd.concat( [lr_model_result_cv, lasso_model_result_cv, xgb_model_result_cv] )
models_result_cv

Unnamed: 0,Model Name,MAE CV,MAPE CV,RMSE CV
0,Linear Regression,1946.31 +/- 100.86,0.3 +/- 0.02,2740.6 +/- 184.58
0,Linear Regression Regularized - Lasso,1943.94 +/- 117.43,0.3 +/- 0.01,2764.83 +/- 206.7
0,XGBoost Regressor,985.44 +/- 81.38,0.14 +/- 0.01,1408.7 +/- 124.15
