# Objective


*   Here we will do some transformations on variables such as encoding and scaling. This will easier for a model to learn and understand the problem
*   Expected outputs are the initial features to be used 



# 1. Library imports

In [None]:
import math
import numpy as np
import pandas as pd
import random
import pickle
import warnings
import inflection
import seaborn as sns
import xgboost as xgb

from scipy import stats as ss
from boruta import BorutaPy
from matplotlib import pyplot as plt


from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.preprocessing import RobustScaler, MinMaxScaler, LabelEncoder


#Helper Functions

For this section, some helper functions have been created.



*   **cross_validation**: Through this technique, the data are divided into blocks (Folds) and different training/validation combinations are created with these blocks, in order to reduce the chances of getting only one biased combination and, thus, allowing a greater capacity for generalization to our model

*   **MAE:** In summary, it represents the difference between predicted and actual value. Assigns equal weight to all errors, so it is considered robust in the presence of outliers
*   **MAPE**: How far is the prediction from the actual value on average in percentage
*   **RMSE:** Used as an improvement metric as it assigns greater weight to larger errors



In [None]:
def cross_validation( x_training, kfold, model_name, model, verbose=False ):
  mae_list = []
  mape_list = []
  rmse_list = []
  for k in reversed( range( 1, kfold+1 ) ):
    if verbose:
      print( '\nKFold Number: {}'.format( k ) )
    # start and end date for validatio
    validation_start_date = x_training['date'].max() - datetime.timedelta(days=k*6*7)
    validation_end_date = x_training['date'].max() - datetime.timedelta(days=(k-1)*6*7)
    # filtering dataset
    training = x_training[x_training['date'] < validation_start_date]
    validation = x_training[(x_training['date'] >= validation_start_date) & (x_training['date'] <= validation_end_date)]
    # training and validation dataset
    # training
    xtraining = training.drop( ['date', 'sales'], axis=1 )
    ytraining = training['sales']
    # validation
    xvalidation = validation.drop( ['date', 'sales'], axis=1 )
    yvalidation = validation['sales']
    # model
    m = model.fit( xtraining, ytraining )
    # prediction
    yhat = m.predict( xvalidation )
    # performance
    m_result = ml_error( model_name, np.expm1( yvalidation ), np.expm1(yhat ) )
    # store performance of each kfold iteration
    mae_list.append( m_result['MAE'] )
    mape_list.append( m_result['MAPE'] )
    rmse_list.append( m_result['RMSE'] )
  return pd.DataFrame( {'Model Name': model_name,
  'MAE CV': np.round( np.mean( mae_list ), 2 ).astype(str ) + ' +/- ' + np.round( np.std( mae_list ), 2 ).astype( str ),
  'MAPE CV': np.round( np.mean( mape_list ), 2 ).astype( str ) + ' +/- ' + np.round( np.std( mape_list ), 2 ).astype( str ),
  'RMSE CV': np.round( np.mean( rmse_list ), 2 ).astype( str ) + ' +/- ' + np.round( np.std( rmse_list ), 2 ).astype( str )}, index=[0] )

def mean_percentage_error( y, yhat ):
  return np.mean( ( y - yhat ) / y )

def mean_absolute_percentage_error( y, yhat ):
  return np.mean( np.abs( ( y - yhat ) / y ) )

def ml_error( model_name, y, yhat ):
  mae = mean_absolute_error( y, yhat )
  mape = mean_absolute_percentage_error( y, yhat )
  rmse = np.sqrt( mean_squared_error( y, yhat ) )
  return pd.DataFrame( { 'Model Name': model_name,
  'MAE': mae,
  'MAPE': mape,
  'RMSE': rmse }, index=[0] )

def cramer_v( x, y ):
  #17min  ultimo EDA
  cm = pd.crosstab( x, y ).values
  n = cm.sum()
  r, k = cm.shape
  chi2 = ss.chi2_contingency( cm )[0]
  chi2corr = max( 0, chi2 - (k-1)*(r-1)/(n-1) )
  kcorr = k - (k-1)**2/(n-1)
  rcorr = r - (r-1)**2/(n-1)
  return np.sqrt( (chi2corr/n) / ( min( kcorr-1, rcorr-1 ) ) )

In [None]:
jupyter_settings()

Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy
  "\n`%matplotlib` prevents importing * from pylab and numpy"


#Loading Data

In [None]:
 df5 = df4.copy()

# 2. Data Preparation

### ENCODING - Machine learning algorithms work with optimization methods, based on numerical variables. Therefore, we need to transform our categorical variables without losing the content.



### NORMALIZATION / RESCALING - Optimization methods tend to bias the model, giving greater importance to variables with a greater range. So we need to normalize/change the scale to avoid model bias

## 2.1 Normalization

Shifts the center to "0", with a standard deviation of 1.

In [1]:
#look at the univariate analysis.. we don't have variables with behavior similar to the normal distribution, so we will use rescaling

## 2.2 Rescaling


*   Rescale for range between 0 and 1 in non-Gaussian distributions
*   Output: data with new range and equivalent importance

In [None]:
a = df5.select_dtypes (include =['int64', 'float64'])

In [None]:
a.head()

Unnamed: 0,store,day_of_week,sales,promo,school_holiday,competition_distance,competition_open_since_month,competition_open_since_year,promo2,promo2_since_week,promo2_since_year,is_promo,year,month,day,week_of_year,competition_time_month,promo_time_week
0,1,5,5263,1,1,1270.0,9,2008,0,31,2015,0,2015,7,31,31,84,0
1,2,5,6064,1,1,570.0,11,2007,1,13,2010,1,2015,7,31,31,94,279
2,3,5,8314,1,1,14130.0,12,2006,1,14,2011,1,2015,7,31,31,105,226
3,4,5,13995,1,1,620.0,9,2009,0,31,2015,0,2015,7,31,31,71,0
4,5,5,4822,1,1,29910.0,4,2015,0,31,2015,0,2015,7,31,31,4,0


In [None]:
#to decide we look at boxplot. If you have outliers very 'far away', we use RobustScaler

#Min Max is sensitive to outliers... 
mms = MinMaxScaler()


#--sns.boxplot (df5 ['competition_distance'])
rs = RobustScaler()


# competition distance
df5['competition_distance'] = rs.fit_transform( df5[['competition_distance']].values )
#pickle.dump( rs, open( '/content/drive/MyDrive/Colab Notebooks/Projetos/rossmann/competition_distance_scaler.pkl', 'wb') )
# competition time month
df5['competition_time_month'] = rs.fit_transform(df5[['competition_time_month']].values )
#pickle.dump( rs, open( '/content/drive/MyDrive/Colab Notebooks/Projetos/rossmann/competition_time_month_scaler.pkl', 'wb') )
# promo time week
df5['promo_time_week'] = mms.fit_transform( df5[['promo_time_week']].values )
#pickle.dump( rs, open( '/content/drive/MyDrive/Colab Notebooks/Projetos/rossmann/promo_time_week_scaler.pkl', 'wb') )
# year
df5['year'] = mms.fit_transform( df5[['year']].values )
#pickle.dump( mms, open( '/content/drive/MyDrive/Colab Notebooks/Projetos/rossmann/year_scaler.pkl', 'wb') )


## 2.3 Transformation

###  5.3.1. Encoding

- One Hot Encoding - This creates a binary column for each category and returns a sparse matrix or dense array 
- Good use for case of variables like state, like the case below in holiday

In [None]:
# state_holiday - One Hot Encoding 
df5 = pd.get_dummies( df5, prefix=['state_holiday'], columns=['state_holiday'] )

 - Label Encoding -  it involves converting each value in a column to a number.

In [None]:
# store_type - Label Encoding
le = LabelEncoder()
df5['store_type'] = le.fit_transform( df5['store_type'] )

#saving
#pickle.dump( le, open( '/content/drive/MyDrive/Colab Notebooks/Projetos/rossmann/store_type_scaler.pkl', 'wb') )

- Ordinal Encoding -  As the name suggests, it is suitable for hierarchies on categorical variables

In [None]:
# assortment - Ordinal Encoding
assortment_dict = {'basic': 1, 'extra': 2, 'extended': 3}
df5['assortment'] = df5['assortment'].map( assortment_dict )

###  5.3.2. Response Variable Transformation

- The goal here is to approximate our sales column to a normal distribution. We will do this by transforming into log, important later to remember to use the inverse operation (exp)

In [None]:
#aproxima de uma distribuição normal, a qual os modelo de ML tem como promissa e são otimizados
df5['sales'] = np.log1p( df5['sales'] )


### 5.3.3. Nature Transformation

In the nature transformation, we are transforming the cyclic nature of the variables with some metrics, such as the trigonometric circle for the months of the year - which allows us to represent a "distance" between January and July and at the same time approaching December to January.

In [None]:
# day of week
df5['day_of_week_sin'] = df5['day_of_week'].apply( lambda x: np.sin( x * ( 2. *np.pi/7 ) ) )
df5['day_of_week_cos'] = df5['day_of_week'].apply( lambda x: np.cos( x * ( 2. *np.pi/7 ) ) )

# month
df5['month_sin'] = df5['month'].apply( lambda x: np.sin( x * ( 2. * np.pi/12 )) )
df5['month_cos'] = df5['month'].apply( lambda x: np.cos( x * ( 2. * np.pi/12 )) )

# day
df5['day_sin'] = df5['day'].apply( lambda x: np.sin( x * ( 2. * np.pi/30 ) ) )
df5['day_cos'] = df5['day'].apply( lambda x: np.cos( x * ( 2. * np.pi/30 ) ) )

# week of year
df5['week_of_year_sin'] = df5['week_of_year'].apply( lambda x: np.sin( x * ( 2.* np.pi/52 ) ) )
df5['week_of_year_cos'] = df5['week_of_year'].apply( lambda x: np.cos( x * ( 2.* np.pi/52 ) ) )

In [None]:
df5.head()

Unnamed: 0,store,day_of_week,date,sales,promo,school_holiday,store_type,assortment,competition_distance,competition_open_since_month,competition_open_since_year,promo2,promo2_since_week,promo2_since_year,is_promo,year,month,day,week_of_year,year_week,competition_since,competition_time_month,promo_since,promo_time_week,state_holiday_christmas,state_holiday_easter_holiday,state_holiday_public_holiday,state_holiday_regular_day,day_of_week_sin,day_of_week_cos,month_sin,month_cos,day_sin,day_cos,week_of_year_sin,week_of_year_cos
0,1,5,2015-07-31,8.568646,1,1,2,1,-0.170968,9,2008,0,31,2015,0,1.0,7,31,31,2015-30,2008-09-01,0.918919,2015-07-27,0.287016,0,0,0,1,-0.974928,-0.222521,-0.5,-0.866025,0.207912,0.978148,-0.568065,-0.822984
1,2,5,2015-07-31,8.71029,1,1,0,1,-0.283871,11,2007,1,13,2010,1,1.0,7,31,31,2015-30,2007-11-01,1.054054,2010-03-22,0.922551,0,0,0,1,-0.974928,-0.222521,-0.5,-0.866025,0.207912,0.978148,-0.568065,-0.822984
2,3,5,2015-07-31,9.025816,1,1,0,1,1.903226,12,2006,1,14,2011,1,1.0,7,31,31,2015-30,2006-12-01,1.202703,2011-03-28,0.801822,0,0,0,1,-0.974928,-0.222521,-0.5,-0.866025,0.207912,0.978148,-0.568065,-0.822984
3,4,5,2015-07-31,9.546527,1,1,2,3,-0.275806,9,2009,0,31,2015,0,1.0,7,31,31,2015-30,2009-09-01,0.743243,2015-07-27,0.287016,0,0,0,1,-0.974928,-0.222521,-0.5,-0.866025,0.207912,0.978148,-0.568065,-0.822984
4,5,5,2015-07-31,8.481151,1,1,0,1,4.448387,4,2015,0,31,2015,0,1.0,7,31,31,2015-30,2015-04-01,-0.162162,2015-07-27,0.287016,0,0,0,1,-0.974928,-0.222521,-0.5,-0.866025,0.207912,0.978148,-0.568065,-0.822984


# 3. Feature Selection

Feature selection improves the machine learning process and increases the predictive power of machine learning algorithms by selecting the most important variables and eliminating redundant and irrelevant features.

In [None]:
df6 = df5.copy()


## Split dataframe into training and test

In [None]:
cols_drop = ['week_of_year', 'day', 'month', 'day_of_week', 'promo_since','competition_since', 'year_week' ]
df6 = df6.drop( cols_drop, axis=1 )


As we are working on a temporal problem, we cannot work with random features - as there is a risk of getting future and past data in training biasing and causing overfit

In [None]:
# training dataset
X_train = df6[df6['date'] < '2015-06-19']
y_train = X_train['sales']

# test dataset
X_test = df6[df6['date'] >= '2015-06-19']
y_test = X_test['sales']

## Botura as Feature Selector

The Boruta algorithm is a wrapper built around the random forest classification algorithm. It tries to capture all the important features in the dataset with respect to our outcome variable (sales)

In [None]:
# training and test dataset for Boruta
X_train_n = X_train.drop( ['date', 'sales'], axis=1 ).values
y_train_n = y_train.values.ravel()


## define RandomForestRegressor
#rf = RandomForestRegressor( n_jobs=-1 )


## define Boruta
#boruta = BorutaPy( rf, n_estimators='auto', verbose=2, random_state=42 ).fit(X_train_n, y_train_n )

## Best Features from boruta

In [None]:
cols_selected = boruta.support_.tolist()
#
## best features
X_train_fs = X_train.drop( ['date', 'sales'], axis=1 )
cols_selected_boruta = X_train_fs.iloc[:, cols_selected].columns.to_list()
#
## not selected boruta
cols_not_selected_boruta = list( np.setdiff1d( X_train_fs.columns,cols_selected_boruta ) )


##Manual feature Selection

In [None]:
cols_selected_boruta = [
'store',
'promo',
'store_type',
'assortment',
'competition_distance',
'competition_open_since_month',
'competition_open_since_year',
'promo2',
'promo2_since_week',
'promo2_since_year',
'competition_time_month',
'promo_time_week',
'day_of_week_sin',
'day_of_week_cos',
'month_sin',
'month_cos',
'day_sin',
'day_cos',
'week_of_year_sin',
'week_of_year_cos']
# columns to add
feat_to_add = ['date', 'sales']
cols_selected_boruta_full = cols_selected_boruta.copy()
cols_selected_boruta_full.extend( feat_to_add )