# 0. IMPORTS

## 0.1. Importing libraries

In [4]:
# !pip install xgboost
# !pip install boruta

Collecting boruta
  Downloading Boruta-0.3-py3-none-any.whl (56 kB)
Installing collected packages: boruta
Successfully installed boruta-0.3


In [5]:
import math
import numpy  as np
import pandas as pd
import random
import pickle
import warnings
import inflection
import seaborn as sns
import xgboost as xgb

from scipy                 import stats  as ss
from boruta                import BorutaPy
from matplotlib            import pyplot as plt
from IPython.display       import Image
from IPython.core.display  import HTML

from sklearn.metrics       import mean_absolute_error, mean_squared_error
from sklearn.ensemble      import RandomForestRegressor
from sklearn.linear_model  import LinearRegression, Lasso
from sklearn.preprocessing import RobustScaler, MinMaxScaler, LabelEncoder

warnings.filterwarnings( 'ignore' )

## 0.2. Loading data

In [15]:
# Importar os datasets
df_sales_raw = pd.read_csv('../datasets/train.csv', low_memory=False)
df_store_raw = pd.read_csv('../datasets/store.csv', low_memory=False)

# Unir os datasetes
df_raw = df_store_raw.merge(df_sales_raw, how='inner', on='Store')

## 0.3. Helper functions

In [43]:
# Função para renomear as colunas do dataframe
def rename_columns(df_aux):
    title = lambda x: inflection.titleize(x)
    snakecase = lambda x: inflection.underscore(x)
    spaces = lambda x: x.replace(" ", "")

    cols = list(df_aux.columns)
    cols = list(map(title, cols))
    cols = list(map(spaces, cols))
    cols = list(map(snakecase, cols))
    
    df_aux.columns = cols
    return df_aux    

# 1. PASSO 01 - DATA DESCRIPTION

In [54]:
df1 = df_raw.copy()

## 1.1. Rename Columns

In [55]:
df1 = rename_columns(df1)
df1.columns

Index(['store', 'store_type', 'assortment', 'competition_distance',
       'competition_open_since_month', 'competition_open_since_year', 'promo2',
       'promo2_since_week', 'promo2_since_year', 'promo_interval',
       'day_of_week', 'date', 'sales', 'customers', 'open', 'promo',
       'state_holiday', 'school_holiday'],
      dtype='object')

## 1.2. Data Dimensions

In [56]:
print('Number of rows: {:,}'.format(df1.shape[0]))
print('Number of columns: {:,}'.format(df1.shape[1]))

Number of rows: 1,017,209
Number of columns: 18


## 1.3. Data Types

Feature | Description
-- | --
'store', 'store_type', 'assortment', 'competition_distance',
       'competition_open_since_month', 'competition_open_since_year', 'promo2',
       'promo2_since_week', 'promo2_since_year', 'promo_interval',
       'day_of_week', 'date', 'sales', 'customers', 'open', 'promo',
       'state_holiday', 'school_holiday'

In [63]:
df1.columns

Index(['store', 'store_type', 'assortment', 'competition_distance',
       'competition_open_since_month', 'competition_open_since_year', 'promo2',
       'promo2_since_week', 'promo2_since_year', 'promo_interval',
       'day_of_week', 'date', 'sales', 'customers', 'open', 'promo',
       'state_holiday', 'school_holiday'],
      dtype='object')

In [57]:
df1.dtypes

store                             int64
store_type                       object
assortment                       object
competition_distance            float64
competition_open_since_month    float64
competition_open_since_year     float64
promo2                            int64
promo2_since_week               float64
promo2_since_year               float64
promo_interval                   object
day_of_week                       int64
date                             object
sales                             int64
customers                         int64
open                              int64
promo                             int64
state_holiday                    object
school_holiday                    int64
dtype: object

## 1.4. Check  NA's

In [61]:
df1.isna().sum()

store                                0
store_type                           0
assortment                           0
competition_distance              2642
competition_open_since_month    323348
competition_open_since_year     323348
promo2                               0
promo2_since_week               508031
promo2_since_year               508031
promo_interval                  508031
day_of_week                          0
date                                 0
sales                                0
customers                            0
open                                 0
promo                                0
state_holiday                        0
school_holiday                       0
dtype: int64

## 1.5. Fillout NA's

In [62]:
df1.sample()

Unnamed: 0,store,store_type,assortment,competition_distance,competition_open_since_month,competition_open_since_year,promo2,promo2_since_week,promo2_since_year,promo_interval,day_of_week,date,sales,customers,open,promo,state_holiday,school_holiday
668024,734,a,a,220.0,,,1,36.0,2013.0,"Mar,Jun,Sept,Dec",1,2014-08-11,5259,525,1,0,0,1


In [None]:
# competition_distance              2642
# competition_open_since_month    323348
# competition_open_since_year     323348
# promo2_since_week               508031
# promo2_since_year               508031
# promo_interval                  508031

## 1.6. Change Data Types

In [58]:
# Change Date types
df1['date'] = pd.to_datetime(df1['date'])
df1.dtypes

store                                    int64
store_type                              object
assortment                              object
competition_distance                   float64
competition_open_since_month           float64
competition_open_since_year            float64
promo2                                   int64
promo2_since_week                      float64
promo2_since_year                      float64
promo_interval                          object
day_of_week                              int64
date                            datetime64[ns]
sales                                    int64
customers                                int64
open                                     int64
promo                                    int64
state_holiday                           object
school_holiday                           int64
dtype: object

## 1.7. Descriptive Statics

# 2. PASSO 02 - FEATURE ENGINEERING

# 3. PASSO 03 - FILTRAGEM DE VARIÁVEIS

# 4. PASSO 04 - EXPLORATORY DATA ANALYSIS

# 5. PASSO 05  - DATA PREPARATION

# 6. PASSO 06 - FEATURE SELECTION

# 7. PASSO 07  - MACHINE LEARNING MODELING

# 8. PASSO 08  - HYPERPARAMETER FINE TUNINGG

# 9. PASSO 09 - ERROR TRANSLATION AND INTERPRETATION