# Análise comparativa de dados

In [14]:
from IPython.display import Image, display, Markdown
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
import joblib

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, OrdinalEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression

from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import ShuffleSplit, GridSearchCV, KFold, cross_validate
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import make_scorer, mean_absolute_error, mean_squared_error, r2_score

## 1. Leitura do conjunto de dados

In [5]:
df = pd.read_csv('../data/raw/data.csv')
dicionario = pd.read_csv('../data/external/dictionary.csv')
dicionario

Unnamed: 0,variavel,descricao,tipo,subtipo
0,total_bill,Valor da conta em dólares,quantitativa,continua
1,tip,Valor da gorjeta em dólares,quantitativa,continua
2,sex,Sexo dos clientes,qualitativa,nominal
3,smoker,Se os clientes eram fumantes ou não,qualitativa,nominal
4,day,Dia da semana,qualitativa,ordinal
5,time,Período do dia,qualitativa,ordinal
6,size,Quantidade de pessoas na mesa,quantitativa,discreta


## 2. Limpeza de dados:

Aqui realizamos a normalização, codificação e o tratamento de dados discrepantes e/ou faltantes dentro do conjunto de dados.

### 2.1. Tratamento de dados faltantes:

In [6]:
df.isnull().sum()

total_bill    0
tip           0
sex           0
smoker        0
day           0
time          0
size          0
dtype: int64

A partir da informação dada pela saída do código acima, é possível perceber que não existem dados faltantes na tabela.

### 2.2. Tratamento de dados discrepantes:

In [7]:
# Separar as variáveis

target_column = 'tip'

# Filtrando as colunas
nominal_columns = (
    dicionario
    .query("subtipo == 'nominal'")
    .variavel
    .to_list()
)
continuous_columns = (
    dicionario
    .query("subtipo == 'continua' and variavel != @target_column")
    .variavel
    .to_list()
)
ordinal_day_columns = (
    dicionario
    .query("variavel in ['day']")
    .variavel
    .to_list()
)
ordinal_time_columns = (
    dicionario
    .query("variavel in ['time']")
    .variavel
    .to_list()
)
discrete_columns = (
    dicionario
    .query("subtipo == 'discreta'")
    .variavel
    .to_list()
)

X = df.drop(columns=[target_column], axis=1)
y = df[target_column]
nominal_columns, continuous_columns, ordinal_day_columns, ordinal_time_columns, discrete_columns

(['sex', 'smoker'], ['total_bill'], ['day'], ['time'], ['size'])

In [8]:
# Variáveis independente
X.head()

Unnamed: 0,total_bill,sex,smoker,day,time,size
0,16.99,Female,No,Sun,Dinner,2
1,10.34,Male,No,Sun,Dinner,3
2,21.01,Male,No,Sun,Dinner,3
3,23.68,Male,No,Sun,Dinner,2
4,24.59,Female,No,Sun,Dinner,4


In [9]:
X.shape

(244, 6)

In [12]:
# Variável dependente
y

0      1.01
1      1.66
2      3.50
3      3.31
4      3.61
       ... 
239    5.92
240    2.00
241    2.00
242    1.75
243    3.00
Name: tip, Length: 244, dtype: float64

In [15]:
nominal_preprocessor = Pipeline([
    ('missing', SimpleImputer(strategy='most_frequent')), # tratamento de dados faltantes
    ('encoding', OneHotEncoder(sparse_output=False, drop='first')), # codificação de variáveis
    ('normalization', StandardScaler()) # normalização de dados
])
continuous_preprocessor = Pipeline([
    ('missing', SimpleImputer(strategy='mean')),  # Tratamento de dados faltantes
    ('normalization', StandardScaler())  # Normalização de dados contínuos
])
ordinal_day_preprocessor = Pipeline([
    ('missing', SimpleImputer(strategy='most_frequent')), # tratamento de dados faltantes
    ('encoding', OneHotEncoder(sparse_output=False, drop='first')), # codificação de variáveis
    ('normalization', StandardScaler()) # normalização de dados
])
ordinal_time_preprocessor = Pipeline([
    ('missing', SimpleImputer(strategy='most_frequent')), # tratamento de dados faltantes
    ('encoding', OneHotEncoder(sparse_output=False, drop='first')), # codificação de variáveis
    ('normalization', StandardScaler()) # normalização de dados
])
discrete_preprocessor = Pipeline([
    ('missing', SimpleImputer(strategy='mean')),  # Tratamento de dados faltantes
    ('normalization', StandardScaler()) # normalização de dados
])
preprocessor = ColumnTransformer([
    ('nominal', nominal_preprocessor, nominal_columns),
    ('continuous', continuous_preprocessor, continuous_columns),
    ('ordinal_day', ordinal_day_preprocessor, ordinal_day_columns),
    ('ordinal_time', ordinal_time_preprocessor, ordinal_time_columns),
    ('discrete', discrete_preprocessor, discrete_columns)
])

model = LogisticRegression()

In [30]:
preprocessor.fit_transform(X)
X_transform = preprocessor.transform(X)
X_transform

array([[-1.34335316, -0.78478917, -0.31471131, ..., -0.5836603 ,
        -0.62158156, -0.60019263],
       [ 0.74440589, -0.78478917, -1.06323531, ..., -0.5836603 ,
        -0.62158156,  0.45338292],
       [ 0.74440589, -0.78478917,  0.1377799 , ..., -0.5836603 ,
        -0.62158156,  0.45338292],
       ...,
       [ 0.74440589,  1.27422758,  0.3246295 , ..., -0.5836603 ,
        -0.62158156, -0.60019263],
       [ 0.74440589, -0.78478917, -0.2212865 , ..., -0.5836603 ,
        -0.62158156, -0.60019263],
       [-1.34335316, -0.78478917, -0.1132289 , ...,  1.71332538,
        -0.62158156, -0.60019263]])

## 3. Seleção de modelos

Iremos análisar três modelos, que serão testados utilizando um método de validação, a saber:

- K-Nearest Neighbors
- Decision Tree
- Random Forest


In [31]:
# experiment settings
n_splits_comparative_analysis = 10
n_folds_grid_search = 5
test_size = .3
random_state = 0
metrics = {
    'MAE': make_scorer(mean_absolute_error),
    'MSE': make_scorer(mean_squared_error),
    'R2': make_scorer(r2_score)
}

# model settings
max_iter = 1000000
models = [
    ('K-Nearest Neighbors', KNeighborsRegressor(), {"n_neighbors": range(3, 20, 2), 'weights': ['uniform', 'distance']}),
    ('Decision Tree',  DecisionTreeRegressor(random_state=random_state), {'criterion':['squared_error', 'friedman_mse'],'max_depth': [3, 25, 40]}),
    ('Random Forest',  RandomForestRegressor(random_state=random_state), {'criterion':['squared_error', 'friedman_mse'],'max_depth': [3, 25, 40], 'n_estimators': [10, 50]}),
]

In [33]:
results = pd.DataFrame({})
cross_validate_grid_search = KFold(n_splits=n_folds_grid_search)
cross_validate_comparative_analysis = ShuffleSplit(n_splits=n_splits_comparative_analysis, test_size=test_size, random_state=random_state)
for model_name, model_object, model_parameters in models:
    print(f"running {model_name}...")
    model_grid_search = GridSearchCV(
        estimator=model_object,
        param_grid=model_parameters,
        scoring='neg_mean_absolute_error',
        n_jobs=2,
        cv=cross_validate_grid_search
    )
    approach = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', model_grid_search)
    ])
    scores = cross_validate(
        estimator=approach,
        X=X,
        y=y,
        cv=cross_validate_comparative_analysis,
        n_jobs=2,
        scoring=metrics,
        return_train_score=False
    )
    scores['model_name'] = [model_name] * n_splits_comparative_analysis
    df_scores = pd.DataFrame(scores)
    df_scores =  df_scores.drop(columns=['model_name'])
    df_scores = df_scores.agg(['mean', 'std'])
    display(df_scores)
    results = pd.concat([results, pd.DataFrame(scores)], ignore_index=True)

running K-Nearest Neighbors...


Unnamed: 0,fit_time,score_time,test_MAE,test_MSE,test_R2
mean,0.329966,0.010307,0.859733,1.425904,0.308659
std,0.007102,0.000586,0.066036,0.335827,0.104049


running Decision Tree...


Unnamed: 0,fit_time,score_time,test_MAE,test_MSE,test_R2
mean,0.129111,0.009245,0.799767,1.27565,0.38453
std,0.00416,0.000242,0.059563,0.386195,0.107518


running Random Forest...


Unnamed: 0,fit_time,score_time,test_MAE,test_MSE,test_R2
mean,2.635647,0.011964,0.790863,1.203103,0.406043
std,0.063757,0.001222,0.058671,0.233691,0.134559


In [34]:
def highlight_best(s, props=''):
    if s.name[1] != 'std':
        if s.name[0].endswith('time'):
            return np.where(s == np.nanmin(s.values), props, '')
        return np.where(s == np.nanmax(s.values), props, '')

display(Markdown("### 3.1 Resultados gerais"))
(
    results
    .groupby('model_name')
    .agg(['mean', 'std']).T
    .style
    .apply(highlight_best, props='color:white;background-color:gray;font-weight: bold;', axis=1)
    .set_table_styles([{'selector': 'td', 'props': 'text-align: center;'}])
)

### 3.1 Resultados gerais

Unnamed: 0,model_name,Decision Tree,K-Nearest Neighbors,Random Forest
fit_time,mean,0.129111,0.329966,2.635647
fit_time,std,0.00416,0.007102,0.063757
score_time,mean,0.009245,0.010307,0.011964
score_time,std,0.000242,0.000586,0.001222
test_MAE,mean,0.799767,0.859733,0.790863
test_MAE,std,0.059563,0.066036,0.058671
test_MSE,mean,1.27565,1.425904,1.203103
test_MSE,std,0.386195,0.335827,0.233691
test_R2,mean,0.38453,0.308659,0.406043
test_R2,std,0.107518,0.104049,0.134559


In [35]:
# Obtem o modelo e os parametros ganhadores
model_name, model_object, model_parameters  = [foo for foo in models if foo[0] == "Decision Tree"][0] 

model_grid_search = GridSearchCV(
        estimator=model_object,
        param_grid=model_parameters,
        scoring='neg_mean_absolute_error',
        n_jobs=None,
        cv=cross_validate_grid_search
    )

approach = Pipeline([
    ("preprocessor", preprocessor),
    ("model", model_grid_search)
])

approach.fit(X, y) #Seleciona o approach

print(f"Hiper parâmetros do modelo: {approach.steps[1][1].best_params_}")

Hiper parâmetros do modelo: {'criterion': 'squared_error', 'max_depth': 3}


In [22]:
joblib.dump(approach, '../models/model.joblib') # Salva o modelo em disco

['../models/model.joblib']