# Análise comparativa de modelos

In [1]:
from IPython.display import display, Markdown
import joblib
import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, OrdinalEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import ShuffleSplit, GridSearchCV, KFold, cross_validate
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

## 1. Obtenção de dados
Nessa etapa obtemos novamnete os arquivos brutos de dados e o dicionário antes de iniciar o pre-processamento.

In [2]:
df = pd.read_csv("../data/raw/data.csv")
df_dict = pd.read_csv("../data/external/dictionary.csv")
df_dict

Unnamed: 0,variavel,descricao,tipo,subtipo
0,species,Nome da espécie,qualitativa,nominal
1,island,Nome da ilha onde foi feita a medição,qualitativa,nominal
2,bill_length_mm,Comprimento de bico,quantitativa,continua
3,bill_depth_mm,Profundidade (altura) de bico,quantitativa,continua
4,flipper_length_mm,Comprimento de nadadeira,quantitativa,continua
5,body_mass_g,Massa corporal (peso),quantitativa,continua
6,sex,Sexo dos indivíduos,qualitativa,nominal


## 2 Preparação de dados
Aqui realizamos a normalização, codificação e o tratamento de dados discrepantes e/ou faltantes dentro do conjunto de dados.

In [3]:
target_column = 'species'
nominal_columns = (
    df_dict
    .query("subtipo == 'nominal' and variavel != @target_column")
    .variavel
    .to_list()
)
continuous_columns = (
    df_dict
    .query("subtipo == 'continua'")
    .variavel
    .to_list()
)

X = df.drop(columns=[target_column], axis=1)
y = df[target_column]

In [4]:
# tratamento de dados discrepantes
nominal_preprocessor = Pipeline([
    ('missing', SimpleImputer(strategy='most_frequent')), # tratamento de dados faltantes
    ('encoding', OneHotEncoder(sparse_output=False, drop='first')), # codificação de variáveis
    ('normalization', StandardScaler()) # normalização de dados
])
continuous_preprocessor = Pipeline([
    ('missing', SimpleImputer(strategy='mean')), # tratamento de dados faltantes
    ('normalization', StandardScaler()) # normalização de dados
])

preprocessor = ColumnTransformer([
    ('nominal', nominal_preprocessor, nominal_columns),
    ('continuous', continuous_preprocessor, continuous_columns)
])

model = LogisticRegression()

## 3. Seleção de modelos

Iremos análisar quatro modelos, que serão testados utilizando um método de validação, a saber:

 - K-Nearest-Neighbors
 - Support Vector Machine
 - Decision Tree
 - Random Forest
  
Além disso, cada um desses algoritmos será testado com diferentes hiper-parametros, para que possamos encontrar o melhor modelo e a melhor configuração possível para esse modelo.

Utilizaremos as seguintes métricas para análise:
 - **Acurácia (accuracy)**: proporção entre os dados que foram corretamente previstos (como positivos ou negativos) com o total de dados observados;
 - **Precisão (precision)**: proporção entre dados corretamente previstos como positivos e o total de observações positivas. 
 - **Recall**: proporção entre dados corretamente previstos como positivos com o total de observações. 
 - **F1-score**: média entre precision e recall, portanto levando em conta tanto falsos positivos quanto falsos negativos.

In [5]:
# experiment settings
n_splits_comparative_analysis = 10
n_folds_grid_search = 5
test_size = .2
random_state = 42
scoring = 'accuracy'
metrics = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']

# model settings
max_iter = 1000
models = [
    ('K-Nearest Neighbors', KNeighborsClassifier(), {"n_neighbors": range(3, 20, 2), 'weights': ['uniform', 'distance']}),
    ('Suport Vector Machines', SVC(random_state=random_state, max_iter=max_iter), {"kernel": ["linear", "rbf"], 'C':[1,10,100,1000],'gamma':[0.0001, 0.001, 0.1, 1]}),
    ('Decision Tree',  DecisionTreeClassifier(random_state=random_state), {'criterion':['gini','entropy'],'max_depth': [3, 6, 8]}),
    ('Random Forest',  RandomForestClassifier(random_state=random_state), {'criterion':['gini','entropy'],'max_depth': [3, 6, 8], 'n_estimators': [10, 30]}),
]

In [6]:
results = pd.DataFrame({})
cross_validate_grid_search = KFold(n_splits=n_folds_grid_search)
cross_validate_comparative_analysis = ShuffleSplit(n_splits=n_splits_comparative_analysis, test_size=test_size, random_state=random_state)
for model_name, model_object, model_parameters in models:
    print(f"running {model_name}...")
    model_grid_search = GridSearchCV(
        estimator=model_object,
        param_grid=model_parameters,
        scoring=scoring,
        n_jobs=-1,
        cv=cross_validate_grid_search
    )
    approach = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', model_grid_search)
    ])
    scores = cross_validate(
        estimator=approach,
        X=X,
        y=y,
        cv=cross_validate_comparative_analysis,
        n_jobs=-1,
        scoring=metrics
    )
    scores['model_name'] = [model_name] * n_splits_comparative_analysis
    display(pd.DataFrame(scores).agg(['mean', 'std']))
    results = pd.concat([results, pd.DataFrame(scores)], ignore_index=True)

running K-Nearest Neighbors...


Unnamed: 0,fit_time,score_time,test_accuracy,test_precision_macro,test_recall_macro,test_f1_macro
mean,1.295083,0.050584,0.994203,0.994947,0.994932,0.994838
std,0.147834,0.033302,0.007484,0.006914,0.00658,0.00673


running Suport Vector Machines...


Unnamed: 0,fit_time,score_time,test_accuracy,test_precision_macro,test_recall_macro,test_f1_macro
mean,1.46562,0.042467,0.991304,0.993603,0.989849,0.991511
std,0.330958,0.027095,0.010133,0.007229,0.013403,0.010489


running Decision Tree...


Unnamed: 0,fit_time,score_time,test_accuracy,test_precision_macro,test_recall_macro,test_f1_macro
mean,0.346543,0.044282,0.963768,0.961011,0.966074,0.96256
std,0.100382,0.03513,0.018396,0.022401,0.018638,0.020508


running Random Forest...


Unnamed: 0,fit_time,score_time,test_accuracy,test_precision_macro,test_recall_macro,test_f1_macro
mean,5.619672,0.034374,0.988406,0.989317,0.987696,0.988348
std,1.080261,0.010721,0.013318,0.014399,0.015712,0.015035


In [7]:
def highlight_best(s, props=''):
    if s.name[1] != 'std':
        if s.name[0].endswith('time'):
            return np.where(s == np.nanmin(s.values), props, '')
        return np.where(s == np.nanmax(s.values), props, '')

display(Markdown("### 3.1 Resultados gerais"))
(
    results
    .groupby('model_name')
    .agg(['mean', 'std']).T
    .style
    .apply(highlight_best, props='color:white;background-color:gray;font-weight: bold;', axis=1)
    .set_table_styles([{'selector': 'td', 'props': 'text-align: center;'}])
)

### 3.1 Resultados gerais

Unnamed: 0,model_name,Decision Tree,K-Nearest Neighbors,Random Forest,Suport Vector Machines
fit_time,mean,0.346543,1.295083,5.619672,1.46562
fit_time,std,0.100382,0.147834,1.080261,0.330958
score_time,mean,0.044282,0.050584,0.034374,0.042467
score_time,std,0.03513,0.033302,0.010721,0.027095
test_accuracy,mean,0.963768,0.994203,0.988406,0.991304
test_accuracy,std,0.018396,0.007484,0.013318,0.010133
test_precision_macro,mean,0.961011,0.994947,0.989317,0.993603
test_precision_macro,std,0.022401,0.006914,0.014399,0.007229
test_recall_macro,mean,0.966074,0.994932,0.987696,0.989849
test_recall_macro,std,0.018638,0.00658,0.015712,0.013403


Como pode ser visto, o classificador K-Nearest Neighbors obteve melhores resultados para todas as métricas, portanto, podemos obter os melhores parâmetros deste modelo e salvá-lo em disco para utilização em uma próxima etapa.

## 3.2 Persistência do modelo



In [8]:
#Obtem o modelo e os parametros ganhadores
model_name, model_object, model_parameters  = [foo for foo in models if foo[0] == "K-Nearest Neighbors"][0] 


model_grid_search = GridSearchCV(
        estimator=model_object,
        param_grid=model_parameters,
        scoring=scoring,
        n_jobs=-1,
        cv=cross_validate_grid_search
    )

approach = Pipeline([
    ("preprocessor", preprocessor),
    ("model", model_grid_search)
])

approach.fit(X, y) #Seleciona o approach

print(f"Hiper parâmetros do modelo: {approach.steps[1][1].best_params_}")

Hiper parâmetros do modelo: {'n_neighbors': 5, 'weights': 'uniform'}


In [9]:
joblib.dump(approach, '../models/model.joblib') # Salva o modelo em disco

['../models/model.joblib']