## Importes

In [1]:
# Utils
import pandas as pd
from copy import deepcopy
from sklearn.preprocessing import Normalizer, StandardScaler
import warnings
from pathlib import Path

# Avaliação, Pontuação e Divisão
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold, train_test_split, cross_val_score

# Salvando e carregando o modelo
import pickle

# Visualização de Dados
import matplotlib.pyplot as plt
import seaborn as sns

# Modelos
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, BaggingClassifier

##
warnings.filterwarnings("ignore")

## Configurações

In [2]:
# Caminhos
data_path =  Path('../data/02_outputs')
save_model_path = Path('../models')

# Configurações de modelo
test_size = 0.2
num_folds = 4
seed = 5
num_trees = 20

## Modelagem

In [3]:
# Carregando os dados
df = pd.read_csv(data_path / '02_to_model_v1.csv')

# Separando o array em componentes de input e output
array = df.values
X = array[:,:-1]
Y = array[:,-1]

# Padronizando os dados
scaler = StandardScaler().fit(X)
X = scaler.transform(X)

# Preparando lista de modelos
models = []
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('NB', GaussianNB()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('DTC', DecisionTreeClassifier()))
models.append(('SVM', SVC()))
models.append(('RFC', RandomForestClassifier()))

# Avaliando cada modelos em loop
results = []
names = []

for name, model in models:
    kfold = KFold(n_splits = num_folds, random_state = seed, shuffle=True)
    cv_results = cross_val_score(model, X, Y, cv = kfold, scoring = 'accuracy')
    results.append(cv_results)
    names.append(name)
    msg = "%s: %4f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)

LDA: 0.662927 (0.039841)
NB: 0.640675 (0.026346)
KNN: 0.618437 (0.034076)
DTC: 0.543943 (0.011287)
SVM: 0.671820 (0.035562)
RFC: 0.637356 (0.031721)


O Decision Tree Classifier foi o modelo menos performático, logo será excluído da próxima etapa.

## Otimização

In [4]:
# Separando dados em folds
kfold = KFold(n_splits=num_folds, shuffle=True, random_state = seed)


# Lista dos melhores modelos avaliadas na etapa anterior
best_models = [('LDA', LinearDiscriminantAnalysis()), 
               ('NB', GaussianNB()),
               ('KNN', KNeighborsClassifier()),
               ('SVM', SVC()),
               ('RFC', RandomForestClassifier())]

# Criando um dict para armazenar modelos resultantes
results = dict()

for name, model_ in best_models:
    
    model = BaggingClassifier(base_estimator = model_, n_estimators = num_trees, random_state = seed)
    results[name] = model
    result = cross_val_score(model, X, Y, cv = kfold)
    
    print(name, "- Accuracy: %.2f%% (%.4f)" % (result.mean() * 100,  result.std()))

LDA - Accuracy: 66.63% (0.0381)
NB - Accuracy: 64.07% (0.0326)
KNN - Accuracy: 62.18% (0.0356)
SVM - Accuracy: 67.40% (0.0327)
RFC - Accuracy: 65.74% (0.0300)


O Modelos selecionado será o Support Vector Machines

In [5]:
selected_model = results['SVM']

# Saving model
with open(save_model_path / 'model_v1_SVM.pkl','wb') as f:
    pickle.dump(selected_model,f)