# Tarefa 06 - Otimização dos Modelos de Aprendizagem de Máquina

Sub Tarefas
- Montar um código experimental que irá procurar pelos melhores hiper parâmetros dos modelos utilizando validação cruzada.
- Utilizar a biblioteca Optuna para fazer uma busca otimizada.
- Implementar uma classe para salvar e carregar os modelos.
- Salvar os melhores modelos treinados.

Definição de Pronto
- Ter um conjunto de hiper parametros escolhido para cada modelo.
- Salvar modelos treinados

In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from optuna import distributions
from optuna.integration import OptunaSearchCV

import tqdm as notebook_tqdm

from lightgbm import LGBMClassifier

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import StratifiedKFold,cross_val_score
from sklearn.metrics import f1_score,roc_auc_score,classification_report
from sklearn.base import BaseEstimator

from data_access_handler import DataAccessHandler
from feature_selector import FeatureSelector
from model import Model

OPTUNA_CV = 5
OPTUNA_N_TRIALS = 10
RANDOM_STATE = 42
DATA_PATH = "./data/"
MODEL_PATH = "./models/"

def f1_score_micro(estimator:BaseEstimator,X:pd.DataFrame,y:pd.DataFrame):
    y_pred = estimator.predict(X)
    return f1_score(y_true=y,y_pred=y_pred, average='micro')

Nesta etapa, a normalização dos dados será feita junto ao treinamento dos modelos, através da classe Pipeline do scikitlearn.

A otimização é feita com a biblioteca Optuna.

In [17]:
access_handler = DataAccessHandler(main_path=DATA_PATH)
df_train = access_handler.load(dataset_type="train")

selector = FeatureSelector()
selector.load_best_features(path=DATA_PATH)
features = selector.get_selected_features

target = 'fetal_health'
X,y = df_train[features],df_train[target].values.ravel()

In [5]:
model_pipeline = Pipeline(steps=[
                    ('scaler', MinMaxScaler()), 
                    ('lr', LogisticRegression(class_weight='balanced', 
                                            penalty='l2',
                                            max_iter=1000,
                                            random_state=RANDOM_STATE))])

param_distributions = {
    'lr__C': distributions.LogUniformDistribution(1e-4, 100),
}

lr_best = OptunaSearchCV(model_pipeline, 
                            param_distributions, 
                            scoring=f1_score_micro,
                            cv=StratifiedKFold(shuffle=True,random_state=RANDOM_STATE), 
                            n_trials=OPTUNA_N_TRIALS,
                            verbose=0)
                            
lr_best.fit(X, y)

  lr_best = OptunaSearchCV(model_pipeline,
[32m[I 2022-07-19 01:42:32,573][0m A new study created in memory with name: no-name-e9966e9f-9b18-4116-bd44-92292411c405[0m
[32m[I 2022-07-19 01:42:32,602][0m Trial 0 finished with value: 0.7666666666666666 and parameters: {'lr__C': 0.0029149796154619833}. Best is trial 0 with value: 0.7666666666666666.[0m
[32m[I 2022-07-19 01:42:32,625][0m Trial 1 finished with value: 0.7506666666666667 and parameters: {'lr__C': 0.0006402202027913462}. Best is trial 0 with value: 0.7666666666666666.[0m
[32m[I 2022-07-19 01:42:32,685][0m Trial 2 finished with value: 0.8433333333333334 and parameters: {'lr__C': 1.1469130792205986}. Best is trial 2 with value: 0.8433333333333334.[0m
[32m[I 2022-07-19 01:42:32,717][0m Trial 3 finished with value: 0.826 and parameters: {'lr__C': 0.03803558544979493}. Best is trial 2 with value: 0.8433333333333334.[0m
[32m[I 2022-07-19 01:42:32,764][0m Trial 4 finished with value: 0.8353333333333334 and parameters:

A classe Model está implementada no arquivo model.py

In [14]:
lr_model = Model(model = lr_best.best_estimator_)
f1_score_micro(lr_model,X,y)

0.85

In [19]:
lr_model.save(path=MODEL_PATH,model_name="logistic_regression")
del lr_model,lr_best

In [20]:
lr_model = Model(model = None)
lr_model.load(path=MODEL_PATH,model_name="logistic_regression")
f1_score_micro(lr_model,X,y)

0.85

In [27]:
model_pipeline = Pipeline(steps=[
                    ('scaler', MinMaxScaler()), 
                    ('rf', RandomForestClassifier(class_weight='balanced', 
                                            random_state=RANDOM_STATE))])

param_distributions = {
    'rf__n_estimators': distributions.IntUniformDistribution(10, 400),
    'rf__max_depth': distributions.IntUniformDistribution(3, 100),
    'rf__min_samples_split': distributions.IntUniformDistribution(2, 20),
    'rf__min_samples_leaf': distributions.IntUniformDistribution(2, 40),
}

rf_best = OptunaSearchCV(model_pipeline, 
                            param_distributions, 
                            scoring=f1_score_micro,
                            cv=StratifiedKFold(shuffle=True,random_state=RANDOM_STATE), 
                            n_trials=OPTUNA_N_TRIALS,
                            verbose=0)
                            
rf_best.fit(X, y)

  rf_best = OptunaSearchCV(model_pipeline,
[32m[I 2022-07-19 01:49:38,110][0m A new study created in memory with name: no-name-33e5cdaf-08ff-41d6-b933-a97b1838774e[0m
[32m[I 2022-07-19 01:49:38,559][0m Trial 0 finished with value: 0.8493333333333333 and parameters: {'rf__n_estimators': 106, 'rf__max_depth': 88, 'rf__min_samples_split': 18, 'rf__min_samples_leaf': 36}. Best is trial 0 with value: 0.8493333333333333.[0m
[32m[I 2022-07-19 01:49:39,498][0m Trial 1 finished with value: 0.874 and parameters: {'rf__n_estimators': 218, 'rf__max_depth': 84, 'rf__min_samples_split': 13, 'rf__min_samples_leaf': 21}. Best is trial 1 with value: 0.874.[0m
[32m[I 2022-07-19 01:49:40,023][0m Trial 2 finished with value: 0.874 and parameters: {'rf__n_estimators': 120, 'rf__max_depth': 87, 'rf__min_samples_split': 19, 'rf__min_samples_leaf': 21}. Best is trial 1 with value: 0.874.[0m
[32m[I 2022-07-19 01:49:41,578][0m Trial 3 finished with value: 0.8513333333333334 and parameters: {'rf__n

In [28]:
rf_model = Model(model = rf_best.best_estimator_)
f1_score_micro(rf_model,X,y)

0.944

In [29]:
rf_model.save(path=MODEL_PATH,model_name="random_forest")
del rf_model,rf_best

In [30]:
rf_model = Model(model = None)
rf_model.load(path=MODEL_PATH,model_name="random_forest")
f1_score_micro(rf_model,X,y)

0.944

In [33]:
model_pipeline = Pipeline(steps=[
                    ('scaler', MinMaxScaler()), 
                    ('lgbm', LGBMClassifier(class_weight='balanced', 
                                            random_state=RANDOM_STATE))])

param_distributions = {
    'lgbm__n_estimators': distributions.IntUniformDistribution(10, 400),
    'lgbm__max_depth': distributions.IntUniformDistribution(2, 100),
    'lgbm__learning_rate': distributions.LogUniformDistribution(5e-2, 0.5),
    'lgbm__num_leaves': distributions.IntUniformDistribution(2, 50),
    'lgbm__subsample_for_bin': distributions.IntLogUniformDistribution(10, 200000),
}

lgbm_best = OptunaSearchCV(model_pipeline, 
                            param_distributions, 
                            scoring=f1_score_micro,
                            cv=StratifiedKFold(shuffle=True,random_state=RANDOM_STATE), 
                            n_trials=OPTUNA_N_TRIALS,
                            verbose=0)
                            
lgbm_best.fit(X, y)

  lgbm_best = OptunaSearchCV(model_pipeline,
[32m[I 2022-07-19 01:55:28,155][0m A new study created in memory with name: no-name-5850deb0-09bb-4700-bde7-60b437b98a4b[0m
[32m[I 2022-07-19 01:55:37,937][0m Trial 0 finished with value: 0.9393333333333332 and parameters: {'lgbm__n_estimators': 202, 'lgbm__max_depth': 95, 'lgbm__learning_rate': 0.3062803413099891, 'lgbm__num_leaves': 7, 'lgbm__subsample_for_bin': 4119}. Best is trial 0 with value: 0.9393333333333332.[0m
[32m[I 2022-07-19 01:56:10,764][0m Trial 1 finished with value: 0.9413333333333334 and parameters: {'lgbm__n_estimators': 247, 'lgbm__max_depth': 38, 'lgbm__learning_rate': 0.10017270744931389, 'lgbm__num_leaves': 32, 'lgbm__subsample_for_bin': 29799}. Best is trial 1 with value: 0.9413333333333334.[0m
[32m[I 2022-07-19 01:56:21,422][0m Trial 2 finished with value: 0.9453333333333334 and parameters: {'lgbm__n_estimators': 212, 'lgbm__max_depth': 48, 'lgbm__learning_rate': 0.3950345043343973, 'lgbm__num_leaves': 16

In [34]:
lgbm_model = Model(model = lgbm_best.best_estimator_)
f1_score_micro(lgbm_model,X,y)

0.9993333333333333

In [35]:
lgbm_model.save(path=MODEL_PATH,model_name="light_gbm")
del lgbm_model,lgbm_best

In [36]:
lgbm_model = Model(model = None)
lgbm_model.load(path=MODEL_PATH,model_name="light_gbm")
f1_score_micro(lgbm_model,X,y)

0.9993333333333333

# Conclusões

- Todos os modelos otimizados obtiveram ótima performance através da avaliação com a validação cruzada.
- O modelo com maior performance de classificação é o LightGBM.

# Tarefa 07 - Escolha do Melhor Modelo

Sub Tarefas:
- Carregar os modelos salvos.
- Comparar os modelos no conjunto final de teste.
- Escolher o modelo vencedor (com maior performance no teste) que irá para produção.

Definição de Pronto:
- Ter escolhido um modelo vencedor.
- Salvar o modelo vencedor para ir para produção.

In [13]:
best_lr = Model()
best_lr.load(path=MODEL_PATH,model_name="logistic_regression")

best_rf = Model()
best_rf.load(path=MODEL_PATH,model_name="random_forest")

best_lgbm = Model()
best_lgbm.load(path=MODEL_PATH,model_name="light_gbm")

In [14]:
access_handler = DataAccessHandler(main_path=DATA_PATH)
df_test = access_handler.load(dataset_type="test")

selector = FeatureSelector()
selector.load_best_features(path=DATA_PATH)
features = selector.get_selected_features

target = 'fetal_health'
X,y = df_test[features],df_test[target].values.ravel()

In [15]:
score_lr = f1_score_micro(estimator = best_lr,X=X,y=y)
score_rf = f1_score_micro(estimator = best_rf,X=X,y=y)
score_lgbm = f1_score_micro(estimator = best_lgbm,X=X,y=y)

print("LogisticRegression Test Score: %.4f" % score_lr)
print("RandomForest Test Score: %.4f" % score_rf)
print("LightGBM Test Score: %.4f" % score_lgbm)

LogisticRegression Test Score: 0.8003
RandomForest Test Score: 0.9105
LightGBM Test Score: 0.9521
