# Pacotes

In [1]:
# Bibliotecas padrão e manipulação de dados
import os
import pickle
import warnings
from datetime import datetime, date

import numpy as np
import pandas as pd
from pytz import timezone
from unidecode import unidecode

# Configurações e filtros
pd.set_option('display.max_columns', None)
warnings.filterwarnings("ignore")

# Visualização de dados
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import missingno as msno

plt.style.use('ggplot')

# Machine Learning - Modelos e Pré-processamento
import shap
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier, Pool
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import (
    train_test_split, cross_val_score, RepeatedStratifiedKFold, KFold, StratifiedKFold, GridSearchCV
)
from sklearn.metrics import (
    accuracy_score, average_precision_score, classification_report, confusion_matrix, f1_score,
    log_loss, precision_recall_curve, precision_score, recall_score, roc_auc_score, roc_curve, auc,
    balanced_accuracy_score, brier_score_loss, cohen_kappa_score, matthews_corrcoef
)
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.feature_selection import (
    VarianceThreshold, RFE, SelectFromModel, SequentialFeatureSelector, mutual_info_classif, mutual_info_regression
)
from sklearn.decomposition import PCA
from sklearn.inspection import permutation_importance
from sklearn.neighbors import KDTree
from sklearn.tree import DecisionTreeClassifier

# Estatística e testes de hipótese
from scipy.stats import (
    chi2_contingency, kruskal, ks_2samp, fisher_exact, mannwhitneyu, power_divergence
)
import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Modelos avançados e otimização
from hyperopt import fmin, tpe, hp, Trials, STATUS_OK, space_eval
from hyperopt.pyll import scope
from skopt import forest_minimize

# Avaliação de modelos e explanação
from shap import Explainer

# Salvamento e carregamento de modelos com MLflow
import mlflow
from mlflow.models import infer_signature
import mlflow.lightgbm
import mlflow.catboost


import mlflow
import mlflow.xgboost
from sklearn.metrics import (
    accuracy_score, confusion_matrix, precision_score, recall_score,
    f1_score, roc_auc_score, balanced_accuracy_score, average_precision_score,
    log_loss, brier_score_loss, cohen_kappa_score, matthews_corrcoef, roc_curve, precision_recall_curve
)

# Impressão de versões das bibliotecas utilizadas
print(f"Pandas version: {pd.__version__}")
print(f"NumPy version: {np.__version__}")
#print(f"Scikit-learn version: {sklearn.__version__}")
print(f"XGBoost version: {xgb.__version__}")
print(f"LightGBM version: {lgb.__version__}")
#print(f"CatBoost version: {CatBoostClassifier.__module__.split('.')[0]} version: {ctb.__version__}")
print(f"SHAP version: {shap.__version__}")
#print(f"PPScore version: {pps.__version__}")
#print(f"missingno version: {msno.__version__}")
#print(f"MLflow version: {mlflow.__version__}")

Pandas version: 2.2.3
NumPy version: 2.2.5
XGBoost version: 3.0.0
LightGBM version: 4.6.0
SHAP version: 0.48.0


# Criando ou carregando o experimento

In [2]:
# Nome do experimento que você deseja verificar/criar
experiment_name = "Dados aviação Aula"

# Verificar se o experimento já existe
experiment = mlflow.get_experiment_by_name(experiment_name)

# Se o experimento não existir, cria-o
if experiment is None:
    mlflow.set_experiment(experiment_name)
    print(f"O experimento '{experiment_name}' foi criado.")
else:
    print(f"O experimento '{experiment_name}' já existe.")

O experimento 'Dados aviação Aula' já existe.


In [3]:
# Verificar se o experimento já existe
experiment = mlflow.get_experiment_by_name(experiment_name)

# Id do experimento
experiment_id = experiment.experiment_id
print(f"O experimento id é:'{experiment_id}'")

O experimento id é:'559262076987350823'


# Xgboost

## Carregando Dados

## Desenvolvimento

In [None]:
# Definindo o caminho do arquivo CSV que contém os dados históricos de voos.
file_path = 'df_treinamento_2022_2023.csv'
        
# Lendo o arquivo CSV e carregando os dados em um DataFrame do pandas.
df = pd.read_csv(file_path)

## No mlflow

In [4]:
# Start an MLflow run context
with mlflow.start_run(experiment_id=experiment_id, run_name='extração e tratamento dos dados', 
                      description = 'Extração e/ou tratamento de dados',
                      tags = {"Extração": "origem_x", "objetivo": "alimentar o modelo_x", "Versão da etapa": "1.0"}):
    # Carregamento de dados históricos de voos a partir de um arquivo CSV.
    # Definindo o caminho do arquivo CSV que contém os dados históricos de voos.
    file_path = 'df_treinamento_2022_2023.csv'
        
    # Lendo o arquivo CSV e carregando os dados em um DataFrame do pandas.
    df = pd.read_csv(file_path)

The git executable must be specified in one of the following ways:
    - be included in your $PATH
    - be set via $GIT_PYTHON_GIT_EXECUTABLE
    - explicitly set via git.refresh(<full-path-to-git-executable>)

All git commands will error until this is rectified.

This initial message can be silenced or aggravated in the future by setting the
$GIT_PYTHON_REFRESH environment variable. Use one of the following values:
    - quiet|q|silence|s|silent|none|n|0: for no message or exception
    - error|e|exception|raise|r|2: for a raised exception

Example:
    export GIT_PYTHON_REFRESH=quiet



## Pre processamento

### Desenvolvimento

In [None]:
df = df.drop(columns =[ 'codigo_di', 'codigo_tipo_linha'])

df = df[list(df)]

list_dummies =  colunas_categ = df.drop(columns = 'status_do_voo').select_dtypes(include=['object']).columns.tolist()

# Transformar colunas categóricas em tipo "category"
df[list_dummies] = df[list_dummies].astype("category")

# Seleção das features preditoras (X) e variável-alvo (y)
dt_ax = df.drop(columns=["status_do_voo"])
dt_ay = df[['status_do_voo']]

# Codificação da variável-alvo
label_mapping = {'Pontual': 0, 'Atrasado': 1}
dt_ay = dt_ay['status_do_voo'].map(label_mapping)

# Codifica colunas categóricas como inteiros
label_encoders = {}
for col in list_dummies:
    le = LabelEncoder()
    dt_ax[col] = le.fit_transform(dt_ax[col])
    label_encoders[col] = le

# Segmentação em treino (86%) e teste (14,20%)
X_train, X_test, y_train, y_test = train_test_split(dt_ax, dt_ay, random_state=33, test_size=0.142)

# Segmentação adicional para validação/calibração (84,5% treino / 16,5% calibração)
X_train_valid, X_test_valid, y_train_valid, y_test_valid = train_test_split(X_train, y_train, random_state=33, test_size=0.165)

# Reverter os valores transformados para o tipo "category" original
def revert_to_category(data, label_encoders, list_dummies):
    for col in list_dummies:
        if col in data.columns:
            le = label_encoders[col]
            data[col] = le.inverse_transform(data[col])
    return data

# Aplicar a reversão em X_smote_a, X_test_calib, X_test
X_train_valid = revert_to_category(X_train_valid, label_encoders, list_dummies)
X_test_valid = revert_to_category(X_test_valid, label_encoders, list_dummies)
X_test = revert_to_category(X_test, label_encoders, list_dummies)

# Para garantir que as colunas estão no tipo "category"
X_train_valid[list_dummies] = X_train_valid[list_dummies].astype("category")
X_test_valid[list_dummies] = X_test_valid[list_dummies].astype("category")
X_test[list_dummies] = X_test[list_dummies].astype("category")

# Converte os nomes das colunas para uma lista de strings
feature_names = list(X_test.columns)

# Converte os conjuntos para DMatrix
dtrain = xgb.DMatrix(X_train_valid, label=y_train_valid, enable_categorical=True, feature_names=feature_names, nthread=-1)
dtest_valid = xgb.DMatrix(X_test_valid, label=y_test_valid, enable_categorical=True, feature_names=feature_names, nthread=-1)
dtest = xgb.DMatrix(X_test, label=y_test, enable_categorical=True, feature_names=feature_names, nthread=-1)

In [None]:
print(df.shape)
print(X_train_valid.shape)
print(X_test.shape)
print(X_test_valid.shape)

### Mlflow

In [5]:
# Função para calcular e registrar a distribuição de classes
def log_class_distribution(y, label):
    unique, counts = np.unique(y, return_counts=True)
    distribution = dict(zip(unique, counts))
    total = sum(counts)
    mlflow.log_param(f"{label}_class_distribution", {f"Class {k}": f"{v/total:.2%}" for k, v in distribution.items()})

In [6]:
with mlflow.start_run(experiment_id=experiment_id, run_name='Pre-processamento',
                      nested=True,
                      description='Garantir o input correto dos modelos',
                      tags={"Pre-processamento": "preparação para treinamento", "objetivo": "garantir o input correto dos dados", "Versão da etapa": "1.0"}):

    # Etapa 1: Exclusão de colunas desnecessárias
    with mlflow.start_run(experiment_id=experiment_id, run_name='drop columns', nested=True, 
                          description='Exclusão de colunas desnecessárias',
                          tags={"Tratamento": "drop_columns"}):
        df = df.drop(columns=['codigo_di', 'codigo_tipo_linha'])
        mlflow.log_param("colunas_excluidas", ['codigo_di', 'codigo_tipo_linha'])

    # Etapa 2: Transformar colunas categóricas em tipo "category"
    with mlflow.start_run(experiment_id=experiment_id, run_name='Transformar colunas categóricas', nested=True, 
                          description='Converte colunas categóricas para o tipo category',
                          tags={"Tratamento": "category_conversion"}):
        list_dummies = df.drop(columns='status_do_voo').select_dtypes(include=['object']).columns.tolist()
        df[list_dummies] = df[list_dummies].astype("category")
        mlflow.log_param("colunas_categoricas", list_dummies)

    # Etapa 3: Seleção de features e variável-alvo
    with mlflow.start_run(experiment_id=experiment_id, run_name='Seleção de features', nested=True, 
                          description='Selecionar features preditoras e variável-alvo',
                          tags={"Tratamento": "feature_selection"}):
        dt_ax = df.drop(columns=["status_do_voo"])
        dt_ay = df['status_do_voo'].map({'Pontual': 0, 'Atrasado': 1})
        mlflow.log_param("target_mapping", {'Pontual': 0, 'Atrasado': 1})
        mlflow.log_param("n_features", dt_ax.shape[1])

    # Etapa 4: Codificação de colunas categóricas
    with mlflow.start_run(experiment_id=experiment_id, run_name='Codificação de colunas categóricas', nested=True, 
                          description='Codificar colunas categóricas como inteiros',
                          tags={"Tratamento": "label_encoding"}):
        label_encoders = {}
        for col in list_dummies:
            le = LabelEncoder()
            dt_ax[col] = le.fit_transform(dt_ax[col])
            label_encoders[col] = le
        mlflow.log_param("n_label_encoded_columns", len(list_dummies))

    # Etapa 5: Segmentação em treino, teste e validação
    with mlflow.start_run(experiment_id=experiment_id, run_name='Segmentação em treino/teste/validação', nested=True, 
                          description='Segmentação dos dados em treino (71,64%), validação (14,15%) e teste (14,20%)',
                          tags={"Tratamento": "data_split"}):
        # Realizar a segmentação
        X_train, X_test, y_train, y_test = train_test_split(dt_ax, dt_ay, random_state=33, test_size=0.142)
        X_train_valid, X_test_valid, y_train_valid, y_test_valid = train_test_split(X_train, y_train, random_state=33, test_size=0.165)
        
        # Registrar o tamanho dos conjuntos
        mlflow.log_param("train_size", len(X_train_valid))
        mlflow.log_param("validation_size", len(X_test_valid))
        mlflow.log_param("test_size", len(X_test))
        
        # Registrar a distribuição de classes
        log_class_distribution(y_train, 'train_size')
        log_class_distribution(y_test_valid, 'validation_size')
        log_class_distribution(y_test, 'test_size')

    # Etapa 6: Reversão e preparação final dos dados
    with mlflow.start_run(experiment_id=experiment_id, run_name='Reversão e preparação final', nested=True, 
                          description='Reverter valores transformados para o tipo category original e preparação final',
                          tags={"Tratamento": "final_preparation"}):
        def revert_to_category(data, label_encoders, list_dummies):
            for col in list_dummies:
                if col in data.columns:
                    le = label_encoders[col]
                    data[col] = le.inverse_transform(data[col])
            return data

        X_train_valid = revert_to_category(X_train_valid, label_encoders, list_dummies)
        X_test_valid = revert_to_category(X_test_valid, label_encoders, list_dummies)
        X_test = revert_to_category(X_test, label_encoders, list_dummies)
        
        # Garantir que as colunas estão no tipo "category"
        X_train_valid[list_dummies] = X_train_valid[list_dummies].astype("category")
        X_test_valid[list_dummies] = X_test_valid[list_dummies].astype("category")
        X_test[list_dummies] = X_test[list_dummies].astype("category")
        
        mlflow.log_param("categorical_columns_finalized", list_dummies)

    # Etapa 7: Conversão para DMatrix
    with mlflow.start_run(experiment_id=experiment_id, run_name='Conversão para DMatrix', nested=True, 
                          description='Converter conjuntos de dados para DMatrix para treinamento com XGBoost',
                          tags={"Tratamento": "dmatrix_conversion"}):
        feature_names = list(X_test.columns)
        dtrain = xgb.DMatrix(X_train_valid, label=y_train_valid, enable_categorical=True, feature_names=feature_names, nthread=-1)
        dtest_valid = xgb.DMatrix(X_test_valid, label=y_test_valid, enable_categorical=True, feature_names=feature_names, nthread=-1)
        dtest = xgb.DMatrix(X_test, label=y_test, enable_categorical=True, feature_names=feature_names, nthread=-1)
        mlflow.log_param("feature_names", feature_names)

## Hipertunnig

### Hipertunnig desenvolvimento

In [None]:
import re
import xgboost as xgb
import pandas as pd
from hyperopt import STATUS_OK

# Callback customizada que extende TrainingCallback e tem a assinatura correta.
class FoldMetricsCallback(xgb.callback.TrainingCallback):
    def __init__(self):
        # Dicionário para armazenar os resultados por iteração.
        # A chave é o número da iteração (int) e o valor é um dicionário com os resultados de cada fold.
        self.fold_results = {}
    
    def after_iteration(self, model, epoch, evals):
        iteration_results = {}
        # No cv, o booster 'model' possui o atributo cvfolds
        if hasattr(model, 'cvfolds'):
            for i, cvpack in enumerate(model.cvfolds):
                # Agora, passamos os argumentos feval=None e output_margin=False
                result_str = cvpack.eval(epoch, feval=None, output_margin=False)
                # Exemplo de result_str:
                # "train-auc:0.72862+0.00158  test-auc:0.68776+0.01517  test-aucpr:0.73000+0.00150  test-logloss:0.70000+0.01000  test-error:0.32000+0.02000"
                # Extraímos os valores numéricos das métricas de teste usando regex
                matches = re.findall(r"test-([\w_]+):([\d\.]+)", result_str)
                for metric, value in matches:
                    if metric in ['aucpr', 'auc', 'logloss', 'error']:
                        key = f"fold{i}_{metric}"
                        iteration_results[key] = float(value)
        # Armazena os resultados da iteração 'epoch'
        self.fold_results[epoch] = iteration_results
        # Retorna False para continuar o treinamento
        return False

In [None]:
def hipertunnig(space):
    """
    Realiza o ajuste de hiperparâmetros de um modelo XGBoost usando validação cruzada com DMatrix.
    
    Args:
        space (dict): Dicionário contendo os hiperparâmetros avaliados pelo Hyperopt.
        
    Returns:
        dict: Dicionário contendo o 'loss' (média do logloss do conjunto de CV), o 'status', 
              a 'best_iteration' e as métricas adicionais (média, mínimo e máximo de AUCPR e AUC).
    """
    # Configuração dos parâmetros a partir do espaço definido
    params = {
        'max_depth': int(space['max_depth']),                    # Profundidade máxima da árvore
        'gamma': space['gamma'],                                 # Redução mínima de perda para divisão de nó
        'reg_alpha': space['reg_alpha'],                         # Regularização L1
        'reg_lambda': space['reg_lambda'],                       # Regularização L2
        'min_child_weight': int(space['min_child_weight']),      # Peso mínimo de instâncias em um nó filho
        'colsample_bytree': space['colsample_bytree'],           # Proporção de colunas amostradas por árvore
        'colsample_bylevel': space['colsample_bylevel'],         # Subamostragem de colunas por nível
        'colsample_bynode': space['colsample_bynode'],           # Subamostragem de colunas por nó
        'n_estimators': space['n_estimators'],                   # (Não utilizado pelo xgb.cv)
        'learning_rate': space['learning_rate'],                 # Taxa de aprendizado
        'max_delta_step': space['max_delta_step'],               # Limite de atualização das folhas
        'subsample': space['subsample'],                         # Proporção de amostragem das instâncias
        'sampling_method': space['sampling_method'],             # Método de amostragem
        'tree_method': space['tree_method'],                     # Método de construção da árvore
        'device': space['device'],                               # Dispositivo para treinamento (ex.: 'cuda')
        'enable_categorical': space['enable_categorical'],       # Habilita suporte nativo a categóricas
        'scale_pos_weight': space['scale_pos_weight'],           # Ajusta o peso das classes desbalanceadas
        'eval_metric': space['eval_metric'],                     # Métricas de avaliação
        'objective': space['objective'],                         # Função objetivo (ex.: 'binary:logistic')
        'seed': space['seed'],                                   # Semente para reprodutibilidade
        'max_cat_to_onehot': int(space['max_cat_to_onehot']),    # Limite para aplicar one-hot
        'max_cat_threshold': int(space['max_cat_threshold']),    # Máximo de categorias para divisão
        'max_leaves': int(space['max_leaves']),                  # Número máximo de folhas por árvore
        'validate_parameters': space['validate_parameters'],     # Valida os parâmetros antes do treinamento
        'max_bin': space['max_bin'],                             # Número máximo de bins para histogramas
        'updater': space['updater']                              # Atualizador (ex.: 'grow_gpu_hist')
    }

    print("Hiperparâmetros utilizados:", params)
    print("Hiperparâmetros n_estimators:", space['n_estimators'])
    
    # Instancia a callback customizada para extrair os resultados individuais por fold
    fold_callback = FoldMetricsCallback()

    # Realiza a validação cruzada com xgb.cv
    cv_results = xgb.cv(
        params=params,
        dtrain=dtrain,                    # DMatrix preparado (global ou passado externamente)
        num_boost_round=int(space['n_estimators']),
        nfold=5,                          # Número de folds
        metrics=["aucpr", "auc", "logloss", "error"],  # Métricas de avaliação
        as_pandas=True,                   # Retorna os resultados como DataFrame
        seed=33,
        stratified=True,                  # Garante estratificação dos folds
        early_stopping_rounds=18 if params['max_depth'] <= 12 else 45,  # Early stopping condicional
        verbose_eval=False,
        callbacks=[fold_callback]
    )

    best_iteration = cv_results.shape[0]
    
    # Calcula estatísticas das métricas a partir do cv_results
    max_aucpr = cv_results["test-aucpr-mean"].max()
    max_auc = cv_results["test-auc-mean"].max()
    max_logloss = cv_results["test-logloss-mean"].max()

    min_aucpr = cv_results["test-aucpr-mean"].min()
    min_auc = cv_results["test-auc-mean"].min()
    min_logloss = cv_results["test-logloss-mean"].min()

    mean_aucpr = cv_results["test-aucpr-mean"].mean()
    mean_auc = cv_results["test-auc-mean"].mean()
    mean_logloss = cv_results["test-logloss-mean"].mean()

    # Calcula estatísticas das métricas a partir do cv_results
    max_error = cv_results["test-error-mean"].max()
    min_error = cv_results["test-error-mean"].min()
    mean_error = cv_results["test-error-mean"].mean()


     # Extração dos resultados individuais de cada fold usando os boosters retornados
    fold_results = []


    print(fold_results)

    print("Melhor iteração:", best_iteration)
    print("Max AUCPR: ", max_aucpr)
    print("Média AUCPR: ", mean_aucpr)
    print("Min AUCPR: ", min_aucpr)
    print("Max AUC: ", max_auc)
    print("Média AUC: ", mean_auc)
    print("Min AUC: ", min_auc)
    print("Max LogLoss: ", max_logloss)
    print("Média LogLoss: ", mean_logloss)
    print("Min LogLoss: ", min_logloss)
    print("Max error: ", max_error)
    print("Média error: ", min_error)
    print("Min error: ", mean_error)
    print("------------------------------------------------------------------------------------------")
    print("Novo modelo")
    
    # Seleciona os resultados individuais da iteração final (0-indexada)
    selected_fold_results = fold_callback.fold_results.get(best_iteration - 1, {})

    # Cria um dicionário com os resultados agregados e os resultados individuais por fold
    result_dict = {
        'loss': mean_logloss,
        'best_iteration': best_iteration,
        'mean_aucpr': mean_aucpr,
        'max_aucpr': max_aucpr,
        'min_aucpr': min_aucpr,
        'mean_auc': mean_auc,
        'max_auc': max_auc,
        'min_auc': min_auc,
        'mean_logloss': mean_logloss,
        'max_logloss': max_logloss,
        'min_logloss': min_logloss,
        'mean_error': mean_error,
        'max_error': max_error,
        'min_error': min_error,
        'fold_results': selected_fold_results, 
        'max_depth': int(space['max_depth']),                    # Profundidade máxima da árvore
        'gamma': space['gamma'],                                 # Redução mínima de perda para divisão de nó
        'reg_alpha': space['reg_alpha'],                         # Regularização L1
        'reg_lambda': space['reg_lambda'],                       # Regularização L2
        'min_child_weight': int(space['min_child_weight']),      # Peso mínimo de instâncias em um nó filho
        'colsample_bytree': space['colsample_bytree'],           # Proporção de colunas amostradas por árvore
        'colsample_bylevel': space['colsample_bylevel'],         # Subamostragem de colunas por nível
        'colsample_bynode': space['colsample_bynode'],           # Subamostragem de colunas por nó
        'n_estimators': space['n_estimators'],                   # (Não utilizado pelo xgb.cv)
        'learning_rate': space['learning_rate'],                 # Taxa de aprendizado
        'max_delta_step': space['max_delta_step'],               # Limite de atualização das folhas
        'subsample': space['subsample'],                         # Proporção de amostragem das instâncias
        'sampling_method': space['sampling_method'],             # Método de amostragem
        'tree_method': space['tree_method'],                     # Método de construção da árvore
        'device': space['device'],                               # Dispositivo para treinamento (ex.: 'cuda')
        'enable_categorical': space['enable_categorical'],       # Habilita suporte nativo a categóricas
        'scale_pos_weight': space['scale_pos_weight'],           # Ajusta o peso das classes desbalanceadas
        'eval_metric': space['eval_metric'],                     # Métricas de avaliação
        'objective': space['objective'],                         # Função objetivo (ex.: 'binary:logistic')
        'seed': space['seed'],                                   # Semente para reprodutibilidade
        'max_cat_to_onehot': int(space['max_cat_to_onehot']),    # Limite para aplicar one-hot
        'max_cat_threshold': int(space['max_cat_threshold']),    # Máximo de categorias para divisão
        'max_leaves': int(space['max_leaves']),                  # Número máximo de folhas por árvore
        'validate_parameters': space['validate_parameters'],     # Valida os parâmetros antes do treinamento
        'max_bin': space['max_bin'],                             # Número máximo de bins para histogramas
        'updater': space['updater']                              # Atualizador (ex.: 'grow_gpu_hist')
    }
    
    # Acumula o resultado na variável global
    global global_results
    global_results.append(result_dict)
    
    # Converte a lista acumulada em DataFrame e imprime uma parcial (últimos 5 resultados)
    df_results = pd.DataFrame(global_results)
    print("Partial results (últimos 10):")
    df_results.tail(10)
    
    # Retorna os resultados para o Hyperopt
    return {
        'loss': mean_logloss,  # Objetivo: minimizar o logloss
        'status': STATUS_OK,
        'best_iteration': best_iteration,
        'additional_metrics': {
            'mean_aucpr': mean_aucpr,
            'mean_auc': mean_auc,
            'mean_logloss': mean_logloss,
            'max_aucpr': max_aucpr,
            'max_auc': max_auc,
            'max_logloss': max_logloss,
            'min_aucpr': min_aucpr,
            'min_auc': min_auc,
            'min_logloss': min_logloss
        },
        'df_results': df_results  # Retorna o DataFrame com os resultados acumulados
    }

In [None]:
space = {
    # Limita a profundidade das árvores para evitar modelos excessivamente complexos
    'max_depth': scope.int(hp.quniform("max_depth", 3, 30, 1)),
    
    # Gamma controla o ganho mínimo para realizar um split; um intervalo menor evita splits irrelevantes
    'gamma': hp.uniform('gamma', 0, 18),
    
    # Regularização L1 para incentivar esparsidade; intervalo ajustado para não penalizar demais
    'reg_alpha': hp.uniform('reg_alpha', 0, 120),
    
    # Regularização L2 para controlar coeficientes elevados
    'reg_lambda': hp.uniform('reg_lambda', 0, 10),
    
    # Fração de features a serem usadas em cada árvore; intervalos que garantem diversidade entre as árvores
    'colsample_bytree': hp.uniform('colsample_bytree', 0.4, 0.9),
    'colsample_bylevel': hp.uniform('colsample_bylevel', 0.4, 1.0),
    'colsample_bynode': hp.uniform('colsample_bynode', 0.4, 1.0),
    
    # Peso mínimo necessário em um nó filho; evitar splits baseados em poucas instâncias
    'min_child_weight': scope.int(hp.quniform('min_child_weight', 1, 24, 1)),
    
    # Taxa de aprendizado em escala logarítmica para explorar valores pequenos sem pular demais
    'learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(0.5)),
    
    # Número de árvores; garante quantidade suficiente sem exagerar no overfitting
    'n_estimators': scope.int(hp.quniform('n_estimators', 100, 1000, 10)),
    
    # Limita a magnitude da atualização dos nós para estabilidade no treinamento
    'max_delta_step': hp.uniform('max_delta_step', 0, 10),
    
    # Subamostragem dos registros para cada árvore; ajuda a reduzir overfitting
    'subsample': hp.uniform('subsample', 0.4, 1.0),
    
    # Método fixo de amostragem baseado em gradientes (já testado no seu contexto)
    'sampling_method': 'gradient_based',
    
    # Utilizando o método de construção de árvores otimizado para GPU
    'tree_method': 'gpu_hist',
    'device': 'cuda',
    
    # Ativa o tratamento nativo de variáveis categóricas
    'enable_categorical': True,
    
    # Dado o desbalanceamento (razão aproximadamente 5:1), busca refinar o peso da classe positiva
    'scale_pos_weight': hp.uniform('scale_pos_weight', 3, 9),
    
    # Controle para uso de one-hot encoding apenas em variáveis de baixa cardinalidade (evita transformar "numero_empresa_voo")
    'max_cat_to_onehot': scope.int(hp.quniform('max_cat_to_onehot', 3, 50, 1)),
    'max_cat_threshold': scope.int(hp.quniform('max_cat_threshold', 3, 50, 1)),
    
    # Limita o número de folhas da árvore para controlar a complexidade
    'max_leaves': scope.int(hp.quniform('max_leaves', 16, 256, 4)),
    
    # Validação dos parâmetros antes do início do treinamento
    'validate_parameters': True,
    
    # Semente para garantir reprodutibilidade
    'seed': 33,
    
    # Métricas utilizadas para avaliar a performance durante o treinamento
    'eval_metric': ["aucpr", "auc", "logloss", "error"],
    
    # Atualizador otimizado para GPU
    'updater': 'grow_gpu_hist',
    
    # Número máximo de bins para histogramas (afeta a granularidade dos splits)
    'max_bin': scope.int(hp.quniform('max_bin', 32, 320, 8)),
    
    # Função objetivo para classificação binária
    'objective': 'binary:logistic'
}


In [None]:
# Executando a otimização
trials = Trials()
# Lista global para acumular os resultados de cada trial
global_results = []
best_hyperparams = fmin(fn=hipertunnig, 
                        space=space, 
                        algo=tpe.suggest, 
                        max_evals=2, 
                        trials=trials,
                        rstate=np.random.default_rng(42))  # Reprodutibilidade)

# Obtendo os melhores hiperparâmetros
best_hyperparams = space_eval(space, best_hyperparams)
print("Melhores hiperparâmetros:", best_hyperparams)

### Hipertunning Mlflow

In [7]:
space = {
    # Limita a profundidade das árvores para evitar modelos excessivamente complexos
    'max_depth': scope.int(hp.quniform("max_depth", 3, 30, 1)),
    
    # Gamma controla o ganho mínimo para realizar um split; um intervalo menor evita splits irrelevantes
    'gamma': hp.uniform('gamma', 0, 18),
    
    # Regularização L1 para incentivar esparsidade; intervalo ajustado para não penalizar demais
    'reg_alpha': hp.uniform('reg_alpha', 0, 120),
    
    # Regularização L2 para controlar coeficientes elevados
    'reg_lambda': hp.uniform('reg_lambda', 0, 10),
    
    # Fração de features a serem usadas em cada árvore; intervalos que garantem diversidade entre as árvores
    'colsample_bytree': hp.uniform('colsample_bytree', 0.4, 0.9),
    'colsample_bylevel': hp.uniform('colsample_bylevel', 0.4, 1.0),
    'colsample_bynode': hp.uniform('colsample_bynode', 0.4, 1.0),
    
    # Peso mínimo necessário em um nó filho; evitar splits baseados em poucas instâncias
    'min_child_weight': scope.int(hp.quniform('min_child_weight', 1, 24, 1)),
    
    # Taxa de aprendizado em escala logarítmica para explorar valores pequenos sem pular demais
    'learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(0.5)),
    
    # Número de árvores; garante quantidade suficiente sem exagerar no overfitting
    'n_estimators': scope.int(hp.quniform('n_estimators', 100, 1000, 10)),
    
    # Limita a magnitude da atualização dos nós para estabilidade no treinamento
    'max_delta_step': hp.uniform('max_delta_step', 0, 10),
    
    # Subamostragem dos registros para cada árvore; ajuda a reduzir overfitting
    'subsample': hp.uniform('subsample', 0.4, 1.0),
    
    # Método fixo de amostragem baseado em gradientes (já testado no seu contexto)
    'sampling_method': 'gradient_based',
    
    # Utilizando o método de construção de árvores otimizado para GPU
    'tree_method': 'gpu_hist',
    'device': 'cuda',
    
    # Ativa o tratamento nativo de variáveis categóricas
    'enable_categorical': True,
    
    # Dado o desbalanceamento (razão aproximadamente 5:1), busca refinar o peso da classe positiva
    'scale_pos_weight': hp.uniform('scale_pos_weight', 3, 9),
    
    # Controle para uso de one-hot encoding apenas em variáveis de baixa cardinalidade (evita transformar "numero_empresa_voo")
    'max_cat_to_onehot': scope.int(hp.quniform('max_cat_to_onehot', 3, 50, 1)),
    'max_cat_threshold': scope.int(hp.quniform('max_cat_threshold', 3, 50, 1)),
    
    # Limita o número de folhas da árvore para controlar a complexidade
    'max_leaves': scope.int(hp.quniform('max_leaves', 16, 256, 4)),
    
    # Validação dos parâmetros antes do início do treinamento
    'validate_parameters': True,
    
    # Semente para garantir reprodutibilidade
    'seed': 33,
    
    # Métricas utilizadas para avaliar a performance durante o treinamento
    'eval_metric': ["aucpr", "auc", "logloss", "error"],
    
    # Atualizador otimizado para GPU
    'updater': 'grow_gpu_hist',
    
    # Número máximo de bins para histogramas (afeta a granularidade dos splits)
    'max_bin': scope.int(hp.quniform('max_bin', 32, 320, 8)),
    
    # Função objetivo para classificação binária
    'objective': 'binary:logistic'
}


In [8]:
mlflow.xgboost.autolog()

In [9]:
import numpy as np
from typing import Dict, Optional, Union
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score,
    balanced_accuracy_score, average_precision_score, log_loss, brier_score_loss,
    cohen_kappa_score, matthews_corrcoef, confusion_matrix
)

def calcular_metricas_binarias(
    y_true: Union[np.ndarray, list],
    y_pred_proba: Optional[Union[np.ndarray, list]] = None,
    y_pred: Optional[Union[np.ndarray, list]] = None,
    *,
    threshold: float = 0.5,
    sample_weight: Optional[Union[np.ndarray, list]] = None
) -> Dict[str, float]:
    """
    Calcula métricas de classificação binária. Se apenas y_pred_proba for fornecido,
    as classes são derivadas com base no threshold.

    Parâmetros
    ----------
    y_true : array-like
        Rótulos verdadeiros (binários).
    y_pred_proba : array-like, opcional
        Probabilidades da classe positiva (shape (n,) ou (n,2) – neste caso usa a coluna 1).
    y_pred : array-like, opcional
        Predições binárias (0/1). Se None, será calculado a partir de y_pred_proba e threshold.
    threshold : float, padrão 0.5
        Limiar para converter probabilidades em classes.
    sample_weight : array-like, opcional
        Pesos amostrais para métricas que suportam.

    Retorno
    -------
    dict
        Dicionário com métricas (acc, precision, recall, f1, auc, prauc, gini, logloss,
        brier, kappa, mcc, balanced_accuracy, specificity, fpr, fnr, gmean, TN/FP/FN/TP).
    """
    y_true = np.asarray(y_true)

    if y_pred_proba is None and y_pred is None:
        raise ValueError("Informe ao menos y_pred_proba ou y_pred.")

    # Probabilidades: aceita shape (n,) ou (n,2)
    if y_pred_proba is not None:
        y_pred_proba = np.asarray(y_pred_proba, dtype=float)
        if y_pred_proba.ndim == 2 and y_pred_proba.shape[1] == 2:
            y_pred_proba = y_pred_proba[:, 1]
        # Clampa para [0,1] caso venha algo levemente fora (ex.: numérico instável)
        y_pred_proba = np.clip(y_pred_proba, 0.0, 1.0)

    # Se não vier y_pred, deriva pelo threshold
    if y_pred is None:
        y_pred = (y_pred_proba >= float(threshold)).astype(int)
    else:
        y_pred = np.asarray(y_pred).astype(int)

    # Garante binarização 0/1 caso os rótulos não estejam exatamente assim
    classes = np.unique(y_true)
    if classes.size > 2:
        raise ValueError("A função suporta apenas classificação binária.")
    if not np.array_equal(classes, np.array([0, 1])):
        mapping = {classes[0]: 0, classes[-1]: 1}
        y_true = np.vectorize(mapping.get)(y_true)

    # Matriz de confusão 2x2 garantida
    cm = confusion_matrix(y_true, y_pred, labels=[0, 1], sample_weight=sample_weight)
    TN, FP, FN, TP = cm.ravel()

    # Métricas baseadas em classes
    accuracy = accuracy_score(y_true, y_pred, sample_weight=sample_weight)
    precision = precision_score(y_true, y_pred, sample_weight=sample_weight, zero_division=0)
    recall = recall_score(y_true, y_pred, sample_weight=sample_weight, zero_division=0)
    f1 = f1_score(y_true, y_pred, sample_weight=sample_weight, zero_division=0)
    balanced_acc = balanced_accuracy_score(y_true, y_pred, sample_weight=sample_weight)
    specificity = TN / (TN + FP) if (TN + FP) > 0 else 0.0
    fpr = FP / (FP + TN) if (FP + TN) > 0 else 0.0
    fnr = FN / (FN + TP) if (FN + TP) > 0 else 0.0
    gmean = float(np.sqrt(recall * specificity)) if (recall > 0 and specificity > 0) else 0.0

    # Métricas que dependem de probabilidades
    if y_pred_proba is not None and np.unique(y_true).size == 2:
        try:
            auc = roc_auc_score(y_true, y_pred_proba, sample_weight=sample_weight)
        except Exception:
            auc = np.nan
        try:
            prauc = average_precision_score(y_true, y_pred_proba, sample_weight=sample_weight)
        except Exception:
            prauc = np.nan
        gini = 2 * auc - 1 if not np.isnan(auc) else np.nan
        try:
            ll = log_loss(y_true, y_pred_proba, sample_weight=sample_weight, labels=[0, 1])
        except Exception:
            ll = np.nan
        try:
            brier = brier_score_loss(y_true, y_pred_proba, sample_weight=sample_weight)
        except Exception:
            brier = np.nan
    else:
        auc = prauc = gini = ll = brier = np.nan

    # Outras métricas
    try:
        mcc = matthews_corrcoef(y_true, y_pred, sample_weight=sample_weight)
    except Exception:
        mcc = np.nan
    try:
        kappa = cohen_kappa_score(y_true, y_pred, sample_weight=sample_weight)
    except Exception:
        kappa = np.nan

    return {
        "threshold": float(threshold),
        "TN": float(TN), "FP": float(FP), "FN": float(FN), "TP": float(TP),
        "accuracy": float(accuracy),
        "precision": float(precision),
        "recall": float(recall),
        "f1": float(f1),
        "balanced_accuracy": float(balanced_acc),
        "specificity": float(specificity),
        "fpr": float(fpr),
        "fnr": float(fnr),
        "gmean": float(gmean),
        "auc": float(auc) if not np.isnan(auc) else np.nan,
        "prauc": float(prauc) if not np.isnan(prauc) else np.nan,
        "gini": float(gini) if not np.isnan(gini) else np.nan,
        "logloss": float(ll) if not np.isnan(ll) else np.nan,
        "brier": float(brier) if not np.isnan(brier) else np.nan,
        "mcc": float(mcc) if not np.isnan(mcc) else np.nan,
        "kappa": float(kappa) if not np.isnan(kappa) else np.nan,
    }


In [10]:
from mlflow.models.signature import infer_signature

In [11]:
# -------- 2) hipertunnig corrigida ----------
import mlflow
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns
from hyperopt import STATUS_OK

from sklearn.metrics import roc_curve, precision_recall_curve, confusion_matrix

def _to_eval_list(eval_metric):
    """Garante lista de strings para eval_metric."""
    if eval_metric is None:
        return ["logloss"]
    if isinstance(eval_metric, str):
        return [eval_metric]
    # tuple -> list
    return list(eval_metric)

def hipertunnig(space):
    """
    Ajusta hiperparâmetros com xgb.cv, treina com o melhor num_boost_round
    e loga tudo no MLflow. Usa calcular_metricas_binarias() para as métricas finais.
    """
    # -------------------------
    # Parâmetros
    # -------------------------
    params = {
        'max_depth': int(space['max_depth']),
        'gamma': space['gamma'],
        'reg_alpha': space['reg_alpha'],
        'reg_lambda': space['reg_lambda'],
        'min_child_weight': int(space['min_child_weight']),
        'colsample_bytree': space['colsample_bytree'],
        'colsample_bylevel': space['colsample_bylevel'],
        'colsample_bynode': space['colsample_bynode'],
        'learning_rate': space['learning_rate'],
        'max_delta_step': space['max_delta_step'],
        'subsample': space['subsample'],
        'sampling_method': space['sampling_method'],
        'tree_method': space['tree_method'],
        'device': space['device'],
        'enable_categorical': space['enable_categorical'],
        'scale_pos_weight': space['scale_pos_weight'],
        'eval_metric': _to_eval_list(space.get('eval_metric', ['logloss'])),
        'objective': space['objective'],
        'seed': space['seed'],
        'max_cat_to_onehot': int(space['max_cat_to_onehot']),
        'max_cat_threshold': int(space['max_cat_threshold']),
        'max_leaves': int(space['max_leaves']),
        'validate_parameters': space['validate_parameters'],
        'max_bin': space['max_bin'],
        'updater': space['updater']
    }
    n_estimators = int(space['n_estimators'])

    print("🔧 Hiperparâmetros utilizados:", params)
    print("🔢 n_estimators:", n_estimators)

    mlflow.xgboost.autolog()

    with mlflow.start_run(experiment_id=experiment_id,
                          run_name='XGBoost Model Training and Tuning',
                          nested=True):

        # -------------------------
        # CV (usa metrics explícitas; ok manter também eval_metric no params)
        # -------------------------
        cv_results = xgb.cv(
            params=params,
            dtrain=dtrain,
            num_boost_round=n_estimators,
            nfold=5,
            metrics=["aucpr", "auc", "logloss"],  # ok em paralelo ao params['eval_metric']
            as_pandas=True,
            seed=33,
            stratified=True,
            early_stopping_rounds=15 if params['max_depth'] <= 6 else 45,
        )

        aucpr_list = cv_results["test-aucpr-mean"].tolist()
        auc_list = cv_results["test-auc-mean"].tolist()
        logloss_list = cv_results["test-logloss-mean"].tolist()

        mean_aucpr = max(aucpr_list)
        mean_auc = max(auc_list)
        mean_logloss = min(logloss_list)

        # best rounds pela parada antecipada (tamanho do DF)
        best_num_boost_round = cv_results.shape[0]

        mlflow.log_metric("mean_aucpr", mean_aucpr)
        mlflow.log_metric("mean_auc", mean_auc)
        mlflow.log_metric("mean_logloss", mean_logloss)
        mlflow.log_metric("best_num_boost_round", best_num_boost_round)

        # -------------------------
        # Treinamento final com o best_num_boost_round
        # -------------------------
        evals_list = []
        try:
            evals_list = [(dtest_valid, 'validation')]
        except NameError:
            # se não existir dtest_valid, avalia só em treino
            evals_list = [(dtrain, 'train')]

        booster = xgb.train(
            params=params,
            dtrain=dtrain,
            num_boost_round=best_num_boost_round,
            evals=evals_list,
            verbose_eval=False
        )

        # -------------------------
        # Predições e métricas (test)
        # -------------------------
        y_pred_proba = booster.predict(dtest)
        metrics_test = calcular_metricas_binarias(y_test, y_pred_proba, threshold=0.5)

        for k, v in metrics_test.items():
            mlflow.log_metric(f"test_{k}", v)

        # (Opcional) Métricas em treino também
        y_train_true = dtrain.get_label()
        y_train_proba = booster.predict(dtrain)
        metrics_train = calcular_metricas_binarias(y_train_true, y_train_proba, threshold=0.5)
        for k, v in metrics_train.items():
            mlflow.log_metric(f"train_{k}", v)

        # -------------------------
        # Artefatos (gráficos)
        # -------------------------
        # Matriz de confusão (test)
        cm = confusion_matrix(y_test, (y_pred_proba >= 0.5).astype(int))
        plt.figure(figsize=(8, 6))
        sns.heatmap(cm, annot=True, fmt='d', cmap="Blues")
        plt.title('Matriz de Confusão (Test)')
        plt.xlabel('Predito')
        plt.ylabel('Real')
        plt.tight_layout()
        plt.savefig('confusion_matrix.png')
        mlflow.log_artifact('confusion_matrix.png')
        plt.close()

        # Importância das features
        plt.figure(figsize=(10, 7))
        xgb.plot_importance(booster, max_num_features=20)
        plt.title('Importância das Features')
        plt.tight_layout()
        plt.savefig('feature_importance.png')
        mlflow.log_artifact('feature_importance.png')
        plt.close()

        # Curva ROC
        fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
        plt.figure(figsize=(8, 6))
        plt.plot(fpr, tpr, linestyle='--', label=f"AUC = {metrics_test.get('auc', float('nan')):.3f}")
        plt.title('Curva ROC (Test)')
        plt.xlabel('FPR')
        plt.ylabel('TPR')
        plt.legend()
        plt.tight_layout()
        plt.savefig('roc_curve.png')
        mlflow.log_artifact('roc_curve.png')
        plt.close()

        # Curva Precisão-Recall
        prec_vals, rec_vals, _ = precision_recall_curve(y_test, y_pred_proba)
        plt.figure(figsize=(8, 6))
        plt.plot(rec_vals, prec_vals, marker='.', label=f"PRAUC = {metrics_test.get('prauc', float('nan')):.3f}")
        plt.title('Curva Precisão-Recall (Test)')
        plt.xlabel('Recall')
        plt.ylabel('Precisão')
        plt.legend()
        plt.tight_layout()
        plt.savefig('precision_recall_curve.png')
        mlflow.log_artifact('precision_recall_curve.png')
        plt.close()

        # -------------------------
        # Log final do modelo
        # -------------------------
        y_train_proba = booster.predict(dtrain)

        # Log the model with signature
        from mlflow.models import infer_signature
    
        signature = infer_signature(X_train, y_train_proba)
        
        mlflow.xgboost.log_model(
        xgb_model=booster,
        name="modelo_xgboost",
        signature=signature,
        input_example=X_train[:5],
        )
        
        #mlflow.xgboost.log_model(
        #    xgb_model=booster,
        #    artifact_path="modelo_xgboost",
            #model_format="json"
        #)

        return {
            'loss': mean_logloss,
            'status': STATUS_OK,
            'cv_metrics': {'aucpr': mean_aucpr, 'auc': mean_auc, 'logloss': mean_logloss},
            'best_num_boost_round': best_num_boost_round,
            'final_metrics_test': metrics_test,
            'final_metrics_train': metrics_train
        }


In [None]:
# Etapa de hipertuning
with mlflow.start_run(experiment_id=experiment_id, run_name='Hipertunning XGBoost', nested=True,
                      description='Busca pelos melhores parâmetros. Os modelos testados são armazenados, mesmo que não tenham os melhores parâmetros.',
                      tags={"Hipertunning": "Melhores parâmetros", "objetivo": "garantir os melhores parâmetros para o modelo"}):
    
    # Executando a otimização
    trials = Trials()
    # Lista global para acumular os resultados de cada trial
    global_results = []
    best_hyperparams = fmin(fn=hipertunnig, 
                            space=space, 
                            algo=tpe.suggest, 
                            max_evals=60, 
                            trials=trials,
                            rstate=np.random.default_rng(40))  # Reprodutibilidade)
    
    # Obtendo os melhores hiperparâmetros
    best_hyperparams = space_eval(space, best_hyperparams)
    print("Melhores hiperparâmetros:", best_hyperparams)

🔧 Hiperparâmetros utilizados:                                                                                         
{'max_depth': 8, 'gamma': 11.044241278530942, 'reg_alpha': 100.68540694740837, 'reg_lambda': 4.4204962722439225, 'min_child_weight': 15, 'colsample_bytree': 0.5209846933141805, 'colsample_bylevel': 0.9160740581284995, 'colsample_bynode': 0.8290919036015767, 'learning_rate': 0.018770532351419632, 'max_delta_step': 9.77163723980769, 'subsample': 0.7825629817802873, 'sampling_method': 'gradient_based', 'tree_method': 'gpu_hist', 'device': 'cuda', 'enable_categorical': True, 'scale_pos_weight': 4.861562748338326, 'eval_metric': ['aucpr', 'auc', 'logloss', 'error'], 'objective': 'binary:logistic', 'seed': 33, 'max_cat_to_onehot': 12, 'max_cat_threshold': 18, 'max_leaves': 168, 'validate_parameters': True, 'max_bin': 88, 'updater': 'grow_gpu_hist'}
🔢 n_estimators:                                                                                                       
640      




Downloading artifacts:   0%|                                                                     | 0/7 [00:00<?, ?it/s]
[A
Downloading artifacts:  14%|########5                                                   | 1/7 [00:00<00:00, 996.98it/s]
[A
Downloading artifacts:  29%|#################1                                          | 2/7 [00:00<00:00, 999.71it/s]
[A
Downloading artifacts:  43%|#########################7                                  | 3/7 [00:00<00:00, 999.52it/s]
[A
Downloading artifacts:  57%|##################################2                         | 4/7 [00:00<00:00, 999.95it/s]
[A
Downloading artifacts:  71%|##########################################8                 | 5/7 [00:00<00:00, 833.10it/s]
[A
Downloading artifacts:  86%|###################################################4        | 6/7 [00:00<00:00, 857.12it/s]
[A
Downloading artifacts: 100%|############################################################| 7/7 [00:00<00:00, 874.91it/s]
[A
Downl

🔧 Hiperparâmetros utilizados:                                                                                         
{'max_depth': 10, 'gamma': 12.45293556443537, 'reg_alpha': 1.978885898505176, 'reg_lambda': 6.6220690987869055, 'min_child_weight': 18, 'colsample_bytree': 0.7132686256766255, 'colsample_bylevel': 0.9993658610997173, 'colsample_bynode': 0.6735222961715663, 'learning_rate': 0.04404362431826378, 'max_delta_step': 7.6413780978565065, 'subsample': 0.9840071027472885, 'sampling_method': 'gradient_based', 'tree_method': 'gpu_hist', 'device': 'cuda', 'enable_categorical': True, 'scale_pos_weight': 3.6137530533652162, 'eval_metric': ['aucpr', 'auc', 'logloss', 'error'], 'objective': 'binary:logistic', 'seed': 33, 'max_cat_to_onehot': 5, 'max_cat_threshold': 13, 'max_leaves': 252, 'validate_parameters': True, 'max_bin': 240, 'updater': 'grow_gpu_hist'}
🔢 n_estimators:                                                                                                       
150     




Downloading artifacts:   0%|                                                                     | 0/7 [00:00<?, ?it/s]
[A
Downloading artifacts:  14%|########4                                                  | 1/7 [00:00<00:00, 1000.07it/s]
[A
Downloading artifacts:  29%|################8                                          | 2/7 [00:00<00:00, 1000.07it/s]
[A
Downloading artifacts:  43%|#########################2                                 | 3/7 [00:00<00:00, 1000.07it/s]
[A
Downloading artifacts:  57%|##################################2                         | 4/7 [00:00<00:00, 799.56it/s]
[A
Downloading artifacts:  71%|##########################################8                 | 5/7 [00:00<00:00, 833.06it/s]
[A
Downloading artifacts:  86%|###################################################4        | 6/7 [00:00<00:00, 856.94it/s]
[A
Downloading artifacts: 100%|############################################################| 7/7 [00:00<00:00, 874.88it/s]
[A
Downl

🔧 Hiperparâmetros utilizados:                                                                                         
{'max_depth': 12, 'gamma': 11.756685794606776, 'reg_alpha': 108.07600094808781, 'reg_lambda': 1.0634819074577218, 'min_child_weight': 10, 'colsample_bytree': 0.4963438901324335, 'colsample_bylevel': 0.7502326742069667, 'colsample_bynode': 0.7648021298814314, 'learning_rate': 0.2391116231755152, 'max_delta_step': 9.603142654610625, 'subsample': 0.5158091156735369, 'sampling_method': 'gradient_based', 'tree_method': 'gpu_hist', 'device': 'cuda', 'enable_categorical': True, 'scale_pos_weight': 6.387601532324096, 'eval_metric': ['aucpr', 'auc', 'logloss', 'error'], 'objective': 'binary:logistic', 'seed': 33, 'max_cat_to_onehot': 23, 'max_cat_threshold': 29, 'max_leaves': 36, 'validate_parameters': True, 'max_bin': 96, 'updater': 'grow_gpu_hist'}
🔢 n_estimators:                                                                                                       
280       




Downloading artifacts:   0%|                                                                     | 0/7 [00:00<?, ?it/s]
[A
Downloading artifacts:  14%|#########8                                                           | 1/7 [00:00<?, ?it/s]
[A
Downloading artifacts:  29%|################8                                          | 2/7 [00:00<00:00, 1000.19it/s]
[A
Downloading artifacts:  43%|#########################7                                  | 3/7 [00:00<00:00, 999.60it/s]
[A
Downloading artifacts:  57%|##################################2                         | 4/7 [00:00<00:00, 999.66it/s]
[A
Downloading artifacts:  71%|##########################################8                 | 5/7 [00:00<00:00, 999.88it/s]
[A
Downloading artifacts:  86%|###################################################4        | 6/7 [00:00<00:00, 857.18it/s]
[A
Downloading artifacts: 100%|############################################################| 7/7 [00:00<00:00, 874.85it/s]
[A
Downl

🔧 Hiperparâmetros utilizados:                                                                                         
{'max_depth': 7, 'gamma': 12.854619950226688, 'reg_alpha': 100.4211711279482, 'reg_lambda': 5.100654065557717, 'min_child_weight': 4, 'colsample_bytree': 0.6819004216821365, 'colsample_bylevel': 0.8723354800081827, 'colsample_bynode': 0.9878452516644881, 'learning_rate': 0.03291994022432445, 'max_delta_step': 3.0105203852280003, 'subsample': 0.7795747125075339, 'sampling_method': 'gradient_based', 'tree_method': 'gpu_hist', 'device': 'cuda', 'enable_categorical': True, 'scale_pos_weight': 6.125705466633934, 'eval_metric': ['aucpr', 'auc', 'logloss', 'error'], 'objective': 'binary:logistic', 'seed': 33, 'max_cat_to_onehot': 34, 'max_cat_threshold': 36, 'max_leaves': 184, 'validate_parameters': True, 'max_bin': 216, 'updater': 'grow_gpu_hist'}
🔢 n_estimators:                                                                                                       
240       




Downloading artifacts:   0%|                                                                     | 0/7 [00:00<?, ?it/s]
[A
Downloading artifacts:  14%|#########8                                                           | 1/7 [00:00<?, ?it/s]
[A
Downloading artifacts:  29%|################8                                          | 2/7 [00:00<00:00, 1998.72it/s]
[A
Downloading artifacts:  43%|#########################2                                 | 3/7 [00:00<00:00, 1499.57it/s]
[A
Downloading artifacts:  57%|#################################7                         | 4/7 [00:00<00:00, 1000.07it/s]
[A
Downloading artifacts:  71%|##########################################8                 | 5/7 [00:00<00:00, 999.98it/s]
[A
Downloading artifacts:  86%|###################################################4        | 6/7 [00:00<00:00, 999.95it/s]
[A
Downloading artifacts: 100%|###########################################################| 7/7 [00:00<00:00, 1000.00it/s]
[A
Downl

🔧 Hiperparâmetros utilizados:                                                                                         
{'max_depth': 28, 'gamma': 9.460262526728503, 'reg_alpha': 63.383861249390556, 'reg_lambda': 9.590325989774048, 'min_child_weight': 3, 'colsample_bytree': 0.8692009884906202, 'colsample_bylevel': 0.7068812826539119, 'colsample_bynode': 0.894596994047149, 'learning_rate': 0.084837248475267, 'max_delta_step': 9.866597575123262, 'subsample': 0.8905881848445374, 'sampling_method': 'gradient_based', 'tree_method': 'gpu_hist', 'device': 'cuda', 'enable_categorical': True, 'scale_pos_weight': 3.5744746019568057, 'eval_metric': ['aucpr', 'auc', 'logloss', 'error'], 'objective': 'binary:logistic', 'seed': 33, 'max_cat_to_onehot': 20, 'max_cat_threshold': 42, 'max_leaves': 56, 'validate_parameters': True, 'max_bin': 152, 'updater': 'grow_gpu_hist'}
🔢 n_estimators:                                                                                                       
500          




Downloading artifacts:   0%|                                                                     | 0/7 [00:00<?, ?it/s]
[A
Downloading artifacts:  14%|########5                                                   | 1/7 [00:00<00:00, 998.41it/s]
[A
Downloading artifacts:  29%|#################1                                          | 2/7 [00:00<00:00, 998.52it/s]
[A
Downloading artifacts:  43%|#########################7                                  | 3/7 [00:00<00:00, 999.83it/s]
[A
Downloading artifacts:  57%|##################################2                         | 4/7 [00:00<00:00, 999.83it/s]
[A
Downloading artifacts:  71%|##########################################8                 | 5/7 [00:00<00:00, 833.13it/s]
[A
Downloading artifacts:  86%|###################################################4        | 6/7 [00:00<00:00, 856.94it/s]
[A
Downloading artifacts: 100%|############################################################| 7/7 [00:00<00:00, 874.93it/s]
[A
Downl

🔧 Hiperparâmetros utilizados:                                                                                         
{'max_depth': 13, 'gamma': 9.824908308441135, 'reg_alpha': 92.38232617733807, 'reg_lambda': 5.718960107664239, 'min_child_weight': 15, 'colsample_bytree': 0.6164474439742667, 'colsample_bylevel': 0.8918165893955093, 'colsample_bynode': 0.43494673175851145, 'learning_rate': 0.11764744601886551, 'max_delta_step': 0.20300035585466447, 'subsample': 0.5680448235172076, 'sampling_method': 'gradient_based', 'tree_method': 'gpu_hist', 'device': 'cuda', 'enable_categorical': True, 'scale_pos_weight': 3.1535133394468833, 'eval_metric': ['aucpr', 'auc', 'logloss', 'error'], 'objective': 'binary:logistic', 'seed': 33, 'max_cat_to_onehot': 23, 'max_cat_threshold': 35, 'max_leaves': 36, 'validate_parameters': True, 'max_bin': 216, 'updater': 'grow_gpu_hist'}
🔢 n_estimators:                                                                                                       
800    




Downloading artifacts:   0%|                                                                     | 0/7 [00:00<?, ?it/s]
[A
Downloading artifacts:  14%|#########8                                                           | 1/7 [00:00<?, ?it/s]
[A
Downloading artifacts:  29%|#################1                                          | 2/7 [00:00<00:00, 999.83it/s]
[A
Downloading artifacts:  43%|#########################7                                  | 3/7 [00:00<00:00, 999.91it/s]
[A
Downloading artifacts:  57%|##################################2                         | 4/7 [00:00<00:00, 932.43it/s]
[A
Downloading artifacts:  71%|##########################################8                 | 5/7 [00:00<00:00, 945.05it/s]
[A
Downloading artifacts:  86%|###################################################4        | 6/7 [00:00<00:00, 953.83it/s]
[A
Downloading artifacts: 100%|############################################################| 7/7 [00:00<00:00, 844.29it/s]
[A
Downl

🔧 Hiperparâmetros utilizados:                                                                                         
{'max_depth': 14, 'gamma': 1.430630150990322, 'reg_alpha': 113.66343267141534, 'reg_lambda': 5.751142835663084, 'min_child_weight': 7, 'colsample_bytree': 0.4320761802123466, 'colsample_bylevel': 0.539530580784046, 'colsample_bynode': 0.7626688451931545, 'learning_rate': 0.2874384092859929, 'max_delta_step': 0.08904153167743512, 'subsample': 0.4624320957600463, 'sampling_method': 'gradient_based', 'tree_method': 'gpu_hist', 'device': 'cuda', 'enable_categorical': True, 'scale_pos_weight': 3.69773402321234, 'eval_metric': ['aucpr', 'auc', 'logloss', 'error'], 'objective': 'binary:logistic', 'seed': 33, 'max_cat_to_onehot': 3, 'max_cat_threshold': 20, 'max_leaves': 56, 'validate_parameters': True, 'max_bin': 48, 'updater': 'grow_gpu_hist'}
🔢 n_estimators:                                                                                                       
250           




Downloading artifacts:   0%|                                                                     | 0/7 [00:00<?, ?it/s]
[A
Downloading artifacts:  14%|#########8                                                           | 1/7 [00:00<?, ?it/s]
[A
Downloading artifacts:  29%|################8                                          | 2/7 [00:00<00:00, 2000.62it/s]
[A
Downloading artifacts:  43%|#########################2                                 | 3/7 [00:00<00:00, 1000.23it/s]
[A
Downloading artifacts:  57%|##################################2                         | 4/7 [00:00<00:00, 999.95it/s]
[A
Downloading artifacts:  71%|##########################################8                 | 5/7 [00:00<00:00, 999.41it/s]
[A
Downloading artifacts:  86%|##################################################5        | 6/7 [00:00<00:00, 1000.03it/s]
[A
Downloading artifacts: 100%|###########################################################| 7/7 [00:00<00:00, 1000.00it/s]
[A
Downl

🔧 Hiperparâmetros utilizados:                                                                                         
{'max_depth': 15, 'gamma': 4.7290718673327845, 'reg_alpha': 5.542334515855245, 'reg_lambda': 8.266203077884922, 'min_child_weight': 17, 'colsample_bytree': 0.8234660474133497, 'colsample_bylevel': 0.6847895299711599, 'colsample_bynode': 0.5899316926411781, 'learning_rate': 0.03265256761091956, 'max_delta_step': 5.907831238561627, 'subsample': 0.4741920949826416, 'sampling_method': 'gradient_based', 'tree_method': 'gpu_hist', 'device': 'cuda', 'enable_categorical': True, 'scale_pos_weight': 4.985790127723398, 'eval_metric': ['aucpr', 'auc', 'logloss', 'error'], 'objective': 'binary:logistic', 'seed': 33, 'max_cat_to_onehot': 43, 'max_cat_threshold': 39, 'max_leaves': 176, 'validate_parameters': True, 'max_bin': 248, 'updater': 'grow_gpu_hist'}
🔢 n_estimators:                                                                                                       
920      




Downloading artifacts:   0%|                                                                     | 0/7 [00:00<?, ?it/s]
[A
Downloading artifacts:  14%|#########8                                                           | 1/7 [00:00<?, ?it/s]
[A
Downloading artifacts:  29%|################8                                          | 2/7 [00:00<00:00, 2000.62it/s]
[A
Downloading artifacts:  43%|#########################7                                  | 3/7 [00:00<00:00, 999.83it/s]
[A
Downloading artifacts:  57%|##################################2                         | 4/7 [00:00<00:00, 999.54it/s]
[A
Downloading artifacts:  71%|##########################################8                 | 5/7 [00:00<00:00, 982.18it/s]
[A
Downloading artifacts:  86%|###################################################4        | 6/7 [00:00<00:00, 846.51it/s]
[A
Downloading artifacts: 100%|############################################################| 7/7 [00:00<00:00, 266.52it/s]
[A
Downl

🔧 Hiperparâmetros utilizados:                                                                                         
{'max_depth': 19, 'gamma': 4.968709241579504, 'reg_alpha': 89.27309696767857, 'reg_lambda': 7.860043935419244, 'min_child_weight': 24, 'colsample_bytree': 0.6680316528444721, 'colsample_bylevel': 0.8270883789266384, 'colsample_bynode': 0.43298486184671037, 'learning_rate': 0.07454840953628435, 'max_delta_step': 9.43438522340667, 'subsample': 0.9793391431278602, 'sampling_method': 'gradient_based', 'tree_method': 'gpu_hist', 'device': 'cuda', 'enable_categorical': True, 'scale_pos_weight': 7.4598309776753045, 'eval_metric': ['aucpr', 'auc', 'logloss', 'error'], 'objective': 'binary:logistic', 'seed': 33, 'max_cat_to_onehot': 37, 'max_cat_threshold': 25, 'max_leaves': 172, 'validate_parameters': True, 'max_bin': 64, 'updater': 'grow_gpu_hist'}
🔢 n_estimators:                                                                                                       
560       




Downloading artifacts:   0%|                                                                     | 0/7 [00:00<?, ?it/s]
[A
Downloading artifacts:  14%|#########8                                                           | 1/7 [00:00<?, ?it/s]
[A
Downloading artifacts:  29%|#################1                                          | 2/7 [00:00<00:00, 797.09it/s]
[A
Downloading artifacts:  43%|#########################7                                  | 3/7 [00:00<00:00, 854.99it/s]
[A
Downloading artifacts:  57%|##################################2                         | 4/7 [00:00<00:00, 886.93it/s]
[A
Downloading artifacts:  71%|##########################################8                 | 5/7 [00:00<00:00, 907.50it/s]
[A
Downloading artifacts:  86%|###################################################4        | 6/7 [00:00<00:00, 921.83it/s]
[A
Downloading artifacts: 100%|############################################################| 7/7 [00:00<00:00, 876.45it/s]
[A
Downl

🔧 Hiperparâmetros utilizados:                                                                                         
{'max_depth': 14, 'gamma': 17.50456378393796, 'reg_alpha': 116.86081926610929, 'reg_lambda': 5.832709577999802, 'min_child_weight': 3, 'colsample_bytree': 0.8392436041971018, 'colsample_bylevel': 0.5167244113984667, 'colsample_bynode': 0.8493306588245927, 'learning_rate': 0.2777149100052217, 'max_delta_step': 2.918091072786085, 'subsample': 0.4110635243411179, 'sampling_method': 'gradient_based', 'tree_method': 'gpu_hist', 'device': 'cuda', 'enable_categorical': True, 'scale_pos_weight': 4.183081723849615, 'eval_metric': ['aucpr', 'auc', 'logloss', 'error'], 'objective': 'binary:logistic', 'seed': 33, 'max_cat_to_onehot': 45, 'max_cat_threshold': 42, 'max_leaves': 92, 'validate_parameters': True, 'max_bin': 224, 'updater': 'grow_gpu_hist'}
🔢 n_estimators:                                                                                                       
710         




Downloading artifacts:   0%|                                                                     | 0/7 [00:00<?, ?it/s]
[A
Downloading artifacts:  14%|#########8                                                           | 1/7 [00:00<?, ?it/s]
[A
Downloading artifacts:  29%|#################1                                          | 2/7 [00:00<00:00, 999.60it/s]
[A
Downloading artifacts:  43%|#########################7                                  | 3/7 [00:00<00:00, 854.82it/s]
[A
Downloading artifacts:  57%|##################################2                         | 4/7 [00:00<00:00, 886.46it/s]
[A
Downloading artifacts:  71%|##########################################8                 | 5/7 [00:00<00:00, 906.92it/s]
[A
Downloading artifacts:  86%|###################################################4        | 6/7 [00:00<00:00, 921.22it/s]
[A
Downloading artifacts: 100%|############################################################| 7/7 [00:00<00:00, 822.21it/s]
[A
Downl

🔧 Hiperparâmetros utilizados:                                                                                         
{'max_depth': 7, 'gamma': 13.926721807580567, 'reg_alpha': 54.571715572949124, 'reg_lambda': 0.15264213142296756, 'min_child_weight': 14, 'colsample_bytree': 0.7663794262768111, 'colsample_bylevel': 0.6522365536212138, 'colsample_bynode': 0.756726730423994, 'learning_rate': 0.09698653117594778, 'max_delta_step': 7.256893367215822, 'subsample': 0.9721712130100555, 'sampling_method': 'gradient_based', 'tree_method': 'gpu_hist', 'device': 'cuda', 'enable_categorical': True, 'scale_pos_weight': 7.138280408792858, 'eval_metric': ['aucpr', 'auc', 'logloss', 'error'], 'objective': 'binary:logistic', 'seed': 33, 'max_cat_to_onehot': 5, 'max_cat_threshold': 35, 'max_leaves': 160, 'validate_parameters': True, 'max_bin': 232, 'updater': 'grow_gpu_hist'}
🔢 n_estimators:                                                                                                       
140      




Downloading artifacts:   0%|                                                                     | 0/7 [00:00<?, ?it/s]
[A
Downloading artifacts:  14%|#########8                                                           | 1/7 [00:00<?, ?it/s]
[A
Downloading artifacts:  29%|################8                                          | 2/7 [00:00<00:00, 2003.49it/s]
[A
Downloading artifacts:  43%|#########################2                                 | 3/7 [00:00<00:00, 1499.93it/s]
[A
Downloading artifacts:  57%|##################################2                         | 4/7 [00:00<00:00, 886.89it/s]
[A
Downloading artifacts:  71%|##########################################8                 | 5/7 [00:00<00:00, 906.95it/s]
[A
Downloading artifacts:  86%|###################################################4        | 6/7 [00:00<00:00, 921.62it/s]
[A
Downloading artifacts: 100%|############################################################| 7/7 [00:00<00:00, 822.37it/s]
[A
Downl

🔧 Hiperparâmetros utilizados:                                                                                         
{'max_depth': 17, 'gamma': 2.7443347797869775, 'reg_alpha': 96.23690887875082, 'reg_lambda': 6.294283269454221, 'min_child_weight': 15, 'colsample_bytree': 0.48424614647132286, 'colsample_bylevel': 0.8927764864535322, 'colsample_bynode': 0.5699658160278829, 'learning_rate': 0.19216404001291204, 'max_delta_step': 5.818118554198143, 'subsample': 0.6217048619057225, 'sampling_method': 'gradient_based', 'tree_method': 'gpu_hist', 'device': 'cuda', 'enable_categorical': True, 'scale_pos_weight': 7.611580526691185, 'eval_metric': ['aucpr', 'auc', 'logloss', 'error'], 'objective': 'binary:logistic', 'seed': 33, 'max_cat_to_onehot': 9, 'max_cat_threshold': 6, 'max_leaves': 252, 'validate_parameters': True, 'max_bin': 208, 'updater': 'grow_gpu_hist'}
🔢 n_estimators:                                                                                                       
530       




Downloading artifacts:   0%|                                                                     | 0/7 [00:00<?, ?it/s]
[A
Downloading artifacts:  14%|#########8                                                           | 1/7 [00:00<?, ?it/s]
[A
Downloading artifacts:  29%|################8                                          | 2/7 [00:00<00:00, 1000.91it/s]
[A
Downloading artifacts:  43%|#########################2                                 | 3/7 [00:00<00:00, 1000.47it/s]
[A
Downloading artifacts:  57%|#################################7                         | 4/7 [00:00<00:00, 1000.19it/s]
[A
Downloading artifacts:  71%|##########################################1                | 5/7 [00:00<00:00, 1000.17it/s]
[A
Downloading artifacts:  86%|###################################################4        | 6/7 [00:00<00:00, 857.29it/s]
[A
Downloading artifacts: 100%|############################################################| 7/7 [00:00<00:00, 875.25it/s]
[A
Downl

🔧 Hiperparâmetros utilizados:                                                                                         
{'max_depth': 9, 'gamma': 11.68259904878954, 'reg_alpha': 70.21686414097826, 'reg_lambda': 7.333279413093128, 'min_child_weight': 6, 'colsample_bytree': 0.5980361700072745, 'colsample_bylevel': 0.42928221219450063, 'colsample_bynode': 0.5036835460711266, 'learning_rate': 0.19045653308276939, 'max_delta_step': 1.3515858397573566, 'subsample': 0.4064188876311967, 'sampling_method': 'gradient_based', 'tree_method': 'gpu_hist', 'device': 'cuda', 'enable_categorical': True, 'scale_pos_weight': 6.843380726127015, 'eval_metric': ['aucpr', 'auc', 'logloss', 'error'], 'objective': 'binary:logistic', 'seed': 33, 'max_cat_to_onehot': 26, 'max_cat_threshold': 34, 'max_leaves': 48, 'validate_parameters': True, 'max_bin': 64, 'updater': 'grow_gpu_hist'}
🔢 n_estimators:                                                                                                       
610         




Downloading artifacts:   0%|                                                                     | 0/7 [00:00<?, ?it/s]
[A
Downloading artifacts:  14%|#########8                                                           | 1/7 [00:00<?, ?it/s]
[A
Downloading artifacts:  29%|################8                                          | 2/7 [00:00<00:00, 1000.31it/s]
[A
Downloading artifacts:  43%|#########################2                                 | 3/7 [00:00<00:00, 1000.23it/s]
[A
Downloading artifacts:  57%|##################################2                         | 4/7 [00:00<00:00, 911.21it/s]
[A
Downloading artifacts:  71%|##########################################8                 | 5/7 [00:00<00:00, 927.70it/s]
[A
Downloading artifacts:  86%|###################################################4        | 6/7 [00:00<00:00, 939.02it/s]
[A
Downloading artifacts: 100%|############################################################| 7/7 [00:00<00:00, 947.22it/s]
[A
Downl

🔧 Hiperparâmetros utilizados:                                                                                         
{'max_depth': 24, 'gamma': 14.791857638700664, 'reg_alpha': 113.60292017174356, 'reg_lambda': 4.200690025333742, 'min_child_weight': 9, 'colsample_bytree': 0.6062837639446688, 'colsample_bylevel': 0.5614328812529829, 'colsample_bynode': 0.6614566195664692, 'learning_rate': 0.04822149726789656, 'max_delta_step': 3.58571763250888, 'subsample': 0.6672135351041404, 'sampling_method': 'gradient_based', 'tree_method': 'gpu_hist', 'device': 'cuda', 'enable_categorical': True, 'scale_pos_weight': 7.572915053632915, 'eval_metric': ['aucpr', 'auc', 'logloss', 'error'], 'objective': 'binary:logistic', 'seed': 33, 'max_cat_to_onehot': 21, 'max_cat_threshold': 20, 'max_leaves': 236, 'validate_parameters': True, 'max_bin': 176, 'updater': 'grow_gpu_hist'}
🔢 n_estimators:                                                                                                       
870       




Downloading artifacts:   0%|                                                                     | 0/7 [00:00<?, ?it/s]
[A
Downloading artifacts:  14%|########4                                                  | 1/7 [00:00<00:00, 1000.79it/s]
[A
Downloading artifacts:  29%|#################1                                          | 2/7 [00:00<00:00, 798.31it/s]
[A
Downloading artifacts:  43%|#########################7                                  | 3/7 [00:00<00:00, 855.40it/s]
[A
Downloading artifacts:  57%|##################################2                         | 4/7 [00:00<00:00, 887.40it/s]
[A
Downloading artifacts:  71%|##########################################8                 | 5/7 [00:00<00:00, 907.78it/s]
[A
Downloading artifacts:  86%|###################################################4        | 6/7 [00:00<00:00, 922.06it/s]
[A
Downloading artifacts: 100%|############################################################| 7/7 [00:00<00:00, 932.39it/s]
[A
Downl

🔧 Hiperparâmetros utilizados:                                                                                         
{'max_depth': 21, 'gamma': 12.07934295496145, 'reg_alpha': 49.80995424043088, 'reg_lambda': 7.106801922263984, 'min_child_weight': 22, 'colsample_bytree': 0.4114712324431084, 'colsample_bylevel': 0.5074442365186821, 'colsample_bynode': 0.7741273949496197, 'learning_rate': 0.1564188512594234, 'max_delta_step': 7.83107885852388, 'subsample': 0.5248647429146472, 'sampling_method': 'gradient_based', 'tree_method': 'gpu_hist', 'device': 'cuda', 'enable_categorical': True, 'scale_pos_weight': 3.210143358736889, 'eval_metric': ['aucpr', 'auc', 'logloss', 'error'], 'objective': 'binary:logistic', 'seed': 33, 'max_cat_to_onehot': 25, 'max_cat_threshold': 13, 'max_leaves': 216, 'validate_parameters': True, 'max_bin': 56, 'updater': 'grow_gpu_hist'}
🔢 n_estimators:                                                                                                       
670          




Downloading artifacts:   0%|                                                                     | 0/7 [00:00<?, ?it/s]
[A
Downloading artifacts:  14%|########4                                                  | 1/7 [00:00<00:00, 1000.07it/s]
[A
Downloading artifacts:  29%|################8                                          | 2/7 [00:00<00:00, 1000.07it/s]
[A
Downloading artifacts:  43%|#########################7                                  | 3/7 [00:00<00:00, 856.10it/s]
[A
Downloading artifacts:  57%|##################################2                         | 4/7 [00:00<00:00, 725.91it/s]
[A
Downloading artifacts:  71%|##########################################8                 | 5/7 [00:00<00:00, 767.99it/s]
[A
Downloading artifacts:  86%|###################################################4        | 6/7 [00:00<00:00, 798.86it/s]
[A
Downloading artifacts: 100%|############################################################| 7/7 [00:00<00:00, 735.94it/s]
[A
Downl

🔧 Hiperparâmetros utilizados:                                                                                         
{'max_depth': 8, 'gamma': 13.03988525656461, 'reg_alpha': 7.3964262542294446, 'reg_lambda': 7.349689167615615, 'min_child_weight': 1, 'colsample_bytree': 0.6229575241374989, 'colsample_bylevel': 0.5282952213008865, 'colsample_bynode': 0.49039405438823735, 'learning_rate': 0.04401470213982398, 'max_delta_step': 8.708326093470784, 'subsample': 0.9121224164435766, 'sampling_method': 'gradient_based', 'tree_method': 'gpu_hist', 'device': 'cuda', 'enable_categorical': True, 'scale_pos_weight': 6.9497717496821165, 'eval_metric': ['aucpr', 'auc', 'logloss', 'error'], 'objective': 'binary:logistic', 'seed': 33, 'max_cat_to_onehot': 43, 'max_cat_threshold': 32, 'max_leaves': 196, 'validate_parameters': True, 'max_bin': 240, 'updater': 'grow_gpu_hist'}
🔢 n_estimators:                                                                                                       
780      




Downloading artifacts:   0%|                                                                     | 0/7 [00:00<?, ?it/s]
[A
Downloading artifacts:  14%|#########8                                                           | 1/7 [00:00<?, ?it/s]
[A
Downloading artifacts:  29%|#################1                                          | 2/7 [00:00<00:00, 998.41it/s]
[A
Downloading artifacts:  43%|#########################2                                 | 3/7 [00:00<00:00, 1000.07it/s]
[A
Downloading artifacts:  57%|#################################7                         | 4/7 [00:00<00:00, 1000.19it/s]
[A
Downloading artifacts:  71%|##########################################8                 | 5/7 [00:00<00:00, 828.88it/s]
[A
Downloading artifacts:  86%|###################################################4        | 6/7 [00:00<00:00, 853.28it/s]
[A
Downloading artifacts: 100%|############################################################| 7/7 [00:00<00:00, 871.56it/s]
[A
Downl

🔧 Hiperparâmetros utilizados:                                                                                         
{'max_depth': 10, 'gamma': 13.861334608067802, 'reg_alpha': 0.48486459670456394, 'reg_lambda': 8.731594991196527, 'min_child_weight': 19, 'colsample_bytree': 0.766425768810145, 'colsample_bylevel': 0.726583315980482, 'colsample_bynode': 0.9423716305938303, 'learning_rate': 0.02136707519851313, 'max_delta_step': 5.134966209012673, 'subsample': 0.7136050715155553, 'sampling_method': 'gradient_based', 'tree_method': 'gpu_hist', 'device': 'cuda', 'enable_categorical': True, 'scale_pos_weight': 3.3784463152221154, 'eval_metric': ['aucpr', 'auc', 'logloss', 'error'], 'objective': 'binary:logistic', 'seed': 33, 'max_cat_to_onehot': 28, 'max_cat_threshold': 19, 'max_leaves': 128, 'validate_parameters': True, 'max_bin': 176, 'updater': 'grow_gpu_hist'}
🔢 n_estimators:                                                                                                       
500     






## Treinamento final do modelo

In [None]:
best_params = {
    'max_depth': int(best_hyperparams['max_depth']),
    'n_estimators': int(best_hyperparams['n_estimators']),
    'reg_lambda': float(best_hyperparams['reg_lambda']),
    'reg_alpha': float(best_hyperparams['reg_alpha']),
    'gamma': float(best_hyperparams['gamma']),
    'min_child_weight': int(best_hyperparams['min_child_weight']),
    'colsample_bytree': float(best_hyperparams['colsample_bytree']),
    'colsample_bylevel': float(best_hyperparams['colsample_bylevel']),
    'colsample_bynode': float(best_hyperparams['colsample_bynode']),
    'learning_rate': float(best_hyperparams['learning_rate']),
    'max_delta_step': float(best_hyperparams.get('max_delta_step', 0.0)),
    'subsample': float(best_hyperparams['subsample']),
    'sampling_method': best_hyperparams.get('sampling_method', 'gradient_based'),
    'tree_method': best_hyperparams.get('tree_method', 'hist'),
    'scale_pos_weight': float(best_hyperparams['scale_pos_weight']),
    'max_cat_to_onehot': int(best_hyperparams.get('max_cat_to_onehot', 10)),
    'max_cat_threshold': int(best_hyperparams.get('max_cat_threshold', 20)),
    'max_leaves': int(best_hyperparams.get('max_leaves', 256)),
    'max_bin': int(best_hyperparams.get('max_bin', 256)),
    'updater': best_hyperparams.get('updater', 'grow_gpu_hist'),
    'objective': 'binary:logistic',
    'eval_metric': ["aucpr", "auc"],
    'enable_categorical': True,
    'validate_parameters': True,
    'seed': int(best_hyperparams.get('seed', 33)),
    'device': best_hyperparams.get('device', 'cuda'),
    'verbosity': 1
}


In [None]:
mlflow.xgboost.autolog()
with mlflow.start_run(
    experiment_id=experiment_id,
    run_name='Treinamento e avaliação XGBoost',
    description='Treinamento com melhores hiperparâmetros e avaliação do modelo final',
    tags={"Tipo": "Classificação", "Modelo": "XGBoost", "Etapa": "Treinamento final"}):
    
    
    # Log dos parâmetros do modelo
    mlflow.log_params(best_params)
    
    num_boost_round = int(best_hyperparams['n_estimators'])
    # Treinamento do modelo
    model_class = xgb.train(
        params=best_params,
        dtrain=dtrain,
        num_boost_round=num_boost_round,
        evals=[(dtest_valid, 'validation')],
        early_stopping_rounds=20,
        verbose_eval=False)
    
    # Previsões
    y_pred_proba = model_class.predict(dtest)
    y_pred = (y_pred_proba >= 0.5).astype(int)
    
    # Métricas de desempenho
    cm = confusion_matrix(y_test, y_pred)
    TN, FP, FN, TP = cm.ravel()
    metrics = {
        "accuracy": accuracy_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred),
        "recall": recall_score(y_test, y_pred),
        "f1_score": f1_score(y_test, y_pred),
        "balanced_accuracy": balanced_accuracy_score(y_test, y_pred),
        "auc": roc_auc_score(y_test, y_pred_proba),
        "prauc": average_precision_score(y_test, y_pred_proba),
        "mcc": matthews_corrcoef(y_test, y_pred),
        "log_loss": log_loss(y_test, y_pred_proba),
        "brier_score": brier_score_loss(y_test, y_pred_proba),
        "cohen_kappa": cohen_kappa_score(y_test, y_pred)
    }

    # Log de métricas individualmente
    for metric_name, metric_value in metrics.items():
        mlflow.log_metric(metric_name, metric_value)
    
    # Gráficos e artefatos
    # Matriz de Confusão
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap="Blues")
    plt.title('Matriz de Confusão')
    plt.xlabel('Predito')
    plt.ylabel('Real')
    plt.savefig('confusion_matrix.png')
    mlflow.log_artifact('confusion_matrix.png')
    plt.close()
    
    # Importância das Features
    plt.figure(figsize=(10, 7))
    xgb.plot_importance(model_class, max_num_features=20)
    plt.title('Importância das Features')
    plt.savefig('feature_importance.png')
    mlflow.log_artifact('feature_importance.png')
    plt.close()
    
    # Curva ROC
    fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, linestyle='--', label='Curva ROC (AUC = {:.3f})'.format(metrics["auc"]))
    plt.title('Curva ROC')
    plt.xlabel('Taxa de Falsos Positivos')
    plt.ylabel('Taxa de Verdadeiros Positivos')
    plt.legend()
    plt.savefig('roc_curve.png')
    mlflow.log_artifact('roc_curve.png')
    plt.close()
    
    # Curva de Precisão-Recall
    precision_vals, recall_vals, _ = precision_recall_curve(y_test, y_pred_proba)
    plt.figure(figsize=(8, 6))
    plt.plot(recall_vals, precision_vals, marker='.', label='PRAUC = {:.3f}'.format(metrics["prauc"]))
    plt.title('Curva de Precisão-Recall')
    plt.xlabel('Recall')
    plt.ylabel('Precisão')
    plt.legend()
    plt.savefig('precision_recall_curve.png')
    mlflow.log_artifact('precision_recall_curve.png')
    plt.close()

    y_train_proba = model_class.predict(dtrain)

    # Log the model with signature
    from mlflow.models import infer_signature
    
    signature = infer_signature(X_train, y_train_proba)
        
    mlflow.xgboost.log_model(
        xgb_model=model_class,
        name="modelo_xgboost_final",
        signature=signature,
        input_example=X_train[:5],
        )

    print("Treinamento e logging concluídos.")

### Alteernativa 2

In [None]:
with mlflow.start_run(experiment_id=experiment_id, run_name='Pre-processamento',
                      run_id='ba596ce0c0ff43228f85f3ef932a8310',
                      nested=True,
                      description = 'Garantir o input correto dos modelos',
                      tags = {"Pre-processamento": "preparação para treinamento", "objetivo": "garantir o input correto dos dados", "Versão da etapa": "1.0"}):
    
   
    
    dft = df[chosen_columns].sample(frac=0.3, random_state=13)
    
    # Colunas que precisam passar por one hot encoding
    list_dummies = ['nome_empresas','codigo_tipo_linha','descricao_origem','descricao_destino','pais_origem','pais_destino','continente_origem',
                 'continente_destino','cidade_origem','cidade_destino','uf_origem','uf_destino','mes_partida',
                 'dia_semana_chegada']

    final_data = pd.DataFrame()
    # Logar os parâmetros
    mlflow.log_param("Colunas escolhidas", chosen_columns)
    mlflow.log_param("Index", 'num_cpf')
    mlflow.log_param("Colunas para one-hot encoding", list_dummies)
    
    # Logar métricas
    mlflow.log_metric("Quantidade de colunas", len(chosen_columns))
    mlflow.log_metric("Quantidade de colunas dummies", len(list_dummies))
    mlflow.log_metric("Quantidade de colunas não dummies", len(chosen_columns) - len(list_dummies) - 1) 
    
    ### One hot encoding
    with mlflow.start_run(experiment_id=experiment_id, nested=True, run_name='One hot encoding', run_id='8836439277bc460e8767f9e6b7311883',
                      description = 'Transformação das colunas categoricas em númericas',
                      tags = {"One hot encoding": "Transformar categorica em númerica", "objetivo": "garantir o input correto dos dados", "Versão da etapa": "1.0"}):
        for column in list_dummies:
            encoder = preprocessing.OneHotEncoder(handle_unknown='ignore')
            encoder.fit(dft[[column]])
            
            # Logar parâmetros para cada coluna processada
            mlflow.log_param(f"Coluna_{column.lower()}", column.lower())
            
            enc_df = pd.DataFrame(encoder.transform(dft[[column]]).toarray(), 
                                  columns=encoder.get_feature_names_out([column]))
            final_data = pd.concat([final_data, enc_df], axis=1)

        final_data['status_do_voo'] = dft['status_do_voo'].values

        dt_ax = final_data.drop(columns=["status_do_voo"])
        dt_ay = final_data[['status_do_voo']].copy()

        # Transformação da coluna em valores binarios. Pontual = 1 e Atrasado = 0
        label_encoder = LabelEncoder()
        dt_ay_enc = label_encoder.fit_transform(dt_ay)
        dt_ay_df = pd.DataFrame(dt_ay_enc, columns=dt_ay.columns)

        # Suponha que 'df' é o seu DataFrame
        column_names = dt_ax.columns.tolist()
        name_map = clean_column_names(column_names)
        
        # Renomear colunas no DataFrame
        dt_ax.rename(columns=name_map, inplace=True)
        
    ### Normalização / Segmentação  treino e teste / Smote
    with mlflow.start_run(experiment_id=experiment_id, run_name='Normalização e Smote', nested=True,run_id='a058314be1ff4283b8fafd1168611eba',
                      description = 'Implementação da etapa de normalização e SMOTE dos dados. Essas etapas são essenciais para evitar overfiting e underfitting',
                      tags = {"Normalização e SMOTE": "Normalização em range de 0 a 1 e criação de dados sinteticos para balencear", "objetivo": "garantir qualidade no correto dos dados", "Versão da etapa": "1.0"}):
        # Normalização dos dados
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(dt_ax)
        X_scaled_df = pd.DataFrame(X_scaled, columns=dt_ax.columns)
    
        # Segmentação em Treino (85%) e Teste (15%)
        X_train, X_test, y_train, y_test = train_test_split(X_scaled_df, dt_ay_df, random_state=13, test_size=0.15)
    
        # Logar distribuição das classes antes do SMOTE
        log_class_distribution(y_test, 'original')
    
        # Aplicar SMOTE
        smote = SMOTE(random_state=13)
        X_smote_a, y_smote_a = smote.fit_resample(X_train, y_train)

        X_test = X_test.reset_index().drop(columns = 'index')
        y_test = y_test.reset_index().drop(columns = 'index')
    
        # Logar distribuição das classes após SMOTE
        log_class_distribution(y_smote_a, 'SMOTE')

In [None]:
def unified_hyper_tuning(space):
    """
    Realiza o ajuste de hiperparâmetros e treinamento de um modelo XGBoost com logging completo utilizando MLflow.
    
    Args:
        space (dict): Dicionário contendo os hiperparâmetros para o modelo XGBoost.
        
    Returns:
        dict: Dicionário contendo o 'loss' (negativo da média do AUC) e o 'status'.
    """
    mlflow.xgboost.autolog()
    with mlflow.start_run(experiment_id=experiment_id, run_name='Unified Model Training and Tuning', nested=True):
        #  Configuração do modelo com os parâmetros do espaço
        clf = xgb.XGBClassifier(max_depth = space['max_depth'],
                                  learning_rate = space['learning_rate'],
                                  reg_alpha = space['reg_alpha'],
                                  reg_lambda = space['reg_lambda'],
                                  min_child_weight = space['min_child_weight'],
                                  subsample = space['subsample'],
                                  colsample_bytree = space['colsample_bytree'],
                                  gamma = space['gamma'],
                                  objective = space['objective'],
                                  seed = space['seed'])
        
        # StratifiedKFold para manter a proporção de classes em cada fold
        skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
        
        # Avaliação usando cross_val_score no conjunto de treinamento
        auc_scores = cross_val_score(clf, X_train, y_train, cv=skf, scoring='roc_auc')
        mean_auc = auc_scores.mean()

        # Logando a média do AUC
        mlflow.log_metric('mean_auc', mean_auc)
        
        model = clf.fit(X_train, y_train)
        
        # Teste do modelo
        y_pred = model.predict(X_test)
        y_pred_proba = model.predict_proba(X_test)[:, 1]
    
        # Teste do modelo e log das curvas
        y_pred_proba = model.predict_proba(X_test)[:, 1]
        precision, recall, _ = precision_recall_curve(y_test, y_pred_proba)
        fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
    
        # Plotar e salvar a Curva de Precisão-Recall
        plt.figure(figsize=(8, 6))
        plt.plot(recall, precision, marker='.')
        plt.title('Curva de Precisão-Recall')
        plt.xlabel('Recall')
        plt.ylabel('Precisão')
        plt.savefig('precision_recall_curve.png')
        plt.close()
    
        # Plotar e salvar a Curva ROC
        plt.figure(figsize=(8, 6))
        plt.plot(fpr, tpr, linestyle='--')
        plt.title('Curva ROC')
        plt.xlabel('Taxa de Falso Positivo')
        plt.ylabel('Taxa de Verdadeiro Positivo')
        plt.savefig('roc_curve.png')
        plt.close()
    
        # Logar gráficos como artefatos
        mlflow.log_artifact('precision_recall_curve.png')
        mlflow.log_artifact('roc_curve.png')

        # Create a model signature
        signature = infer_signature(X_test, model.predict(X_test))
        model_info = mlflow.xgboost.log_model(model, "modelo_xgboost", signature=signature) 
        
        mlflow.xgboost.log_model(model, "model_xgb", signature=signature)
        model_uri = mlflow.get_artifact_uri("model_xgb")
        
        eval_data = pd.DataFrame(X_test, columns=dt_ax.columns)
        eval_data['atraso30_m3'] = y_test.reset_index(drop=True)
        
        result = mlflow.evaluate(model_uri,
                                 eval_data,
                                 targets="atraso30_m3",
                                 model_type="classifier",
                                 evaluators=["default"])

        # A função de perda é o negativo da média do AUC para otimização
        return {'loss': -mean_auc, 'status': STATUS_OK}

space = {
  'max_depth': scope.int(hp.quniform('max_depth', 4, 100, 1)),
  'learning_rate': hp.loguniform('learning_rate', -3, 0),
  'reg_alpha': hp.loguniform('reg_alpha', -5, -1),
  'reg_lambda': hp.loguniform('reg_lambda', -6, -1),
  'gamma': hp.quniform('gamma', 0.5, 1, 0.05),
  'colsample_bytree': hp.quniform('colsample_bytree', 0.5, 1, 0.05),
  'subsample': hp.quniform('subsample', 0.5, 1, 0.05),
  'min_child_weight': hp.quniform('min_child_weight', 1, 6, 1),
  'objective': 'binary:logistic',
  'seed': 123, # Set a seed for deterministic training
}

In [None]:
# Etapa de hipertunning
with mlflow.start_run(experiment_id=experiment_id, run_name='Hipertunnig', nested=True,
                      description = 'Busca pelos melhores parametros. Os modelos testados são armazenados, mesmo que não tenha os melhores parametros.',
                      tags = {"Hipertunnig": "Melhores parametros", "objetivo": "garantir os melhores parametros para o modelo", "Versão da etapa": "1.0"}):
    # Executando a otimização
    trials = Trials()
    best_hyperparams = fmin(fn=unified_hyper_tuning, 
                            space=space, 
                            algo=tpe.suggest, 
                            max_evals=5, 
                            trials=trials)
    
    # Obtendo os melhores hiperparâmetros
    mlflow.log_params(best_hyperparams)
    best_hyperparams = space_eval(space, best_hyperparams)
    print("Melhores hiperparâmetros:", best_hyperparams)

# Catboost

## Criando ou carregando o experimento

In [None]:
# Nome do experimento que você deseja verificar/criar
experiment_name = "CatBoost MLflow Aviação"

# Verificar se o experimento já existe
experiment = mlflow.get_experiment_by_name(experiment_name)

# Se o experimento não existir, cria-o
if experiment is None:
    mlflow.set_experiment(experiment_name)
    print(f"O experimento '{experiment_name}' foi criado.")
else:
    print(f"O experimento '{experiment_name}' já existe.")

In [None]:
# Verificar se o experimento já existe
experiment = mlflow.get_experiment_by_name(experiment_name)

# Id do experimento
experiment_id = experiment.experiment_id
print(f"O experimento id é:'{experiment_id}'")

## Pré-processamento

In [None]:
with mlflow.start_run(experiment_id=experiment_id,
    run_name="Pipeline de Pré-processamento CatBoost", 
                      description="Pipeline completo para preparação de dados históricos de voos",
                      tags={"Etapa": "Pipeline de Pre-processamento", "versão": "1.0"}):

    # Etapa 1: Carregamento dos dados tratados
    with mlflow.start_run(experiment_id=experiment_id,run_name="Carregando dataset tratado", nested=True):
        # Lendo os dados
        file_path = 'df_treinamento_2022_2023.csv'
        df = pd.read_csv(file_path)
    
    # Etapa 2: Exclusão de colunas desnecessárias
    with mlflow.start_run(experiment_id=experiment_id,run_name="Exclusão de Colunas", nested=True):
        df = df.drop(columns=['codigo_di', 'codigo_tipo_linha'])
        mlflow.log_param("colunas_excluidas", ['codigo_di', 'codigo_tipo_linha'])

    # Etapa 3: Identificação de colunas categóricas
    with mlflow.start_run(experiment_id=experiment_id,run_name="Identificação de Categóricas", nested=True):
        list_dummies = df.drop(columns='status_do_voo').select_dtypes(include=['object']).columns.tolist()
        mlflow.log_param("colunas_categoricas", list_dummies)

    # Etapa 4: Seleção de features e variável-alvo
    with mlflow.start_run(experiment_id=experiment_id,run_name="Seleção de Features", nested=True):
        dt_ax = df.drop(columns=["status_do_voo"])
        dt_ay = df['status_do_voo'].map({'Pontual': 0, 'Atrasado': 1})
        mlflow.log_param("target_mapping", {'Pontual': 0, 'Atrasado': 1})
        mlflow.log_param("n_features", dt_ax.shape[1])

    # Etapa 5: Codificação de colunas categóricas
    with mlflow.start_run(experiment_id=experiment_id,run_name="Codificação de Categóricas", nested=True):
        label_encoders = {}
        for col in list_dummies:
            le = LabelEncoder()
            dt_ax[col] = le.fit_transform(dt_ax[col])
            label_encoders[col] = le
        mlflow.log_param("n_label_encoded_columns", len(list_dummies))

    # Etapa 6: Segmentação em treino, teste e validação
    with mlflow.start_run(experiment_id=experiment_id,run_name="Segmentação dos Dados", nested=True):
        X_train, X_test, y_train, y_test = train_test_split(dt_ax, dt_ay, random_state=33, test_size=0.142)
        X_train_valid, X_test_valid, y_train_valid, y_test_valid = train_test_split(X_train, y_train, random_state=33, test_size=0.165)

        mlflow.log_param("train_size", len(X_train_valid))
        mlflow.log_param("validation_size", len(X_test_valid))
        mlflow.log_param("test_size", len(X_test))

    # Etapa 7: Reversão e finalização das colunas categóricas
    with mlflow.start_run(experiment_id=experiment_id,run_name="Reversão de Colunas Categóricas", nested=True):
        def revert_to_category(data, label_encoders, list_dummies):
            for col in list_dummies:
                if col in data.columns:
                    le = label_encoders[col]
                    data[col] = le.inverse_transform(data[col])
            return data

        X_train_valid = revert_to_category(X_train_valid, label_encoders, list_dummies)
        X_test_valid = revert_to_category(X_test_valid, label_encoders, list_dummies)
        X_test = revert_to_category(X_test, label_encoders, list_dummies)

        mlflow.log_param("categorical_columns_finalized", list_dummies)

    # Etapa 7: Resumo do Pipeline
    mlflow.log_param("pipeline_status", "Concluído")

## Treinamento sem hipertuning

### Desenvolvimento

In [None]:


with mlflow.start_run(
    experiment_id=experiment_id,
    run_name='Treinamento e avaliação CatBoost',
    nested=True,
    description='Treinamento com melhores hiperparâmetros e avaliação do modelo final',
    tags={"Tipo": "Classificação", "Modelo": "CatBoost", "Etapa": "Treinamento final"}):
    
    # Log dos parâmetros do modelo
    mlflow.log_params(best_params)
    
    # Configuração do modelo CatBoostClassifier
    classifier_params = best_params.copy()
    cat_features = list(X_train_valid.select_dtypes(include=['object']))
    model = CatBoostClassifier(
        cat_features=cat_features, 
        eval_metric='AUC')

    
    # Treinamento do modelo
    model.fit(X_train_valid, y_train_valid, 
              eval_set=(X_test_valid, y_test_valid), 
              cat_features=cat_features, 
              verbose=100,
              plot=True)
    
    # Previsões
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    y_pred = (y_pred_proba >= 0.5).astype(int)
    
     # Métricas de desempenho
    cm = confusion_matrix(y_test, y_pred)
    TN, FP, FN, TP = cm.ravel()
    specificity = TN / (TN + FP) if (TN + FP) > 0 else 0
    sensitivity = TP / (TP + FN) if (TP + FN) > 0 else 0
    fpr = FP / (FP + TN) if (FP + TN) > 0 else 0
    fnr = FN / (FN + TP) if (FN + TP) > 0 else 0
    g_mean = np.sqrt(sensitivity * specificity)
    
    metrics = {
        "accuracy": accuracy_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred),
        "recall (sensibilidade)": recall_score(y_test, y_pred),
        "f1_score": f1_score(y_test, y_pred),
        "balanced_accuracy": balanced_accuracy_score(y_test, y_pred),
        "specificity": specificity,
        "auc": roc_auc_score(y_test, y_pred_proba),
        "prauc": average_precision_score(y_test, y_pred_proba),
        "mcc": matthews_corrcoef(y_test, y_pred),
        "log_loss": log_loss(y_test, y_pred_proba),
        "brier_score": brier_score_loss(y_test, y_pred_proba),
        "cohen_kappa": cohen_kappa_score(y_test, y_pred),
        "false_positive_rate (FPR)": fpr,
        "false_negative_rate (FNR)": fnr,
        "geometric_mean (G-Mean)": g_mean
    }


    # Log de métricas individualmente
    for metric_name, metric_value in metrics.items():
        mlflow.log_metric(metric_name, metric_value)
    
    # Gráficos e artefatos
    # Matriz de Confusão
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap="Blues")
    plt.title('Matriz de Confusão')
    plt.xlabel('Predito')
    plt.ylabel('Real')
    plt.savefig('confusion_matrix.png')
    mlflow.log_artifact('confusion_matrix.png')
    plt.close()
    
    # Curva ROC
    fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, linestyle='--', label='Curva ROC (AUC = {:.3f})'.format(metrics["auc"]))
    plt.title('Curva ROC')
    plt.xlabel('Taxa de Falsos Positivos')
    plt.ylabel('Taxa de Verdadeiros Positivos')
    plt.legend()
    plt.savefig('roc_curve.png')
    mlflow.log_artifact('roc_curve.png')
    plt.close()
    
    # Curva de Precisão-Recall
    precision_vals, recall_vals, _ = precision_recall_curve(y_test, y_pred_proba)
    plt.figure(figsize=(8, 6))
    plt.plot(recall_vals, precision_vals, marker='.', label='PRAUC = {:.3f}'.format(metrics["prauc"]))
    plt.title('Curva de Precisão-Recall')
    plt.xlabel('Recall')
    plt.ylabel('Precisão')
    plt.legend()
    plt.savefig('precision_recall_curve.png')
    mlflow.log_artifact('precision_recall_curve.png')
    plt.close()

    # SHAP Importance
    explainer = shap.Explainer(model)
    shap_values = explainer(X_test)
    shap_importance = np.abs(shap_values.values).mean(axis=0)
    sorted_idx = shap_importance.argsort()

    # Gráfico de importância SHAP
    fig = plt.figure(figsize=(7, 7))
    plt.barh(range(len(sorted_idx)), shap_importance[sorted_idx], align='center')
    plt.yticks(range(len(sorted_idx)), np.array(X_test.columns)[sorted_idx])
    plt.title('SHAP Importance')
    plt.tight_layout()
    plt.savefig('shap_importance.png')
    mlflow.log_artifact('shap_importance.png')
    plt.close()

    # Beeswarm SHAP
    plt.figure(figsize=(12, 8))
    shap.plots.beeswarm(shap_values, max_display=15, show=False)
    plt.title('SHAP Beeswarm')
    plt.tight_layout()
    plt.savefig('shap_beeswarm.png')
    mlflow.log_artifact('shap_beeswarm.png')
    plt.close()

    # Registrar o modelo no MLflow
    signature = infer_signature(X_test, y_pred_proba)
    mlflow.catboost.log_model(
        model,
        artifact_path="model_catboost",
        signature=signature
    )
    
    print("Treinamento, logging e gráfico SHAP concluídos.")


## Hipertuning Catboost

### Hipertuning desenvolvimento

In [None]:
space_catboost = {
    'iterations': scope.int(hp.quniform('iterations', 100, 1000, 50)),   # Número de árvores
    'learning_rate': hp.loguniform('learning_rate', -3, -0.3),           # Taxa de aprendizado (0.05 ~ 0.7)
    'depth': scope.int(hp.quniform('depth', 4, 12, 1)),                  # Profundidade da árvore (controle de overfitting)
    'l2_leaf_reg': hp.loguniform('l2_leaf_reg', -3, 2),                  # Regularização L2 (1 ~ 100)
    'random_strength': hp.uniform('random_strength', 0, 2),              # Aleatoriedade nas divisões
    'bagging_temperature': hp.uniform('bagging_temperature', 0, 1),      # Temperatura para amostragem de dados
    'scale_pos_weight': hp.uniform('scale_pos_weight', 4, 8),            # Peso para classes desbalanceadas
    'min_data_in_leaf': scope.int(hp.quniform('min_data_in_leaf', 10, 100, 10)),  # Mínimo de dados por folha
    'max_bin': scope.int(hp.quniform('max_bin', 128, 256, 32)),          # Número máximo de bins
    'grow_policy': hp.choice('grow_policy', ['Depthwise', 'Lossguide']), # Política de crescimento
    'eval_metric': 'AUC',                                                # Métrica de avaliação
    'task_type': 'GPU',                                                  # Utilizar GPU
    'random_seed': 42                                                    # Reprodutibilidade
}


# Função objetivo para o Hyperopt
def objective(params):
    params['eval_metric'] = params['eval_metric']  # Define a métrica de avaliação
    params['loss_function'] = 'Logloss'           # Objetivo de classificação binária
    params['verbose'] = False                         # Reduz a verbosidade do treinamento

    # Inicialização do modelo
    model = CatBoostClassifier(**params,cat_features=cat_features, )
    
    # Treinamento
    model.fit(
        X_train, y_train,
        eval_set=(X_test, y_test),
        early_stopping_rounds=50,
        cat_features=cat_features, 
        verbose=False
    )
    
    # Predições e cálculo da métrica
    preds = model.predict_proba(X_test)[:, 1]
    auc = roc_auc_score(y_test, preds)
    
    # Retorna a métrica negativa
    return {'loss': -auc, 'status': STATUS_OK}



# Inicialização do Hyperopt
trials = Trials()
best = fmin(
    fn=objective,                     # Função objetivo
    space=space_catboost,             # Espaço de busca
    algo=tpe.suggest,                 # Algoritmo de busca (TPE)
    max_evals=2,                     # Número de avaliações
    trials=trials,                    # Armazena os resultados
    rstate=np.random.default_rng(42)  # Reprodutibilidade
)

# Exibição dos melhores parâmetros
print("Melhores parâmetros:", best)


# Ajuste dos Melhores Parâmetros
best_params = {
        'depth': int(best['depth']),  # Corrigido para "depth"
        'random_strength': best['random_strength'],
        'l2_leaf_reg': best['l2_leaf_reg'],
        'bagging_temperature': best['bagging_temperature'],
        'min_data_in_leaf': int(best['min_data_in_leaf']),  # Corrigido para "min_data_in_leaf"
        'learning_rate': best['learning_rate'],
        'iterations': int(best['iterations']),  # Corrigido para "iterations"
        'scale_pos_weight': best['scale_pos_weight'],
        'max_bin': int(best['max_bin']),
        'grow_policy': ['Depthwise', 'Lossguide'][best['grow_policy']],  # Mapeia o índice para a string correta
        'task_type': 'GPU',
        'eval_metric': 'AUC',
        'loss_function': 'Logloss',
        'random_seed': 42,
        'verbose': False }
    

# Treinamento do Modelo Final
final_model = CatBoostClassifier(**best_params, cat_features=cat_features)
    
final_model.fit(
    X_train, y_train,
    eval_set=(X_test, y_test),
    early_stopping_rounds=50,
    verbose=10,
    plot=True)
    
# Avaliação do Modelo Final
final_preds = final_model.predict_proba(X_test)[:, 1]
final_auc = roc_auc_score(y_test, final_preds)
print(f"AUC do modelo final: {final_auc:.4f}")

### Hipertuning Mlflow

In [None]:
def objective_mlflow(params):
    """
    Realiza o ajuste de hiperparâmetros e treinamento de um modelo CatBoost com logging completo utilizando MLflow.
    
    Args:
        params (dict): Dicionário contendo os hiperparâmetros para o modelo CatBoost.
        
    Returns:
        dict: Dicionário contendo o 'loss' (negativo da média do AUC) e o 'status'.
    """


    cat_features = list(X_train_valid.select_dtypes(include=['object']))
    
    with mlflow.start_run(experiment_id=experiment_id, run_name='CatBoost Training and Tuning', nested=True,
                         tags = {"Hipertunnig": "Catboost"}):
        mlflow.log_params(params)
        # Ajuste dos hiperparâmetros
        params['loss_function'] = 'Logloss'           # Objetivo de classificação binária
        params['verbose'] = False                         # Reduz a verbosidade do treinamento

        # Inicialização do modelo
        model = CatBoostClassifier(
            **params,
            cat_features=cat_features,
        )
        
        # Treinamento
        model.fit(
            X_train_valid, y_train_valid,
            eval_set=(X_test_valid, y_test_valid),
            early_stopping_rounds=40,
            cat_features=cat_features,
            verbose=False
        )

        # Previsões
        y_pred_proba = model.predict_proba(X_test)[:, 1]
        y_pred = (y_pred_proba >= 0.5).astype(int)
        
        # Métricas de desempenho
        cm = confusion_matrix(y_test, y_pred)
        TN, FP, FN, TP = cm.ravel()
        specificity = TN / (TN + FP) if (TN + FP) > 0 else 0
        sensitivity = TP / (TP + FN) if (TP + FN) > 0 else 0
        fpr = FP / (FP + TN) if (FP + TN) > 0 else 0
        fnr = FN / (FN + TP) if (FN + TP) > 0 else 0
        g_mean = np.sqrt(sensitivity * specificity)

        metrics = {
            "accuracy": accuracy_score(y_test, y_pred),
            "precision": precision_score(y_test, y_pred),
            "recall_sensibility": recall_score(y_test, y_pred),  # Nome ajustado
            "f1_score": f1_score(y_test, y_pred),
            "balanced_accuracy": balanced_accuracy_score(y_test, y_pred),
            "specificity": specificity,
            "auc": roc_auc_score(y_test, y_pred_proba),
            "prauc": average_precision_score(y_test, y_pred_proba),
            "mcc": matthews_corrcoef(y_test, y_pred),
            "log_loss": log_loss(y_test, y_pred_proba),
            "brier_score": brier_score_loss(y_test, y_pred_proba),
            "cohen_kappa": cohen_kappa_score(y_test, y_pred),
            "false_positive_rate_FPR": fpr,  # Nome ajustado
            "false_negative_rate_FNR": fnr,  # Nome ajustado
            "geometric_mean_GMean": g_mean   # Nome ajustado
        }

        # Log de métricas
        for metric_name, metric_value in metrics.items():
            mlflow.log_metric(metric_name, metric_value)

        # Gráficos e artefatos
        output_dir = "mlflow_artifacts"
        os.makedirs(output_dir, exist_ok=True)

        # Matriz de Confusão
        plt.figure(figsize=(8, 6))
        sns.heatmap(cm, annot=True, fmt='d', cmap="Blues")
        plt.title('Matriz de Confusão')
        plt.xlabel('Predito')
        plt.ylabel('Real')
        confusion_matrix_path = os.path.join(output_dir, "confusion_matrix.png")
        plt.savefig(confusion_matrix_path)
        mlflow.log_artifact(confusion_matrix_path)
        plt.close()

        # Curva ROC
        fpr_vals, tpr_vals, _ = roc_curve(y_test, y_pred_proba)
        plt.figure(figsize=(8, 6))
        plt.plot(fpr_vals, tpr_vals, linestyle='--', label='Curva ROC (AUC = {:.3f})'.format(metrics["auc"]))
        plt.title('Curva ROC')
        plt.xlabel('Taxa de Falsos Positivos')
        plt.ylabel('Taxa de Verdadeiros Positivos')
        plt.legend()
        roc_curve_path = os.path.join(output_dir, "roc_curve.png")
        plt.savefig(roc_curve_path)
        mlflow.log_artifact(roc_curve_path)
        plt.close()

        # Curva de Precisão-Recall
        precision_vals, recall_vals, _ = precision_recall_curve(y_test, y_pred_proba)
        plt.figure(figsize=(8, 6))
        plt.plot(recall_vals, precision_vals, marker='.', label='PRAUC = {:.3f}'.format(metrics["prauc"]))
        plt.title('Curva de Precisão-Recall')
        plt.xlabel('Recall')
        plt.ylabel('Precisão')
        plt.legend()
        pr_curve_path = os.path.join(output_dir, "precision_recall_curve.png")
        plt.savefig(pr_curve_path)
        mlflow.log_artifact(pr_curve_path)
        plt.close()

        # SHAP Importance
        explainer = shap.Explainer(model)
        shap_values = explainer(X_test)
        shap_importance = np.abs(shap_values.values).mean(axis=0)
        sorted_idx = shap_importance.argsort()

        # Gráfico de importância SHAP
        plt.figure(figsize=(7, 7))
        plt.barh(range(len(sorted_idx)), shap_importance[sorted_idx], align='center')
        plt.yticks(range(len(sorted_idx)), X_test.columns[sorted_idx])
        plt.title('SHAP Importance')
        plt.tight_layout()
        shap_importance_path = os.path.join(output_dir, "shap_importance.png")
        plt.savefig(shap_importance_path)
        mlflow.log_artifact(shap_importance_path)
        plt.close()

        # Beeswarm SHAP
        plt.figure(figsize=(12, 8))
        shap.plots.beeswarm(shap_values, max_display=15, show=False)
        plt.title('SHAP Beeswarm')
        plt.tight_layout()
        shap_beeswarm_path = os.path.join(output_dir, "shap_beeswarm.png")
        plt.savefig(shap_beeswarm_path)
        mlflow.log_artifact(shap_beeswarm_path)
        plt.close()

        # Registrar o modelo no MLflow
        signature = infer_signature(X_test, y_pred_proba)
        mlflow.catboost.log_model(
            model,
            name="model_catboost",
            signature=signature
        )

        print(metrics)

    # Retorna a métrica de perda para o Hyperopt
    return {'loss': -metrics['auc'], 'status': STATUS_OK}


# Espaço de Busca para o Hyperopt
space_catboost = {
    'iterations': scope.int(hp.quniform('iterations', 100, 1000, 50)),  # Número de árvores
    'learning_rate': hp.loguniform('learning_rate', -3, -0.3),          # Taxa de aprendizado
    'depth': scope.int(hp.quniform('depth', 4, 12, 1)),                 # Profundidade das árvores
    'l2_leaf_reg': hp.loguniform('l2_leaf_reg', -3, 2),                 # Regularização L2
    'random_strength': hp.uniform('random_strength', 0, 2),             # Aleatoriedade nas divisões
    'bagging_temperature': hp.uniform('bagging_temperature', 0, 1),     # Temperatura do bagging
    'scale_pos_weight': hp.uniform('scale_pos_weight', 4, 8),           # Peso para classes desbalanceadas
    'min_data_in_leaf': scope.int(hp.quniform('min_data_in_leaf', 10, 100, 10)),  # Mínimo de dados por folha
    'max_bin': scope.int(hp.quniform('max_bin', 128, 256, 32)),         # Número máximo de bins
    'grow_policy': hp.choice('grow_policy', ['Depthwise', 'Lossguide']),  # Política de crescimento
    'eval_metric': 'AUC',
    'task_type': 'GPU',                                                 # Utilizar GPU
    'random_seed': 42                                                   # Reprodutibilidade
}

In [None]:
# Etapa de hipertunning
with mlflow.start_run(experiment_id=experiment_id, run_name='Hipertunnig', nested=True,  
                      description = 'Busca pelos melhores parametros. Os modelos testados são armazenados, mesmo que não tenha os melhores parametros. CatBoost',
                      tags = {"Execução do Hipert": "Melhores parametros", "objetivo": "garantir os melhores parametros para o modelo", "Versão da etapa": "1.0"}):

    cat_features = list(X_train_valid.select_dtypes(include=['object']))
    # Inicialização do Hyperopt
    trials = Trials()
    best = fmin(
        fn=objective_mlflow,                     # Função objetivo
        space=space_catboost,             # Espaço de busca
        algo=tpe.suggest,                 # Algoritmo de busca (TPE)
        max_evals=50,                     # Número de avaliações
        trials=trials,                    # Armazena os resultados
        rstate=np.random.default_rng(42)  # Reprodutibilidade
    )
    
   
    # Obtendo os melhores hiperparâmetros
    mlflow.log_params(best)
    print("Melhores hiperparâmetros:", best)

## Treinamento após hipertunning

In [None]:
# Iniciar rastreamento MLflow
with mlflow.start_run(experiment_id=experiment_id, run_name='Treinamento do melhor modelo modelo CatBoost', nested=True,
                     description='Treinando o CatBoost com os melhores parametros',
                     tags={"Versão do modelo": "1", "Algoritmo": "CatBoost"}):

    # Ajuste dos Melhores Parâmetros
    best_params = {
        'depth': int(best['depth']),
        'random_strength': best['random_strength'],
        'l2_leaf_reg': best['l2_leaf_reg'],
        'bagging_temperature': best['bagging_temperature'],
        'min_data_in_leaf': int(best['min_data_in_leaf']),
        'learning_rate': best['learning_rate'],
        'iterations': int(best['iterations']),
        'scale_pos_weight': best['scale_pos_weight'],
        'max_bin': int(best['max_bin']),
        'grow_policy': ['Depthwise', 'Lossguide'][best['grow_policy']],
        'task_type': 'GPU',
        'eval_metric': 'AUC',
        'loss_function': 'Logloss',
        'random_seed': 42,
        'verbose': False
    }
    mlflow.log_params(best_params)
    cat_features = list(X_train_valid.select_dtypes(include=['object']))
    # Treinamento do Modelo Final
    final_model = CatBoostClassifier(**best_params, cat_features=cat_features)

    final_model.fit(
        X_train_valid, y_train_valid,
        eval_set=(X_test_valid, y_test_valid),
        early_stopping_rounds=40,
        verbose=10,
        plot=True
    )

    # Avaliação do Modelo Final

    # Previsões
    y_pred_proba = final_model.predict_proba(X_test)[:, 1]
    y_pred = (y_pred_proba >= 0.5).astype(int)

    # Métricas de desempenho
    cm = confusion_matrix(y_test, y_pred)
    TN, FP, FN, TP = cm.ravel()
    specificity = TN / (TN + FP) if (TN + FP) > 0 else 0
    sensitivity = TP / (TP + FN) if (TP + FN) > 0 else 0
    fpr = FP / (FP + TN) if (FP + TN) > 0 else 0
    fnr = FN / (FN + TP) if (FN + TP) > 0 else 0
    g_mean = np.sqrt(sensitivity * specificity)

    metrics = {
        "accuracy": accuracy_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred),
        "recall_sensitivity": recall_score(y_test, y_pred),  # Nome ajustado
        "f1_score": f1_score(y_test, y_pred),
        "balanced_accuracy": balanced_accuracy_score(y_test, y_pred),
        "specificity": specificity,
        "auc": roc_auc_score(y_test, y_pred_proba),
        "prauc": average_precision_score(y_test, y_pred_proba),
        "mcc": matthews_corrcoef(y_test, y_pred),
        "log_loss": log_loss(y_test, y_pred_proba),
        "brier_score": brier_score_loss(y_test, y_pred_proba),
        "cohen_kappa": cohen_kappa_score(y_test, y_pred),
        "false_positive_rate_FPR": fpr,  # Nome ajustado
        "false_negative_rate_FNR": fnr,  # Nome ajustado
        "geometric_mean_GMean": g_mean   # Nome ajustado
    }

    # Log de métricas individualmente
    for metric_name, metric_value in metrics.items():
        mlflow.log_metric(metric_name, metric_value)

    # Gráficos e artefatos
    # Matriz de Confusão
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap="Blues")
    plt.title('Matriz de Confusão')
    plt.xlabel('Predito')
    plt.ylabel('Real')
    plt.savefig('confusion_matrix.png')
    mlflow.log_artifact('confusion_matrix.png')
    plt.close()

    # Curva ROC
    fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, linestyle='--', label='Curva ROC (AUC = {:.3f})'.format(metrics["auc"]))
    plt.title('Curva ROC')
    plt.xlabel('Taxa de Falsos Positivos')
    plt.ylabel('Taxa de Verdadeiros Positivos')
    plt.legend()
    plt.savefig('roc_curve.png')
    mlflow.log_artifact('roc_curve.png')
    plt.close()

    # Curva de Precisão-Recall
    precision_vals, recall_vals, _ = precision_recall_curve(y_test, y_pred_proba)
    plt.figure(figsize=(8, 6))
    plt.plot(recall_vals, precision_vals, marker='.', label='PRAUC = {:.3f}'.format(metrics["prauc"]))
    plt.title('Curva de Precisão-Recall')
    plt.xlabel('Recall')
    plt.ylabel('Precisão')
    plt.legend()
    plt.savefig('precision_recall_curve.png')
    mlflow.log_artifact('precision_recall_curve.png')
    plt.close()

    # SHAP Importance
    explainer = shap.Explainer(final_model)
    shap_values = explainer(X_test)
    shap_importance = np.abs(shap_values.values).mean(axis=0)
    sorted_idx = shap_importance.argsort()

    # Gráfico de importância SHAP
    plt.figure(figsize=(7, 7))
    plt.barh(range(len(sorted_idx)), shap_importance[sorted_idx], align='center')
    plt.yticks(range(len(sorted_idx)), np.array(X_test.columns)[sorted_idx])
    plt.title('SHAP Importance')
    plt.tight_layout()
    plt.savefig('shap_importance.png')
    mlflow.log_artifact('shap_importance.png')
    plt.close()

    # Beeswarm SHAP
    plt.figure(figsize=(12, 8))
    shap.plots.beeswarm(shap_values, max_display=15, show=False)
    plt.title('SHAP Beeswarm')
    plt.tight_layout()
    plt.savefig('shap_beeswarm.png')
    mlflow.log_artifact('shap_beeswarm.png')
    plt.close()

    # Registrar o modelo no MLflow
    signature = infer_signature(X_test, y_pred_proba)
    mlflow.catboost.log_model(
        final_model,
        artifact_path="model_catboost_final",
        signature=signature
    )
