### 1.1 Importar Libs

In [19]:
# Importa√ß√£o das bibliotecas necess√°rias
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import os
import json
import pickle
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold, train_test_split
from sklearn.ensemble import StackingClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (classification_report, confusion_matrix, 
                           accuracy_score, precision_score, recall_score, 
                           f1_score, roc_auc_score, roc_curve)
from sklearn.preprocessing import StandardScaler
from scipy.stats import uniform, randint
from joblib import dump, load
import xgboost as xgb
import warnings
warnings.filterwarnings('ignore')

# Configura√ß√µes de plotagem
plt.rcParams['figure.figsize'] = [12, 8]
sns.set_style("whitegrid")

print("Bibliotecas importadas com sucesso!")
print(f"Pandas: {pd.__version__}")
print(f"NumPy: {np.__version__}")
print(f"Scikit-learn: {sklearn.__version__}")
print(f"XGBoost: {xgb.__version__}")

Bibliotecas importadas com sucesso!
Pandas: 2.3.2
NumPy: 2.3.3
Scikit-learn: 1.7.2
XGBoost: 3.1.2


### 1.2 Definir Comit√™ Heterog√™neo (Stacking)

In [20]:
# ======================================================================
# COMIT√ä HETEROG√äNEO (STACKING) - IMPLEMENTA√á√ÉO
# ======================================================================

from sklearn.base import BaseEstimator, ClassifierMixin

class HeterogeneousStackingCommittee(BaseEstimator, ClassifierMixin):
    def __init__(self, 
                 # Par√¢metros Decision Tree (vari√°veis)
                 dt_max_depth=None, dt_min_samples_split=2, dt_min_samples_leaf=1,
                 # Par√¢metros Decision Tree (fixos)
                 dt_criterion='gini', dt_max_features='sqrt',
                 # Par√¢metros Random Forest (vari√°veis)
                 rf_n_estimators=100, rf_max_depth=None,
                 # Par√¢metros Random Forest (fixos)
                 rf_criterion='gini', rf_max_features='sqrt',
                 rf_min_samples_split=16, rf_min_samples_leaf=1,
                 # Par√¢metros XGBoost (vari√°veis)
                 xgb_n_estimators=100, xgb_max_depth=6, xgb_learning_rate=0.3,
                 # Par√¢metros XGBoost (fixos - melhores valores)
                 xgb_subsample=0.6527464393047274, xgb_colsample_bytree=0.6502374440504688,
                 xgb_min_child_weight=1, xgb_gamma=0.13697900853304845,
                 xgb_reg_alpha=0.778102210605468, xgb_reg_lambda=1.518712387770365,
                 # Par√¢metros do meta-estimador
                 meta_C=1.0, meta_max_iter=1000,
                 # Configura√ß√µes gerais
                 cv=5, random_state=None):
        """
        Comit√™ Heterog√™neo usando StackingClassifier com √°rvores de decis√£o
        
        Estimadores base: Decision Tree, Random Forest, XGBoost
        Meta-estimador: Logistic Regression
        """
        # Par√¢metros Decision Tree
        self.dt_max_depth = dt_max_depth
        self.dt_min_samples_split = dt_min_samples_split
        self.dt_min_samples_leaf = dt_min_samples_leaf
        self.dt_criterion = dt_criterion
        self.dt_max_features = dt_max_features
        
        self.rf_n_estimators = rf_n_estimators
        self.rf_max_depth = rf_max_depth
        self.rf_criterion = rf_criterion
        self.rf_max_features = rf_max_features
        self.rf_min_samples_split = rf_min_samples_split
        self.rf_min_samples_leaf = rf_min_samples_leaf
        
        # Par√¢metros XGBoost
        self.xgb_n_estimators = xgb_n_estimators
        self.xgb_max_depth = xgb_max_depth
        self.xgb_learning_rate = xgb_learning_rate
        self.xgb_subsample = xgb_subsample
        self.xgb_colsample_bytree = xgb_colsample_bytree
        self.xgb_min_child_weight = xgb_min_child_weight
        self.xgb_gamma = xgb_gamma
        self.xgb_reg_alpha = xgb_reg_alpha
        self.xgb_reg_lambda = xgb_reg_lambda
        
        # Par√¢metros do meta-estimador
        self.meta_C = meta_C
        self.meta_max_iter = meta_max_iter
        
        # Configura√ß√µes gerais
        self.cv = cv
        self.random_state = random_state
        
    def fit(self, X, y):
        # Definir estimadores base com par√¢metros otimiz√°veis
        base_estimators = [
            ('decision_tree', DecisionTreeClassifier(
                max_depth=self.dt_max_depth,
                min_samples_split=self.dt_min_samples_split,
                min_samples_leaf=self.dt_min_samples_leaf,
                criterion=self.dt_criterion,
                max_features=self.dt_max_features,
                random_state=self.random_state
            )),
            ('random_forest', RandomForestClassifier(
                n_estimators=self.rf_n_estimators,
                max_depth=self.rf_max_depth,
                criterion=self.rf_criterion,
                max_features=self.rf_max_features,
                min_samples_split=self.rf_min_samples_split,
                min_samples_leaf=self.rf_min_samples_leaf,
                random_state=self.random_state
            )),
            ('xgboost', xgb.XGBClassifier(
                n_estimators=self.xgb_n_estimators,
                max_depth=self.xgb_max_depth,
                learning_rate=self.xgb_learning_rate,
                subsample=self.xgb_subsample,
                colsample_bytree=self.xgb_colsample_bytree,
                min_child_weight=self.xgb_min_child_weight,
                gamma=self.xgb_gamma,
                reg_alpha=self.xgb_reg_alpha,
                reg_lambda=self.xgb_reg_lambda,
                objective='binary:logistic',
                eval_metric='logloss',
                random_state=self.random_state,
                verbosity=0
            ))
        ]
        
        # Meta-estimador (Logistic Regression)
        meta_estimator = LogisticRegression(
            C=self.meta_C,
            max_iter=self.meta_max_iter,
            random_state=self.random_state
        )
        
        # Criar o StackingClassifier
        self.stacking_classifier = StackingClassifier(
            estimators=base_estimators,
            final_estimator=meta_estimator,
            cv=self.cv,
            stack_method='predict_proba',  # Usar probabilidades
            n_jobs=1  # Evitar conflitos de paraleliza√ß√£o
        )
        
        # Treinar o ensemble
        self.stacking_classifier.fit(X, y)
        self.classes_ = self.stacking_classifier.classes_
        
        return self
    
    def predict(self, X):
        return self.stacking_classifier.predict(X)
    
    def predict_proba(self, X):
        return self.stacking_classifier.predict_proba(X)
    
    def score(self, X, y):
        return accuracy_score(y, self.predict(X))

print("Comit√™ Heterog√™neo (Stacking) com Decision Tree, Random Forest e XGBoost definido com sucesso!")

Comit√™ Heterog√™neo (Stacking) com Decision Tree, Random Forest e XGBoost definido com sucesso!


### 1.3 Carregar Datasets

In [9]:
# Carregamento e prepara√ß√£o inicial dos dados
print("Carregando datasets...")

# Carregar datasets pr√©-processados
train_data = pd.read_csv('../dataset_sepsis_prepared.csv')
test_data = pd.read_csv('../dataset_sepsis_test_prepared.csv')

print(f"Dataset de treino: {train_data.shape}")
print(f"Dataset de teste: {test_data.shape}")

# Separar features e target
X_train = train_data.drop('SepsisLabel', axis=1)
y_train = train_data['SepsisLabel']
X_test = test_data.drop('SepsisLabel', axis=1)
y_test = test_data['SepsisLabel']

# Normaliza√ß√£o dos dados
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("\nDistribui√ß√£o das classes:")
print("Treino:", y_train.value_counts().to_dict())
print("Teste:", y_test.value_counts().to_dict())

Carregando datasets...


Dataset de treino: (853006, 19)
Dataset de teste: (215171, 19)

Distribui√ß√£o das classes:
Treino: {0.0: 831112, 1.0: 21894}
Teste: {0.0: 209675, 1.0: 5496}

Distribui√ß√£o das classes:
Treino: {0.0: 831112, 1.0: 21894}
Teste: {0.0: 209675, 1.0: 5496}


In [10]:
train_data.head()

Unnamed: 0,Hour,HR,O2Sat,Temp,SBP,MAP,DBP,Resp,BUN,WBC,Platelets,Gender,Unit1,Unit2,HospAdmTime,ICULOS,Critical_Risk_Window,Time_Category,SepsisLabel
0,8,-1.109794,-0.46018,-0.936182,2.873365,3.044201,2.159974,-0.073601,-0.277186,-1.39358,0.450716,1.0,1.0,0.0,-12.06,9.0,0,0,0.0
1,47,0.569971,-2.43777,0.173477,0.39396,0.650783,0.430943,-0.997324,0.309171,0.245616,-0.275108,1.0,1.0,0.0,-0.05,48.0,0,1,0.0
2,6,0.15003,0.978068,0.016114,-0.983487,-0.55385,-0.198683,-0.073601,-0.310739,0.121368,-0.191266,1.0,1.0,0.0,-0.02,7.0,0,0,0.0
3,39,-0.269912,0.258944,0.289355,0.853109,0.405094,-0.11546,0.752015,0.298591,-0.003965,-3.15292,0.0,0.0,1.0,-75.85,43.0,0,1,0.0
4,127,0.569971,-0.46018,0.007012,0.761279,1.834255,1.05505,-0.520987,0.70214,-0.180545,0.030433,0.0,0.0,1.0,-0.03,128.0,1,2,0.0


## 2. Sampling para Busca de Hiperpar√¢metros

In [24]:
# ======================================================================
# SAMPLING ESTRATIFICADO PARA BUSCA DE HIPERPAR√ÇMETROS
# ======================================================================

print("=== PREPARA√á√ÉO DE AMOSTRA PARA BUSCA DE HIPERPAR√ÇMETROS ===")

# Amostra estratificada do dataset de treino (muito pequena devido √† complexidade)
_, X_sample, _, y_sample = train_test_split(
    X_train_scaled, y_train, 
    test_size=0.01, 
    stratify=y_train,
    random_state=10
)

print(f"Dataset original de treino: {X_train_scaled.shape[0]:,} amostras")
print(f"Amostra para busca de hiperpar√¢metros: {X_sample.shape[0]:,} amostras")
print(f"Redu√ß√£o: {(1 - X_sample.shape[0]/X_train_scaled.shape[0])*100:.1f}%")

print("\nDistribui√ß√£o das classes na amostra:")
print("Amostra:", pd.Series(y_sample).value_counts().to_dict())
print("Original:", y_train.value_counts().to_dict())

=== PREPARA√á√ÉO DE AMOSTRA PARA BUSCA DE HIPERPAR√ÇMETROS ===
Dataset original de treino: 853,006 amostras
Amostra para busca de hiperpar√¢metros: 8,531 amostras
Redu√ß√£o: 99.0%

Distribui√ß√£o das classes na amostra:
Amostra: {0.0: 8312, 1.0: 219}
Original: {0.0: 831112, 1.0: 21894}
Dataset original de treino: 853,006 amostras
Amostra para busca de hiperpar√¢metros: 8,531 amostras
Redu√ß√£o: 99.0%

Distribui√ß√£o das classes na amostra:
Amostra: {0.0: 8312, 1.0: 219}
Original: {0.0: 831112, 1.0: 21894}


## 3.1 Fun√ß√µes Auxiliares

In [12]:
## Fun√ß√£o auxiliar para c√°lculo do G-Mean
def gmean_score(y_true, y_pred):
    """Calcula o G-Mean (Geometric Mean) para problemas bin√°rios"""
    # Sensitivity (recall da classe positiva - sepsis)
    sensitivity = recall_score(y_true, y_pred, pos_label=1, zero_division=0)
    # Specificity (recall da classe negativa - sem sepsis)
    specificity = recall_score(y_true, y_pred, pos_label=0, zero_division=0)
    # G-Mean √© a m√©dia geom√©trica de sensitivity e specificity
    return np.sqrt(sensitivity * specificity)

# Fun√ß√£o para avaliar modelos
def evaluate_model(model, X_train, X_test, y_train, y_test, model_name):
    """Avalia um modelo treinado e retorna m√©tricas completas"""
    # Predi√ß√µes
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # M√©tricas de treino
    train_metrics = {
        'accuracy': accuracy_score(y_train, y_train_pred),
        'precision': precision_score(y_train, y_train_pred, zero_division=0),
        'recall': recall_score(y_train, y_train_pred, zero_division=0),
        'f1': f1_score(y_train, y_train_pred, zero_division=0),
        'gmean': gmean_score(y_train, y_train_pred)
    }
    
    # M√©tricas de teste
    test_metrics = {
        'accuracy': accuracy_score(y_test, y_test_pred),
        'precision': precision_score(y_test, y_test_pred, zero_division=0),
        'recall': recall_score(y_test, y_test_pred, zero_division=0),
        'f1': f1_score(y_test, y_test_pred, zero_division=0),
        'gmean': gmean_score(y_test, y_test_pred)
    }
    
    # AUC-ROC para problemas bin√°rios
    try:
        y_test_proba = model.predict_proba(X_test)[:, 1]  # Probabilidade da classe positiva
        test_metrics['auc_roc'] = roc_auc_score(y_test, y_test_proba)
        
        # AUC-ROC para treino tamb√©m
        y_train_proba = model.predict_proba(X_train)[:, 1]
        train_metrics['auc_roc'] = roc_auc_score(y_train, y_train_proba)
    except Exception as e:
        test_metrics['auc_roc'] = None
        train_metrics['auc_roc'] = None
    
    return train_metrics, test_metrics, y_test_pred

# Fun√ß√£o para plotar hist√≥rico de busca
def plot_search_history(all_search_results, search_results, model_name, metric='mean_test_score'):
    """Plota a evolu√ß√£o dos resultados durante a busca de hiperpar√¢metros"""
        
    plt.figure(figsize=(15, 6))
        
    results_df = pd.DataFrame(search_results.cv_results_)
    # Extrair melhor score de cada busca e seu desvio-padr√£o
    search_scores = []
    search_stds = []
    search_indices = []
    
    for i, search_result in enumerate(all_search_results):
        search_scores.append(search_result['best_score'])
        search_indices.append(i + 1)
        
        # Encontrar o desvio-padr√£o correspondente ao melhor score desta busca
        cv_results = search_result['cv_results']
        best_idx = np.argmax(cv_results['mean_test_score'])
        search_stds.append(cv_results['std_test_score'][best_idx])
    
    # GR√ÅFICO 1: Melhor F1-Score por Busca com Desvio-Padr√£o
    plt.subplot(1, 2, 1)
    plt.plot(search_indices, search_scores, 'b-o', alpha=0.8, markersize=8)
    
    # Adicionar sombra do desvio-padr√£o
    plt.fill_between(search_indices, 
                     np.array(search_scores) - np.array(search_stds),
                     np.array(search_scores) + np.array(search_stds), 
                     color='blue', alpha=0.3)
    
    plt.title(f'{model_name} - Melhor F1-Score por Busca (com Desvio-Padr√£o)')
    plt.xlabel('N√∫mero da Busca')
    plt.ylabel('Melhor F1-Score')
    plt.grid(True, alpha=0.3)
    
    # Destacar a melhor busca
    best_idx = search_scores.index(max(search_scores))
    plt.plot(search_indices[best_idx], search_scores[best_idx], 'ro', markersize=12, 
            markeredgecolor='darkred', markeredgewidth=2,
            label=f'Melhor: {search_scores[best_idx]:.4f} ¬± {search_stds[best_idx]:.4f}')
    plt.legend()
    
    # GR√ÅFICO 2: Itera√ß√µes da melhor busca
    plt.subplot(1, 2, 2)
    
    # Verificar se existe coluna de treino
    if 'mean_train_score' in results_df.columns:
        plt.plot(results_df['mean_train_score'], 'g-o', alpha=0.7, label='Treino')
    
    # Plotar valida√ß√£o
    plt.plot(results_df[metric], 'b-o', alpha=0.7, label='Valida√ß√£o')
    
    # DESTACAR A MELHOR ITERA√á√ÉO
    best_iteration_idx = results_df[metric].idxmax()
    best_iteration_score = results_df[metric].iloc[best_iteration_idx]
    
    plt.plot(best_iteration_idx, best_iteration_score, 'ro', markersize=15, 
            markeredgecolor='darkred', markeredgewidth=2,
            label=f'Melhor itera√ß√£o: #{best_iteration_idx + 1} ({best_iteration_score:.4f})')
    
    plt.title(f'{model_name} - Treino vs Valida√ß√£o (Melhor Busca)')
    plt.xlabel('Itera√ß√£o')
    plt.ylabel('F1-Score')
    plt.legend()
    plt.grid(True)
    
    plt.tight_layout()
    plt.show()


# Fun√ß√£o para executar m√∫ltiplas buscas de hiperpar√¢metros
def multiple_randomized_search(estimator, param_distributions, X, y, cv_strategy, 
                              n_searches=20, n_iter_per_search=80, scoring='f1', 
                              random_state=42, n_jobs=-1, verbose=0):
    """
    Executa m√∫ltiplas buscas RandomizedSearchCV e retorna a melhor configura√ß√£o global
    
    Parameters:
    -----------
    n_searches : int
        N√∫mero de execu√ß√µes do RandomizedSearchCV (default: 20)
    n_iter_per_search : int  
        N√∫mero de itera√ß√µes por execu√ß√£o (default: 80)
    """
    print(f"Executando {n_searches} buscas com {n_iter_per_search} itera√ß√µes cada...")
    
    best_overall_score = -np.inf
    best_overall_params = None
    best_search_result = None
    all_results = []
    
    for search_idx in range(n_searches):
        print(f"\nBusca {search_idx + 1}/{n_searches}...")
        
        # RandomizedSearchCV para esta execu√ß√£o
        search = RandomizedSearchCV(
            estimator=estimator,
            param_distributions=param_distributions,
            n_iter=n_iter_per_search,
            scoring=scoring,
            cv=cv_strategy,
            random_state=None,
            n_jobs=n_jobs,
            return_train_score=True,
            verbose=0  # Menos verbose para m√∫ltiplas execu√ß√µes
        )
        
        search.fit(X, y)
        
        # Armazenar resultados desta busca
        search_results = {
            'search_idx': search_idx,
            'best_score': search.best_score_,
            'best_params': search.best_params_,
            'cv_results': search.cv_results_
        }
        all_results.append(search_results)
        
        # Verificar se esta √© a melhor busca at√© agora
        if search.best_score_ > best_overall_score:
            best_overall_score = search.best_score_
            best_overall_params = search.best_params_
            best_search_result = search
            
        print(f"Melhor score desta busca: {search.best_score_:.4f}")
        print(f"Melhor configura√ß√£o desta busca: {search.best_params_}")
        print(f"Melhor score geral at√© agora: {best_overall_score:.4f}")
    
    print(f"\nüéØ Busca completa! Melhor score geral: {best_overall_score:.4f}")
    print(f"Total de configura√ß√µes testadas: {n_searches * n_iter_per_search:,}")
    
    return best_search_result, all_results, best_overall_params


# Fun√ß√£o para plotar hist√≥rico de busca a partir de loaded_results
def plot_search_history_from_loaded(loaded_results, model_name, metric='mean_test_score'):
    """Plota a evolu√ß√£o dos resultados a partir de loaded_results"""
    
    plt.figure(figsize=(15, 6))
    
    # Extrair dados do loaded_results
    detailed_df = loaded_results['detailed_df']
    
    # Agrupar por search_idx para obter o melhor de cada busca e seu desvio-padr√£o
    best_per_search = detailed_df.loc[detailed_df.groupby('search_idx')['mean_test_score'].idxmax()]
    best_per_search = best_per_search[['search_idx', 'mean_test_score', 'std_test_score']].reset_index(drop=True)
    best_per_search['search_number'] = best_per_search['search_idx'] + 1  # 1-indexado
    
    # GR√ÅFICO 1: Melhor F1-Score por Busca com Desvio-Padr√£o
    plt.subplot(1, 2, 1)
    plt.plot(best_per_search['search_number'], best_per_search['mean_test_score'], 
             'b-o', alpha=0.8, markersize=8)
    
    # Adicionar sombra do desvio-padr√£o
    plt.fill_between(best_per_search['search_number'], 
                     best_per_search['mean_test_score'] - best_per_search['std_test_score'],
                     best_per_search['mean_test_score'] + best_per_search['std_test_score'], 
                     color='blue', alpha=0.3)
    
    plt.title(f'{model_name} - Melhor F1-Score por Busca (com Desvio-Padr√£o)')
    plt.xlabel('N√∫mero da Busca')
    plt.ylabel('Melhor F1-Score')
    plt.grid(True, alpha=0.3)
    
    # Destacar a melhor busca
    best_search_idx = best_per_search['mean_test_score'].idxmax()
    best_search_score = best_per_search['mean_test_score'].iloc[best_search_idx]
    best_search_std = best_per_search['std_test_score'].iloc[best_search_idx]
    best_search_number = best_per_search['search_number'].iloc[best_search_idx]
    
    plt.plot(best_search_number, best_search_score, 'ro', markersize=12, 
             markeredgecolor='darkred', markeredgewidth=2,
             label=f'Melhor: {best_search_score:.4f} ¬± {best_search_std:.4f}')
    plt.legend()
    
    # GR√ÅFICO 2: Itera√ß√µes da melhor busca
    plt.subplot(1, 2, 2)
    
    # Encontrar qual search_idx teve o melhor score geral
    best_overall_idx = detailed_df['mean_test_score'].idxmax()
    best_overall_search_idx = detailed_df.loc[best_overall_idx, 'search_idx']
    
    # Filtrar dados apenas da melhor busca
    best_search_data = detailed_df[detailed_df['search_idx'] == best_overall_search_idx].copy()
    best_search_data = best_search_data.sort_values('iteration').reset_index(drop=True)
    
    # Criar eixo X 1-indexado para itera√ß√µes
    iterations_1indexed = range(1, len(best_search_data) + 1)
    
    # Verificar se existe coluna de treino
    if 'mean_train_score' in best_search_data.columns and best_search_data['mean_train_score'].notna().any():
        plt.plot(iterations_1indexed, best_search_data['mean_train_score'], 
                'g-o', alpha=0.7, label='Treino')
    
    # Plotar valida√ß√£o sem sombra
    plt.plot(iterations_1indexed, best_search_data[metric], 'b-o', alpha=0.7, label='Valida√ß√£o')
    
    # DESTACAR A MELHOR ITERA√á√ÉO
    best_iteration_idx = best_search_data[metric].idxmax()
    best_iteration_score = best_search_data[metric].iloc[best_iteration_idx]
    best_iteration_number = best_iteration_idx + 1  # 1-indexado
    
    plt.plot(best_iteration_number, best_iteration_score, 'ro', markersize=15, 
             markeredgecolor='darkred', markeredgewidth=2,
             label=f'Melhor itera√ß√£o: #{best_iteration_number} ({best_iteration_score:.4f})')
    
    plt.title(f'{model_name} - Treino vs Valida√ß√£o (Melhor Busca #{best_overall_search_idx + 1})')
    plt.xlabel('Itera√ß√£o')
    plt.ylabel('F1-Score')
    plt.legend()
    plt.grid(True)
    
    plt.tight_layout()
    plt.show()

print("Fun√ß√µes auxiliares definidas com sucesso!")

Fun√ß√µes auxiliares definidas com sucesso!


### 3.1 Definir Folds

In [13]:
# Configura√ß√£o da valida√ß√£o cruzada estratificada
cv_strategy = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

## 4. Stacking - Busca de Hiperpar√¢metros

In [14]:
# Definir nome do modelo para uso em salvamento e exibi√ß√£o
MODEL_NAME = "Stacking"

In [25]:
# ======================================================================
# 4.1 BUSCA DE HIPERPARAMETROS 
# ======================================================================

print(f"=== BUSCA DE HIPERPAR√ÇMETROS - {MODEL_NAME} ===")

# Defini√ß√£o do Espa√ßo de Hiperpar√¢metros para Stacking
param_distributions = {
    # Par√¢metros Decision Tree (busca)
    'dt_max_depth': randint(3, 50),
    'dt_min_samples_split': randint(2, 40),
    'dt_min_samples_leaf': randint(1, 50),
    # Par√¢metros Decision Tree fixos: criterion='gini', max_features='sqrt'
    
    # Par√¢metros Random Forest (busca)
    'rf_n_estimators': randint(5, 300),
    'rf_max_depth': randint(3, 50),
    # Par√¢metros Random Forest fixos: criterion='gini', max_features='sqrt', min_samples_split=16, min_samples_leaf=1
    
    # Par√¢metros XGBoost (busca apenas n_estimators, max_depth, learning_rate)
    # Demais par√¢metros fixos nos melhores valores encontrados
    'xgb_n_estimators': randint(140, 381),
    'xgb_max_depth': randint(3, 11),
    'xgb_learning_rate': uniform(0.5, 1.5),
    
    # Par√¢metros do Meta-estimador (Logistic Regression)
    'meta_C': uniform(0.001, 10),  # Regulariza√ß√£o: valores menores = mais regulariza√ß√£o
    'meta_max_iter': [100, 500, 1000, 2000, 3000],  # Itera√ß√µes m√°ximas para converg√™ncia
}

# M√∫ltiplas execu√ß√µes do RandomizedSearchCV
print(f"Iniciando busca de hiperpar√¢metros para {MODEL_NAME}...")
model_search, model_all_searches, best_params = multiple_randomized_search(
    estimator=HeterogeneousStackingCommittee(random_state=42, cv=3),
    param_distributions=param_distributions,
    X=X_sample,                  
    y=y_sample,
    cv_strategy=cv_strategy,
    n_searches=20,  
    n_iter_per_search=2,       
    scoring='f1',
    n_jobs=1,  # Processamento sequencial
)

# Sele√ß√£o da Melhor Configura√ß√£o
print(f"\n--- RESULTADOS {MODEL_NAME} ---")
print("Melhores hiperpar√¢metros:")
for param, value in best_params.items():
    print(f"  {param}: {value}")

print(f"\nMelhor F1-Score (CV): {model_search.best_score_:.4f}")

=== BUSCA DE HIPERPAR√ÇMETROS - Stacking ===
Iniciando busca de hiperpar√¢metros para Stacking...
Executando 20 buscas com 2 itera√ß√µes cada...

Busca 1/20...
Melhor score desta busca: 0.0265
Melhor configura√ß√£o desta busca: {'dt_max_depth': 4, 'dt_min_samples_leaf': 18, 'dt_min_samples_split': 34, 'meta_C': np.float64(3.176949833122895), 'meta_max_iter': 1000, 'rf_max_depth': 4, 'rf_n_estimators': 233, 'xgb_learning_rate': np.float64(1.7410978906984163), 'xgb_max_depth': 8, 'xgb_n_estimators': 269}
Melhor score geral at√© agora: 0.0265

Busca 2/20...
Melhor score desta busca: 0.0265
Melhor configura√ß√£o desta busca: {'dt_max_depth': 4, 'dt_min_samples_leaf': 18, 'dt_min_samples_split': 34, 'meta_C': np.float64(3.176949833122895), 'meta_max_iter': 1000, 'rf_max_depth': 4, 'rf_n_estimators': 233, 'xgb_learning_rate': np.float64(1.7410978906984163), 'xgb_max_depth': 8, 'xgb_n_estimators': 269}
Melhor score geral at√© agora: 0.0265

Busca 2/20...
Melhor score desta busca: 0.0263
Melho

KeyboardInterrupt: 

In [None]:
# Registro de Desempenho - plotar evolu√ß√£o
plot_search_history(model_all_searches, model_search, MODEL_NAME)

In [None]:
# ======================================================================
# 4.5 AN√ÅLISE DAS MELHORES CONFIGURA√á√ïES ENCONTRADAS
# ======================================================================

print(f"=== TOP CONFIGURA√á√ïES - {MODEL_NAME} ===")

# Extrair os melhores resultados de cada busca
best_configs = []

for i, search_result in enumerate(model_all_searches):
    config = {
        'Busca': i + 1,
        'F1_Score': search_result['best_score'],
        'RF_N_Est': search_result['best_params']['rf_n_estimators'],
        'RF_Depth': search_result['best_params']['rf_max_depth'],
        'SVM_C': search_result['best_params']['svm_C'],
        'SVM_Gamma': search_result['best_params']['svm_gamma'],
        'KNN_K': search_result['best_params']['knn_n_neighbors'],
        'MLP_Layers': str(search_result['best_params']['mlp_hidden_layers']),
        'Meta_C': search_result['best_params']['meta_C'],
        'CV_Folds': search_result['best_params']['cv']
    }
    best_configs.append(config)

# Converter para DataFrame e ordenar por F1-Score
results_df = pd.DataFrame(best_configs)
results_df = results_df.sort_values('F1_Score', ascending=False).reset_index(drop=True)
results_df['Ranking'] = range(1, len(results_df) + 1)

# Reordenar colunas
results_df = results_df[['Ranking', 'Busca', 'F1_Score', 'RF_N_Est', 'RF_Depth', 
                        'SVM_C', 'SVM_Gamma', 'KNN_K', 'MLP_Layers', 'Meta_C', 'CV_Folds']]

# Mostrar tabela formatada
print("Configura√ß√µes encontradas (ordenadas por F1-Score Bin√°rio):")
print("-" * 150)
print(results_df.to_string(index=False, float_format='%.4f'))

# Estat√≠sticas resumidas
print(f"\n--- ESTAT√çSTICAS DAS CONFIGURA√á√ïES ---")
print(f"Melhor F1-Score: {results_df['F1_Score'].max():.4f}")
print(f"F1-Score m√©dio: {results_df['F1_Score'].mean():.4f}")
print(f"Desvio padr√£o: {results_df['F1_Score'].std():.4f}")
print(f"F1-Score m√≠nimo: {results_df['F1_Score'].min():.4f}")

## 5. Salvar Resultados de Busca

In [None]:
# Salvar Resultados da Busca de Hiperpar√¢metros

print(f"=== SALVANDO RESULTADOS DA BUSCA - {MODEL_NAME} ===")

# Criar pasta se n√£o existir
os.makedirs('searches', exist_ok=True)

# 1. Salvar resultados detalhados de todas as buscas
search_detailed_results = []

for i, search_result in enumerate(model_all_searches):
    # Extrair informa√ß√µes de cada busca individual
    cv_results = search_result['cv_results']
    
    for j in range(len(cv_results['mean_test_score'])):
        search_detailed_results.append({
            'search_idx': search_result['search_idx'],
            'iteration': j,
            'mean_test_score': cv_results['mean_test_score'][j],
            'std_test_score': cv_results['std_test_score'][j],
            'mean_train_score': cv_results['mean_train_score'][j] if 'mean_train_score' in cv_results else None,
            'std_train_score': cv_results['std_train_score'][j] if 'std_train_score' in cv_results else None,
            'params': str(cv_results['params'][j]),
            **{k: (str(v) if isinstance(v, tuple) else v) for k, v in cv_results['params'][j].items()}
        })

# Converter para DataFrame e salvar
search_df = pd.DataFrame(search_detailed_results)
search_df.to_csv(f'searches/{MODEL_NAME.lower()}_all_searches.csv', index=False)

print(f"  Todos os Resultados salvos: searches/{MODEL_NAME.lower()}_all_searches.csv")
print(f"  Total de configura√ß√µes testadas: {len(search_df):,}")

# 2. Salvar resumo da melhor busca
best_search_summary = {
    'model_name': MODEL_NAME,
    'best_overall_score': model_search.best_score_,
    'best_overall_params': {k: (str(v) if isinstance(v, tuple) else v) for k, v in model_search.best_params_.items()},
    'search_config': {
        'n_searches': 5,
        'n_iter_per_search': 3,
        'scoring': 'f1',
        'cv_folds': 3,
        'total_configurations': len(search_df)
    },
    'top_configs': search_df.nlargest(len(search_df), 'mean_test_score')[
        ['mean_test_score', 'std_test_score', 'rf_n_estimators', 'svm_C', 'knn_n_neighbors', 
         'meta_C', 'cv']
    ].to_dict('records')
}

# Salvar resumo em JSON
with open(f'searches/{MODEL_NAME.lower()}_search_summary.json', 'w') as f:
    json.dump(best_search_summary, f, indent=2)

print(f"  Resumo salvo: searches/{MODEL_NAME.lower()}_search_summary.json")

# Mostrar estat√≠sticas da busca
print(f"\n--- ESTAT√çSTICAS DA BUSCA {MODEL_NAME} ---")
print(f"Melhor F1-Score: {model_search.best_score_:.4f}")
print(f"Desvio padr√£o do melhor: {search_df.loc[search_df['mean_test_score'].idxmax(), 'std_test_score']:.4f}")
print(f"F1-Score m√©dio geral: {search_df['mean_test_score'].mean():.4f}")
print(f"F1-Score m√≠nimo: {search_df['mean_test_score'].min():.4f}")
print(f"F1-Score m√°ximo: {search_df['mean_test_score'].max():.4f}")

## 5.2 Carregar Resultado de busca

In [None]:
def load_search_results(model_name, searches_folder='searches'):
    """
    Carrega resultados de busca salvos anteriormente
    
    Returns:
    --------
    dict: Dicion√°rio com todos os resultados carregados
    """
    print(f"=== CARREGANDO RESULTADOS DE BUSCA - {model_name.upper()} ===")
    
    results = {}
    
    # 1. Carregar DataFrame detalhado
    csv_path = os.path.join(searches_folder, f'{model_name.lower()}_all_searches.csv')
    if os.path.exists(csv_path):
        results['detailed_df'] = pd.read_csv(csv_path)
        print(f"‚úÖ Resultados detalhados carregados: {len(results['detailed_df']):,} configura√ß√µes")
    else:
        print(f"‚ö†Ô∏è  Arquivo n√£o encontrado: {csv_path}")
    
    # 2. Carregar resumo JSON
    json_path = os.path.join(searches_folder, f'{model_name.lower()}_search_summary.json')
    if os.path.exists(json_path):
        with open(json_path, 'r') as f:
            results['summary'] = json.load(f)
        print(f"‚úÖ Resumo carregado: F1-Score = {results['summary']['best_overall_score']:.4f}")
    else:
        print(f"‚ö†Ô∏è  Arquivo n√£o encontrado: {json_path}")
    
    # 3. Carregar backup pickle
    pkl_path = os.path.join(searches_folder, f'{model_name.lower()}_full_search.pkl')
    if os.path.exists(pkl_path):
        with open(pkl_path, 'rb') as f:
            results['full_backup'] = pickle.load(f)
        print(f"‚úÖ Backup completo carregado")
    else:
        print(f"‚ö†Ô∏è  Arquivo n√£o encontrado: {pkl_path}")
    
    return results


def get_best_params_from_saved(model_name, searches_folder='searches'):
    """
    Recupera os melhores par√¢metros de arquivos salvos
    
    Returns:
    --------
    dict: Melhores par√¢metros encontrados
    """
    # Tentar carregar do JSON primeiro
    json_path = os.path.join(searches_folder, f'{model_name.lower()}_search_summary.json')
    
    if os.path.exists(json_path):
        with open(json_path, 'r') as f:
            summary = json.load(f)
        return summary['best_overall_params']
    
    # Fallback para pickle
    pkl_path = os.path.join(searches_folder, f'{model_name.lower()}_full_search.pkl')
    if os.path.exists(pkl_path):
        with open(pkl_path, 'rb') as f:
            backup = pickle.load(f)
        return backup['best_params']
    
    print(f"‚ùå N√£o foi poss√≠vel carregar par√¢metros para {model_name}")
    return None


In [None]:
#### 4.2 Carregar Resultados Salvos (Fun√ß√£o Auxiliar)
# Exemplo de uso da fun√ß√£o (n√£o executar se j√° temos os resultados)
loaded_results = load_search_results(MODEL_NAME)

In [None]:
# Plotar a hist√≥ria da busca a partir dos resultados carregados
plot_search_history_from_loaded(loaded_results, MODEL_NAME)

In [None]:

#### 4.3 Recuperar Melhores Par√¢metros para Uso Posterior
# Exemplo de uso (descomente se precisar carregar par√¢metros salvos):
if 'loaded_results' in locals():
    best_params = get_best_params_from_saved(MODEL_NAME)
    if best_params:
        print(f"‚úÖ Par√¢metros carregados: {best_params}")
    best_score = loaded_results['summary']['best_overall_score']
    print(f"‚úÖ Melhor F1-Score carregado: {best_score:.4f}")
else:
    best_params = model_search.best_params_
    best_score = model_search.best_score_
    print(f"‚úÖ Usando par√¢metros da busca atual: {best_params}")
    print(f"‚úÖ Melhor F1-Score da busca atual: {best_score:.4f}")

## 6. Treinar Modelo Final e Salvar

In [None]:
# Treinamento Final com melhores hiperpar√¢metros
best_model = HeterogeneousStackingCommittee(**best_params, random_state=42)
best_model.fit(X_train_scaled, y_train)

print(f"\nModelo final {MODEL_NAME} treinado: {best_model}")

# Criar pasta se n√£o existir
os.makedirs('models', exist_ok=True)

# Salvar modelo treinado
dump(best_model, f'models/{MODEL_NAME.lower()}_trained.joblib')
print(f"Modelo salvo: models/{MODEL_NAME.lower()}_trained.joblib")

## 7. Avalia√ß√£o Final e Salvamento dos Resultados

In [None]:
# Carregar modelo
loaded_model = load(f'models/{MODEL_NAME.lower()}_trained.joblib')

In [None]:
print(f"=== AVALIA√á√ÉO E SALVAMENTO DOS RESULTADOS - {MODEL_NAME} ===")

# Criar pastas se n√£o existirem
os.makedirs('results', exist_ok=True)

# Avalia√ß√£o completa do modelo
print("\nAvaliando performance do modelo...")

if 'loaded_model' in locals():
    model = loaded_model
else:
    model = best_model

X_train_eval = X_train_scaled
y_train_eval = y_train
X_test_eval = X_test_scaled
y_test_eval = y_test

# Avaliar modelo
train_metrics, test_metrics, y_pred = evaluate_model(
    model, X_train_eval, X_test_eval, y_train_eval, y_test_eval, MODEL_NAME
)

In [None]:
# Compilar todos os resultados
model_final_results = {
    'model_name': MODEL_NAME,
    'best_params': best_params,
    'best_cv_score': best_score,
    'train_metrics': train_metrics,
    'test_metrics': test_metrics,
    'predictions': y_pred.tolist(),
    'test_labels': y_test_eval.tolist(),
    'evaluation_info': {
        'train_samples_used': len(X_train_eval),
        'test_samples_used': len(X_test_eval),
        'total_train_samples': len(X_train_scaled),
        'total_test_samples': len(X_test_scaled)
    }
}

# Salvar resultados em JSON
with open(f'results/{MODEL_NAME.lower()}_results.json', 'w') as f:
    json.dump(model_final_results, f, indent=2)

print(f"Resultados {MODEL_NAME} salvos em: results/{MODEL_NAME.lower()}_results.json")

# Mostrar resumo
print(f"\n--- RESUMO {MODEL_NAME} ---")
print(f"F1-Score CV: {model_final_results['best_cv_score']:.4f}")
print(f"F1-Score Teste: {test_metrics['f1']:.4f}")
print(f"Acur√°cia Teste: {test_metrics['accuracy']:.4f}")
print(f"Precis√£o Teste: {test_metrics['precision']:.4f}")
print(f"Recall Teste: {test_metrics['recall']:.4f}")
print(f"G-Mean Teste: {test_metrics['gmean']:.4f}")
if test_metrics['auc_roc']:
    print(f"AUC-ROC Teste: {test_metrics['auc_roc']:.4f}")