# **EXPERIMENTO 10 ANOS, com DIAS_EM_TRATAMENTO**

In [None]:
# ==============================================
# 1. Montar o Google Drive
# ==============================================
from google.colab import drive
drive.mount('/content/drive')

# ==============================================
# 2. Importações
# ==============================================
import pandas as pd
import numpy as np
import joblib
import os
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_val_predict
from sklearn.metrics import (classification_report, confusion_matrix, accuracy_score,
                             f1_score, matthews_corrcoef, roc_auc_score)
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler, SMOTE
import warnings
warnings.filterwarnings('ignore')

# ==============================================
# 3. Leitura do dataset
# ==============================================
OutPath = "/content/drive/MyDrive/Colab Notebooks/5.SINANTB_Nacional_2001-2023/1.4 Resultados e-ou arquivos de saída/"
data = pd.read_csv(os.path.join(OutPath, "Sinantb_desfechos_03-14-2025_20-42-05.csv"))



X = data.drop("DESFECHO", axis=1)
y = data["DESFECHO"]
print(data.columns)

# ==============================================
# 4. Criação única do conjunto de teste balanceado
# ==============================================
np.random.seed(42)
minority_class = y.value_counts().idxmin()
majority_class = y.value_counts().idxmax()
test_size = int(y.value_counts()[minority_class] * 0.2)

minority_indices = np.random.choice(y[y == minority_class].index, test_size, replace=False)
majority_indices = np.random.choice(y[y == majority_class].index, test_size, replace=False)
test_indices = np.concatenate([minority_indices, majority_indices])

X_test_bal = X.loc[test_indices]
y_test_bal = y.loc[test_indices]

X_train_raw = X.drop(test_indices)
y_train_raw = y.drop(test_indices)

# ==============================================
# 5. Modelos e Parâmetros
# ==============================================
modelos = {
    "DecisionTree": DecisionTreeClassifier(random_state=42),
    "RandomForest": RandomForestClassifier(random_state=42),
    "GradientBoosting": GradientBoostingClassifier(random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
}

param_grids = {
    "DecisionTree": {
        'max_depth': [None, 5, 10, 20],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    },
    "RandomForest": {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5, 10]
    },
    "GradientBoosting": {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 7]
    },
    "XGBoost": {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.1],
        'max_depth': [3, 5, 7]
    }
}

balanceamentos = {
    "Undersampling": RandomUnderSampler(random_state=42),
    "Oversampling": RandomOverSampler(random_state=42),
    "SMOTE": SMOTE(random_state=42)
}

# ==============================================
# 6. Experimentos
# ==============================================
results_train = []
results_test = []
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for bal_name, balancer in balanceamentos.items():
    X_train_bal, y_train_bal = balancer.fit_resample(X_train_raw, y_train_raw)

    for mod_name, model in modelos.items():
        grid_search = GridSearchCV(model, param_grids[mod_name], cv=cv, scoring='f1_macro', n_jobs=-1)
        grid_search.fit(X_train_bal, y_train_bal)

        print(f"\nMelhores parâmetros para {mod_name} com {bal_name}:")
        print(grid_search.best_params_)
        print(f"Melhor F1-Macro (treino - CV): {grid_search.best_score_:.4f}")

        # Salvar o modelo otimizado após treinamento com GridSearch
        model_path = os.path.join(OutPath, f"{mod_name}_{bal_name}_otimizado10anos.pkl")
        joblib.dump(grid_search.best_estimator_, model_path)
        print(f"Modelo salvo em: {model_path}")

        # Métricas Treino (cross-validation)
        y_train_pred = cross_val_predict(grid_search.best_estimator_, X_train_bal, y_train_bal, cv=cv, method='predict')
        y_train_proba = cross_val_predict(grid_search.best_estimator_, X_train_bal, y_train_bal, cv=cv, method='predict_proba')[:,1]

        cm_train = confusion_matrix(y_train_bal, y_train_pred)
        tn, fp, fn, tp = cm_train.ravel()

        results_train.append({
            'Balanceamento': bal_name,
            'Modelo': mod_name,
            'Accuracy': accuracy_score(y_train_bal, y_train_pred),
            'F1-Score': f1_score(y_train_bal, y_train_pred),
            'F1-Macro': f1_score(y_train_bal, y_train_pred, average='macro'),
            'MCC': matthews_corrcoef(y_train_bal, y_train_pred),
            'Specificity': tn / (tn + fp),
            'Sensitivity': tp / (tp + fn),
            'AUC-ROC': roc_auc_score(y_train_bal, y_train_proba)
        })

        # Métricas Teste
        y_test_pred = grid_search.predict(X_test_bal)
        y_test_proba = grid_search.predict_proba(X_test_bal)[:,1]
        cm_test = confusion_matrix(y_test_bal, y_test_pred)
        tn, fp, fn, tp = cm_test.ravel()

        results_test.append({
            'Balanceamento': bal_name,
            'Modelo': mod_name,
            'Accuracy': accuracy_score(y_test_bal, y_test_pred),
            'F1-Score': f1_score(y_test_bal, y_test_pred),
            'F1-Macro': f1_score(y_test_bal, y_test_pred, average='macro'),
            'MCC': matthews_corrcoef(y_test_bal, y_test_pred),
            'Specificity': tn / (tn + fp),
            'Sensitivity': tp / (tp + fn),
            'AUC-ROC': roc_auc_score(y_test_bal, y_test_proba)
        })

# ==============================================
# 7. Tabelas consolidadas
# ==============================================
def highlight_max(s):
    return ['font-weight: bold' if v == s.max() else '' for v in s]

df_results_train = pd.DataFrame(results_train)
df_results_test = pd.DataFrame(results_test)

print("\nTabela de Resultados - Treino:")
display(df_results_train.style.apply(highlight_max, subset=df_results_train.columns[2:]))

print("\nTabela de Resultados - Teste:")
display(df_results_test.style.apply(highlight_max, subset=df_results_test.columns[2:]))


Mounted at /content/drive
Index(['CS_SEXO', 'CS_RACA', 'CS_ESCOL_N', 'TRATAMENTO', 'AGRAVALCOO',
       'AGRAVDIABE', 'AGRAVDOENC', 'AGRAVOUTRA', 'POP_LIBER', 'POP_RUA',
       'POP_SAUDE', 'POP_IMIG', 'AGRAVDROGA', 'AGRAVTABAC', 'REGIAO', 'IDADE',
       'DIAS_EM_TRATAMENTO', 'DESFECHO'],
      dtype='object')

Melhores parâmetros para DecisionTree com Undersampling:
{'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 10}
Melhor F1-Macro (treino - CV): 0.8344
Modelo salvo em: /content/drive/MyDrive/Colab Notebooks/5.SINANTB_Nacional_2001-2023/1.4 Resultados e-ou arquivos de saída/DecisionTree_Undersampling_otimizado10anos.pkl

Melhores parâmetros para RandomForest com Undersampling:
{'max_depth': 20, 'min_samples_split': 10, 'n_estimators': 200}
Melhor F1-Macro (treino - CV): 0.8378
Modelo salvo em: /content/drive/MyDrive/Colab Notebooks/5.SINANTB_Nacional_2001-2023/1.4 Resultados e-ou arquivos de saída/RandomForest_Undersampling_otimizado10anos.pkl


In [None]:
# =============================
# Importações necessárias
# =============================
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import os

# =============================
# Caminho onde os modelos estão salvos
# =============================
OutPath = "/content/drive/MyDrive/Colab Notebooks/5.SINANTB_Nacional_2001-2023/1.4 Resultados e-ou arquivos de saída/"

# Lista dos modelos e balanceamentos
modelos = ["DecisionTree", "RandomForest", "GradientBoosting", "XGBoost"]
balanceamentos = ["Undersampling", "Oversampling", "SMOTE"]

# =============================
# Carregamento e Feature Importance
# =============================
for modelo in modelos:
    for bal in balanceamentos:
        nome_modelo = f"{modelo}_{bal}_otimizado10anos.pkl"
        caminho_modelo = os.path.join(OutPath, nome_modelo)

        if os.path.exists(caminho_modelo):
            clf = joblib.load(caminho_modelo)

            # Obter as importâncias das features
            if hasattr(clf, 'feature_importances_'):
                importancias = clf.feature_importances_
                indices = np.argsort(importancias)[::-1]

                # Nomes das features
                features = X.columns

                # Plotar gráfico
                plt.figure(figsize=(10, 6))
                sns.barplot(x=importancias[indices], y=features[indices], palette="Blues_d")
                plt.title(f"Feature Importance - {modelo} ({bal})")
                plt.xlabel('Importância')
                plt.ylabel('Variável')
                plt.tight_layout()
                plt.show()

            else:
                print(f"O modelo {modelo} ({bal}) não tem feature_importances_.")
        else:
            print(f"Modelo não encontrado: {caminho_modelo}")
