# Treinamento e Avalia√ß√£o de Modelos
## Tech Challenge Fase 4 - Predi√ß√£o de N√≠veis de Obesidade

> üìò **Documenta√ß√£o:** Para contexto completo da estrat√©gia de modelagem e justificativas das escolhas t√©cnicas, consulte [00_GUIA_ANALISE.ipynb](00_GUIA_ANALISE.ipynb)

---

In [None]:
# Importa√ß√£o de bibliotecas
import sys
sys.path.append('../src')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import joblib

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import (
    accuracy_score, classification_report, confusion_matrix,
    f1_score, precision_score, recall_score
)

# Modelos
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier

# Importar tradu√ß√µes e cores padronizadas
from translations import (
    VARIABLE_NAMES, OBESITY_LABELS, OBESITY_ORDER,
    PRIMARY_COLOR, SECONDARY_COLOR, ACCENT_COLOR,
    translate_variable, get_obesity_label, get_color_palette
)

warnings.filterwarnings('ignore')
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print("Bibliotecas importadas com sucesso!")
print("Tradu√ß√µes e cores padronizadas carregadas ‚úì")

## 1. Carregamento e Prepara√ß√£o dos Dados

In [None]:
# Carregar dataset
df = pd.read_csv('../data/Obesity.csv')

# Calcular BMI
df['BMI'] = df['Weight'] / (df['Height'] ** 2)

print(f"üìä Dataset: {df.shape[0]} linhas, {df.shape[1]} colunas")
print(f"\nüéØ Vari√°vel Alvo: Obesity")
print(f"\nClasses: {df['Obesity'].unique()}")
print(f"N√∫mero de classes: {df['Obesity'].nunique()}")

## 2. Pr√©-processamento

In [None]:
# Separar features e target
X = df.drop('Obesity', axis=1)
y = df['Obesity']

print("üìã Features:")
print(X.columns.tolist())
print(f"\nTotal de features: {len(X.columns)}")

In [None]:
# Identificar vari√°veis categ√≥ricas e num√©ricas
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
numerical_cols = X.select_dtypes(include=[np.number]).columns.tolist()

print(f"üìä Vari√°veis Categ√≥ricas ({len(categorical_cols)}): {categorical_cols}")
print(f"üìä Vari√°veis Num√©ricas ({len(numerical_cols)}): {numerical_cols}")

In [None]:
# Codificar vari√°veis categ√≥ricas
label_encoders = {}
X_encoded = X.copy()

for col in categorical_cols:
    le = LabelEncoder()
    X_encoded[col] = le.fit_transform(X[col])
    label_encoders[col] = le

# Codificar target
le_target = LabelEncoder()
y_encoded = le_target.fit_transform(y)

print("‚úÖ Vari√°veis categ√≥ricas codificadas!")
print(f"\nüìå Mapeamento das classes:")
for idx, class_name in enumerate(le_target.classes_):
    print(f"{idx}: {class_name}")

In [None]:
# Normalizar features num√©ricas
scaler = StandardScaler()
X_scaled = X_encoded.copy()
X_scaled[numerical_cols] = scaler.fit_transform(X_encoded[numerical_cols])

print("‚úÖ Features num√©ricas normalizadas!")
print(f"\nüìä Shape final: {X_scaled.shape}")

## 3. Divis√£o dos Dados

In [None]:
# Dividir em treino e teste (80/20)
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y_encoded, 
    test_size=0.2, 
    random_state=42, 
    stratify=y_encoded
)

print(f"üìä Conjunto de Treino: {X_train.shape[0]} amostras")
print(f"üìä Conjunto de Teste: {X_test.shape[0]} amostras")
print(f"\n‚úÖ Propor√ß√£o: {X_train.shape[0]/len(X_scaled)*100:.1f}% treino / {X_test.shape[0]/len(X_scaled)*100:.1f}% teste")

## 4. Treinamento de Modelos Baseline

In [None]:
# Dicion√°rio de modelos
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42),
    'XGBoost': XGBClassifier(random_state=42, eval_metric='mlogloss')
}

print("ü§ñ Modelos a serem treinados:")
for name in models.keys():
    print(f"  - {name}")

In [None]:
# Treinar e avaliar modelos
results = {}

print("\n" + "="*80)
print("üöÄ TREINAMENTO DOS MODELOS")
print("="*80)

for name, model in models.items():
    print(f"\nüîÑ Treinando {name}...")
    
    # Treinar modelo
    model.fit(X_train, y_train)
    
    # Fazer predi√ß√µes
    y_pred = model.predict(X_test)
    
    # Calcular m√©tricas
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    
    # Cross-validation
    cv_scores = cross_val_score(model, X_train, y_train, cv=5)
    
    # Armazenar resultados
    results[name] = {
        'model': model,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_score': f1,
        'cv_mean': cv_scores.mean(),
        'cv_std': cv_scores.std(),
        'predictions': y_pred
    }
    
    print(f"  ‚úÖ Acur√°cia: {accuracy*100:.2f}%")
    print(f"  üìä CV Score: {cv_scores.mean()*100:.2f}% (+/- {cv_scores.std()*100:.2f}%)")

print("\n" + "="*80)
print("‚úÖ TREINAMENTO CONCLU√çDO!")
print("="*80)

## 5. Compara√ß√£o de Modelos

In [None]:
# Criar DataFrame com resultados
results_df = pd.DataFrame({
    'Modelo': list(results.keys()),
    'Acur√°cia (%)': [results[m]['accuracy']*100 for m in results.keys()],
    'Precis√£o (%)': [results[m]['precision']*100 for m in results.keys()],
    'Recall (%)': [results[m]['recall']*100 for m in results.keys()],
    'F1-Score (%)': [results[m]['f1_score']*100 for m in results.keys()],
    'CV Score (%)': [results[m]['cv_mean']*100 for m in results.keys()]
})

results_df = results_df.sort_values('Acur√°cia (%)', ascending=False).reset_index(drop=True)

print("\nüìä COMPARA√á√ÉO DE MODELOS:")
print("="*100)
print(results_df.to_string(index=False))
print("="*100)

# Identificar melhor modelo
best_model_name = results_df.iloc[0]['Modelo']
best_accuracy = results_df.iloc[0]['Acur√°cia (%)']

print(f"\nüèÜ MELHOR MODELO: {best_model_name}")
print(f"üéØ Acur√°cia: {best_accuracy:.2f}%")

if best_accuracy >= 75:
    print("\n‚úÖ META ATINGIDA! Acur√°cia > 75%")
else:
    print(f"\n‚ö†Ô∏è Meta n√£o atingida. Necess√°rio otimizar modelos (atual: {best_accuracy:.2f}%, meta: 75%)")

In [None]:
# Visualizar compara√ß√£o de modelos com cores padronizadas
fig, axes = plt.subplots(1, 2, figsize=(18, 6))

# Gr√°fico de barras - Acur√°cia
colors_bars = [SECONDARY_COLOR if acc >= 75 else ACCENT_COLOR for acc in results_df['Acur√°cia (%)']]
axes[0].barh(results_df['Modelo'], results_df['Acur√°cia (%)'], 
            color=colors_bars, edgecolor='black', alpha=0.85, linewidth=1.2)
axes[0].axvline(x=75, color=ACCENT_COLOR, linestyle='--', linewidth=2, label='Meta (75%)')
axes[0].set_xlabel('Acur√°cia (%)', fontsize=12, fontweight='bold')
axes[0].set_title('Compara√ß√£o de Acur√°cia dos Modelos', fontsize=14, fontweight='bold', pad=15)
axes[0].legend(fontsize=10)
axes[0].grid(axis='x', alpha=0.3, linestyle='--')

# Adicionar valores nas barras
for idx, val in enumerate(results_df['Acur√°cia (%)']):
    axes[0].text(val + 0.5, idx, f'{val:.2f}%', va='center', fontweight='bold', fontsize=9)

# Gr√°fico de barras agrupadas - Todas as m√©tricas com cores padronizadas
metrics_df = results_df.set_index('Modelo')[['Acur√°cia (%)', 'Precis√£o (%)', 'Recall (%)', 'F1-Score (%)']]

# Criar gradiente de azul para as m√©tricas
metric_colors = get_color_palette(4)
metrics_df.plot(kind='bar', ax=axes[1], color=metric_colors, 
               edgecolor='black', width=0.8, alpha=0.85, linewidth=1.2)

axes[1].set_title('Compara√ß√£o de M√©tricas por Modelo', fontsize=14, fontweight='bold', pad=15)
axes[1].set_ylabel('Percentual (%)', fontsize=12, fontweight='bold')
axes[1].set_xlabel('Modelo', fontsize=12, fontweight='bold')
axes[1].legend(title='M√©tricas', bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=9)
axes[1].tick_params(axis='x', rotation=45)
axes[1].grid(axis='y', alpha=0.3, linestyle='--')

plt.tight_layout()
plt.show()

## 6. An√°lise Detalhada do Melhor Modelo

In [None]:
# Obter melhor modelo
best_model = results[best_model_name]['model']
y_pred_best = results[best_model_name]['predictions']

# Traduzir nomes das classes para portugu√™s
class_names_pt = [get_obesity_label(cls) for cls in le_target.classes_]

# Relat√≥rio de classifica√ß√£o
print("\nüìã RELAT√ìRIO DE CLASSIFICA√á√ÉO - " + best_model_name.upper())
print("="*80)
print(classification_report(y_test, y_pred_best, 
                          target_names=class_names_pt,
                          digits=4))
print("="*80)

In [None]:
# Matriz de confus√£o com cores padronizadas e labels em portugu√™s
cm = confusion_matrix(y_test, y_pred_best)

# Traduzir nomes das classes
class_names_pt = [get_obesity_label(cls) for cls in le_target.classes_]

plt.figure(figsize=(12, 10))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=class_names_pt,
            yticklabels=class_names_pt,
            square=True, linewidths=0.5,
            cbar_kws={"shrink": 0.8},
            annot_kws={"fontsize": 10})
plt.title(f'Matriz de Confus√£o - {best_model_name}', fontsize=14, fontweight='bold', pad=20)
plt.ylabel('Classe Real', fontsize=12, fontweight='bold')
plt.xlabel('Classe Prevista', fontsize=12, fontweight='bold')
plt.xticks(rotation=45, ha='right', fontsize=9)
plt.yticks(rotation=0, fontsize=9)
plt.tight_layout()
plt.show()

# Calcular acur√°cia por classe
class_accuracy = cm.diagonal() / cm.sum(axis=1)
print("\nüìä Acur√°cia por Classe:")
print("="*70)
for idx, acc in enumerate(class_accuracy):
    print(f"  {class_names_pt[idx]:25s}: {acc*100:6.2f}%")
print("="*70)

## 7. Feature Importance (se aplic√°vel)

In [None]:
# Verificar se o modelo tem feature_importances_
if hasattr(best_model, 'feature_importances_'):
    # Obter import√¢ncia das features
    feature_importance = pd.DataFrame({
        'Feature': X_scaled.columns,
        'Feature_PT': [translate_variable(col) for col in X_scaled.columns],
        'Importance': best_model.feature_importances_
    }).sort_values('Importance', ascending=False)
    
    print("\nüìä IMPORT√ÇNCIA DAS FEATURES:")
    print("="*80)
    print(feature_importance[['Feature_PT', 'Importance']].to_string(index=False))
    print("="*80)
    
    # Visualizar top 15 features com cores padronizadas
    plt.figure(figsize=(12, 8))
    top_features = feature_importance.head(15)
    
    bars = plt.barh(top_features['Feature_PT'], top_features['Importance'], 
                   color=SECONDARY_COLOR, edgecolor='black', alpha=0.85, linewidth=1.2)
    plt.xlabel('Import√¢ncia', fontsize=12, fontweight='bold')
    plt.title(f'Top 15 Features Mais Importantes - {best_model_name}', 
              fontsize=14, fontweight='bold', pad=15)
    plt.gca().invert_yaxis()
    plt.grid(axis='x', alpha=0.3, linestyle='--')
    plt.tight_layout()
    plt.show()
    
    # Insights sobre as features mais importantes
    print("\nüí° INSIGHTS:")
    print("="*80)
    top_3 = feature_importance.head(3)
    for idx, row in top_3.iterrows():
        print(f"  {idx+1}. {row['Feature_PT']}: {row['Importance']*100:.2f}% de import√¢ncia")
else:
    print(f"\n‚ö†Ô∏è {best_model_name} n√£o possui atributo feature_importances_")

## 8. Otimiza√ß√£o do Melhor Modelo (GridSearch)

In [None]:
# Definir grids de hiperpar√¢metros
param_grids = {
    'Random Forest': {
        'n_estimators': [100, 200, 300],
        'max_depth': [10, 20, 30, None],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    },
    'XGBoost': {
        'n_estimators': [100, 200, 300],
        'max_depth': [3, 5, 7, 10],
        'learning_rate': [0.01, 0.1, 0.3],
        'subsample': [0.8, 1.0]
    },
    'Gradient Boosting': {
        'n_estimators': [100, 200, 300],
        'max_depth': [3, 5, 7],
        'learning_rate': [0.01, 0.1, 0.3],
        'subsample': [0.8, 1.0]
    }
}

# Otimizar apenas se o melhor modelo estiver no dicion√°rio
if best_model_name in param_grids:
    print(f"\nüîß OTIMIZANDO {best_model_name.upper()}...")
    print("="*80)
    
    grid_search = GridSearchCV(
        best_model,
        param_grids[best_model_name],
        cv=5,
        scoring='accuracy',
        n_jobs=-1,
        verbose=1
    )
    
    grid_search.fit(X_train, y_train)
    
    print("\n‚úÖ Otimiza√ß√£o conclu√≠da!")
    print(f"\nüìä Melhores Par√¢metros:")
    for param, value in grid_search.best_params_.items():
        print(f"  {param}: {value}")
    
    # Avaliar modelo otimizado
    optimized_model = grid_search.best_estimator_
    y_pred_optimized = optimized_model.predict(X_test)
    optimized_accuracy = accuracy_score(y_test, y_pred_optimized)
    
    print(f"\nüìà COMPARA√á√ÉO:")
    print(f"  Modelo Original: {best_accuracy:.2f}%")
    print(f"  Modelo Otimizado: {optimized_accuracy*100:.2f}%")
    print(f"  Melhoria: {(optimized_accuracy*100 - best_accuracy):.2f}%")
    
    # Usar modelo otimizado se for melhor
    if optimized_accuracy > results[best_model_name]['accuracy']:
        best_model = optimized_model
        print("\n‚úÖ Modelo otimizado ser√° usado!")
    else:
        print("\n‚ö†Ô∏è Modelo original teve melhor desempenho.")
else:
    print(f"\n‚ö†Ô∏è Grid Search n√£o configurado para {best_model_name}")

## 9. Salvar Modelo e Artefatos

In [None]:
# Salvar modelo treinado
model_path = '../models/best_model.pkl'
joblib.dump(best_model, model_path)
print(f"‚úÖ Modelo salvo em: {model_path}")

# Salvar encoders
encoders_path = '../models/label_encoders.pkl'
joblib.dump(label_encoders, encoders_path)
print(f"‚úÖ Label Encoders salvos em: {encoders_path}")

# Salvar target encoder
target_encoder_path = '../models/target_encoder.pkl'
joblib.dump(le_target, target_encoder_path)
print(f"‚úÖ Target Encoder salvo em: {target_encoder_path}")

# Salvar scaler
scaler_path = '../models/scaler.pkl'
joblib.dump(scaler, scaler_path)
print(f"‚úÖ Scaler salvo em: {scaler_path}")

# Salvar lista de features
features_path = '../models/feature_names.pkl'
joblib.dump(X_scaled.columns.tolist(), features_path)
print(f"‚úÖ Feature names salvos em: {features_path}")

# Salvar m√©tricas do modelo
metrics_path = '../models/model_metrics.pkl'
model_metrics = {
    'model_name': best_model_name,
    'accuracy': optimized_accuracy if 'optimized_accuracy' in locals() else best_accuracy/100,
    'results_df': results_df
}
joblib.dump(model_metrics, metrics_path)
print(f"‚úÖ M√©tricas salvas em: {metrics_path}")

print("\n" + "="*80)
print("üéâ TODOS OS ARTEFATOS SALVOS COM SUCESSO!")
print("="*80)

## 10. Resumo Final

In [None]:
print("\n" + "="*80)
print("üìä RESUMO DO PROJETO")
print("="*80)
print(f"\nüéØ Objetivo: Acur√°cia > 75%")
print(f"\nüèÜ Melhor Modelo: {best_model_name}")
final_accuracy = optimized_accuracy*100 if 'optimized_accuracy' in locals() else best_accuracy
print(f"üìà Acur√°cia Alcan√ßada: {final_accuracy:.2f}%")
print(f"\nüì¶ Artefatos Salvos:")
print(f"  - Modelo: {model_path}")
print(f"  - Encoders: {encoders_path}")
print(f"  - Target Encoder: {target_encoder_path}")
print(f"  - Scaler: {scaler_path}")
print(f"  - Features: {features_path}")
print(f"  - M√©tricas: {metrics_path}")
print(f"\n‚úÖ Pr√≥ximos Passos:")
print(f"  1. Desenvolver aplica√ß√£o Streamlit para predi√ß√£o")
print(f"  2. Criar dashboard anal√≠tico para equipe m√©dica")
print(f"  3. Documentar e testar aplica√ß√£o")
print("="*80)