## 05 - Otimiza√ß√£o de Hiperpar√¢metros e Predi√ß√µes Finais
1. Otimiza√ß√£o de hiperpar√¢metros (Grid Search)
2. Treinamento do modelo otimizado
3. Predi√ß√µes no conjunto de valida√ß√£o
4. Compara√ß√£o com modelos base

# Imports

In [None]:
import sys
sys.path.append('/home/jovyan/work')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import GridSearchCV
from src import S3Client, ModelTrainer, ModelEvaluator, MLFlowClient, DatabaseClient

# Carregamento dos dados

In [None]:
s3 = S3Client()

X_train = s3.read_csv('processed/X_train_scaled.csv')
y_train = s3.read_csv('processed/y_train.csv')['target']
X_test = s3.read_csv('processed/X_test_scaled.csv')
y_test = s3.read_csv('processed/y_test.csv')['target']
X_val = s3.read_csv('processed/X_val_scaled.csv')
y_val = s3.read_csv('processed/y_val.csv')['target']

print(f"üìä Train: {X_train.shape}")
print(f"üìä Test: {X_test.shape}")
print(f"üìä Validation: {X_val.shape}")

# Inicializa√ß√£o

In [None]:
trainer = ModelTrainer()
evaluator = ModelEvaluator()
mlflow_client = MLFlowClient(experiment_name="heart-disease-hyperparameter-tuning")

## Parte 1: Otimiza√ß√£o de Hiperpar√¢metros

### Random Forest - Grid Search

In [None]:
print("üîÑ Otimizando Random Forest...")

rf_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 15, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}

# Modelo base
rf_base = trainer.get_model('random_forest')

# Grid Search
print(f"üìä Testando {np.prod([len(v) for v in rf_param_grid.values()])} combina√ß√µes...")
print(f"‚è±Ô∏è  Isso pode levar alguns minutos...\n")

rf_tuned, rf_best_params = trainer.hyperparameter_tuning(
    model=rf_base,
    param_grid=rf_param_grid,
    X_train=X_train,
    y_train=y_train,
    cv=5,
    scoring='accuracy'
)

print(f"\nüèÜ Melhores hiperpar√¢metros Random Forest:")
for param, value in rf_best_params.items():
    print(f"   {param}: {value}")

### Logistic Regression - Grid Search

In [None]:
print("\nüîÑ Otimizando Logistic Regression...")

lr_param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear', 'saga'],
    'max_iter': [1000]
}

lr_base = trainer.get_model('logistic_regression')

print(f"üìä Testando {np.prod([len(v) for v in lr_param_grid.values()])} combina√ß√µes...")

lr_tuned, lr_best_params = trainer.hyperparameter_tuning(
    model=lr_base,
    param_grid=lr_param_grid,
    X_train=X_train,
    y_train=y_train,
    cv=5,
    scoring='accuracy'
)

print(f"\nüèÜ Melhores hiperpar√¢metros Logistic Regression:")
for param, value in lr_best_params.items():
    print(f"   {param}: {value}")

### SVM - Grid Search

In [None]:
print("\nüîÑ Otimizando SVM...")

svm_param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': ['scale', 'auto', 0.001, 0.01, 0.1],
    'kernel': ['rbf', 'poly']
}

svm_base = trainer.get_model('svm')

print(f"üìä Testando {np.prod([len(v) for v in svm_param_grid.values()])} combina√ß√µes...")
print("‚ö†Ô∏è  SVM pode demorar mais...")

svm_tuned, svm_best_params = trainer.hyperparameter_tuning(
    model=svm_base,
    param_grid=svm_param_grid,
    X_train=X_train,
    y_train=y_train,
    cv=5,
    scoring='accuracy'
)

print(f"\nüèÜ Melhores hiperpar√¢metros SVM:")
for param, value in svm_best_params.items():
    print(f"   {param}: {value}")

## Parte 2: Avaliar Modelos Otimizados

In [None]:
tuned_models = {
    'random_forest_optimized': rf_tuned,
    'logistic_regression_optimized': lr_tuned,
    'svm_optimized': svm_tuned
}

results_tuned = []

for name, model in tuned_models.items():
    print(f"\nüîÑ Avaliando: {name}")
    
    # Predi√ß√µes
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test) if hasattr(model, 'predict_proba') else None
    
    # Avaliar
    metrics = evaluator.evaluate_model(y_test, y_pred, y_pred_proba, model_name=name)
    results_tuned.append(metrics)
    
    # Registrar no MLFlow
    run = mlflow_client.start_run(run_name=f"tuned_{name}")
    
    mlflow_client.log_params(model.get_params())
    mlflow_client.log_metrics({
        'test_accuracy': metrics['accuracy'],
        'test_precision': metrics['precision'],
        'test_recall': metrics['recall'],
        'test_f1': metrics['f1_score']
    })
    mlflow_client.log_model(model, artifact_path="model", registered_model_name=name)
    mlflow_client.end_run()

print("\n‚úÖ Modelos otimizados avaliados!")

### Compara√ß√£o: Base vs Otimizado

In [None]:
base_models_names = ['random_forest', 'logistic_regression', 'svm']
results_base = []

for name in base_models_names:
    try:
        # Carregar modelo base do MLFlow
        model_uri = f"models:/{name}/latest"
        model = mlflow_client.load_model(model_uri)
        
        # Predi√ß√µes
        y_pred = model.predict(X_test)
        y_pred_proba = model.predict_proba(X_test) if hasattr(model, 'predict_proba') else None
        
        # Avaliar
        metrics = evaluator.evaluate_model(y_test, y_pred, y_pred_proba, model_name=name)
        results_base.append(metrics)
    except:
        print(f"‚ö†Ô∏è  Modelo base {name} n√£o encontrado")

In [None]:
comparison_data = []

for base, tuned in zip(results_base, results_tuned):
    comparison_data.append({
        'Model': base['model_name'].replace('_', ' ').title(),
        'Base Accuracy': base['accuracy'],
        'Tuned Accuracy': tuned['accuracy'],
        'Improvement': tuned['accuracy'] - base['accuracy'],
        'Improvement %': (tuned['accuracy'] - base['accuracy']) / base['accuracy'] * 100
    })

comparison_df = pd.DataFrame(comparison_data)

print("\nüìä Compara√ß√£o Base vs Otimizado:")
display(comparison_df)

### Visualiza√ß√µes

In [None]:
fig, ax = plt.subplots(figsize=(12, 6))

x = np.arange(len(comparison_df))
width = 0.35

bars1 = ax.bar(x - width/2, comparison_df['Base Accuracy'], width, 
               label='Base', color='#3498db', alpha=0.8)
bars2 = ax.bar(x + width/2, comparison_df['Tuned Accuracy'], width,
               label='Otimizado', color='#e74c3c', alpha=0.8)

ax.set_xlabel('Modelo')
ax.set_ylabel('Accuracy')
ax.set_title('Compara√ß√£o: Modelos Base vs Otimizados')
ax.set_xticks(x)
ax.set_xticklabels(comparison_df['Model'])
ax.legend()
ax.grid(axis='y', alpha=0.3)

# Adicionar valores nas barras
for bars in [bars1, bars2]:
    for bar in bars:
        height = bar.get_height()
        ax.text(bar.get_x() + bar.get_width()/2., height,
                f'{height:.3f}',
                ha='center', va='bottom', fontsize=9)

plt.tight_layout()
plt.show()

## Parte 3: Predi√ß√µes no Conjunto de Valida√ß√£o

In [None]:
best_tuned = max(results_tuned, key=lambda x: x['accuracy'])
best_model_name = best_tuned['model_name']
best_accuracy = best_tuned['accuracy']

print(f"\nüèÜ Melhor modelo otimizado: {best_model_name}")
print(f"   Accuracy: {best_accuracy:.4f}")

# Carregar melhor modelo
best_model = tuned_models[best_model_name]

In [None]:
print("\nüîÑ Realizando predi√ß√µes no conjunto de valida√ß√£o...")

y_val_pred = best_model.predict(X_val)
y_val_pred_proba = best_model.predict_proba(X_val) if hasattr(best_model, 'predict_proba') else None

# Avaliar
val_metrics = evaluator.evaluate_model(
    y_val, 
    y_val_pred, 
    y_val_pred_proba,
    model_name=f"{best_model_name}_validation"
)

print(f"\nüìä Resultados no Conjunto de Valida√ß√£o:")
print(f"   Accuracy:  {val_metrics['accuracy']:.4f}")
print(f"   Precision: {val_metrics['precision']:.4f}")
print(f"   Recall:    {val_metrics['recall']:.4f}")
print(f"   F1-Score:  {val_metrics['f1_score']:.4f}")

In [None]:
cm_val = evaluator.get_confusion_matrix(y_val, y_val_pred)

plt.figure(figsize=(8, 6))
sns.heatmap(cm_val, annot=True, fmt='d', cmap='Blues',
           xticklabels=['Saud√°vel', 'Doen√ßa'],
           yticklabels=['Saud√°vel', 'Doen√ßa'])
plt.title(f'Matriz de Confus√£o - Valida√ß√£o\n{best_model_name}')
plt.ylabel('Real')
plt.xlabel('Predito')
plt.tight_layout()
plt.show()

# Salvar predi√ß√µes e Modelo Final

In [None]:
predictions_df = pd.DataFrame({
    'patient_id': range(len(y_val)),
    'true_label': y_val.values,
    'predicted_label': y_val_pred,
    'probability_disease': y_val_pred_proba[:, 1] if y_val_pred_proba is not None else y_val_pred,
    'correct': y_val.values == y_val_pred
})

print("\nüìä Primeiras predi√ß√µes:")
display(predictions_df.head(10))

# Estat√≠sticas
correct_count = predictions_df['correct'].sum()
total_count = len(predictions_df)

print(f"\n‚úÖ Predi√ß√µes corretas: {correct_count}/{total_count} ({correct_count/total_count*100:.1f}%)")

In [None]:
s3.write_csv(predictions_df, 'predictions/validation_predictions.csv')
print("\n‚úÖ Predi√ß√µes salvas no MinIO!")

In [None]:
# Registrar como modelo de produ√ß√£o
model_version = mlflow_client.register_model(
    run_id=mlflow_client.client.search_runs(
        experiment_ids=[mlflow_client.experiment_id],
        filter_string=f"tags.mlflow.runName = 'tuned_{best_model_name}'",
        max_results=1
    )[0].info.run_id,
    model_name="production_model",
    artifact_path="model"
)

# Transicionar para produ√ß√£o
mlflow_client.transition_model_stage(
    model_name="production_model",
    version=model_version,
    stage="Production"
)

print(f"\n‚úÖ Modelo de produ√ß√£o registrado: production_model v{model_version}")

# Resumo Final

print("\n" + "="*60)
print("üìä RESUMO FINAL DO PROJETO")
print("="*60)

print("\nüéØ MODELOS BASE (Artigo):")
for result in results_base:
    print(f"   {result['model_name']}: {result['accuracy']:.4f}")

print("\nüöÄ MODELOS OTIMIZADOS:")
for result in results_tuned:
    print(f"   {result['model_name']}: {result['accuracy']:.4f}")

print(f"\nüèÜ MELHOR MODELO: {best_model_name}")
print(f"   Test Accuracy:       {best_accuracy:.4f}")
print(f"   Validation Accuracy: {val_metrics['accuracy']:.4f}")

print(f"\nüìà MELHORIA M√âDIA: {comparison_df['Improvement %'].mean():.2f}%")

print("\n‚úÖ PROJETO CONCLU√çDO!")
print("="*60)