### 03 - Treinamento dos Modelos

Treina os modelos do artigo + melhorias propostas

# Imports

In [None]:
import sys
sys.path.append('/home/jovyan/work')

import pandas as pd
import numpy as np
from src import S3Client, ModelTrainer, MLFlowClient

# Carregar dados processados

In [None]:
s3 = S3Client()

X_train = s3.read_csv('processed/X_train_scaled.csv')
X_test = s3.read_csv('processed/X_test_scaled.csv')
y_train = s3.read_csv('processed/y_train.csv')['target']
y_test = s3.read_csv('processed/y_test.csv')['target']

print(f"üìä X_train: {X_train.shape}")
print(f"üìä X_test: {X_test.shape}")
print(f"üìä y_train: {y_train.shape}")
print(f"üìä y_test: {y_test.shape}")

# Inicializar trainer e MLFlow

In [None]:
trainer = ModelTrainer()
mlflow_client = MLFlowClient(experiment_name="heart-disease-prediction")

# Fase 1: Modelos do Artigo

KNN, Random Forest, Logistic Regression, SVM, Gaussian NB, Decision Tree

## Treinar modelos do artigo

In [None]:
article_models = trainer.train_article_models(X_train, y_train)

print(f"\n‚úÖ {len(article_models)} modelos do artigo treinados!")
for name in article_models.keys():
    print(f"   - {name}")

# Fase 2: Modelos de Melhoria

Gradient Boosting, Random Forest tuned, XGBoost

## Treinar modelos de melhoria

In [None]:
improved_models = trainer.train_improved_models(X_train, y_train)

print(f"\n‚úÖ {len(improved_models)} modelos de melhoria treinados!")
for name in improved_models.keys():
    print(f"   - {name}")

# Combinar todos os modelos

In [None]:
all_models = {**article_models, **improved_models}

print(f"\nüìä Total de modelos treinados: {len(all_models)}")

# Valida√ß√£o Cruzada

## Cross-validation para cada modelo

In [None]:
cv_results = {}

for name, model in all_models.items():
    print(f"\nüîÑ Cross-validation: {name}")
    cv_result = trainer.cross_validate(model, X_train, y_train, cv=5)
    cv_results[name] = cv_result

# Comparar resultados CV

In [None]:
cv_df = pd.DataFrame({
    'model': cv_results.keys(),
    'cv_mean': [r['mean'] for r in cv_results.values()],
    'cv_std': [r['std'] for r in cv_results.values()]
}).sort_values('cv_mean', ascending=False)

print("\nüìä Resultados Cross-Validation:")
display(cv_df)

# Salvar Modelos no MLFlow

In [None]:
for name, model in all_models.items():
    # Iniciar run
    run = mlflow_client.start_run(run_name=f"train_{name}")
    
    # Log par√¢metros
    if hasattr(model, 'get_params'):
        mlflow_client.log_params(model.get_params())
    
    # Log m√©tricas de CV
    if name in cv_results:
        mlflow_client.log_metrics({
            'cv_mean_accuracy': cv_results[name]['mean'],
            'cv_std_accuracy': cv_results[name]['std']
        })
    
    # Log modelo
    mlflow_client.log_model(model, artifact_path="model", registered_model_name=name)
    
    # Finalizar run
    mlflow_client.end_run()
    
    print(f"‚úÖ {name} registrado no MLFlow")


# Melhor Modelo

In [None]:
best_model_name = cv_df.iloc[0]['model']
best_cv_score = cv_df.iloc[0]['cv_mean']

print(f"\nüèÜ Melhor modelo (CV): {best_model_name}")
print(f"   Accuracy: {best_cv_score:.4f}")

trainer.save_best_model(all_models[best_model_name], best_model_name)