In [None]:
# Notebook: Entrenamiento y Evaluaci√≥n de Modelos
# ================================================

# %% [markdown]
# # üöÄ Pipeline Completo de Entrenamiento y Evaluaci√≥n
# 
# Este notebook ejecuta el pipeline completo:
# 1. Carga y preprocesamiento de datos
# 2. Ingenier√≠a de caracter√≠sticas
# 3. Entrenamiento de 4 modelos (LSTM, GRU, TCN, TFT)
# 4. Evaluaci√≥n y comparaci√≥n

# %% [markdown]
# ## üì¶ Imports

# %%
import sys
import os
import warnings
warnings.filterwarnings('ignore')

# Agregar path del proyecto
sys.path.append('..')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from src.utils import load_config, set_seed, create_directories
from src.data_loader import StockDataLoader
from src.features import FeatureEngineering
from src.train import TrainingPipeline
from src.evaluate import evaluate_from_pipeline

# Configurar estilo
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print("‚úì Imports completados")

# %% [markdown]
# ## ‚öôÔ∏è Configuraci√≥n

# %%
# Cargar configuraci√≥n
config = load_config('../config.yaml')
set_seed(config['training']['random_seed'])
create_directories(config)

print("Configuraci√≥n cargada:")
print(f"  - Lookback: {config['features']['lookback_window']} d√≠as")
print(f"  - Horizonte: {config['features']['prediction_horizon']} d√≠a(s)")
print(f"  - Train hasta: {config['data']['train_end']}")
print(f"  - Val hasta: {config['data']['val_end']}")

# %% [markdown]
# ## üîç Exploraci√≥n R√°pida de Datos

# %%
# Cargar datos procesados
loader = StockDataLoader(config)
df = loader.load_full_data()

print(f"\nüìä Dataset:")
print(f"  - Filas: {len(df):,}")
print(f"  - Tickers: {df['ticker'].nunique()}")
print(f"  - Rango: {df['Date'].min().date()} ‚Üí {df['Date'].max().date()}")

# Visualizar distribuci√≥n de precios
plt.figure(figsize=(14, 6))
plt.subplot(1, 2, 1)
plt.hist(df['Close'], bins=100, edgecolor='black', alpha=0.7)
plt.xlabel('Precio de Cierre ($)')
plt.ylabel('Frecuencia')
plt.title('Distribuci√≥n de Precios')

plt.subplot(1, 2, 2)
top_tickers = df['ticker'].value_counts().head(10)
plt.barh(range(len(top_tickers)), top_tickers.values)
plt.yticks(range(len(top_tickers)), top_tickers.index)
plt.xlabel('N√∫mero de registros')
plt.title('Top 10 Tickers por cantidad de datos')
plt.tight_layout()
plt.show()

# %% [markdown]
# ## üèãÔ∏è Entrenamiento de Modelos
# 
# **ADVERTENCIA**: Esto puede tardar varios minutos u horas dependiendo de:
# - Tama√±o del dataset
# - Hardware disponible (GPU recomendada)
# - N√∫mero de epochs

# %%
# Crear pipeline
pipeline = TrainingPipeline('../config.yaml')

# Opci√≥n 1: Entrenar todos los modelos
models_to_train = ['lstm', 'gru', 'tcn', 'tft']

# Opci√≥n 2: Entrenar solo algunos (m√°s r√°pido para pruebas)
# models_to_train = ['lstm', 'gru']

print(f"\nüéØ Modelos a entrenar: {models_to_train}")
print("‚è≥ Iniciando entrenamiento (puede tardar)...\n")

# Ejecutar pipeline completo
trained_models = pipeline.run(models=models_to_train)

# %% [markdown]
# ## üìä Visualizaci√≥n de Historiales de Entrenamiento

# %%
import json

fig, axes = plt.subplots(len(trained_models), 2, figsize=(14, 4*len(trained_models)))

if len(trained_models) == 1:
    axes = axes.reshape(1, -1)

for idx, (model_name, model_data) in enumerate(trained_models.items()):
    history = model_data['history'].history
    
    # Loss
    ax1 = axes[idx, 0] if len(trained_models) > 1 else axes[0]
    ax1.plot(history['loss'], label='Train Loss', linewidth=2)
    ax1.plot(history['val_loss'], label='Val Loss', linewidth=2)
    ax1.set_title(f'{model_name.upper()} - Loss', fontweight='bold')
    ax1.set_xlabel('Epoch')
    ax1.set_ylabel('Loss (MSE)')
    ax1.legend()
    ax1.grid(alpha=0.3)
    
    # MAE
    ax2 = axes[idx, 1] if len(trained_models) > 1 else axes[1]
    ax2.plot(history['mae'], label='Train MAE', linewidth=2)
    ax2.plot(history['val_mae'], label='Val MAE', linewidth=2)
    ax2.set_title(f'{model_name.upper()} - MAE', fontweight='bold')
    ax2.set_xlabel('Epoch')
    ax2.set_ylabel('MAE')
    ax2.legend()
    ax2.grid(alpha=0.3)

plt.tight_layout()
plt.savefig('../results/plots/training_history.png', dpi=300, bbox_inches='tight')
plt.show()

print("‚úì Historiales guardados en results/plots/training_history.png")

# %% [markdown]
# ## üß™ Evaluaci√≥n en Test Set
# 
# Ahora evaluaremos los modelos en datos nunca vistos (test set)

# %%
# Preparar datos de test (reutilizando el pipeline)
print("‚è≥ Preparando datos de test...")

# Cargar y preparar datos
train_df, val_df, test_df = pipeline.load_and_prepare_data()
train_df, val_df, test_df = pipeline.engineer_features(train_df, val_df, test_df)

results = pipeline.create_sequences(train_df, val_df, test_df)
(X_train, y_train, _, _, X_val, y_val, _, _, 
 X_test, y_test, test_tickers, test_dates, feature_cols) = results

X_train, y_train, X_val, y_val, X_test, y_test = pipeline.normalize_data(
    X_train, y_train, X_val, y_val, X_test, y_test
)

print(f"‚úì Test set preparado: {X_test.shape}")

# Evaluar todos los modelos
print("\n‚è≥ Evaluando modelos en test set...\n")
eval_results, df_results = evaluate_from_pipeline(
    X_test, y_test, 
    models=models_to_train
)

# %% [markdown]
# ## üìà An√°lisis de Resultados

# %%
# Mostrar tabla de resultados
print("\n" + "="*70)
print("RANKING DE MODELOS (por RMSE)")
print("="*70)
print(df_results.to_string(index=False))
print("="*70 + "\n")

# Identificar mejor modelo
best_model = df_results.iloc[0]['Model']
best_rmse = df_results.iloc[0]['RMSE']
best_r2 = df_results.iloc[0]['R2']

print(f"üèÜ MEJOR MODELO: {best_model}")
print(f"   RMSE: ${best_rmse:.4f}")
print(f"   R¬≤: {best_r2:.4f}")

# %% [markdown]
# ## üéØ An√°lisis de Errores

# %%
# Calcular errores para el mejor modelo
best_result = [r for r in eval_results if r['model_name'] == best_model.lower()][0]
errors = best_result['y_pred'] - best_result['y_true']
percentage_errors = (errors / best_result['y_true']) * 100

fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# Distribuci√≥n de errores
axes[0].hist(errors, bins=100, edgecolor='black', alpha=0.7, color='steelblue')
axes[0].axvline(0, color='red', linestyle='--', linewidth=2, label='Error = 0')
axes[0].set_xlabel('Error ($)')
axes[0].set_ylabel('Frecuencia')
axes[0].set_title(f'{best_model} - Distribuci√≥n de Errores')
axes[0].legend()
axes[0].grid(alpha=0.3)

# Errores porcentuales
axes[1].hist(percentage_errors, bins=100, edgecolor='black', alpha=0.7, color='coral')
axes[1].axvline(0, color='red', linestyle='--', linewidth=2)
axes[1].set_xlabel('Error (%)')
axes[1].set_ylabel('Frecuencia')
axes[1].set_title('Distribuci√≥n de Errores Porcentuales')
axes[1].grid(alpha=0.3)

# Errores vs valor real
axes[2].scatter(best_result['y_true'], errors, alpha=0.3, s=10)
axes[2].axhline(0, color='red', linestyle='--', linewidth=2)
axes[2].set_xlabel('Valor Real ($)')
axes[2].set_ylabel('Error ($)')
axes[2].set_title('Errores vs Valores Reales')
axes[2].grid(alpha=0.3)

plt.tight_layout()
plt.savefig('../results/plots/error_analysis.png', dpi=300, bbox_inches='tight')
plt.show()

# Estad√≠sticas de errores
print("\nEstad√≠sticas de Errores:")
print(f"  Media: ${errors.mean():.4f}")
print(f"  Mediana: ${np.median(errors):.4f}")
print(f"  Std: ${errors.std():.4f}")
print(f"  MAPE: {percentage_errors.abs().mean():.2f}%")

# %% [markdown]
# ## üíæ Guardar Resultados Finales

# %%
# Crear resumen final
summary = {
    'best_model': best_model,
    'metrics': df_results.to_dict('records'),
    'dataset_info': {
        'train_samples': int(len(X_train)),
        'val_samples': int(len(X_val)),
        'test_samples': int(len(X_test)),
        'features': feature_cols,
        'tickers': int(df['ticker'].nunique())
    }
}

import json
with open('../results/final_summary.json', 'w') as f:
    json.dump(summary, f, indent=2)

print("‚úì Resumen guardado en results/final_summary.json")

# %% [markdown]
# ## üéâ Conclusi√≥n
# 
# El pipeline completo ha sido ejecutado exitosamente. Los resultados incluyen:
# - **Modelos entrenados**: Guardados en `results/models/`
# - **Gr√°ficos**: Guardados en `results/plots/`
# - **Tablas**: Guardadas en `results/tables/`
# - **Resumen**: `results/final_summary.json`
# 
# ### Pr√≥ximos pasos:
# 1. Analizar resultados en profundidad
# 2. Ajustar hiperpar√°metros si es necesario
# 3. Probar con diferentes ventanas temporales
# 4. Implementar predicciones en producci√≥n

print("\n" + "="*70)
print("‚úÖ PIPELINE COMPLETADO EXITOSAMENTE")
print("="*70)