In [None]:
# Configuración e imports
import sys
import os
sys.path.append('../scripts')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Configurar matplotlib
plt.style.use('default')
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 10

print("✅ Librerías importadas correctamente")
print(f"📍 Directorio actual: {os.getcwd()}")


In [None]:
# Función para cargar modelos
def load_model_safe(model_path):
    """Carga un modelo de forma segura"""
    try:
        with open(model_path, 'rb') as f:
            model_info = pickle.load(f)
        print(f"✅ Modelo cargado: {model_path}")
        return model_info
    except FileNotFoundError:
        print(f"❌ No se encontró: {model_path}")
        return None
    except Exception as e:
        print(f"❌ Error cargando {model_path}: {e}")
        return None

# Cargar modelos entrenados
global_model = load_model_safe('../models/xgboost_global_features.pkl')
nearby_model = load_model_safe('../models/xgboost_nearby_features.pkl')

print("\n📊 INFORMACIÓN DE MODELOS:")
if global_model:
    print(f"🌐 Global - R²: {global_model['metrics']['val_r2']:.4f}, Features: {len(global_model['features'])}")
if nearby_model:
    print(f"🔍 Cercanas - R²: {nearby_model['metrics']['val_r2']:.4f}, Features: {len(nearby_model['features'])}")

# Cargar datasets de validación
print("\n📂 Cargando datasets...")
val_global = pd.read_csv('../data/val_global.csv')
val_nearby = pd.read_csv('../data/val_nearby.csv')

print(f"✅ Val Global: {val_global.shape}")
print(f"✅ Val Cercanas: {val_nearby.shape}")


In [None]:
# Comparación de métricas
def create_metrics_comparison():
    """Crea una comparación visual de las métricas"""
    
    if not global_model or not nearby_model:
        print("❌ No se pueden comparar modelos - algunos no están disponibles")
        return
    
    # Extraer métricas
    metrics_data = {
        'Modelo': ['Global', 'Cercanas'],
        'R²': [global_model['metrics']['val_r2'], nearby_model['metrics']['val_r2']],
        'RMSE': [global_model['metrics']['val_rmse'], nearby_model['metrics']['val_rmse']],
        'MAE': [global_model['metrics']['val_mae'], nearby_model['metrics']['val_mae']],
        'Features': [len(global_model['features']), len(nearby_model['features'])]
    }
    
    df_metrics = pd.DataFrame(metrics_data)
    
    # Crear visualización
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    fig.suptitle('🆚 Comparación de Modelos: Global vs Estaciones Cercanas', fontsize=16)
    
    # R²
    ax1 = axes[0,0]
    bars1 = ax1.bar(df_metrics['Modelo'], df_metrics['R²'], 
                    color=['steelblue', 'orange'], alpha=0.7)
    ax1.set_title('R² Score (Mayor es Mejor)', fontsize=12)
    ax1.set_ylabel('R²')
    ax1.set_ylim(0, max(df_metrics['R²']) * 1.1)
    
    # Agregar valores en las barras
    for bar, value in zip(bars1, df_metrics['R²']):
        ax1.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01,
                f'{value:.3f}', ha='center', va='bottom')
    
    # RMSE
    ax2 = axes[0,1]
    bars2 = ax2.bar(df_metrics['Modelo'], df_metrics['RMSE'], 
                    color=['steelblue', 'orange'], alpha=0.7)
    ax2.set_title('RMSE (Menor es Mejor)', fontsize=12)
    ax2.set_ylabel('RMSE')
    
    # Agregar valores en las barras
    for bar, value in zip(bars2, df_metrics['RMSE']):
        ax2.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.1,
                f'{value:.2f}', ha='center', va='bottom')
    
    # MAE
    ax3 = axes[1,0]
    bars3 = ax3.bar(df_metrics['Modelo'], df_metrics['MAE'], 
                    color=['steelblue', 'orange'], alpha=0.7)
    ax3.set_title('MAE (Menor es Mejor)', fontsize=12)
    ax3.set_ylabel('MAE')
    
    # Agregar valores en las barras
    for bar, value in zip(bars3, df_metrics['MAE']):
        ax3.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.05,
                f'{value:.2f}', ha='center', va='bottom')
    
    # Número de Features
    ax4 = axes[1,1]
    bars4 = ax4.bar(df_metrics['Modelo'], df_metrics['Features'], 
                    color=['steelblue', 'orange'], alpha=0.7)
    ax4.set_title('Número de Features', fontsize=12)
    ax4.set_ylabel('Cantidad de Features')
    
    # Agregar valores en las barras
    for bar, value in zip(bars4, df_metrics['Features']):
        ax4.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.5,
                f'{value}', ha='center', va='bottom')
    
    plt.tight_layout()
    plt.show()
    
    # Tabla resumen
    print("\n📋 TABLA RESUMEN DE MÉTRICAS:")
    print("="*60)
    print(df_metrics.to_string(index=False, float_format='%.4f'))
    
    # Análisis
    r2_diff = nearby_model['metrics']['val_r2'] - global_model['metrics']['val_r2']
    feature_reduction = (1 - len(nearby_model['features']) / len(global_model['features'])) * 100
    
    print(f"\n🔍 ANÁLISIS:")
    print(f"   Diferencia R²: {r2_diff:+.4f}")
    print(f"   Reducción features: {feature_reduction:.1f}%")
    
    if r2_diff > 0.01:
        print("🎉 ¡Modelo CERCANAS es significativamente mejor!")
    elif r2_diff > -0.01:
        print("🤝 Performance similar entre ambos modelos")
    else:
        print("🌐 Modelo GLOBAL tiene mejor performance")

create_metrics_comparison()


In [None]:
# Análisis de Feature Importance
def plot_feature_importance():
    """Compara la importancia de features entre ambos modelos"""
    
    if not global_model or not nearby_model:
        print("❌ No se pueden analizar features - modelos no disponibles")
        return
    
    # Extraer feature importance
    global_features = global_model['features']
    global_importance = global_model['model'].feature_importances_
    
    nearby_features = nearby_model['features']
    nearby_importance = nearby_model['model'].feature_importances_
    
    # Crear DataFrames
    df_global = pd.DataFrame({
        'feature': global_features,
        'importance': global_importance,
        'model': 'Global'
    }).sort_values('importance', ascending=False)
    
    df_nearby = pd.DataFrame({
        'feature': nearby_features,
        'importance': nearby_importance,
        'model': 'Cercanas'
    }).sort_values('importance', ascending=False)
    
    # Visualizar top features
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 8))
    
    # Top features modelo global
    top_global = df_global.head(15)
    ax1.barh(range(len(top_global)), top_global['importance'], color='steelblue', alpha=0.7)
    ax1.set_yticks(range(len(top_global)))
    ax1.set_yticklabels(top_global['feature'])
    ax1.set_xlabel('Importancia')
    ax1.set_title('🌐 Top 15 Features - Modelo Global')
    ax1.invert_yaxis()
    
    # Top features modelo cercanas
    top_nearby = df_nearby.head(15)
    ax2.barh(range(len(top_nearby)), top_nearby['importance'], color='orange', alpha=0.7)
    ax2.set_yticks(range(len(top_nearby)))
    ax2.set_yticklabels(top_nearby['feature'])
    ax2.set_xlabel('Importancia')
    ax2.set_title('🔍 Top 15 Features - Modelo Cercanas')
    ax2.invert_yaxis()
    
    plt.tight_layout()
    plt.show()
    
    # Análisis de features en común
    common_features = set(global_features) & set(nearby_features)
    print(f"\n🔗 FEATURES EN COMÚN: {len(common_features)}")
    
    if common_features:
        # Comparar importancia de features comunes
        common_comparison = []
        for feature in common_features:
            global_imp = df_global[df_global['feature'] == feature]['importance'].iloc[0]
            nearby_imp = df_nearby[df_nearby['feature'] == feature]['importance'].iloc[0]
            common_comparison.append({
                'feature': feature,
                'global_importance': global_imp,
                'nearby_importance': nearby_imp,
                'difference': nearby_imp - global_imp
            })
        
        df_common = pd.DataFrame(common_comparison).sort_values('nearby_importance', ascending=False)
        
        print("\n📊 TOP 10 FEATURES COMUNES:")
        print("="*80)
        print(f"{'Feature':<35} {'Global':<10} {'Cercanas':<10} {'Diferencia':<10}")
        print("="*80)
        
        for _, row in df_common.head(10).iterrows():
            print(f"{row['feature']:<35} {row['global_importance']:<10.4f} "
                  f"{row['nearby_importance']:<10.4f} {row['difference']:>+9.4f}")
    
    # Mostrar top features únicas
    global_unique = set(global_features) - set(nearby_features)
    nearby_unique = set(nearby_features) - set(global_features)
    
    print(f"\n🌐 Features únicas del modelo GLOBAL: {len(global_unique)}")
    if global_unique and len(global_unique) < 20:
        global_unique_top = df_global[df_global['feature'].isin(global_unique)].head(10)
        for _, row in global_unique_top.iterrows():
            print(f"   {row['feature']}: {row['importance']:.4f}")
    
    print(f"\n🔍 Features únicas del modelo CERCANAS: {len(nearby_unique)}")
    if nearby_unique and len(nearby_unique) < 20:
        nearby_unique_top = df_nearby[df_nearby['feature'].isin(nearby_unique)].head(10)
        for _, row in nearby_unique_top.iterrows():
            print(f"   {row['feature']}: {row['importance']:.4f}")

plot_feature_importance()


In [None]:
# Resumen final y recomendaciones
def generate_final_report():
    """Genera un reporte final con conclusiones"""
    
    print("🎯 REPORTE FINAL - ENFOQUE SIMPLIFICADO")
    print("="*70)
    
    if global_model and nearby_model:
        global_r2 = global_model['metrics']['val_r2']
        nearby_r2 = nearby_model['metrics']['val_r2']
        
        print(f"🎯 ESTACIÓN OBJETIVO: 014 - Pacifico (más concurrida)")
        print(f"📊 PERFORMANCE GLOBAL: R² = {global_r2:.4f}")
        print(f"📊 PERFORMANCE CERCANAS: R² = {nearby_r2:.4f}")
        
        # Determinar el mejor modelo
        if nearby_r2 > global_r2:
            winner = "ESTACIONES CERCANAS 🔍"
            improvement = nearby_r2 - global_r2
            print(f"\n🏆 GANADOR: {winner}")
            print(f"   Mejora de R²: +{improvement:.4f}")
        elif global_r2 > nearby_r2:
            winner = "MODELO GLOBAL 🌐"
            difference = global_r2 - nearby_r2
            print(f"\n🏆 GANADOR: {winner}")
            print(f"   Ventaja de R²: +{difference:.4f}")
        else:
            print(f"\n🤝 EMPATE - Performance similar")
        
        # Análisis de complejidad
        global_features = len(global_model['features'])
        nearby_features = len(nearby_model['features'])
        reduction = (1 - nearby_features / global_features) * 100
        
        print(f"\n📊 COMPLEJIDAD DEL MODELO:")
        print(f"   Global: {global_features} features")
        print(f"   Cercanas: {nearby_features} features")
        print(f"   Reducción: {reduction:.1f}%")
        
    else:
        print("❌ No se pudieron cargar ambos modelos para la comparación")
    
    print(f"\n🔄 PRÓXIMOS PASOS RECOMENDADOS:")
    print("="*50)
    
    print("1. 📈 ESCALAMIENTO GRADUAL:")
    print("   - Entrenar para Top 2 estaciones más concurridas")
    print("   - Entrenar para Top 5 estaciones más concurridas")
    print("   - Evaluar en qué punto se degrada la performance")
    
    print("\n2. 🔧 OPTIMIZACIÓN DE FEATURES:")
    print("   - Crear features específicas para la estación objetivo")
    print("   - Incorporar información de clima/eventos")
    print("   - Probar diferentes ventanas temporales (15min, 45min, 60min)")
    
    print("\n3. 🎯 MEJORAS DEL MODELO:")
    print("   - Tuning de hiperparámetros con Grid/Random Search")
    print("   - Probar otros algoritmos (Random Forest, LightGBM)")
    print("   - Implementar ensemble de modelos")
    
    print("\n4. 🧪 VALIDACIÓN ADICIONAL:")
    print("   - Cross-validation temporal")
    print("   - Validación en datos de test (Septiembre 2024+)")
    print("   - Análisis de estacionalidad y tendencias")
    
    print("\n5. 🚀 DEPLOYMENT:")
    print("   - Crear API para predicciones en tiempo real")
    print("   - Dashboard de monitoreo")
    print("   - Sistema de alertas por baja performance")
    
    print(f"\n✅ LOGROS DEL ENFOQUE SIMPLIFICADO:")
    print("   ✓ Redujo complejidad del problema original")
    print("   ✓ Estableció baseline sólida para 1 estación")
    print("   ✓ Permitió identificar features más relevantes")
    print("   ✓ Facilitó interpretabilidad del modelo")
    print("   ✓ Creó framework escalable para múltiples estaciones")

generate_final_report()
