In [None]:
# Configuraci√≥n e imports
import sys
import os
sys.path.append('../scripts')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Configurar matplotlib
plt.style.use('default')
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 10

print("‚úÖ Librer√≠as importadas correctamente")
print(f"üìç Directorio actual: {os.getcwd()}")


In [None]:
# Funci√≥n para cargar modelos
def load_model_safe(model_path):
    """Carga un modelo de forma segura"""
    try:
        with open(model_path, 'rb') as f:
            model_info = pickle.load(f)
        print(f"‚úÖ Modelo cargado: {model_path}")
        return model_info
    except FileNotFoundError:
        print(f"‚ùå No se encontr√≥: {model_path}")
        return None
    except Exception as e:
        print(f"‚ùå Error cargando {model_path}: {e}")
        return None

# Cargar modelos entrenados
global_model = load_model_safe('../models/xgboost_global_features.pkl')
nearby_model = load_model_safe('../models/xgboost_nearby_features.pkl')

print("\nüìä INFORMACI√ìN DE MODELOS:")
if global_model:
    print(f"üåê Global - R¬≤: {global_model['metrics']['val_r2']:.4f}, Features: {len(global_model['features'])}")
if nearby_model:
    print(f"üîç Cercanas - R¬≤: {nearby_model['metrics']['val_r2']:.4f}, Features: {len(nearby_model['features'])}")

# Cargar datasets de validaci√≥n
print("\nüìÇ Cargando datasets...")
val_global = pd.read_csv('../data/val_global.csv')
val_nearby = pd.read_csv('../data/val_nearby.csv')

print(f"‚úÖ Val Global: {val_global.shape}")
print(f"‚úÖ Val Cercanas: {val_nearby.shape}")


In [None]:
# Comparaci√≥n de m√©tricas
def create_metrics_comparison():
    """Crea una comparaci√≥n visual de las m√©tricas"""
    
    if not global_model or not nearby_model:
        print("‚ùå No se pueden comparar modelos - algunos no est√°n disponibles")
        return
    
    # Extraer m√©tricas
    metrics_data = {
        'Modelo': ['Global', 'Cercanas'],
        'R¬≤': [global_model['metrics']['val_r2'], nearby_model['metrics']['val_r2']],
        'RMSE': [global_model['metrics']['val_rmse'], nearby_model['metrics']['val_rmse']],
        'MAE': [global_model['metrics']['val_mae'], nearby_model['metrics']['val_mae']],
        'Features': [len(global_model['features']), len(nearby_model['features'])]
    }
    
    df_metrics = pd.DataFrame(metrics_data)
    
    # Crear visualizaci√≥n
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    fig.suptitle('üÜö Comparaci√≥n de Modelos: Global vs Estaciones Cercanas', fontsize=16)
    
    # R¬≤
    ax1 = axes[0,0]
    bars1 = ax1.bar(df_metrics['Modelo'], df_metrics['R¬≤'], 
                    color=['steelblue', 'orange'], alpha=0.7)
    ax1.set_title('R¬≤ Score (Mayor es Mejor)', fontsize=12)
    ax1.set_ylabel('R¬≤')
    ax1.set_ylim(0, max(df_metrics['R¬≤']) * 1.1)
    
    # Agregar valores en las barras
    for bar, value in zip(bars1, df_metrics['R¬≤']):
        ax1.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01,
                f'{value:.3f}', ha='center', va='bottom')
    
    # RMSE
    ax2 = axes[0,1]
    bars2 = ax2.bar(df_metrics['Modelo'], df_metrics['RMSE'], 
                    color=['steelblue', 'orange'], alpha=0.7)
    ax2.set_title('RMSE (Menor es Mejor)', fontsize=12)
    ax2.set_ylabel('RMSE')
    
    # Agregar valores en las barras
    for bar, value in zip(bars2, df_metrics['RMSE']):
        ax2.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.1,
                f'{value:.2f}', ha='center', va='bottom')
    
    # MAE
    ax3 = axes[1,0]
    bars3 = ax3.bar(df_metrics['Modelo'], df_metrics['MAE'], 
                    color=['steelblue', 'orange'], alpha=0.7)
    ax3.set_title('MAE (Menor es Mejor)', fontsize=12)
    ax3.set_ylabel('MAE')
    
    # Agregar valores en las barras
    for bar, value in zip(bars3, df_metrics['MAE']):
        ax3.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.05,
                f'{value:.2f}', ha='center', va='bottom')
    
    # N√∫mero de Features
    ax4 = axes[1,1]
    bars4 = ax4.bar(df_metrics['Modelo'], df_metrics['Features'], 
                    color=['steelblue', 'orange'], alpha=0.7)
    ax4.set_title('N√∫mero de Features', fontsize=12)
    ax4.set_ylabel('Cantidad de Features')
    
    # Agregar valores en las barras
    for bar, value in zip(bars4, df_metrics['Features']):
        ax4.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.5,
                f'{value}', ha='center', va='bottom')
    
    plt.tight_layout()
    plt.show()
    
    # Tabla resumen
    print("\nüìã TABLA RESUMEN DE M√âTRICAS:")
    print("="*60)
    print(df_metrics.to_string(index=False, float_format='%.4f'))
    
    # An√°lisis
    r2_diff = nearby_model['metrics']['val_r2'] - global_model['metrics']['val_r2']
    feature_reduction = (1 - len(nearby_model['features']) / len(global_model['features'])) * 100
    
    print(f"\nüîç AN√ÅLISIS:")
    print(f"   Diferencia R¬≤: {r2_diff:+.4f}")
    print(f"   Reducci√≥n features: {feature_reduction:.1f}%")
    
    if r2_diff > 0.01:
        print("üéâ ¬°Modelo CERCANAS es significativamente mejor!")
    elif r2_diff > -0.01:
        print("ü§ù Performance similar entre ambos modelos")
    else:
        print("üåê Modelo GLOBAL tiene mejor performance")

create_metrics_comparison()


In [None]:
# An√°lisis de Feature Importance
def plot_feature_importance():
    """Compara la importancia de features entre ambos modelos"""
    
    if not global_model or not nearby_model:
        print("‚ùå No se pueden analizar features - modelos no disponibles")
        return
    
    # Extraer feature importance
    global_features = global_model['features']
    global_importance = global_model['model'].feature_importances_
    
    nearby_features = nearby_model['features']
    nearby_importance = nearby_model['model'].feature_importances_
    
    # Crear DataFrames
    df_global = pd.DataFrame({
        'feature': global_features,
        'importance': global_importance,
        'model': 'Global'
    }).sort_values('importance', ascending=False)
    
    df_nearby = pd.DataFrame({
        'feature': nearby_features,
        'importance': nearby_importance,
        'model': 'Cercanas'
    }).sort_values('importance', ascending=False)
    
    # Visualizar top features
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 8))
    
    # Top features modelo global
    top_global = df_global.head(15)
    ax1.barh(range(len(top_global)), top_global['importance'], color='steelblue', alpha=0.7)
    ax1.set_yticks(range(len(top_global)))
    ax1.set_yticklabels(top_global['feature'])
    ax1.set_xlabel('Importancia')
    ax1.set_title('üåê Top 15 Features - Modelo Global')
    ax1.invert_yaxis()
    
    # Top features modelo cercanas
    top_nearby = df_nearby.head(15)
    ax2.barh(range(len(top_nearby)), top_nearby['importance'], color='orange', alpha=0.7)
    ax2.set_yticks(range(len(top_nearby)))
    ax2.set_yticklabels(top_nearby['feature'])
    ax2.set_xlabel('Importancia')
    ax2.set_title('üîç Top 15 Features - Modelo Cercanas')
    ax2.invert_yaxis()
    
    plt.tight_layout()
    plt.show()
    
    # An√°lisis de features en com√∫n
    common_features = set(global_features) & set(nearby_features)
    print(f"\nüîó FEATURES EN COM√öN: {len(common_features)}")
    
    if common_features:
        # Comparar importancia de features comunes
        common_comparison = []
        for feature in common_features:
            global_imp = df_global[df_global['feature'] == feature]['importance'].iloc[0]
            nearby_imp = df_nearby[df_nearby['feature'] == feature]['importance'].iloc[0]
            common_comparison.append({
                'feature': feature,
                'global_importance': global_imp,
                'nearby_importance': nearby_imp,
                'difference': nearby_imp - global_imp
            })
        
        df_common = pd.DataFrame(common_comparison).sort_values('nearby_importance', ascending=False)
        
        print("\nüìä TOP 10 FEATURES COMUNES:")
        print("="*80)
        print(f"{'Feature':<35} {'Global':<10} {'Cercanas':<10} {'Diferencia':<10}")
        print("="*80)
        
        for _, row in df_common.head(10).iterrows():
            print(f"{row['feature']:<35} {row['global_importance']:<10.4f} "
                  f"{row['nearby_importance']:<10.4f} {row['difference']:>+9.4f}")
    
    # Mostrar top features √∫nicas
    global_unique = set(global_features) - set(nearby_features)
    nearby_unique = set(nearby_features) - set(global_features)
    
    print(f"\nüåê Features √∫nicas del modelo GLOBAL: {len(global_unique)}")
    if global_unique and len(global_unique) < 20:
        global_unique_top = df_global[df_global['feature'].isin(global_unique)].head(10)
        for _, row in global_unique_top.iterrows():
            print(f"   {row['feature']}: {row['importance']:.4f}")
    
    print(f"\nüîç Features √∫nicas del modelo CERCANAS: {len(nearby_unique)}")
    if nearby_unique and len(nearby_unique) < 20:
        nearby_unique_top = df_nearby[df_nearby['feature'].isin(nearby_unique)].head(10)
        for _, row in nearby_unique_top.iterrows():
            print(f"   {row['feature']}: {row['importance']:.4f}")

plot_feature_importance()


In [None]:
# Resumen final y recomendaciones
def generate_final_report():
    """Genera un reporte final con conclusiones"""
    
    print("üéØ REPORTE FINAL - ENFOQUE SIMPLIFICADO")
    print("="*70)
    
    if global_model and nearby_model:
        global_r2 = global_model['metrics']['val_r2']
        nearby_r2 = nearby_model['metrics']['val_r2']
        
        print(f"üéØ ESTACI√ìN OBJETIVO: 014 - Pacifico (m√°s concurrida)")
        print(f"üìä PERFORMANCE GLOBAL: R¬≤ = {global_r2:.4f}")
        print(f"üìä PERFORMANCE CERCANAS: R¬≤ = {nearby_r2:.4f}")
        
        # Determinar el mejor modelo
        if nearby_r2 > global_r2:
            winner = "ESTACIONES CERCANAS üîç"
            improvement = nearby_r2 - global_r2
            print(f"\nüèÜ GANADOR: {winner}")
            print(f"   Mejora de R¬≤: +{improvement:.4f}")
        elif global_r2 > nearby_r2:
            winner = "MODELO GLOBAL üåê"
            difference = global_r2 - nearby_r2
            print(f"\nüèÜ GANADOR: {winner}")
            print(f"   Ventaja de R¬≤: +{difference:.4f}")
        else:
            print(f"\nü§ù EMPATE - Performance similar")
        
        # An√°lisis de complejidad
        global_features = len(global_model['features'])
        nearby_features = len(nearby_model['features'])
        reduction = (1 - nearby_features / global_features) * 100
        
        print(f"\nüìä COMPLEJIDAD DEL MODELO:")
        print(f"   Global: {global_features} features")
        print(f"   Cercanas: {nearby_features} features")
        print(f"   Reducci√≥n: {reduction:.1f}%")
        
    else:
        print("‚ùå No se pudieron cargar ambos modelos para la comparaci√≥n")
    
    print(f"\nüîÑ PR√ìXIMOS PASOS RECOMENDADOS:")
    print("="*50)
    
    print("1. üìà ESCALAMIENTO GRADUAL:")
    print("   - Entrenar para Top 2 estaciones m√°s concurridas")
    print("   - Entrenar para Top 5 estaciones m√°s concurridas")
    print("   - Evaluar en qu√© punto se degrada la performance")
    
    print("\n2. üîß OPTIMIZACI√ìN DE FEATURES:")
    print("   - Crear features espec√≠ficas para la estaci√≥n objetivo")
    print("   - Incorporar informaci√≥n de clima/eventos")
    print("   - Probar diferentes ventanas temporales (15min, 45min, 60min)")
    
    print("\n3. üéØ MEJORAS DEL MODELO:")
    print("   - Tuning de hiperpar√°metros con Grid/Random Search")
    print("   - Probar otros algoritmos (Random Forest, LightGBM)")
    print("   - Implementar ensemble de modelos")
    
    print("\n4. üß™ VALIDACI√ìN ADICIONAL:")
    print("   - Cross-validation temporal")
    print("   - Validaci√≥n en datos de test (Septiembre 2024+)")
    print("   - An√°lisis de estacionalidad y tendencias")
    
    print("\n5. üöÄ DEPLOYMENT:")
    print("   - Crear API para predicciones en tiempo real")
    print("   - Dashboard de monitoreo")
    print("   - Sistema de alertas por baja performance")
    
    print(f"\n‚úÖ LOGROS DEL ENFOQUE SIMPLIFICADO:")
    print("   ‚úì Redujo complejidad del problema original")
    print("   ‚úì Estableci√≥ baseline s√≥lida para 1 estaci√≥n")
    print("   ‚úì Permiti√≥ identificar features m√°s relevantes")
    print("   ‚úì Facilit√≥ interpretabilidad del modelo")
    print("   ‚úì Cre√≥ framework escalable para m√∫ltiples estaciones")

generate_final_report()
