<a href="https://colab.research.google.com/github/ninja-marduk/ml_precipitation_prediction/blob/feature%2Fbase-models/models/base_models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Configuración del entorno (compatible con Colab y local)
import os
import sys
from pathlib import Path
import shutil
import time
import psutil

# Regenerar el código con las condiciones específicas
notebook_globals = {
    "USE_CROSS_VALIDATION": False,
    "ENABLED_MODELS": ['CNN', 'GRU'],
    "ENABLED_EXPERIMENTS": ['time+cycles', 'all_features'],
    "ENABLED_HORIZONS": [3],
}

# Detectar si estamos en Google Colab
IN_COLAB = 'google.colab' in sys.modules

if IN_COLAB:
    from google.colab import drive
    drive.mount('/content/drive')
    # Si estamos en Colab, clonar el repositorio
    !git clone https://github.com/ninja-marduk/ml_precipitation_prediction.git
    %cd ml_precipitation_prediction
    # Instalar dependencias necesarias
    !pip install -r requirements.txt
    !pip install xarray netCDF4 optuna matplotlib seaborn lightgbm xgboost scikit-learn ace_tools_open cartopy
    BASE_PATH = '/content/drive/MyDrive/ml_precipitation_prediction'
else:
    # Si estamos en local, usar la ruta actual
    if '/models' in os.getcwd():
        BASE_PATH = Path('..')
    else:
        BASE_PATH = Path('.')

print(f"Entorno configurado. Usando ruta base: {BASE_PATH}")

# Si BASE_PATH viene como string, lo convertimos
BASE_PATH = Path(BASE_PATH)

# Ahora puedes concatenar correctamente
data_output_dir = BASE_PATH / 'data' / 'output'
model_output_dir = BASE_PATH / 'models' / 'output'


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Cloning into 'ml_precipitation_prediction'...
remote: Enumerating objects: 904, done.[K
remote: Counting objects: 100% (41/41), done.[K
remote: Compressing objects: 100% (35/35), done.[K
remote: Total 904 (delta 26), reused 6 (delta 6), pack-reused 863 (from 1)[K
Receiving objects: 100% (904/904), 99.82 MiB | 43.29 MiB/s, done.
Resolving deltas: 100% (494/494), done.
/content/ml_precipitation_prediction/ml_precipitation_prediction
Entorno configurado. Usando ruta base: /content/drive/MyDrive/ml_precipitation_prediction


In [None]:
import os
import numpy as np
import pandas as pd
import xarray as xr
from pathlib import Path
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.python.client import device_lib
from tensorflow.keras.utils import Sequence
from tensorflow.keras.layers import Input
import traceback

# ==== Variables de control ====
USE_CROSS_VALIDATION = False
ENABLED_MODELS = ['GRU'] # Primer ronda ENABLED_MODELS = ['CNN', 'GRU'], segunda ronda: ENABLED_MODELS = ['LSTM', 'BLSTM']
ENABLED_EXPERIMENTS = ['time+cycles', 'time+cycles+elev', 'time+cycles+elev+cluster']
ENABLED_HORIZONS = [3]
INPUT_WINDOW = 60  # 5 años (mensual)
OUTPUT_HORIZON = 3

# ==== Configuración de entorno ====
print("🔍 Detectando dispositivo disponible...")
gpu_devices = tf.config.list_physical_devices('GPU')
USE_GPU = bool(gpu_devices)
if USE_GPU:
    print("✅ GPU detectada:", gpu_devices[0].name)
    print("ℹ️ Entrenamiento acelerado con GPU activado.")
else:
    print("⚠️ No se detectó GPU. Usando CPU.")

# ==== Funciones auxiliares ====
def build_model(model_type, input_shape, output_neurons):
    from tensorflow.keras.models import Sequential
    from tensorflow.keras.layers import Dense, Conv2D, MaxPooling2D, Flatten, LSTM, GRU, Bidirectional, Reshape
    model = Sequential()
    model.add(Input(shape=input_shape, dtype='float32'))
    if model_type == 'LSTM':
        model.add(LSTM(64))
    elif model_type == 'GRU':
        model.add(GRU(64))
    elif model_type == 'BLSTM':
        model.add(Bidirectional(LSTM(64)))
    elif model_type == 'CNN':
        model.add(Reshape((*input_shape, 1)))
        model.add(Conv2D(32, (3, 3), activation='relu', padding='same'))
        model.add(MaxPooling2D((2, 2)))
        model.add(Flatten())
    model.add(Dense(output_neurons))
    model.compile(optimizer='adam', loss='mse')
    return model

def evaluate(y_true, y_pred):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    mape = np.mean(np.abs((y_true - y_pred) / (y_true + 1e-5))) * 100
    r2 = r2_score(y_true, y_pred)
    return rmse, mae, mape, r2

class DataGenerator(Sequence):
    def __init__(self, x_set, y_set, batch_size=16):
        self.x, self.y = x_set.astype(np.float32), y_set.astype(np.float32)
        self.batch_size = batch_size
    def __len__(self):
        return int(np.ceil(len(self.x) / float(self.batch_size)))
    def __getitem__(self, idx):
        batch_x = self.x[idx * self.batch_size:(idx + 1) * self.batch_size]
        batch_y = self.y[idx * self.batch_size:(idx + 1) * self.batch_size]
        return batch_x, batch_y

def to_dataset(x, y):
    x = tf.convert_to_tensor(x, dtype=tf.float32)
    y = tf.convert_to_tensor(y, dtype=tf.float32)
    return tf.data.Dataset.from_tensor_slices((x, y)).batch(16).prefetch(tf.data.AUTOTUNE)

# ==== Directorios y Dataset ====
print("📂 Configurando directorios y cargando dataset...")
try:
    model_output_dir_STH = model_output_dir / 'ST_HybridWaveStack'
    curves_dir = model_output_dir_STH / "learning_curves"
    if not model_output_dir_STH.exists():
        model_output_dir_STH.mkdir(parents=True)
    if not curves_dir.exists():
        curves_dir.mkdir(parents=True)
    print(f"✔️ Modelos en: {model_output_dir_STH}")
    print(f"✔️ Curvas en: {curves_dir}")

    file_path = data_output_dir / "complete_dataset_with_features_with_clusters_elevation_with_windows.nc"
    ds = xr.open_dataset(file_path)
    print(f"✔️ Dataset cargado desde: {file_path}")
except Exception as e:
    raise RuntimeError(f"❌ Error cargando dataset o creando carpetas: {e}")

# ==== Configuración de experimentos ====
experiment_settings = {
    "time+cycles": ['year','month',
                    'month_sin','month_cos','doy_sin','doy_cos'],
    "time+cycles+lag":  ['year','month',
                        'month_sin','month_cos','doy_sin','doy_cos',
                        'total_precipitation_lag1','total_precipitation_lag2','total_precipitation_lag3',
                        'total_precipitation_lag4','total_precipitation_lag12','total_precipitation_lag24','total_precipitation_lag36'],
    "time+cycles+lag+elev": ['year','month',
                             'month_sin','month_cos','doy_sin','doy_cos',
                            'total_precipitation_lag1','total_precipitation_lag2','total_precipitation_lag3',
                            'total_precipitation_lag4','total_precipitation_lag12','total_precipitation_lag24','total_precipitation_lag36',
                            'elevation','slope','aspect'],
    "time+cycles+elev":   ['year','month',
                          'month_sin','month_cos','doy_sin','doy_cos',
                          'elevation','slope','aspect'],
    "time+cycles+elev+cluster":   ['year','month',
                                  'month_sin','month_cos','doy_sin','doy_cos',
                                  'elevation','slope','aspect',
                                  'cluster_elevation'],
    "all_features": ['year','month',
                     'month_sin','month_cos','doy_sin','doy_cos',
                    'total_precipitation_lag1','total_precipitation_lag2','total_precipitation_lag3',
                    'total_precipitation_lag4','total_precipitation_lag12','total_precipitation_lag24','total_precipitation_lag36',
                    'elevation','slope','aspect',
                     'cluster_elevation']
}

# Validación de variables
ds_vars = set(ds.data_vars)
for name, vars_list in experiment_settings.items():
    missing = [v for v in vars_list if v not in ds_vars]
    if missing:
        raise ValueError(f"❌ Faltan vars para '{name}': {missing}")
print("✅ Variables requeridas presentes.")

results = []

# ==== Entrenamiento modular ====
for exp_name, variables in experiment_settings.items():
    if exp_name not in ENABLED_EXPERIMENTS:
        continue
    print(f"\n🚀 Experimento: {exp_name}")
    try:
        # Preparar datos
        cluster_idx = variables.index('cluster_elevation') if 'cluster_elevation' in variables else None
        subset = ds[variables].to_array().transpose('time','latitude','longitude','variable').values
        if cluster_idx is not None:
            cd = subset[...,cluster_idx]
            subset[...,cluster_idx] = LabelEncoder().fit_transform(cd.ravel()).reshape(cd.shape)
        subset = subset.astype(np.float32)
        target = ds['total_precipitation'].values
        samples,lat,lon,feats = subset.shape
        X = subset.reshape(samples, lat*lon, feats)
        y = target.reshape(samples, lat*lon)
        mask = ~np.isnan(y)
        X, y = X[mask], y[mask]

        # Generar secuencias
        X_seq, Y_targets = [], []
        for i in range(len(X) - INPUT_WINDOW - OUTPUT_HORIZON):
            X_seq.append(X[i:i + INPUT_WINDOW])
            Y_targets.append([y[i + INPUT_WINDOW + h] for h in range(OUTPUT_HORIZON)])
        X_seq = np.array(X_seq)
        Y_targets = np.array(Y_targets)

        # --- FILTRAR Secuencias con NaNs y reportar ---
        def filtrar_secuencias(Xs, ys):
            total = len(Xs)
            valid = (~np.isnan(Xs).any(axis=(1,2))) & (~np.isnan(ys).any(axis=1))
            kept = valid.sum()
            lost = total - kept
            pct = 100 * lost/total
            print(f"📉 Secuencias orig.: {total}, válidas: {kept}, eliminadas: {lost} ({pct:.2f}%)")
            return Xs[valid], ys[valid]

        X_seq, Y_targets = filtrar_secuencias(X_seq, Y_targets)

        if len(X_seq)==0:
            print(f"⚠️ No quedan secuencias válidas para '{exp_name}'. Saltando.")
            continue

        input_shape = (X_seq.shape[1], X_seq.shape[2])

        # Cross-validation
        for model_name in ENABLED_MODELS:
            print(f"🔧 Modelo: {model_name}")
            # Partición
            if USE_CROSS_VALIDATION:
                splitter = KFold(n_splits=3, shuffle=False).split(X_seq)
            else:
                split_idx = int(len(X_seq) * 0.7)
                splitter = [(
                    np.arange(split_idx),
                    np.arange(split_idx, len(X_seq))
                )]

            for h in ENABLED_HORIZONS:
                for split_id, (tr_idx, va_idx) in enumerate(splitter, start=1):
                    X_tr, X_va = X_seq[tr_idx], X_seq[va_idx]
                    y_tr, y_va = Y_targets[tr_idx], Y_targets[va_idx]

                    # Escalado
                    scalerX=StandardScaler()
                    shpX = X_tr.shape
                    X_tr = scalerX.fit_transform(X_tr.reshape(-1,shpX[-1])).reshape(shpX)
                    shpX2 = X_va.shape
                    X_va = scalerX.transform(X_va.reshape(-1,shpX[-1])).reshape(shpX2)
                    scalery=StandardScaler()
                    y_tr = scalery.fit_transform(y_tr.reshape(-1,1)).reshape(y_tr.shape)
                    y_va = scalery.transform(y_va.reshape(-1,1)).reshape(y_va.shape)

                    # Data generators
                    train_gen = DataGenerator(X_tr, y_tr, batch_size=16)
                    val_gen   = DataGenerator(X_va, y_va, batch_size=16)

                    # Nombre de archivo
                    postfix = f"_H{h}" if USE_CROSS_VALIDATION else "_NoCV"
                    model_path = model_output_dir_STH / f"{exp_name.replace('+','_')}_{model_name}{postfix}.h5"
                    if model_path.exists():
                        print("⏩ Ya existe. Skip.")
                        continue

                    model = build_model(model_name, input_shape, output_neurons = OUTPUT_HORIZON)
                    es = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=9, restore_best_weights=True)
                    history = model.fit(train_gen,
                                        validation_data=val_gen,
                                        epochs=50,
                                        verbose=1,
                                        callbacks=[es])

                    # Evaluación
                    y_pred = model.predict(X_va)  # Mantiene la forma (samples, 3)
                    y_true = y_va  # También (samples, 3)

                    metrics = [evaluate(y_true[:, i], y_pred[:, i]) for i in range(OUTPUT_HORIZON)]

                    results.append({
                        'experiment': exp_name,
                        'model': model_name,
                        'horizon': h,
                        'cv_enabled': USE_CROSS_VALIDATION,
                        'RMSE': np.mean([m[0] for m in metrics]),
                        'MAE': np.mean([m[1] for m in metrics]),
                        'MAPE': np.mean([m[2] for m in metrics]),
                        'R2': np.mean([m[3] for m in metrics]),
                        'epochs': len(history.history['loss'])
                    })

                    # Curva
                    plt.figure()
                    plt.plot(history.history['loss'], label='Train')
                    plt.plot(history.history['val_loss'], label='Val')
                    plt.title(f'{exp_name} - {model_name} - {"NoCV" if not USE_CROSS_VALIDATION else f"H{h}"}')
                    plt.xlabel('Epoch')
                    plt.ylabel('Loss')
                    plt.legend()
                    fname = f"{exp_name.replace('+','_')}_{model_name}{postfix}.png"
                    plt.savefig(curves_dir / fname)
                    plt.close()

                    model.save(model_path)
                    print(f"💾 Modelo guardado: {model_path.name}")

    except Exception as e:
        print(f"❌ Error en '{exp_name}': {e}\n{traceback.format_exc()}")

# ==== Guardar resultados ====
results_df = pd.DataFrame(results)
results_df.to_csv("resultados_modelos_cv_8anios_mvp.csv", index=False)
print(results_df.head())

import ace_tools_open as tools
tools.display_dataframe_to_user(name="Resultados CV MVP", dataframe=results_df)
print("✅ Proceso finalizado con éxito.")


🔍 Detectando dispositivo disponible...
✅ GPU detectada: /physical_device:GPU:0
ℹ️ Entrenamiento acelerado con GPU activado.
📂 Configurando directorios y cargando dataset...
✔️ Modelos en: /content/drive/MyDrive/ml_precipitation_prediction/models/output/ST_HybridWaveStack
✔️ Curvas en: /content/drive/MyDrive/ml_precipitation_prediction/models/output/ST_HybridWaveStack/learning_curves
✔️ Dataset cargado desde: /content/drive/MyDrive/ml_precipitation_prediction/data/output/complete_dataset_with_features_with_clusters_elevation_with_windows.nc
✅ Variables requeridas presentes.

🚀 Experimento: time+cycles
📉 Secuencias orig.: 2101387, válidas: 2101387, eliminadas: 0 (0.00%)
🔧 Modelo: GRU
Epoch 1/50


  self._warn_if_super_not_called()


[1m41553/91936[0m [32m━━━━━━━━━[0m[37m━━━━━━━━━━━[0m [1m4:11[0m 5ms/step - loss: 0.6829

In [None]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from pathlib import Path
import xarray as xr
import matplotlib.pyplot as plt
import cartopy.crs as ccrs
import cartopy.feature as cfeature
import traceback

# ==== Directorios y Dataset ====
print("📂 Configurando directorios y cargando dataset...")
try:
    model_output_dir_STH = Path("/content/drive/MyDrive/ml_precipitation_prediction/models/output/ST_HybridWaveStack")
    data_output_dir = Path("/content/drive/MyDrive/ml_precipitation_prediction/data/output")
    model_files = sorted([f for f in model_output_dir_STH.glob("*.h5")])
    print(f"✔️ Modelos encontrados: {len(model_files)}")
    print(f"🧠 Ejemplos: {[m.name for m in model_files[:3]]}")

    ds_path = data_output_dir / "complete_dataset_with_features_with_clusters_elevation_with_windows.nc"
    ds = xr.open_dataset(ds_path)
    print(f"✔️ Dataset cargado desde: {ds_path}")
except Exception as e:
    raise RuntimeError(f"❌ Error cargando modelos o dataset: {e}")

# ==== Configuración general ====
input_window = 96
horizon = 3

# Configuración de features por experimento
experiment_settings = {
    "all_features": [
        'year', 'month', 'month_sin', 'month_cos', 'doy_sin', 'doy_cos',
        'total_precipitation_lag1', 'total_precipitation_lag2', 'total_precipitation_lag3',
        'total_precipitation_lag4', 'total_precipitation_lag12', 'total_precipitation_lag24', 'total_precipitation_lag36',
        'elevation', 'slope', 'aspect', 'cluster_elevation'
    ],
    "time+cycles": [
        'year', 'month', 'month_sin', 'month_cos', 'doy_sin', 'doy_cos'
    ]
}
target_var = 'total_precipitation'

# ==== Función de evaluación ====
def evaluate(y_true, y_pred):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    mape = np.mean(np.abs((y_true - y_pred) / (y_true + 1e-5))) * 100
    r2 = r2_score(y_true, y_pred)
    return rmse, mae, mape, r2

# ==== Evaluación de modelos ====
results = []
pred_map = None
true_map = None
mape_map = None

lat = ds.latitude.size
lon = ds.longitude.size
n_cells = lat * lon

for model_path in model_files:
    print(f"🔍 Evaluando modelo: {model_path.name}")
    try:
        name_parts = model_path.stem.split("_")
        if "all" in name_parts and "features" in name_parts:
            experiment_key = "all_features"
        elif "time" in name_parts and "cycles" in name_parts:
            experiment_key = "time+cycles"
        else:
            print(f"⚠️ No se pudo mapear correctamente el experimento para {model_path.name}")
            continue

        if experiment_key not in experiment_settings:
            print(f"⚠️ No se encontraron variables para {experiment_key}")
            continue

        variables = experiment_settings[experiment_key]

        subset = ds[variables].to_array().transpose('time', 'latitude', 'longitude', 'variable').values
        target = ds[target_var].values

        # Codificar cluster si aplica
        cluster_idx = variables.index('cluster_elevation') if 'cluster_elevation' in variables else None
        if cluster_idx is not None:
            encoded = LabelEncoder().fit_transform(subset[..., cluster_idx].ravel()).reshape(subset[..., cluster_idx].shape)
            subset[..., cluster_idx] = encoded

        # Preprocesamiento
        subset = subset.astype(np.float32)
        samples, lat, lon, feats = subset.shape
        X_all = subset.reshape(samples, lat * lon, feats)
        y_all = target.reshape(samples, lat * lon)

        mask = ~np.isnan(y_all)
        X_all = X_all[mask]
        y_all = y_all[mask]

        # Generar secuencias
        X_seq, y_seq = [], []
        for i in range(len(X_all) - input_window - horizon):
            X_seq.append(X_all[i:i + input_window])
            y_seq.append(y_all[i + input_window + horizon - 1])
        X_seq = np.array(X_seq)
        y_seq = np.array(y_seq)

        # División
        split_idx = int(len(X_seq) * 0.7)
        X_train, X_test = X_seq[:split_idx], X_seq[split_idx:]
        y_train, y_test = y_seq[:split_idx], y_seq[split_idx:]

        # Escalado
        scaler_X = StandardScaler()
        scaler_y = StandardScaler()

        X_train_flat = X_train.reshape(-1, X_train.shape[-1])
        X_test_flat = X_test.reshape(-1, X_test.shape[-1])
        X_train_scaled = scaler_X.fit_transform(X_train_flat).reshape(X_train.shape)
        X_test_scaled = scaler_X.transform(X_test_flat).reshape(X_test.shape)

        y_train_scaled = scaler_y.fit_transform(y_train.reshape(-1, 1)).flatten()
        y_test_scaled = scaler_y.transform(y_test.reshape(-1, 1)).flatten()

        # Cargar y evaluar modelo
        model = tf.keras.models.load_model(model_path, compile=False)
        y_pred_scaled = model.predict(X_test_scaled, verbose=0).flatten()
        y_pred = scaler_y.inverse_transform(y_pred_scaled.reshape(-1, 1)).flatten()
        y_true = scaler_y.inverse_transform(y_test_scaled.reshape(-1, 1)).flatten()

        # Ajustar longitud si no es múltiplo del grid
        n_valid = (len(y_pred) // n_cells) * n_cells
        y_pred = y_pred[:n_valid]
        y_true = y_true[:n_valid]

        rmse, mae, mape, r2 = evaluate(y_true, y_pred)
        results.append({
            'model': model_path.name,
            'experiment': experiment_key,
            'RMSE': rmse,
            'MAE': mae,
            'MAPE (%)': np.mean(np.abs((y_true - y_pred) / (y_true + 1e-5))) * 100,
            'R2': r2
        })

        # Guardar para mapa si aún no se ha hecho
        if pred_map is None:
          steps = n_valid // n_cells
          pred_map = y_pred.reshape((steps, lat, lon)).mean(axis=0)
          true_map = y_true.reshape((steps, lat, lon)).mean(axis=0)
          mape_map = np.abs((true_map - pred_map) / (true_map + 1e-5)) * 100

    except Exception as e:
        print(f"❌ Error evaluando {model_path.name}: {traceback.format_exc()}")

# === Guardar y mostrar resultados ===
df_results = pd.DataFrame(results)
df_results.to_csv(model_output_dir_STH / "metrics_modelos_test.csv", index=False)

# Crear NetCDF y CSV de mapas si se generaron
if pred_map is not None:
    ds_out = xr.Dataset(
        {
            "predicted_mean": (("lat", "lon"), pred_map),
            "true_mean": (("lat", "lon"), true_map),
            "mape": (("lat", "lon"), mape_map),
        },
        coords={"lat": ds.latitude.values, "lon": ds.longitude.values},
    )
    ds_out.to_netcdf(model_output_dir_STH / "predictions_and_mape.nc")
    mape_table = pd.DataFrame(mape_map, columns=ds.longitude.values, index=ds.latitude.values)
    mape_table.to_csv(model_output_dir_STH / "mape_per_cell.csv")

    def plot_map(data, title, cmap='viridis', vmin=None, vmax=None):
        plt.figure(figsize=(10, 6))
        ax = plt.axes(projection=ccrs.PlateCarree())
        ax.set_title(title, fontsize=14)
        mesh = plt.pcolormesh(ds_out.lon, ds_out.lat, data, cmap=cmap,
                              shading='auto', vmin=vmin, vmax=vmax)
        plt.colorbar(mesh, ax=ax, orientation='vertical', label=title)
        ax.coastlines()
        ax.add_feature(cfeature.BORDERS, linestyle=':')
        ax.add_feature(cfeature.LAND, facecolor='lightgray')
        ax.add_feature(cfeature.LAKES, edgecolor='gray')
        ax.add_feature(cfeature.RIVERS, edgecolor='blue')
        ax.gridlines(draw_labels=True)
        plt.show()

    # 🌧️ Mapa de predicción promedio 3 meses
    plot_map(ds_out['predicted_mean'], 'Predicción promedio - 3 meses', cmap='Blues')

    # 📉 Mapa de MAPE 3 meses
    plot_map(ds_out['mape'], 'MAPE (%) - 3 meses', cmap='Reds', vmin=0, vmax=100)

import ace_tools_open as tools
tools.display_dataframe_to_user(name="Resultados Test Flexible", dataframe=df_results)
print("📊 Evaluación finalizada.")
