<a href="https://colab.research.google.com/github/ninja-marduk/ml_precipitation_prediction/blob/feature%2Fbase-models/models/base_models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Configuración del entorno (compatible con Colab y local)
import os
import sys
from pathlib import Path
import shutil
import time
import psutil

# Regenerar el código con las condiciones específicas
notebook_globals = {
    "USE_CROSS_VALIDATION": False,
    "ENABLED_MODELS": ['CNN', 'GRU'],
    "ENABLED_EXPERIMENTS": ['time+cycles', 'all_features'],
    "ENABLED_HORIZONS": [3],
}

# Detectar si estamos en Google Colab
IN_COLAB = 'google.colab' in sys.modules

if IN_COLAB:
    from google.colab import drive
    drive.mount('/content/drive')
    # Si estamos en Colab, clonar el repositorio
    !git clone https://github.com/ninja-marduk/ml_precipitation_prediction.git
    %cd ml_precipitation_prediction
    # Instalar dependencias necesarias
    !pip install -r requirements.txt
    !pip install xarray netCDF4 optuna matplotlib seaborn lightgbm xgboost scikit-learn ace_tools
    BASE_PATH = '/content/drive/MyDrive/ml_precipitation_prediction'
else:
    # Si estamos en local, usar la ruta actual
    if '/models' in os.getcwd():
        BASE_PATH = Path('..')
    else:
        BASE_PATH = Path('.')

print(f"Entorno configurado. Usando ruta base: {BASE_PATH}")

# Si BASE_PATH viene como string, lo convertimos
BASE_PATH = Path(BASE_PATH)

# Ahora puedes concatenar correctamente
data_output_dir = BASE_PATH / 'data' / 'output'
model_output_dir = BASE_PATH / 'models' / 'output'


Entorno configurado. Usando ruta base: ..


In [None]:
# Versión final optimizada - Entrenamiento modular y controlado por variables locales
import os
import numpy as np
import pandas as pd
import xarray as xr
from pathlib import Path
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.python.client import device_lib

# ==== Variables de control ====
USE_CROSS_VALIDATION = True
ENABLED_MODELS = ['CNN', 'GRU']
ENABLED_EXPERIMENTS = ['time+cycles', 'all_features']
ENABLED_HORIZONS = [3]
input_window = 96  # 8 años (mensual)

# ==== Configuración de entorno ====
print("🔍 Detectando dispositivo disponible...")
gpu_devices = tf.config.list_physical_devices('GPU')
USE_GPU = bool(gpu_devices)

if USE_GPU:
    print("✅ GPU detectada:", gpu_devices[0].name)
    try:
        from tensorflow.keras import mixed_precision
        mixed_precision.set_global_policy('mixed_float16')
        print("⚡ Política 'mixed_float16' activada.")
    except Exception as e:
        print(f"⚠️ No se pudo activar mixed precision: {e}")
else:
    print("⚠️ No se detectó GPU. Usando CPU.")
    print("ℹ️ En Colab puedes activar GPU en Entorno de ejecución > Cambiar tipo de entorno de ejecución.")

# ==== Funciones auxiliares ====
def build_model(model_type, input_shape, output_neurons):
    from tensorflow.keras.models import Sequential
    from tensorflow.keras.layers import Dense, Conv2D, MaxPooling2D, Flatten, LSTM, GRU, Bidirectional, Reshape, Input
    model = Sequential()
    model.add(Input(shape=input_shape))
    if model_type == 'LSTM':
        model.add(LSTM(64))
    elif model_type == 'GRU':
        model.add(GRU(64))
    elif model_type == 'BLSTM':
        model.add(Bidirectional(LSTM(64)))
    elif model_type == 'CNN':
        model.add(Reshape((*input_shape, 1)))
        model.add(Conv2D(32, (3, 3), activation='relu', padding='same'))
        model.add(MaxPooling2D((2, 2)))
        model.add(Flatten())
    model.add(Dense(output_neurons))
    model.compile(optimizer='adam', loss='mse')
    return model

def evaluate(y_true, y_pred):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    mape = np.mean(np.abs((y_true - y_pred) / (y_true + 1e-5))) * 100
    r2 = r2_score(y_true, y_pred)
    return rmse, mae, mape, r2

def to_dataset(x, y):
    return tf.data.Dataset.from_tensor_slices((x, y)).batch(16).prefetch(tf.data.AUTOTUNE)

# ==== Directorios y Dataset ====
print("📂 Configurando directorios y cargando dataset...")
try:
    model_output_dir = model_output_dir / 'ST_HybridWaveStack'
    curves_dir = model_output_dir / "learning_curves"

    if not model_output_dir.exists():
        model_output_dir.mkdir(parents=True)
    if not curves_dir.exists():
        curves_dir.mkdir(parents=True)

    file_path = data_output_dir / "complete_dataset_with_features_with_clusters_elevation_with_windows.nc"
    ds = xr.open_dataset(file_path)
    print(f"✔️ Dataset cargado desde: {file_path}")
except Exception as e:
    raise RuntimeError(f"❌ Error cargando dataset o creando carpetas: {e}")

# ==== Configuración de experimentos ====
experiment_settings = {
    "time+cycles": ['year', 'month', 'month_sin', 'month_cos', 'doy_sin', 'doy_cos'],
    "time+cycles+lag": ['year', 'month', 'month_sin', 'month_cos', 'doy_sin', 'doy_cos',
                        'total_precipitation_lag1', 'total_precipitation_lag2', 'total_precipitation_lag3',
                        'total_precipitation_lag4', 'total_precipitation_lag12', 'total_precipitation_lag24', 'total_precipitation_lag36'],
    "time+cycles+lag+elev": ['year', 'month', 'month_sin', 'month_cos', 'doy_sin', 'doy_cos',
                             'total_precipitation_lag1', 'total_precipitation_lag2', 'total_precipitation_lag3',
                             'total_precipitation_lag4', 'total_precipitation_lag12', 'total_precipitation_lag24', 'total_precipitation_lag36',
                             'elevation', 'slope', 'aspect'],
    "all_features": ['year', 'month', 'month_sin', 'month_cos', 'doy_sin', 'doy_cos',
                     'total_precipitation_lag1', 'total_precipitation_lag2', 'total_precipitation_lag3',
                     'total_precipitation_lag4', 'total_precipitation_lag12', 'total_precipitation_lag24', 'total_precipitation_lag36',
                     'elevation', 'slope', 'aspect', 'cluster_elevation']
}

# ==== Validación de variables ====
ds_vars = set(ds.data_vars)
for name, vars_list in experiment_settings.items():
    missing = [v for v in vars_list if v not in ds_vars]
    if missing:
        raise ValueError(f"❌ Faltan variables necesarias para el experimento '{name}': {missing}")
print("✅ Variables requeridas presentes.")

results = []

# ==== Entrenamiento modular ====
for exp_name, variables in experiment_settings.items():
    if exp_name not in ENABLED_EXPERIMENTS:
        continue

    print(f"\n🚀 Experimento: {exp_name}")
    try:
        cluster_idx = variables.index('cluster_elevation') if 'cluster_elevation' in variables else None
        subset = ds[variables].to_array().transpose('time', 'latitude', 'longitude', 'variable').values

        if cluster_idx is not None:
            encoded = LabelEncoder().fit_transform(subset[..., cluster_idx].ravel()).reshape(subset[..., cluster_idx].shape)
            subset[..., cluster_idx] = encoded

        subset = subset.astype(np.float32)
        target = ds['total_precipitation'].values
        samples, lat, lon, feats = subset.shape
        X = subset.reshape(samples, lat * lon, feats)
        y = target.reshape(samples, lat * lon)
        mask = ~np.isnan(y)
        X = X[mask]
        y = y[mask]

        if X.shape[0] == 0 or y.shape[0] == 0:
            print(f"⚠️ No hay datos válidos tras aplicar la máscara en '{exp_name}'. Saltando experimento.")
            continue

        X_seq = []
        Y_targets = {h: [] for h in ENABLED_HORIZONS}
        for i in range(len(X) - input_window - max(ENABLED_HORIZONS)):
            X_seq.append(X[i:i + input_window])
            for h in ENABLED_HORIZONS:
                Y_targets[h].append(y[i + input_window + h - 1])
        X_seq = np.array(X_seq)
        Y_targets = {h: np.array(Y_targets[h]) for h in ENABLED_HORIZONS}

        if len(X_seq) == 0:
            print(f"⚠️ No se generaron secuencias para '{exp_name}'. Saltando...")
            continue

        input_shape = (X_seq.shape[1], X_seq.shape[2])

        for model_name in ENABLED_MODELS:
            print(f"🔧 Modelo: {model_name}")
            for h in ENABLED_HORIZONS:
                print(f"🕒 Horizonte: {h} meses")
                kf = KFold(n_splits=3, shuffle=False)
                fold = 1
                for train_idx, val_idx in kf.split(X_seq):
                    model = build_model(model_name, input_shape, 1)
                    es = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
                    X_train_fold, X_val_fold = X_seq[train_idx], X_seq[val_idx]
                    y_train_fold, y_val_fold = Y_targets[h][train_idx], Y_targets[h][val_idx]

                    train_ds = to_dataset(X_train_fold, y_train_fold)
                    val_ds = to_dataset(X_val_fold, y_val_fold)

                    model_path = model_output_dir / f"{exp_name.replace('+','_')}_{model_name}_H{h}_F{fold}.h5"
                    if model_path.exists():
                        print(f"⏩ Modelo ya existe: {model_path.name}. Saltando...")
                        fold += 1
                        continue

                    history = model.fit(train_ds,
                                        validation_data=val_ds,
                                        epochs=20,
                                        verbose=1,
                                        callbacks=[es])

                    y_pred = model.predict(X_val_fold).flatten()
                    y_true = y_val_fold.flatten()
                    rmse, mae, mape, r2 = evaluate(y_true, y_pred)

                    results.append({
                        'experiment': exp_name,
                        'model': model_name,
                        'horizon': h,
                        'fold': fold,
                        'RMSE': rmse,
                        'MAE': mae,
                        'MAPE': mape,
                        'R2': r2,
                        'epochs': len(history.history['loss'])
                    })

                    # Guardar curva
                    plt.figure()
                    plt.plot(history.history['loss'], label='Train')
                    plt.plot(history.history['val_loss'], label='Val')
                    plt.title(f'{exp_name} - {model_name} - H{h} - F{fold}')
                    plt.xlabel('Epoch')
                    plt.ylabel('Loss')
                    plt.legend()
                    plt.savefig(curves_dir / f"{exp_name.replace('+','_')}_{model_name}_H{h}_F{fold}.png")
                    plt.close()

                    model.save(model_path)
                    print(f"💾 Guardado: {model_path.name}")
                    fold += 1

    except Exception as e:
        print(f"❌ Error en experimento '{exp_name}': {e}")

# ==== Guardar resultados ====
results_df = pd.DataFrame(results)
results_df.to_csv("resultados_modelos_cv_8anios_mvp.csv", index=False)

import ace_tools as tools
tools.display_dataframe_to_user(name="Resultados CV MVP", dataframe=results_df)
print("✅ Resultados guardados y mostrados al usuario.")
print("🎉 Proceso finalizado con éxito.")


🔍 Detectando dispositivo disponible...
✅ GPU detectada: /physical_device:GPU:0
⚡ Política 'mixed_float16' activada.
📂 Configurando directorios y cargando dataset...
✔️ Dataset cargado desde: /content/drive/MyDrive/ml_precipitation_prediction/data/output/complete_dataset_with_features_with_clusters_elevation_with_windows.nc
✅ Variables requeridas presentes.

🚀 Experimento: time+cycles
🔧 Modelo: CNN
🕒 Horizonte: 3 meses
Epoch 1/20
[1m87557/87557[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m257s[0m 3ms/step - loss: 9169.0938 - val_loss: 13252.8125
Epoch 2/20
[1m87557/87557[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m247s[0m 3ms/step - loss: 9037.1875 - val_loss: 11750.7334
Epoch 3/20
[1m87557/87557[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m247s[0m 3ms/step - loss: 9085.4043 - val_loss: 12658.6133
Epoch 4/20
[1m87557/87557[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m251s[0m 3ms/step - loss: 9069.9932 - val_loss: 12931.8350
Epoch 5/20
[1m87557/87557[0m [32m━━━━━━━━━━