<a href="https://colab.research.google.com/github/ninja-marduk/ml_precipitation_prediction/blob/main/models/base_models_STHyMOUNTAIN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 📘 Entrenamiento de Modelos Baseline para Predicción Espaciotemporal de Precipitación Mensual STHyMOUNTAIN

Este notebook implementa modelos baseline para la predicción de precipitaciones usando datos espaciotemporales.

## 🔍 Implementación de Modelos Avanzados y Técnicas de Validación

Además de los modelos tabulares baseline, implementaremos:

1. **Optimización avanzada con Optuna** para los modelos tabulares XGBoost y LightGBM
2. **Validación robusta** mediante:
   - Hold-Out Validation (ya implementada)
   - Cross-Validation (k=5)
   - Bootstrapping (100 muestras)
3. **Modelos de Deep Learning** para capturar patrones espaciales y temporales:
   - Redes CNN para patrones espaciales
   - Redes ConvLSTM para patrones espaciotemporales

El objetivo es proporcionar una evaluación completa de diferentes enfoques de modelado para la predicción de precipitación en regiones montañosas.

In [None]:
# Configuración del entorno (compatible con Colab y local)
import os
import sys
from pathlib import Path
import shutil
import time
import psutil

# Detectar si estamos en Google Colab
IN_COLAB = 'google.colab' in sys.modules

if IN_COLAB:
    from google.colab import drive
    drive.mount('/content/drive')   
    # Si estamos en Colab, clonar el repositorio
    !git clone https://github.com/ninja-marduk/ml_precipitation_prediction.git
    %cd ml_precipitation_prediction
    # Instalar dependencias necesarias
    !pip install -r requirements.txt
    !pip install xarray netCDF4 optuna matplotlib seaborn lightgbm xgboost scikit-learn
    BASE_PATH = '/content/drive/MyDrive/ml_precipitation_prediction'
else:
    # Si estamos en local, usar la ruta actual
    if '/models' in os.getcwd():
        BASE_PATH = Path('..')
    else:
        BASE_PATH = Path('.')

print(f"Entorno configurado. Usando ruta base: {BASE_PATH}")

# Si BASE_PATH viene como string, lo convertimos
BASE_PATH = Path(BASE_PATH)

# Ahora puedes concatenar correctamente
model_output_dir = BASE_PATH / 'models' / 'output'
model_output_dir.mkdir(parents=True, exist_ok=True)

print(f"Directorio para salida de modelos creado: {model_output_dir}")

# Implementación de resiliencia para interacción con Google Drive y restauración de datos
def backup_dataframe(df, backup_path):
    """Guarda un DataFrame como respaldo en formato Parquet."""
    try:
        df.to_parquet(backup_path, index=False)
        print(f"Respaldo del DataFrame guardado en: {backup_path}")
    except Exception as e:
        print(f"Error al guardar respaldo del DataFrame: {e}")

def restore_dataframe(backup_path):
    """Restaura un DataFrame desde un archivo de respaldo en formato Parquet."""
    try:
        if backup_path.exists():
            df_restored = pd.read_parquet(backup_path)
            print(f"DataFrame restaurado desde: {backup_path}")
            return df_restored
        else:
            print(f"No se encontró el archivo de respaldo en: {backup_path}")
            return None
    except Exception as e:
        print(f"Error al restaurar el DataFrame: {e}")
        return None

# Ruta para respaldo temporal del DataFrame
temp_dir = BASE_PATH / 'data' / 'output' / 'temp'
temp_dir.mkdir(parents=True, exist_ok=True)
temp_file_path = temp_dir / 'dataframe_backup.parquet'

# Respaldo inicial del DataFrame principal
if 'df' in locals() and df is not None:
    backup_dataframe(df, temp_file_path)

# Modificar interacción con Google Drive para reintentos
max_retries = 3
retry_delay = 5  # segundos

def mount_google_drive():
    """Intenta montar Google Drive con reintentos."""
    for attempt in range(max_retries):
        try:
            from google.colab import drive
            drive.mount('/content/drive')
            print("Google Drive montado exitosamente.")
            return True
        except Exception as e:
            print(f"Error al montar Google Drive (intento {attempt + 1}/{max_retries}): {e}")
            if attempt < max_retries - 1:
                time.sleep(retry_delay)
    print("No se pudo montar Google Drive después de varios intentos.")
    return False

if IN_COLAB:
    if not mount_google_drive():
        print("Usando datos en memoria o restaurando desde respaldo local.")
        df = restore_dataframe(temp_file_path)

# Restaurar modelos guardados en caso de fallo
model_files = {
    'RandomForest': model_output_dir / 'RandomForest.pkl',
    'XGBoost': model_output_dir / 'XGBoost.pkl',
    'LightGBM': model_output_dir / 'LightGBM.pkl'
}

def load_saved_model(model_name, model_path):
    """Carga un modelo guardado desde disco."""
    try:
        with open(model_path, 'rb') as f:
            model = pickle.load(f)
            print(f"Modelo {model_name} cargado desde: {model_path}")
            return model
    except Exception as e:
        print(f"Error al cargar el modelo {model_name}: {e}")
        return None

# Inicializar `modelos_base` como un diccionario vacío
modelos_base = {}

# Intentar cargar modelos guardados
for model_name, model_path in model_files.items():
    if model_name not in modelos_base:
        modelos_base[model_name] = load_saved_model(model_name, model_path)

# Implementación de resiliencia para modelos CNN y ConvLSTM

# Respaldo y restauración de modelos CNN y ConvLSTM
cnn_model_path = model_output_dir / 'cnn_model.h5'
convlstm_model_path = model_output_dir / 'convlstm_model.h5'

def backup_model(model, model_path):
    """Guarda un modelo de Keras como respaldo."""
    try:
        model.save(model_path)
        print(f"Modelo respaldado en: {model_path}")
    except Exception as e:
        print(f"Error al guardar respaldo del modelo: {e}")

def restore_model(model_path):
    """Restaura un modelo de Keras desde un archivo de respaldo."""
    try:
        if model_path.exists():
            model = tf.keras.models.load_model(model_path)
            print(f"Modelo restaurado desde: {model_path}")
            return model
        else:
            print(f"No se encontró el archivo de respaldo en: {model_path}")
            return None
    except Exception as e:
        print(f"Error al restaurar el modelo: {e}")
        return None

# Respaldo inicial de modelos si existen
if 'cnn_model' in locals() and cnn_model is not None:
    backup_model(cnn_model, cnn_model_path)
if 'convlstm_model' in locals() and convlstm_model is not None:
    backup_model(convlstm_model, convlstm_model_path)

# Restaurar modelos en caso de fallo
if 'cnn_model' not in locals() or cnn_model is None:
    cnn_model = restore_model(cnn_model_path)
if 'convlstm_model' not in locals() or convlstm_model is None:
    convlstm_model = restore_model(convlstm_model_path)

# Modificar interacción con Google Drive para reintentos
max_retries = 3
retry_delay = 5  # segundos

def mount_google_drive():
    """Intenta montar Google Drive con reintentos."""
    for attempt in range(max_retries):
        try:
            from google.colab import drive
            drive.mount('/content/drive')
            print("Google Drive montado exitosamente.")
            return True
        except Exception as e:
            print(f"Error al montar Google Drive (intento {attempt + 1}/{max_retries}): {e}")
            if attempt < max_retries - 1:
                time.sleep(retry_delay)
    print("No se pudo montar Google Drive después de varios intentos.")
    return False

if IN_COLAB:
    if not mount_google_drive():
        print("Usando datos en memoria o restaurando desde respaldo local para modelos CNN y ConvLSTM.")

In [None]:
# 1. Importaciones necesarias
import numpy as np
import pandas as pd
import xarray as xr
import optuna
import pickle
import datetime
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns

# Importaciones para barras de progreso y mejora de visualización
from tqdm.notebook import tqdm, trange
from IPython.display import display, HTML, clear_output
import time

# Configurar visualización más atractiva
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_context("notebook", font_scale=1.2)
colors = plt.rcParams['axes.prop_cycle'].by_key()['color']

In [None]:
# ST-HybridWaveStack: Deep Stacking CNN, LSTM, GRU, BLSTM + ELM con clusterización de elevación y datos CHIRPS reales

# ==============================================
# 1. Configuraciones generales y carga de datos
# ==============================================
import os
import numpy as np
import pandas as pd
import xarray as xr
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv2D, MaxPooling2D, Flatten, LSTM, GRU, Bidirectional, Reshape
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.base import BaseEstimator, RegressorMixin

# ==============================================
# 2. Cargar dataset real con y sin cluster_elevation
# ==============================================
def load_dataset(file_path):
    """Carga un archivo NetCDF y lo convierte a pandas DataFrame"""
    try:
        # Cargar el archivo NetCDF con xarray
        print(f"Intentando cargar el archivo: {file_path}")
        ds = xr.open_dataset(file_path)
        print("Archivo cargado exitosamente con xarray")

        # Mostrar información del dataset cargado
        print("\nInformación del dataset:")
        print(ds.info())
        print("\nVariables disponibles:")
        for var_name in ds.data_vars:
            print(f"- {var_name}: {ds[var_name].shape}")

        # Convertir a DataFrame
        df = ds.to_dataframe().reset_index()
        return df, ds
    except Exception as e:
        print(f"Error al cargar el archivo NetCDF: {e}")
        return None, None

# Ruta al dataset
data_file_path = BASE_PATH / 'data' / 'output'
data_file = BASE_PATH / 'data' / 'output' / 'complete_dataset_with_features_with_clusters_elevation.nc'
print(f"Buscando archivo en: {data_file}")

# Cargar el dataset
df, ds_with_clusters = load_dataset(data_file)

# Verificar si se cargó correctamente
if df is not None:
    print(f"Dataset cargado con éxito. Dimensiones: {df.shape}")
    print("\nPrimeras filas del DataFrame:")
    display(df.head())
else:
    print("No se pudo cargar el dataset. Verificar la ruta y el formato del archivo.")

# Variables a usar
vars_to_use = ['total_precipitation', 'month_sin', 'month_cos', 'elevation', 'slope', 'aspect', 'cluster_elevation']

# Extraer dimensiones
data_array = ds_with_clusters[vars_to_use].to_array().transpose('time', 'latitude', 'longitude', 'variable')
data_np = data_array.values

# Preprocesar la variable categórica 'cluster_elevation'
from sklearn.preprocessing import LabelEncoder

# Identificar el índice de la variable categórica
cluster_elevation_index = vars_to_use.index('cluster_elevation')

# Aplicar LabelEncoder para convertir 'low', 'medium', 'high', etc., a valores numéricos
label_encoder = LabelEncoder()
# Extract the 3D slice for 'cluster_elevation'
categorical_feature_slice = data_np[..., cluster_elevation_index]
original_shape = categorical_feature_slice.shape

# Flatten the slice for LabelEncoder, then reshape back
# Ensure the input to fit_transform is 1D
flattened_slice = categorical_feature_slice.ravel()
encoded_values = label_encoder.fit_transform(flattened_slice)

# Assign the reshaped encoded values back to data_np
data_np[..., cluster_elevation_index] = encoded_values.reshape(original_shape)

# Convertir a tipo float para evitar problemas de dtype
data_np = data_np.astype(float)

# Variable objetivo
target = ds_with_clusters['total_precipitation'].values  # [time, lat, lon]

# Reestructurar para convertir a muestras individuales
samples, lat, lon, n_features = data_np.shape
X = data_np.reshape(samples, lat * lon, n_features)
y = target.reshape(samples, lat * lon)

# Quitar NaNs (si los hay)
mask = ~np.isnan(y)
X = X[mask]
y = y[mask]

# ==============================================
# 3. Crear ventanas para predicción a 12 meses
# ==============================================
def create_sequences(X, y, window=12):
    X_seq, y_seq = [], []
    for i in range(len(X) - window):
        X_seq.append(X[i:i+window])
        y_seq.append(y[i+window])
    return np.array(X_seq), np.array(y_seq)

X_seq, y_seq = create_sequences(X, y, window=12)

# Dividir train/test
X_train, X_test, y_train, y_test = train_test_split(X_seq, y_seq, test_size=0.2, random_state=42)

# ==============================================
# 4. Clase ELM
# ==============================================
class ELMRegressor(BaseEstimator, RegressorMixin):
    def __init__(self, n_hidden=1000, activation=np.tanh):
        self.n_hidden = n_hidden
        self.activation = activation

    def fit(self, X, y):
        input_size = X.shape[1]
        self.input_weights = np.random.normal(size=(input_size, self.n_hidden))
        self.biases = np.random.normal(size=(self.n_hidden,))
        H = self.activation(np.dot(X, self.input_weights) + self.biases)
        self.output_weights = np.dot(np.linalg.pinv(H), y)
        return self

    def predict(self, X):
        H = self.activation(np.dot(X, self.input_weights) + self.biases)
        return np.dot(H, self.output_weights)

# ==============================================
# 5. Modelos base
# ==============================================
def build_model(model_type, input_shape, output_neurons):
    model = Sequential()
    if model_type == 'LSTM':
        model.add(LSTM(64, input_shape=input_shape))
    elif model_type == 'GRU':
        model.add(GRU(64, input_shape=input_shape))
    elif model_type == 'BLSTM':
        model.add(Bidirectional(LSTM(64), input_shape=input_shape))
    elif model_type == 'CNN':
        # input_shape for Reshape is (timesteps, features_per_timestep)
        # Reshape to (timesteps, features_per_timestep, 1) to be (height, width, channels) for Conv2D
        model.add(Reshape((*input_shape, 1), input_shape=input_shape))
        model.add(Conv2D(32, (3, 3), activation='relu', padding='same')) # Added padding
        model.add(MaxPooling2D((2, 2)))
        model.add(Flatten())
    model.add(Dense(output_neurons)) # Use output_neurons
    model.compile(optimizer='adam', loss='mse')
    return model

# ==============================================
# 6. Entrenamiento de modelos base
# ==============================================
# Ensure preds_train and preds_test are initialized (they are from notebook state, but good practice)
if 'preds_train' not in globals():
    preds_train = []
if 'preds_test' not in globals():
    preds_test = []

# Reshape X_train and X_test for the models
# Original X_train shape: (num_sequences, window, lat*lon, n_features)
# Reshape to: (num_sequences, window, lat*lon*n_features)
X_train_feed = X_train.reshape((X_train.shape[0], X_train.shape[1], -1))
X_test_feed = X_test.reshape((X_test.shape[0], X_test.shape[1], -1))

# Define input shape for models: (timesteps, features_per_timestep)
input_shape_for_build = (X_train_feed.shape[1], X_train_feed.shape[2])

# Define output neurons for models
# Since y_train is 1D (each sequence predicts a single value), output_neurons should be 1.
# The original comment "# y_train shape: (num_sequences, lat*lon)" seems inconsistent with current y_train shape.
output_neurons_for_build = 1

# Early stopping callback (assuming 'es' is defined from notebook state)
# If not, define it: es = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
if 'es' not in globals():
    print("Warning: EarlyStopping callback 'es' not found in global scope. Defining a default one.")
    es = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)


plot_dir = "ST_HybridWaveStack/plots"
os.makedirs(plot_dir, exist_ok=True)

# model_names is defined in the notebook state
for name in model_names:
    print(f"\nTraining {name} model...")
    
    model = build_model(name, input_shape_for_build, output_neurons_for_build)
    
    print(f"Input shape for model: {input_shape_for_build}")
    print(f"Output neurons for model: {output_neurons_for_build}")
    model.summary()

    history = model.fit(X_train_feed, y_train,
                        epochs=50,  # Example: 50 epochs
                        batch_size=32, # Example: batch size 32
                        validation_split=0.2, # Example: 20% validation split
                        callbacks=[es],
                        verbose=1)
    
    # Make predictions
    pred_train_model = model.predict(X_train_feed).flatten()
    pred_test_model = model.predict(X_test_feed).flatten()

    # Guardar curvas de aprendizaje
    plt.figure()
    plt.plot(history.history['loss'], label='train_loss')
    if 'val_loss' in history.history:
        plt.plot(history.history['val_loss'], label='val_loss')
    plt.title(f'Learning Curve {name}')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    plt.savefig(f'{plot_dir}/learning_{name}.png')
    plt.close()

    # Append predictions
    preds_train.append(pred_train_model)
    preds_test.append(pred_test_model)

print("\nBase models training complete.")
print(f"Number of training prediction sets: {len(preds_train)}")
print(f"Number of test prediction sets: {len(preds_test)}")
if preds_train:
    print(f"Shape of first training prediction set: {preds_train[0].shape}")
if preds_test:
    print(f"Shape of first test prediction set: {preds_test[0].shape}")


# ==============================================
# 7. Meta-modelo ELM
# ==============================================
# Flatten predictions from base models to ensure they are 2D
preds_train_flat = [pred.flatten() for pred in preds_train]
preds_test_flat = [pred.flatten() for pred in preds_test]

# Stack flattened predictions to create input for ELMRegressor
X_train_elm = np.column_stack(preds_train_flat)
X_test_elm = np.column_stack(preds_test_flat)

# Flatten target variables to match the input shape
y_train_elm = y_train.flatten()
y_test_elm = y_test.flatten()

elm = ELMRegressor(n_hidden=500)
elm.fit(X_train_elm, y_train_elm)
y_pred_elm = elm.predict(X_test_elm)

# ==============================================
# 8. Evaluación
# ==============================================
def print_metrics(y_true, y_pred, label):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
plt.xlabel('Real')
plt.ylabel('Predicho ELM')
plt.title('Comparación ELM')
plot_dir = "ST_HybridWaveStack/plots" # Define plot_dir if not already defined in this scope
os.makedirs(plot_dir, exist_ok=True) # Ensure directory exists
plt.savefig(f"{plot_dir}/scatter_elm.png")
plt.close()

# ==============================================
# ==============================================
# 9. Visualización resultados
# ==============================================
plt.figure()
plt.scatter(y_test, y_pred_elm, alpha=0.7)
plt.xlabel('Real')
plt.ylabel('Predicho ELM')
plt.title('Comparación ELM')
plt.savefig("ST_HybridWaveStack/plots/scatter_elm.png")
plt.close()

# ==============================================
# 10. Comentario sobre cluster_elevation
# ==============================================
# La variable 'cluster_elevation' se usa como feature categórica. Puede:
# - Mejorar la capacidad de generalización del modelo en regiones montañosas
# - Actuar como representación abstracta del terreno
# Se debe incluir como variable adicional normalizada o one-hot si fuera necesario.

print("Pipeline completo ejecutado. Resultados y gráficos guardados.")
