<a href="https://colab.research.google.com/github/ninja-marduk/ml_precipitation_prediction/blob/feature%2Fbase-models/models/base_models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Configuración del entorno (compatible con Colab y local)
import os
import sys
from pathlib import Path
import shutil
import time
import psutil

# Detectar si estamos en Google Colab
IN_COLAB = 'google.colab' in sys.modules

if IN_COLAB:
    from google.colab import drive
    drive.mount('/content/drive')
    # Si estamos en Colab, clonar el repositorio
    !git clone https://github.com/ninja-marduk/ml_precipitation_prediction.git
    %cd ml_precipitation_prediction
    # Instalar dependencias necesarias
    !pip install -r requirements.txt
    !pip install xarray netCDF4 optuna matplotlib seaborn lightgbm xgboost scikit-learn
    BASE_PATH = '/content/drive/MyDrive/ml_precipitation_prediction'
else:
    # Si estamos en local, usar la ruta actual
    if '/models' in os.getcwd():
        BASE_PATH = Path('..')
    else:
        BASE_PATH = Path('.')

print(f"Entorno configurado. Usando ruta base: {BASE_PATH}")

# Si BASE_PATH viene como string, lo convertimos
BASE_PATH = Path(BASE_PATH)

# Ahora puedes concatenar correctamente
data_output_dir = BASE_PATH / 'data' / 'output'
model_output_dir = BASE_PATH / 'models' / 'output'
model_output_dir.mkdir(parents=True, exist_ok=True)

print(f"Directorio para salida de modelos creado: {model_output_dir}")

# Implementación de resiliencia para interacción con Google Drive y restauración de datos
def backup_dataframe(df, backup_path):
    """Guarda un DataFrame como respaldo en formato Parquet."""
    try:
        df.to_parquet(backup_path, index=False)
        print(f"Respaldo del DataFrame guardado en: {backup_path}")
    except Exception as e:
        print(f"Error al guardar respaldo del DataFrame: {e}")

def restore_dataframe(backup_path):
    """Restaura un DataFrame desde un archivo de respaldo en formato Parquet."""
    try:
        if backup_path.exists():
            df_restored = pd.read_parquet(backup_path)
            print(f"DataFrame restaurado desde: {backup_path}")
            return df_restored
        else:
            print(f"No se encontró el archivo de respaldo en: {backup_path}")
            return None
    except Exception as e:
        print(f"Error al restaurar el DataFrame: {e}")
        return None

# Ruta para respaldo temporal del DataFrame
temp_dir = BASE_PATH / 'data' / 'output' / 'temp'
temp_dir.mkdir(parents=True, exist_ok=True)
temp_file_path = temp_dir / 'dataframe_backup.parquet'

# Respaldo inicial del DataFrame principal
if 'df' in locals() and df is not None:
    backup_dataframe(df, temp_file_path)

# Modificar interacción con Google Drive para reintentos
max_retries = 3
retry_delay = 5  # segundos

def mount_google_drive():
    """Intenta montar Google Drive con reintentos."""
    for attempt in range(max_retries):
        try:
            from google.colab import drive
            drive.mount('/content/drive')
            print("Google Drive montado exitosamente.")
            return True
        except Exception as e:
            print(f"Error al montar Google Drive (intento {attempt + 1}/{max_retries}): {e}")
            if attempt < max_retries - 1:
                time.sleep(retry_delay)
    print("No se pudo montar Google Drive después de varios intentos.")
    return False

if IN_COLAB:
    if not mount_google_drive():
        print("Usando datos en memoria o restaurando desde respaldo local.")
        df = restore_dataframe(temp_file_path)

# Restaurar modelos guardados en caso de fallo
model_files = {
    'RandomForest': model_output_dir / 'RandomForest.pkl',
    'XGBoost': model_output_dir / 'XGBoost.pkl',
    'LightGBM': model_output_dir / 'LightGBM.pkl'
}

def load_saved_model(model_name, model_path):
    """Carga un modelo guardado desde disco."""
    try:
        with open(model_path, 'rb') as f:
            model = pickle.load(f)
            print(f"Modelo {model_name} cargado desde: {model_path}")
            return model
    except Exception as e:
        print(f"Error al cargar el modelo {model_name}: {e}")
        return None

# Inicializar `modelos_base` como un diccionario vacío
modelos_base = {}

# Intentar cargar modelos guardados
for model_name, model_path in model_files.items():
    if model_name not in modelos_base:
        modelos_base[model_name] = load_saved_model(model_name, model_path)

# Implementación de resiliencia para modelos CNN y ConvLSTM

# Respaldo y restauración de modelos CNN y ConvLSTM
cnn_model_path = model_output_dir / 'cnn_model.h5'
convlstm_model_path = model_output_dir / 'convlstm_model.h5'

def backup_model(model, model_path):
    """Guarda un modelo de Keras como respaldo."""
    try:
        model.save(model_path)
        print(f"Modelo respaldado en: {model_path}")
    except Exception as e:
        print(f"Error al guardar respaldo del modelo: {e}")

def restore_model(model_path):
    """Restaura un modelo de Keras desde un archivo de respaldo."""
    try:
        if model_path.exists():
            model = tf.keras.models.load_model(model_path)
            print(f"Modelo restaurado desde: {model_path}")
            return model
        else:
            print(f"No se encontró el archivo de respaldo en: {model_path}")
            return None
    except Exception as e:
        print(f"Error al restaurar el modelo: {e}")
        return None

# Respaldo inicial de modelos si existen
if 'cnn_model' in locals() and cnn_model is not None:
    backup_model(cnn_model, cnn_model_path)
if 'convlstm_model' in locals() and convlstm_model is not None:
    backup_model(convlstm_model, convlstm_model_path)

# Restaurar modelos en caso de fallo
if 'cnn_model' not in locals() or cnn_model is None:
    cnn_model = restore_model(cnn_model_path)
if 'convlstm_model' not in locals() or convlstm_model is None:
    convlstm_model = restore_model(convlstm_model_path)

# Modificar interacción con Google Drive para reintentos
max_retries = 3
retry_delay = 5  # segundos

def mount_google_drive():
    """Intenta montar Google Drive con reintentos."""
    for attempt in range(max_retries):
        try:
            from google.colab import drive
            drive.mount('/content/drive')
            print("Google Drive montado exitosamente.")
            return True
        except Exception as e:
            print(f"Error al montar Google Drive (intento {attempt + 1}/{max_retries}): {e}")
            if attempt < max_retries - 1:
                time.sleep(retry_delay)
    print("No se pudo montar Google Drive después de varios intentos.")
    return False

if IN_COLAB:
    if not mount_google_drive():
        print("Usando datos en memoria o restaurando desde respaldo local para modelos CNN y ConvLSTM.")

In [None]:
# 1. Importaciones necesarias
import numpy as np
import pandas as pd
import xarray as xr
import optuna
import pickle
import datetime
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns

# Importaciones para barras de progreso y mejora de visualización
from tqdm.notebook import tqdm, trange
from IPython.display import display, HTML, clear_output
import time

# Configurar visualización más atractiva
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_context("notebook", font_scale=1.2)
colors = plt.rcParams['axes.prop_cycle'].by_key()['color']

In [None]:
# ST-HybridWaveStack - Evaluación comparativa de configuraciones

import os
import numpy as np
import pandas as pd
import xarray as xr
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv2D, MaxPooling2D, Flatten, LSTM, GRU, Bidirectional, Reshape
from tensorflow.keras.callbacks import EarlyStopping

# Funciones auxiliares
def create_sequences(X, y, window=12):
    X_seq, y_seq = [], []
    for i in range(len(X) - window):
        X_seq.append(X[i:i+window])
        y_seq.append(y[i+window])
    return np.array(X_seq), np.array(y_seq)

def build_model(model_type, input_shape, output_neurons):
    model = Sequential()
    if model_type == 'LSTM':
        model.add(LSTM(64, input_shape=input_shape))
    elif model_type == 'GRU':
        model.add(GRU(64, input_shape=input_shape))
    elif model_type == 'BLSTM':
        model.add(Bidirectional(LSTM(64), input_shape=input_shape))
    elif model_type == 'CNN':
        model.add(Reshape((*input_shape, 1), input_shape=input_shape))
        model.add(Conv2D(32, (3, 3), activation='relu', padding='same'))
        model.add(MaxPooling2D((2, 2)))
        model.add(Flatten())
    model.add(Dense(output_neurons))
    model.compile(optimizer='adam', loss='mse')
    return model

def evaluate(y_true, y_pred):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    mape = np.mean(np.abs((y_true - y_pred) / (y_true + 1e-5))) * 100
    r2 = r2_score(y_true, y_pred)
    return rmse, mae, mape, r2

# Ruta del archivo
try:
    file_path = data_output_dir / "complete_dataset_with_features_with_clusters_elevation.nc"
    ds = xr.open_dataset(file_path)
    print(f"✔️ Dataset cargado desde: {file_path}")
except Exception as e:
    raise RuntimeError(f"❌ Error cargando dataset: {e}")

# Configuraciones del experimento
experiment_settings = {
    "precip+time": ['month_sin', 'month_cos'],
    "precip+time+elev": ['month_sin', 'month_cos', 'elevation', 'slope', 'aspect'],
    "all_features": ['month_sin', 'month_cos', 'elevation', 'slope', 'aspect', 'cluster_elevation']
}

results = {
    'experiment': [],
    'model': [],
    'RMSE': [],
    'MAE': [],
    'MAPE': [],
    'R2': []
}

for exp_name, variables in experiment_settings.items():
    print(f"\n🚀 Ejecutando experimento: {exp_name}")

    try:
        cluster_elevation_index = variables.index('cluster_elevation') if 'cluster_elevation' in variables else None
        subset_array = ds[variables].to_array().transpose('time', 'latitude', 'longitude', 'variable')
        subset_np = subset_array.values

        if cluster_elevation_index is not None:
            cluster_data = subset_np[..., cluster_elevation_index]
            encoded = LabelEncoder().fit_transform(cluster_data.ravel()).reshape(cluster_data.shape)
            subset_np[..., cluster_elevation_index] = encoded

        subset_np = subset_np.astype(np.float32)
        target = ds['total_precipitation'].values

        samples, lat, lon, feats = subset_np.shape
        X = subset_np.reshape(samples, lat * lon, feats)
        y = target.reshape(samples, lat * lon)

        mask = ~np.isnan(y)
        X = X[mask]
        y = y[mask]

        X_seq, y_seq = create_sequences(X, y, window=12)
        X_train, X_test, y_train, y_test = train_test_split(X_seq, y_seq, test_size=0.2, random_state=42)

        X_train_feed = X_train.reshape((X_train.shape[0], X_train.shape[1], -1))
        X_test_feed = X_test.reshape((X_test.shape[0], X_test.shape[1], -1))
        input_shape = (X_train_feed.shape[1], X_train_feed.shape[2])

        for model_name in ['LSTM', 'GRU', 'BLSTM', 'CNN']:
            print(f"\t🏗️ Entrenando modelo {model_name}...")
            model = build_model(model_name, input_shape, 1)
            es = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
            model.fit(X_train_feed, y_train, epochs=50, batch_size=32, validation_split=0.2, verbose=0, callbacks=[es])

            y_pred = model.predict(X_test_feed).flatten()
            y_true = y_test.flatten()

            rmse, mae, mape, r2 = evaluate(y_true, y_pred)
            results['experiment'].append(exp_name)
            results['model'].append(model_name)
            results['RMSE'].append(rmse)
            results['MAE'].append(mae)
            results['MAPE'].append(mape)
            results['R2'].append(r2)
            print(f"\t✅ {model_name} -> RMSE: {rmse:.3f}, MAE: {mae:.3f}, MAPE: {mape:.2f}%, R²: {r2:.3f}")

    except Exception as err:
        print(f"❌ Error en experimento '{exp_name}': {err}")

# Mostrar resultados finales
results_df = pd.DataFrame(results)
results_df = results_df.sort_values(by='RMSE')
results_df.to_csv('resultados_comparativos_modelos.csv', index=False)
print("\n📊 Resultados finales ordenados por RMSE:")
print(results_df)
results_df


✔️ Dataset cargado desde: /content/drive/MyDrive/ml_precipitation_prediction/data/output/complete_dataset_with_features_with_clusters_elevation.nc

🚀 Ejecutando experimento: precip+time
	🏗️ Entrenando modelo LSTM...


  super().__init__(**kwargs)


Epoch 1/50
[1m42029/42029[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m309s[0m 7ms/step - loss: 15573.6006 - val_loss: 10470.9414
Epoch 2/50
[1m42029/42029[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m309s[0m 7ms/step - loss: 10482.1719 - val_loss: 10482.6914
Epoch 3/50
[1m42029/42029[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m312s[0m 7ms/step - loss: 10444.9043 - val_loss: 10510.0811
Epoch 4/50
[1m42029/42029[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m313s[0m 7ms/step - loss: 10478.2305 - val_loss: 10474.4268
Epoch 5/50
[1m42029/42029[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m316s[0m 8ms/step - loss: 10465.0820 - val_loss: 10471.7578
Epoch 6/50
[1m42029/42029[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m316s[0m 8ms/step - loss: 10487.4619 - val_loss: 10472.5469
Epoch 7/50
[1m42029/42029[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m318s[0m 8ms/step - loss: 10478.0820 - val_loss: 10492.5840
Epoch 8/50
[1m42029/42029[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[

  super().__init__(**kwargs)


Epoch 1/50
[1m42029/42029[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m347s[0m 8ms/step - loss: 15481.3652 - val_loss: 10474.9951
Epoch 2/50
[1m42029/42029[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m342s[0m 8ms/step - loss: 10484.7236 - val_loss: 10473.6943
Epoch 3/50
[1m42029/42029[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m338s[0m 8ms/step - loss: 10486.7979 - val_loss: 10470.1777
Epoch 4/50
[1m42029/42029[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m341s[0m 8ms/step - loss: 10467.8867 - val_loss: 10466.8398
Epoch 5/50
[1m42029/42029[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m339s[0m 8ms/step - loss: 10471.9482 - val_loss: 10467.0967
Epoch 6/50
[1m42029/42029[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m337s[0m 8ms/step - loss: 10453.0225 - val_loss: 10466.8906
Epoch 7/50
[1m42029/42029[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m335s[0m 8ms/step - loss: 10468.4893 - val_loss: 10466.3555
Epoch 8/50
[1m42029/42029[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[

  super().__init__(**kwargs)


Epoch 1/50
[1m42029/42029[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m354s[0m 8ms/step - loss: 13293.0840 - val_loss: 10482.6797
Epoch 2/50
[1m42029/42029[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m348s[0m 8ms/step - loss: 10472.0654 - val_loss: 10466.9697
Epoch 3/50
[1m42029/42029[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m343s[0m 8ms/step - loss: 10444.4287 - val_loss: 10468.5791
Epoch 4/50
[1m42029/42029[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m347s[0m 8ms/step - loss: 10469.4473 - val_loss: 10468.4736
Epoch 5/50
[1m42029/42029[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m343s[0m 8ms/step - loss: 10456.3633 - val_loss: 10466.6924
Epoch 6/50
[1m42029/42029[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m337s[0m 8ms/step - loss: 10440.4873 - val_loss: 10466.2842
Epoch 7/50
[1m42029/42029[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m339s[0m 8ms/step - loss: 10498.3652 - val_loss: 10463.0576
Epoch 8/50
[1m42029/42029[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[

  super().__init__(**kwargs)


Epoch 1/50
[1m42029/42029[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m131s[0m 3ms/step - loss: 11912.6494 - val_loss: 10555.7471
Epoch 2/50
[1m42029/42029[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m128s[0m 3ms/step - loss: 10556.7314 - val_loss: 10495.3965
Epoch 3/50
[1m42029/42029[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m127s[0m 3ms/step - loss: 10489.4551 - val_loss: 10481.2920
Epoch 4/50
[1m42029/42029[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m126s[0m 3ms/step - loss: 10480.6504 - val_loss: 10474.6514
Epoch 5/50
[1m42029/42029[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m129s[0m 3ms/step - loss: 10499.0068 - val_loss: 10473.0049
Epoch 6/50
[1m42029/42029[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m125s[0m 3ms/step - loss: 10491.9453 - val_loss: 10485.2070
Epoch 7/50
[1m42029/42029[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m133s[0m 3ms/step - loss: 10465.7480 - val_loss: 10470.0898
Epoch 8/50
[1m42029/42029[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[

  super().__init__(**kwargs)


Epoch 1/50
[1m42029/42029[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m334s[0m 8ms/step - loss: 19704.6484 - val_loss: 8517.9912
Epoch 2/50
[1m42029/42029[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m330s[0m 8ms/step - loss: 8136.3984 - val_loss: 7286.7954
Epoch 3/50
[1m42029/42029[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m324s[0m 8ms/step - loss: 7425.9297 - val_loss: 7895.3003
Epoch 4/50
[1m42029/42029[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m322s[0m 8ms/step - loss: 7238.7827 - val_loss: 6824.1514
Epoch 5/50
[1m42029/42029[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m323s[0m 8ms/step - loss: 6907.7339 - val_loss: 6682.5112
Epoch 6/50
[1m42029/42029[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m317s[0m 8ms/step - loss: 6917.1748 - val_loss: 6669.1782
Epoch 7/50
[1m42023/42029[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 7ms/step - loss: 6801.5972