## Goals: Training of some *Exploration* model

Training of 3 (qrf, lgbm and EBM) x 4 (at 4 different horizon) baseline models on the splited dataset for detailed exploration.

# 1. Data Import and Setup

Imports necessary libraries, sets up environment paths.

In [None]:
# Standard library imports
import os
import sys

# Third-party imports
import joblib
import lightgbm as lgb
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from interpret.glassbox import ExplainableBoostingRegressor
from mapie.regression import MapieQuantileRegressor
from quantile_forest import RandomForestQuantileRegressor

# Append project root to sys.path for local imports
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..', '..', '..', '..')))

# Local application imports
from src.utils.model import split_dataset

Defines constants :
* INPUT_DIR must be the same as the one defined in *00 Preprocessing/Feature Engineering*.
* MODEL_DIR is the directory where the exploration models will be saved.

In [None]:
INPUT_DIR = "../../../../data/input/"
MODEL_DIR = "../../../../models/exploration/"

NUMBER_OF_WEEK = 4 # we want to predict at 4 different horizon reduce if you only want to explore first week
SEED = 42 
# columns to drop : target at different horizon, station_code, and features removed from Feature Selection
TO_DROP = ["water_flow_week1", "station_code", "water_flow_week2", "water_flow_week3", "water_flow_week4"]

# 2. Data Loading
Load in the baseline datasets, create the directory to save models.

In [None]:
# load the dataset
ds_train = pd.read_csv(f"{INPUT_DIR}ds_train.csv")
ds_test_spatio_temporal = pd.read_csv(f"{INPUT_DIR}ds_test_spatio_temporal.csv")
ds_test_temporal = pd.read_csv(f"{INPUT_DIR}ds_test_temporal.csv")

ds_train["ObsDate"] = pd.to_datetime(ds_train["ObsDate"])
ds_test_spatio_temporal["ObsDate"] = pd.to_datetime(ds_test_spatio_temporal["ObsDate"])
ds_test_temporal["ObsDate"] = pd.to_datetime(ds_test_temporal["ObsDate"])

ds_train = ds_train.set_index("ObsDate")
ds_test_spatio_temporal = ds_test_spatio_temporal.set_index("ObsDate")
ds_test_temporal = ds_test_temporal.set_index("ObsDate")

if not os.path.exists(MODEL_DIR):
    os.makedirs(MODEL_DIR)

In [None]:
X_train = ds_train.drop(columns=TO_DROP)
y_train = {}
y_train[0] = ds_train["water_flow_week1"]
for i in range(1, NUMBER_OF_WEEK):
    y_train[i] = ds_train[f"water_flow_week{i+1}"]

X_test_spatio_temporal = ds_test_spatio_temporal.drop(columns=TO_DROP)
y_test_spatio_temporal = {}
for i in range(0, NUMBER_OF_WEEK):
    y_test_spatio_temporal[i] = ds_test_spatio_temporal[f"water_flow_week{i+1}"]

X_test_temporal = ds_test_temporal.drop(columns=TO_DROP)
y_test_temporal = {}
for i in range(0, NUMBER_OF_WEEK):
    y_test_temporal[i] = ds_test_temporal[f"water_flow_week{i+1}"]

mapie_enbpi = {}
mapie = {}
qrf = {}
mapie_aci = {}



In [None]:

# Define constants
ALPHA = 0.1
TIME_VALIDATION = "1997-01-01"
LGBM_PARAMS = {
    "max_depth": 15,
    "learning_rate": 0.01,
    "n_estimators": 500,
    "colsample_bytree": 0.7,
    "objective": "quantile",
    "alpha": ALPHA
}

train_mapie, val_spatio_temporal, val_temporal  = split_dataset(ds_train, 0.75, TIME_VALIDATION)

X_train_mapie = train_mapie.drop(columns=["water_flow_week1", "station_code", "water_flow_week2", "water_flow_week3", "water_flow_week4"])
y_train_mapie = {}
for i in range(0, NUMBER_OF_WEEK):
    y_train_mapie[i] = train_mapie[f"water_flow_week{i+1}"]

X_val = val_spatio_temporal.drop(columns=["water_flow_week1", "station_code", "water_flow_week2", "water_flow_week3", "water_flow_week4"])
y_val = {}
y_val[0] = val_spatio_temporal["water_flow_week1"]
for i in range(1, NUMBER_OF_WEEK):
    y_val[i] = val_spatio_temporal[f"water_flow_week{i+1}"]

for i in range(NUMBER_OF_WEEK):
    print(f"Training week {i}")
    # Initialize and train MapieQuantileRegressor
    regressor = lgb.LGBMRegressor(**LGBM_PARAMS)
    mapie[i] = MapieQuantileRegressor(estimator=regressor, method="quantile", cv="split", alpha=ALPHA)
    mapie[i].fit(X_train_mapie, y_train_mapie[i], X_calib=X_val, y_calib=y_val[i])
    
    # save model with date
    time = pd.Timestamp.now().strftime("%Y-%m-%d_%H-%M-%S")

    model_path = f"{MODEL_DIR}mapie_quantile_{time}_week_{i}.pkl"
    joblib.dump(mapie[i], model_path)


In [None]:
for i in range(NUMBER_OF_WEEK):
    print(f"Training week {i}")
    # Train RandomForestQuantileRegressor
    qrf[i] = RandomForestQuantileRegressor(n_estimators=10, max_depth=10, min_samples_leaf=10)
    qrf[i].fit(X_train, y_train[i])

    time = pd.Timestamp.now().strftime("%Y-%m-%d_%H-%M-%S")
    model_path = f"{MODEL_DIR}qrf_quantile_{time}_week_{i}.pkl"
    joblib.dump(qrf[i], model_path)

In [None]:
NUM_ENSEMBLES = 5

# A dictionary to hold the list of EBM models per week
ebm_ensembles = {}

for i in range(NUMBER_OF_WEEK):
    print(f"Training EBM ensemble for week {i}")

    # This will store all seed models for a single week
    models_i = []
    
    for seed in range(NUM_ENSEMBLES):
        print(f"Training EBM ensemble {seed} for week {i}")
        # 1. Create your bootstrap sample or subset (if you want bagging)
        sample_indices = np.random.choice(len(X_train), size=len(X_train), replace=True)
        X_sample = X_train.iloc[sample_indices]
        y_sample = y_train[i][sample_indices]
        
        # 2. Train an EBM with consistent binning parameters
        ebm_model = ExplainableBoostingRegressor(
            outer_bags=1,
            inner_bags=1,
            max_bins=128,
            learning_rate=0.05,
            interactions=3,
            early_stopping_rounds=100,
            random_state=SEED  # ensures same binning
        )
        ebm_model.fit(X_sample, y_sample)
        
        models_i.append(ebm_model)

    time = pd.Timestamp.now().strftime("%Y-%m-%d_%H-%M-%S")
    file_path = f"{MODEL_DIR}ebm_ensemble_{time}_week_{i}.pkl"

    joblib.dump(ebm_ensembles, file_path)
    print(f"Saved EBM ensembles to {file_path}")

    # Store the list of models for week i
    ebm_ensembles[i] = models_i