# SERA Monthly Prediction

## Imports and Functions

In [1]:
import tensorflow as tf
import os
import numpy as np
import random
import keras
SEED = 42

def set_seeds(seed=SEED):
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    tf.random.set_seed(seed)
    np.random.seed(seed)
    keras.utils.set_random_seed(seed)

def set_global_determinism(seed=SEED):
    set_seeds(seed=seed)

    os.environ['TF_DETERMINISTIC_OPS'] = '1'
    os.environ['TF_CUDNN_DETERMINISTIC'] = '1'
    
    tf.config.threading.set_inter_op_parallelism_threads(1)
    tf.config.threading.set_intra_op_parallelism_threads(1)

# Call the above function with seed value
set_global_determinism(seed=SEED)

In [2]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.gaussian_process.kernels import RBF, WhiteKernel
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.model_selection import TimeSeriesSplit

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, LSTM, Dense, Dropout, GRU, Conv1D, Flatten, Reshape

from sklearn.svm import SVR
from xgboost import XGBRegressor
import sys
from catboost import CatBoostRegressor


In [3]:
# Add the folder to the Python path

os.chdir("../")
# change working directory to project's root path
print(os.getcwd())

FIRST_YEAR= 1972
FREQUENCY= "monthly"

c:\Users\marti\Desktop\data\hw_extra


In [4]:
folder_path = os.path.abspath("functions/") #INPUT_PATH)#'path_to_your_folder')  # Replace with the actual folder path
sys.path.insert(0, folder_path)

from Predictions import (
    PredictionExperiment,
    PredictionModel,
    SERA,
    sera_objective,
    piecewise_linear_phi_np
)


TensorFlow Addons (TFA) has ended development and introduction of new features.
TFA has entered a minimal maintenance and release mode until a planned end of life in May 2024.
Please modify downstream libraries to take dependencies from other repositories in our TensorFlow community (e.g. Keras, Keras-CV, and Keras-NLP). 

For more information see: https://github.com/tensorflow/addons/issues/2807 



In [5]:
indices_of_interest = ["HWN", "HWF", "HWD", "HWM", "HWA"]
bounds_v1 = (-1.1692892810242344, -0.30647585455315646, 4.561547586528888, 6.499969486244418)
bounds = (-1.1692892810242344, -0.30647585455315646, 3.0, 6.499969486244418)


## California

In [6]:
region="california"
metadata_path = f"data/climate_features/{region}/metadata.csv"
metadata = pd.read_csv(metadata_path)
metadata.reset_index(inplace=True, drop=True)
display(metadata)

Unnamed: 0,id,filename,season,indices
0,6e47cb06,predictor_6e47cb06_1.parquet,1,df9a31c5-20a07cea-cfb03125-9169e0dc-0b0bffae-b...
1,6e47cb06,predictor_6e47cb06_2.parquet,2,df9a31c5-20a07cea-cfb03125-9169e0dc-0b0bffae-b...
2,6e47cb06,predictor_6e47cb06_3.parquet,3,df9a31c5-20a07cea-cfb03125-9169e0dc-0b0bffae-b...
3,6e47cb06,predictor_6e47cb06_4.parquet,4,df9a31c5-20a07cea-cfb03125-9169e0dc-0b0bffae-b...
4,6e47cb06,predictor_6e47cb06_5.parquet,5,df9a31c5-20a07cea-cfb03125-9169e0dc-0b0bffae-b...
5,6e47cb06,predictor_6e47cb06_6.parquet,6,df9a31c5-20a07cea-cfb03125-9169e0dc-0b0bffae-b...
6,6e47cb06,predictor_6e47cb06_7.parquet,7,df9a31c5-20a07cea-cfb03125-9169e0dc-0b0bffae-b...
7,6e47cb06,predictor_6e47cb06_8.parquet,8,df9a31c5-20a07cea-cfb03125-9169e0dc-0b0bffae-b...
8,6e47cb06,predictor_6e47cb06_9.parquet,9,df9a31c5-20a07cea-cfb03125-9169e0dc-0b0bffae-b...
9,6e47cb06,predictor_6e47cb06_10.parquet,10,df9a31c5-20a07cea-cfb03125-9169e0dc-0b0bffae-b...


In [7]:
results = pd.read_csv(f"data/sera_results/{region}_results/results.csv")
ids_results = results["id_data"].unique()
id_experiments = metadata["id"].unique()
ids_to_execute = [id for id in id_experiments if id not in ids_results]
print(len(ids_to_execute))

0


In [8]:
from sklearn.base import BaseEstimator, RegressorMixin
import xgboost as xgb
class XGBCustomObjective(BaseEstimator, RegressorMixin):
    """
    Wrapper class to make XGBoost with custom objectives compatible with sklearn.
    This is needed for MultiOutputRegressor.
    """
    def __init__(self, objective_func=None, random_state=42, n_estimators=15, learning_rate=0.1, max_depth=6, subsample=0.8, colsample_bytree=0.8, **kwargs):
        self.objective_func = objective_func
        self.random_state = random_state
        self.n_estimators = n_estimators
        self.learning_rate = learning_rate
        self.max_depth = max_depth
        self.subsample = subsample
        self.colsample_bytree = colsample_bytree
        self.kwargs = kwargs
        self.model = None
        
    def fit(self, X, y):
        """Fit the model with custom objective."""
        # Create DMatrix
        dtrain = xgb.DMatrix(X, label=y)
        
        # Set up parameters using your specified values
        params = {
            'max_depth': self.max_depth,
            'eta': self.learning_rate,  # learning_rate
            'subsample': self.subsample,
            'colsample_bytree': self.colsample_bytree,
            'seed': self.random_state,  # random_state
            'disable_default_eval_metric': 1 if self.objective_func else 0
        }
        params.update(self.kwargs)
        
        if self.objective_func:
            # Train with custom objective
            self.model = xgb.train(
                params=params,
                dtrain=dtrain,
                num_boost_round=self.n_estimators,
                obj=self.objective_func,
                verbose_eval=False
            )
        else:
            # Train with standard objective
            params['objective'] = 'reg:squarederror'
            del params['disable_default_eval_metric']
            self.model = xgb.train(
                params=params,
                dtrain=dtrain,
                num_boost_round=self.n_estimators,
                verbose_eval=False
            )
        
        return self
    
    def predict(self, X):
        """Make predictions."""
        dtest = xgb.DMatrix(X)
        return self.model.predict(dtest)
    
    def get_params(self, deep=True):
        """Get parameters for this estimator (required for sklearn compatibility)."""
        params = {
            'objective_func': self.objective_func,
            'random_state': self.random_state,
            'n_estimators': self.n_estimators,
            'learning_rate': self.learning_rate,
            'max_depth': self.max_depth,
            'subsample': self.subsample,
            'colsample_bytree': self.colsample_bytree
        }
        params.update(self.kwargs)
        return params
    
    def set_params(self, **params):
        """Set parameters for this estimator (required for sklearn compatibility)."""
        for key, value in params.items():
            if hasattr(self, key):
                setattr(self, key, value)
            else:
                self.kwargs[key] = value
        return self

In [9]:
k=0
for id in ids_to_execute:
    k+=1
    print("Executing",id, "iter", k)
    data = {i: pd.read_parquet(f"data/climate_features/{region}/predictor_{id}_{i}.parquet") for i in range(1,13)}
    rnn16_model = Sequential([
    SimpleRNN(16, activation="tanh", input_shape=(1, len(data[1].columns) - len(indices_of_interest))),
    Dropout(0.1),  # Regularization
    Dense(8, activation="relu"),
    Dense(len(indices_of_interest))  # Predict 5 indices
    ])
    lstm16_model = Sequential([
    LSTM(16, activation="tanh", input_shape=(1, len(data[1].columns) - len(indices_of_interest))),
    Dropout(0.1),  # Regularization
    Dense(8, activation="relu"),
    Dense(len(indices_of_interest))  # Predict 5 indices
    ])
    cnn_rnn_model = Sequential([
        Conv1D(16, kernel_size=1, activation="relu", input_shape=(1, len(data[1].columns) - len(indices_of_interest))),
        Reshape((1, 16)),  # Back to time dimension
        SimpleRNN(8, activation="tanh"),
        Dropout(0.1),
        Dense(len(indices_of_interest))
    ])
    lp_model = Sequential([
        Flatten(input_shape=(1, len(data[1].columns) - len(indices_of_interest))),
        Dense(16, activation="relu"),
        Dropout(0.1),
        Dense(8, activation="relu"),
        Dense(len(indices_of_interest))
    ])
    xgb_model = XGBCustomObjective(
        objective_func=sera_objective(piecewise_linear_phi_np(bounds, initial_weight=0.3)),
        n_estimators=15,
        learning_rate=0.1
    )
    # assert len(regressors) == len(name_regressors)
    regressors =  [xgb_model, rnn16_model, lstm16_model, cnn_rnn_model, lp_model]
    name_regressors =  ["CXGB15", "RNN16", "LSTM16", "CNNRNN16", "MLP16"]
    assert len(regressors) == len(name_regressors)
    experiment_1 = PredictionExperiment(data, indices_of_interest, regressors, name_regressors, 5, id, loss_fn=SERA(bounds=bounds,T=100, initial_weight=0.3, fn="piecewise2"))
    experiment_1.execute_experiment()
    experiment_1.get_metrics("r2", "prediction", show=False)
    experiment_1.get_metrics("mape", "prediction", show=False)
    experiment_1.get_metrics("mae", stage="prediction", show=False)
    experiment_1.get_metrics("r2", stage="training", show=False)
    experiment_1.get_metrics("mape", stage="training", show=False)
    experiment_1.get_metrics("mae", stage="training", show=False)
    experiment_1.get_metrics("r2", stage="CV", show=False)
    experiment_1.get_metrics("mape", stage="CV", show=False)
    experiment_1.get_metrics("mae", stage="CV", show=False)
    experiment_1.get_metrics("r2", stage="TSCV", show=False)
    experiment_1.get_metrics("mape", stage="TSCV", show=False)
    experiment_1.get_metrics("mae", stage="TSCV", show=False)
    experiment_1.get_metrics("sera", stage="prediction", show=False)
    experiment_1.get_metrics("sera", stage="training", show=False)
    experiment_1.get_metrics("sera", stage="CV", show=False)
    experiment_1.get_metrics("sera", stage="TSCV", show=False)

    #experiment_1.top_results("r2", 5, stage="prediction", top_data_path=f"data/results/{FREQUENCY}/top_results.csv")
    #experiment_1.top_results("cv_r2", 5, stage="CV", top_data_path=f"data/results/{FREQUENCY}/top_results.csv")
    experiment_1.save_results(f"data/sera_results_v2/{region}_results/results.csv")

In [10]:
k=0
for id in ids_to_execute:
    k+=1
    print("Executing",id, "iter", k)
    data = {i: pd.read_parquet(f"data/climate_features/{region}/predictor_{id}_{i}.parquet") for i in range(1,13)}
    rnn16_model = Sequential([
    SimpleRNN(16, activation="tanh", input_shape=(1, len(data[1].columns) - len(indices_of_interest))),
    Dropout(0.1),  # Regularization
    Dense(8, activation="relu"),
    Dense(len(indices_of_interest))  # Predict 5 indices
    ])
    lstm16_model = Sequential([
    LSTM(16, activation="tanh", input_shape=(1, len(data[1].columns) - len(indices_of_interest))),
    Dropout(0.1),  # Regularization
    Dense(8, activation="relu"),
    Dense(len(indices_of_interest))  # Predict 5 indices
    ])
    cnn_rnn_model = Sequential([
        Conv1D(16, kernel_size=1, activation="relu", input_shape=(1, len(data[1].columns) - len(indices_of_interest))),
        Reshape((1, 16)),  # Back to time dimension
        SimpleRNN(8, activation="tanh"),
        Dropout(0.1),
        Dense(len(indices_of_interest))
    ])
    lp_model = Sequential([
        Flatten(input_shape=(1, len(data[1].columns) - len(indices_of_interest))),
        Dense(16, activation="relu"),
        Dropout(0.1),
        Dense(8, activation="relu"),
        Dense(len(indices_of_interest))
    ])
    xgb_model = XGBCustomObjective(
        objective_func=sera_objective(piecewise_linear_phi_np(bounds, initial_weight=0.3)),
        n_estimators=15,
        learning_rate=0.1
    )
    # assert len(regressors) == len(name_regressors)
    regressors =  [xgb_model, rnn16_model, lstm16_model, cnn_rnn_model, lp_model]
    name_regressors =  ["CXGB15", "RNN16", "LSTM16", "CNNRNN16", "MLP16"]
    assert len(regressors) == len(name_regressors)
    experiment_1 = PredictionExperiment(data, indices_of_interest, regressors, name_regressors, 5, id, loss_fn=SERA(bounds=bounds,T=100, initial_weight=0.1, fn="piecewise2"))
    experiment_1.execute_experiment()
    experiment_1.get_metrics("r2", "prediction", show=False)
    experiment_1.get_metrics("mape", "prediction", show=False)
    experiment_1.get_metrics("mae", stage="prediction", show=False)
    experiment_1.get_metrics("r2", stage="training", show=False)
    experiment_1.get_metrics("mape", stage="training", show=False)
    experiment_1.get_metrics("mae", stage="training", show=False)
    experiment_1.get_metrics("r2", stage="CV", show=False)
    experiment_1.get_metrics("mape", stage="CV", show=False)
    experiment_1.get_metrics("mae", stage="CV", show=False)
    experiment_1.get_metrics("r2", stage="TSCV", show=False)
    experiment_1.get_metrics("mape", stage="TSCV", show=False)
    experiment_1.get_metrics("mae", stage="TSCV", show=False)
    experiment_1.get_metrics("sera", stage="prediction", show=False)
    experiment_1.get_metrics("sera", stage="training", show=False)
    experiment_1.get_metrics("sera", stage="CV", show=False)
    experiment_1.get_metrics("sera", stage="TSCV", show=False)

    #experiment_1.top_results("r2", 5, stage="prediction", top_data_path=f"data/results/{FREQUENCY}/top_results.csv")
    #experiment_1.top_results("cv_r2", 5, stage="CV", top_data_path=f"data/results/{FREQUENCY}/top_results.csv")
    experiment_1.save_results(f"data/sera_results_v2/{region}_results/results.csv")

## Chile

In [11]:
region="chile"
metadata = pd.read_csv(f"data/climate_features/{region}/metadata.csv")
metadata.reset_index(inplace=True, drop=True)
display(metadata)

Unnamed: 0,id,filename,season,indices
0,978f49d7,predictor_978f49d7_1.parquet,1,fde0e327-340e2882-43701738-e306f58b-e601b072-e...
1,978f49d7,predictor_978f49d7_2.parquet,2,fde0e327-340e2882-43701738-e306f58b-e601b072-e...
2,978f49d7,predictor_978f49d7_3.parquet,3,fde0e327-340e2882-43701738-e306f58b-e601b072-e...
3,978f49d7,predictor_978f49d7_4.parquet,4,fde0e327-340e2882-43701738-e306f58b-e601b072-e...
4,978f49d7,predictor_978f49d7_5.parquet,5,fde0e327-340e2882-43701738-e306f58b-e601b072-e...
...,...,...,...,...
247,458d357c,predictor_458d357c_8.parquet,8,32f131d2-69ffcfa8-4af95abb-4a86cb22-52eda853-3...
248,458d357c,predictor_458d357c_9.parquet,9,32f131d2-69ffcfa8-4af95abb-4a86cb22-52eda853-3...
249,458d357c,predictor_458d357c_10.parquet,10,32f131d2-69ffcfa8-4af95abb-4a86cb22-52eda853-3...
250,458d357c,predictor_458d357c_11.parquet,11,32f131d2-69ffcfa8-4af95abb-4a86cb22-52eda853-3...


In [14]:
results = pd.read_csv(f"data/sera_results_v2/{region}_results/results.csv")
ids_results = results["id_data"].unique()
id_experiments = metadata["id"].unique()
ids_to_execute = [id for id in id_experiments if id not in ids_results]
print(len(ids_to_execute))


17


In [None]:
k=0
for id in ids_to_execute:
    k+=1
    print("Executing",id, "iter", k)
    data = {i: pd.read_parquet(f"data/climate_features/{region}/predictor_{id}_{i}.parquet") for i in range(1,13)}
    rnn16_model = Sequential([
    SimpleRNN(16, activation="tanh", input_shape=(1, len(data[1].columns) - len(indices_of_interest))),
    Dropout(0.1),  # Regularization
    Dense(8, activation="relu"),
    Dense(len(indices_of_interest))  # Predict 5 indices
    ])
    lstm16_model = Sequential([
    LSTM(16, activation="tanh", input_shape=(1, len(data[1].columns) - len(indices_of_interest))),
    Dropout(0.1),  # Regularization
    Dense(8, activation="relu"),
    Dense(len(indices_of_interest))  # Predict 5 indices
    ])
    cnn_rnn_model = Sequential([
        Conv1D(16, kernel_size=1, activation="relu", input_shape=(1, len(data[1].columns) - len(indices_of_interest))),
        Reshape((1, 16)),  # Back to time dimension
        SimpleRNN(8, activation="tanh"),
        Dropout(0.1),
        Dense(len(indices_of_interest))
    ])
    lp_model = Sequential([
        Flatten(input_shape=(1, len(data[1].columns) - len(indices_of_interest))),
        Dense(16, activation="relu"),
        Dropout(0.1),
        Dense(8, activation="relu"),
        Dense(len(indices_of_interest))
    ])
    xgb_model = XGBCustomObjective(
        objective_func=sera_objective(piecewise_linear_phi_np(bounds, initial_weight=0.3)),
        n_estimators=15,
        learning_rate=0.1
    )
    # assert len(regressors) == len(name_regressors)
    # regressors =  [xgb_model, rnn16_model, lstm16_model, cnn_rnn_model, lp_model]
    # name_regressors =  ["CXGB15", "RNN16", "LSTM16", "CNNRNN16", "MLP16"]
    regressors= [xgb_model]
    name_regressors= ["CXGB15"]
    assert len(regressors) == len(name_regressors)
    experiment_1 = PredictionExperiment(data, indices_of_interest, regressors, name_regressors, 5, id, loss_fn=SERA(bounds=bounds,T=100, initial_weight=0.3, fn="piecewise2"))
    experiment_1.execute_experiment()
    experiment_1.get_metrics("r2", "prediction", show=False)
    experiment_1.get_metrics("mape", "prediction", show=False)
    experiment_1.get_metrics("mae", stage="prediction", show=False)
    experiment_1.get_metrics("r2", stage="training", show=False)
    experiment_1.get_metrics("mape", stage="training", show=False)
    experiment_1.get_metrics("mae", stage="training", show=False)
    experiment_1.get_metrics("r2", stage="CV", show=False)
    experiment_1.get_metrics("mape", stage="CV", show=False)
    experiment_1.get_metrics("mae", stage="CV", show=False)
    experiment_1.get_metrics("r2", stage="TSCV", show=False)
    experiment_1.get_metrics("mape", stage="TSCV", show=False)
    experiment_1.get_metrics("mae", stage="TSCV", show=False)
    experiment_1.get_metrics("sera", stage="prediction", show=False)
    experiment_1.get_metrics("sera", stage="training", show=False)
    experiment_1.get_metrics("sera", stage="CV", show=False)
    experiment_1.get_metrics("sera", stage="TSCV", show=False)

    #experiment_1.top_results("r2", 5, stage="prediction", top_data_path=f"data/results/{FREQUENCY}/top_results.csv")
    #experiment_1.top_results("cv_r2", 5, stage="CV", top_data_path=f"data/results/{FREQUENCY}/top_results.csv")
    experiment_1.save_results(f"data/sera_results_v2/{region}_results/results.csv")

Executing 4d17ba1a iter 1
Train predicting  1 CXGB15
Train predicting  2 CXGB15
Train predicting  3 CXGB15
Train predicting  4 CXGB15
Train predicting  5 CXGB15
Train predicting  6 CXGB15
Train predicting  7 CXGB15
Train predicting  8 CXGB15
Train predicting  9 CXGB15
Train predicting  10 CXGB15
Train predicting  11 CXGB15
Train predicting  12 CXGB15
Executing 3adff093 iter 2
Train predicting  1 CXGB15
Train predicting  2 CXGB15
Train predicting  3 CXGB15
Train predicting  4 CXGB15
Train predicting  5 CXGB15
Train predicting  6 CXGB15
Train predicting  7 CXGB15
Train predicting  8 CXGB15
Train predicting  9 CXGB15
Train predicting  10 CXGB15
Train predicting  11 CXGB15


In [None]:
k=0
for id in id_experiments:
    k+=1
    print("Executing",id, "iter", k)
    data = {i: pd.read_parquet(f"data/climate_features/{region}/predictor_{id}_{i}.parquet") for i in range(1,13)}
    rnn16_model = Sequential([
    SimpleRNN(16, activation="tanh", input_shape=(1, len(data[1].columns) - len(indices_of_interest))),
    Dropout(0.1),  # Regularization
    Dense(8, activation="relu"),
    Dense(len(indices_of_interest))  # Predict 5 indices
    ])
    lstm16_model = Sequential([
    LSTM(16, activation="tanh", input_shape=(1, len(data[1].columns) - len(indices_of_interest))),
    Dropout(0.1),  # Regularization
    Dense(8, activation="relu"),
    Dense(len(indices_of_interest))  # Predict 5 indices
    ])
    cnn_rnn_model = Sequential([
        Conv1D(16, kernel_size=1, activation="relu", input_shape=(1, len(data[1].columns) - len(indices_of_interest))),
        Reshape((1, 16)),  # Back to time dimension
        SimpleRNN(8, activation="tanh"),
        Dropout(0.1),
        Dense(len(indices_of_interest))
    ])
    lp_model = Sequential([
        Flatten(input_shape=(1, len(data[1].columns) - len(indices_of_interest))),
        Dense(16, activation="relu"),
        Dropout(0.1),
        Dense(8, activation="relu"),
        Dense(len(indices_of_interest))
    ])
    xgb_model = XGBCustomObjective(
        objective_func=sera_objective(piecewise_linear_phi_np(bounds, initial_weight=0.3)),
        n_estimators=15,
        learning_rate=0.1
    )
    # assert len(regressors) == len(name_regressors)
    regressors =  [xgb_model, rnn16_model, lstm16_model, cnn_rnn_model, lp_model]
    name_regressors =  ["CXGB15", "RNN16", "LSTM16", "CNNRNN16", "MLP16"]
    assert len(regressors) == len(name_regressors)
    experiment_1 = PredictionExperiment(data, indices_of_interest, regressors, name_regressors, 5, id, loss_fn=SERA(bounds=bounds,T=100, initial_weight=0.1, fn="piecewise2"))
    experiment_1.execute_experiment()
    experiment_1.get_metrics("r2", "prediction", show=False)
    experiment_1.get_metrics("mape", "prediction", show=False)
    experiment_1.get_metrics("mae", stage="prediction", show=False)
    experiment_1.get_metrics("r2", stage="training", show=False)
    experiment_1.get_metrics("mape", stage="training", show=False)
    experiment_1.get_metrics("mae", stage="training", show=False)
    experiment_1.get_metrics("r2", stage="CV", show=False)
    experiment_1.get_metrics("mape", stage="CV", show=False)
    experiment_1.get_metrics("mae", stage="CV", show=False)
    experiment_1.get_metrics("r2", stage="TSCV", show=False)
    experiment_1.get_metrics("mape", stage="TSCV", show=False)
    experiment_1.get_metrics("mae", stage="TSCV", show=False)
    experiment_1.get_metrics("sera", stage="prediction", show=False)
    experiment_1.get_metrics("sera", stage="training", show=False)
    experiment_1.get_metrics("sera", stage="CV", show=False)
    experiment_1.get_metrics("sera", stage="TSCV", show=False)

    #experiment_1.top_results("r2", 5, stage="prediction", top_data_path=f"data/results/{FREQUENCY}/top_results.csv")
    #experiment_1.top_results("cv_r2", 5, stage="CV", top_data_path=f"data/results/{FREQUENCY}/top_results.csv")
    experiment_1.save_results(f"data/sera_results_v2/{region}_results/results.csv")