# Monthly Prediction

## Imports and Functions

In [1]:
import tensorflow as tf
import os
import numpy as np
import random
import keras
SEED = 42

def set_seeds(seed=SEED):
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    tf.random.set_seed(seed)
    np.random.seed(seed)
    keras.utils.set_random_seed(seed)

def set_global_determinism(seed=SEED):
    set_seeds(seed=seed)

    os.environ['TF_DETERMINISTIC_OPS'] = '1'
    os.environ['TF_CUDNN_DETERMINISTIC'] = '1'
    
    tf.config.threading.set_inter_op_parallelism_threads(1)
    tf.config.threading.set_intra_op_parallelism_threads(1)

# Call the above function with seed value
set_global_determinism(seed=SEED)

In [2]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.gaussian_process.kernels import RBF, WhiteKernel
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.model_selection import TimeSeriesSplit

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, LSTM, Dense, Dropout, GRU, Conv1D, Flatten, Reshape

from sklearn.svm import SVR
from xgboost import XGBRegressor
import sys
from catboost import CatBoostRegressor


In [3]:
# Add the folder to the Python path

os.chdir("../")
# change working directory to project's root path
print(os.getcwd())

FIRST_YEAR= 1972
FREQUENCY= "monthly"

c:\Users\marti\Desktop\data\hw_extra


In [4]:
folder_path = os.path.abspath("functions/") #INPUT_PATH)#'path_to_your_folder')  # Replace with the actual folder path
sys.path.insert(0, folder_path)

from Predictions import (
    PredictionExperiment,
    PredictionModel
)


TensorFlow Addons (TFA) has ended development and introduction of new features.
TFA has entered a minimal maintenance and release mode until a planned end of life in May 2024.
Please modify downstream libraries to take dependencies from other repositories in our TensorFlow community (e.g. Keras, Keras-CV, and Keras-NLP). 

For more information see: https://github.com/tensorflow/addons/issues/2807 



In [5]:
indices_of_interest = ["HWN", "HWF", "HWD", "HWM", "HWA"]


## California

In [6]:
region="california"
metadata_path = f"data/locally_features/{region}/metadata.csv"
metadata = pd.read_csv(metadata_path)
metadata.reset_index(inplace=True, drop=True)
display(metadata)

Unnamed: 0,id,filename,season,indices,stations
0,355d8add,predictor_355d8add_1.parquet,1,775d0d46-00376815-a6cfee77-e4199a9f-e15d8862-8...,USC00040983
1,355d8add,predictor_355d8add_2.parquet,2,775d0d46-00376815-a6cfee77-e4199a9f-e15d8862-8...,USC00040983
2,355d8add,predictor_355d8add_3.parquet,3,775d0d46-00376815-a6cfee77-e4199a9f-e15d8862-8...,USC00040983
3,355d8add,predictor_355d8add_4.parquet,4,775d0d46-00376815-a6cfee77-e4199a9f-e15d8862-8...,USC00040983
4,355d8add,predictor_355d8add_5.parquet,5,775d0d46-00376815-a6cfee77-e4199a9f-e15d8862-8...,USC00040983
...,...,...,...,...,...
139,1d1adaa3,predictor_1d1adaa3_8.parquet,8,775d0d46-00376815-a6cfee77-e4199a9f-e15d8862-8...,california_NOAA_sur
140,1d1adaa3,predictor_1d1adaa3_9.parquet,9,775d0d46-00376815-a6cfee77-e4199a9f-e15d8862-8...,california_NOAA_sur
141,1d1adaa3,predictor_1d1adaa3_10.parquet,10,775d0d46-00376815-a6cfee77-e4199a9f-e15d8862-8...,california_NOAA_sur
142,1d1adaa3,predictor_1d1adaa3_11.parquet,11,775d0d46-00376815-a6cfee77-e4199a9f-e15d8862-8...,california_NOAA_sur


In [7]:
results = pd.read_csv(f"data/locally_results/{region}_results/results.csv")
ids_results = results["id_data"].unique()
id_experiments = metadata["id"].unique()
ids_to_execute = [id for id in id_experiments if id not in ids_results]
print(len(ids_to_execute))

12


In [8]:
k=0
for id in id_experiments:
    k+=1
    print("Executing",id, "iter", k)
    data = {i: pd.read_parquet(f"data/locally_features/{region}/predictor_{id}_{i}.parquet") for i in range(1,13)}
    rnn_model = Sequential([
    SimpleRNN(16, activation="tanh", input_shape=(1, len(data[1].columns) - len(indices_of_interest))),
    Dropout(0.1),  # Regularization
    Dense(8, activation="relu"),
    Dense(len(indices_of_interest))  # Predict 5 indices
    ])
    lstm_model = Sequential([
    LSTM(16, activation="tanh", input_shape=(1, len(data[1].columns) - len(indices_of_interest))),
    Dropout(0.1),  # Regularization
    Dense(8, activation="relu"),
    Dense(len(indices_of_interest))  # Predict 5 indices
    ])
    # Stacked LSTM
    stacked_lstm = Sequential([
        LSTM(16, activation="tanh", return_sequences=True, input_shape=(1, len(data[12].columns) - len(indices_of_interest))),
        LSTM(8, activation="tanh"),
        Dropout(0.1),
        Dense(len(indices_of_interest))
    ])
    cnn_rnn_model = Sequential([
        Conv1D(16, kernel_size=1, activation="relu", input_shape=(1, len(data[12].columns) - len(indices_of_interest))),
        Reshape((1, 16)),  # Back to time dimension
        SimpleRNN(8, activation="tanh"),
        Dropout(0.1),
        Dense(len(indices_of_interest))
    ])
    lp_model = Sequential([
        Flatten(input_shape=(1, len(data[12].columns) - len(indices_of_interest))),
        Dense(16, activation="relu"),
        Dropout(0.1),
        Dense(8, activation="relu"),
        Dense(len(indices_of_interest))
    ])
    regressors = [rnn_model, lstm_model, stacked_lstm, cnn_rnn_model, lp_model]
    name_regressors = ["RNN16", "LSTM16", "CNNRNN16", "CNNLSTM16", "MLP16"]
    regressors = [LinearRegression(), RandomForestRegressor(random_state=42, n_estimators=10), SVR(kernel='rbf'), SVR(kernel='linear'),
                XGBRegressor(random_state=42, n_estimators=10, learning_rate=0.1)] + regressors
    name_regressors = ["Linear", "RF10", "SVR-rbf", "SVR-linear", "XGB10"] + name_regressors
    assert len(regressors) == len(name_regressors)
    experiment_1 = PredictionExperiment(data, indices_of_interest, regressors, name_regressors, 5, id)
    experiment_1.execute_experiment()
    experiment_1.get_metrics("r2", "prediction", show=False)
    experiment_1.get_metrics("mape", "prediction", show=False)
    experiment_1.get_metrics("mae", stage="prediction", show=False)
    experiment_1.get_metrics("r2", stage="training", show=False)
    experiment_1.get_metrics("mape", stage="training", show=False)
    experiment_1.get_metrics("mae", stage="training", show=False)
    experiment_1.get_metrics("r2", stage="CV", show=False)
    experiment_1.get_metrics("mape", stage="CV", show=False)
    experiment_1.get_metrics("mae", stage="CV", show=False)
    experiment_1.get_metrics("r2", stage="TSCV", show=False)
    experiment_1.get_metrics("mape", stage="TSCV", show=False)
    experiment_1.get_metrics("mae", stage="TSCV", show=False)
    #experiment_1.top_results("r2", 5, stage="prediction", top_data_path=f"data/results/{FREQUENCY}/top_results.csv")
    #experiment_1.top_results("cv_r2", 5, stage="CV", top_data_path=f"data/results/{FREQUENCY}/top_results.csv")
    experiment_1.save_results(f"data/locally_results/{region}_results/results.csv")

Executing 355d8add iter 1
Executing e8b133a1 iter 2
Executing 706912a1 iter 3
Executing 7828ad33 iter 4
Executing 05727793 iter 5
Executing 37aa5f1e iter 6
Executing 072d4a2e iter 7
Executing aa74505a iter 8
Executing 8066d7ca iter 9
Executing c6b5cfea iter 10
Executing 34185832 iter 11
Executing 1d1adaa3 iter 12


## Chile

In [6]:
region="chile"
metadata = pd.read_csv(f"data/locally_features/{region}/metadata.csv")
metadata.reset_index(inplace=True, drop=True)
display(metadata)

Unnamed: 0,id,filename,season,indices,stations
0,41e7367e,predictor_41e7367e_1.parquet,1,4914419e-626d65d6-7fb9d7a1-68aad4b4-9a40f504-b...,330007
1,41e7367e,predictor_41e7367e_2.parquet,2,4914419e-626d65d6-7fb9d7a1-68aad4b4-9a40f504-b...,330007
2,41e7367e,predictor_41e7367e_3.parquet,3,4914419e-626d65d6-7fb9d7a1-68aad4b4-9a40f504-b...,330007
3,41e7367e,predictor_41e7367e_4.parquet,4,4914419e-626d65d6-7fb9d7a1-68aad4b4-9a40f504-b...,330007
4,41e7367e,predictor_41e7367e_5.parquet,5,4914419e-626d65d6-7fb9d7a1-68aad4b4-9a40f504-b...,330007
...,...,...,...,...,...
139,a4d41909,predictor_a4d41909_8.parquet,8,e01fec28-2812e0a6-208ab882-266e3e9c-2dbfd54e-a...,chile_central_sur
140,a4d41909,predictor_a4d41909_9.parquet,9,e01fec28-2812e0a6-208ab882-266e3e9c-2dbfd54e-a...,chile_central_sur
141,a4d41909,predictor_a4d41909_10.parquet,10,e01fec28-2812e0a6-208ab882-266e3e9c-2dbfd54e-a...,chile_central_sur
142,a4d41909,predictor_a4d41909_11.parquet,11,e01fec28-2812e0a6-208ab882-266e3e9c-2dbfd54e-a...,chile_central_sur


In [7]:
results = pd.read_csv(f"data/locally_results/{region}_results/results.csv")
ids_results = results["id_data"].unique()
id_experiments = metadata["id"].unique()
ids_to_execute = [id for id in id_experiments if id not in ids_results]
print(len(ids_to_execute))


5


In [8]:
k=0
for id in ids_to_execute:
    k+=1
    print("Executing",id, "iter", k)
    data = {i: pd.read_parquet(f"data/locally_features/{region}/predictor_{id}_{i}.parquet") for i in range(1,13)}
    rnn_model = Sequential([
    SimpleRNN(16, activation="tanh", input_shape=(1, len(data[1].columns) - len(indices_of_interest))),
    Dropout(0.1),  # Regularization
    Dense(8, activation="relu"),
    Dense(len(indices_of_interest))  # Predict 5 indices
    ])
    lstm_model = Sequential([
    LSTM(16, activation="tanh", input_shape=(1, len(data[1].columns) - len(indices_of_interest))),
    Dropout(0.1),  # Regularization
    Dense(8, activation="relu"),
    Dense(len(indices_of_interest))  # Predict 5 indices
    ])
    # Stacked LSTM
    stacked_lstm = Sequential([
        LSTM(16, activation="tanh", return_sequences=True, input_shape=(1, len(data[12].columns) - len(indices_of_interest))),
        LSTM(8, activation="tanh"),
        Dropout(0.1),
        Dense(len(indices_of_interest))
    ])
    cnn_rnn_model = Sequential([
        Conv1D(16, kernel_size=1, activation="relu", input_shape=(1, len(data[12].columns) - len(indices_of_interest))),
        Reshape((1, 16)),  # Back to time dimension
        SimpleRNN(8, activation="tanh"),
        Dropout(0.1),
        Dense(len(indices_of_interest))
    ])
    lp_model = Sequential([
        Flatten(input_shape=(1, len(data[12].columns) - len(indices_of_interest))),
        Dense(16, activation="relu"),
        Dropout(0.1),
        Dense(8, activation="relu"),
        Dense(len(indices_of_interest))
    ])
    regressors = [rnn_model, lstm_model, stacked_lstm, cnn_rnn_model, lp_model]
    name_regressors = ["RNN16", "LSTM16", "CNNRNN16", "CNNLSTM16", "MLP16"]
    regressors = [LinearRegression(), RandomForestRegressor(random_state=42, n_estimators=10), SVR(kernel='rbf'), SVR(kernel='linear'),
                XGBRegressor(random_state=42, n_estimators=10, learning_rate=0.1)] + regressors
    name_regressors = ["Linear", "RF10", "SVR-rbf", "SVR-linear", "XGB10"] + name_regressors
    assert len(regressors) == len(name_regressors)
    experiment_1 = PredictionExperiment(data, indices_of_interest, regressors, name_regressors, 5, id)
    experiment_1.execute_experiment()
    experiment_1.get_metrics("r2", "prediction", show=False)
    experiment_1.get_metrics("mape", "prediction", show=False)
    experiment_1.get_metrics("mae", stage="prediction", show=False)
    experiment_1.get_metrics("r2", stage="training", show=False)
    experiment_1.get_metrics("mape", stage="training", show=False)
    experiment_1.get_metrics("mae", stage="training", show=False)
    experiment_1.get_metrics("r2", stage="CV", show=False)
    experiment_1.get_metrics("mape", stage="CV", show=False)
    experiment_1.get_metrics("mae", stage="CV", show=False)
    experiment_1.get_metrics("r2", stage="TSCV", show=False)
    experiment_1.get_metrics("mape", stage="TSCV", show=False)
    experiment_1.get_metrics("mae", stage="TSCV", show=False)
    #experiment_1.top_results("r2", 5, stage="prediction", top_data_path=f"data/results/{FREQUENCY}/top_results.csv")
    #experiment_1.top_results("cv_r2", 5, stage="CV", top_data_path=f"data/results/{FREQUENCY}/top_results.csv")
    experiment_1.save_results(f"data/locally_results/{region}_results/results.csv")

Executing 231a6330 iter 1
Executing 43ded9cf iter 2
Executing e7fa2923 iter 3
Executing b469e259 iter 4
Executing a4d41909 iter 5
