# Summerly Prediction

## Imports and Functions

In [1]:
import tensorflow as tf
import os
import numpy as np
import random
import keras
SEED = 42

def set_seeds(seed=SEED):
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    tf.random.set_seed(seed)
    np.random.seed(seed)
    keras.utils.set_random_seed(seed)

def set_global_determinism(seed=SEED):
    set_seeds(seed=seed)

    os.environ['TF_DETERMINISTIC_OPS'] = '1'
    os.environ['TF_CUDNN_DETERMINISTIC'] = '1'
    
    tf.config.threading.set_inter_op_parallelism_threads(1)
    tf.config.threading.set_intra_op_parallelism_threads(1)

# Call the above function with seed value
set_global_determinism(seed=SEED)

In [2]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.gaussian_process.kernels import RBF, WhiteKernel
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.model_selection import TimeSeriesSplit

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, LSTM, Dense, Dropout, GRU, Conv1D, Flatten, Reshape

from sklearn.svm import SVR
from xgboost import XGBRegressor
import sys
from catboost import CatBoostRegressor


In [3]:
# Add the folder to the Python path

os.chdir("../")
# change working directory to project's root path
print(os.getcwd())

FIRST_YEAR= 1972
FREQUENCY= "monthly"

c:\Users\marti\Desktop\data\hw_extra


In [4]:
folder_path = os.path.abspath("functions/") #INPUT_PATH)#'path_to_your_folder')  # Replace with the actual folder path
sys.path.insert(0, folder_path)

from Predictions import (
    PredictionExperiment,
    PredictionModel
)


TensorFlow Addons (TFA) has ended development and introduction of new features.
TFA has entered a minimal maintenance and release mode until a planned end of life in May 2024.
Please modify downstream libraries to take dependencies from other repositories in our TensorFlow community (e.g. Keras, Keras-CV, and Keras-NLP). 

For more information see: https://github.com/tensorflow/addons/issues/2807 



In [5]:
indices_of_interest = ["HWN", "HWF", "HWD", "HWM", "HWA"]


## California

In [6]:
region="california"
metadata_path = f"data/summer_features/{region}/metadata.csv"
metadata = pd.read_csv(metadata_path)
metadata.reset_index(inplace=True, drop=True)
display(metadata)

Unnamed: 0,id,filename,season,indices


In [7]:
results = pd.read_csv(f"data/climate_results/{region}_results/results.csv")
ids_results = results["id_data"].unique()
id_experiments = metadata["id"].unique()
ids_to_execute = [id for id in id_experiments if id not in ids_results]
print(len(ids_to_execute))

0


In [8]:
k=0
for id in id_experiments:
    k+=1
    print("Executing",id, "iter", k)
    data = {i: pd.read_parquet(f"data/climate_features/{region}/predictor_{id}_{i}.parquet") for i in range(1,13)}
    rnn_model = Sequential([
    SimpleRNN(8, activation="tanh", input_shape=(1, len(data[1].columns) - len(indices_of_interest))),
    Dropout(0.1),  # Regularization
    Dense(4, activation="relu"),
    Dense(len(indices_of_interest))  # Predict 5 indices
    ])
    lstm_model = Sequential([
    LSTM(8, activation="tanh", input_shape=(1, len(data[1].columns) - len(indices_of_interest))),
    Dropout(0.1),  # Regularization
    Dense(4, activation="relu"),
    Dense(len(indices_of_interest))  # Predict 5 indices
    ])
    # Stacked LSTM
    stacked_lstm = Sequential([
        LSTM(16, activation="tanh", return_sequences=True, input_shape=(1, len(data[12].columns) - len(indices_of_interest))),
        LSTM(8, activation="tanh"),
        Dropout(0.1),
        Dense(len(indices_of_interest))
    ])
    cnn_rnn_model = Sequential([
        Conv1D(16, kernel_size=1, activation="relu", input_shape=(1, len(data[12].columns) - len(indices_of_interest))),
        Reshape((1, 16)),  # Back to time dimension
        SimpleRNN(8, activation="tanh"),
        Dropout(0.1),
        Dense(len(indices_of_interest))
    ])
    lp_model = Sequential([
        Flatten(input_shape=(1, len(data[12].columns) - len(indices_of_interest))),
        Dense(16, activation="relu"),
        Dropout(0.1),
        Dense(8, activation="relu"),
        Dense(len(indices_of_interest))
    ])
    lp8_model = Sequential([
        Flatten(input_shape=(1, len(data[12].columns) - len(indices_of_interest))),
        Dense(8, activation="relu"),
        Dropout(0.1),
        Dense(4, activation="relu"),
        Dense(len(indices_of_interest))
    ])
    # regressors = [rnn_model, lstm_model, stacked_lstm, cnn_rnn_model, lp_model, lp8_model]
    # name_regressors = ["RNN8", "LSTM8", "CNNRNN16", "CNNLSTM16", "MLP16", "MLP8"]
    regressors = [LinearRegression()]
    name_regressors = ["Linear"]
    assert len(regressors) == len(name_regressors)
    experiment_1 = PredictionExperiment(data, indices_of_interest, regressors, name_regressors, 5, id)
    experiment_1.execute_experiment()
    experiment_1.get_metrics("r2", "prediction", show=False)
    experiment_1.get_metrics("mape", "prediction", show=False)
    experiment_1.get_metrics("mae", stage="prediction", show=False)
    experiment_1.get_metrics("r2", stage="training", show=False)
    experiment_1.get_metrics("mape", stage="training", show=False)
    experiment_1.get_metrics("mae", stage="training", show=False)
    experiment_1.get_metrics("r2", stage="CV", show=False)
    experiment_1.get_metrics("mape", stage="CV", show=False)
    experiment_1.get_metrics("mae", stage="CV", show=False)
    experiment_1.get_metrics("r2", stage="TSCV", show=False)
    experiment_1.get_metrics("mape", stage="TSCV", show=False)
    experiment_1.get_metrics("mae", stage="TSCV", show=False)
    #experiment_1.top_results("r2", 5, stage="prediction", top_data_path=f"data/results/{FREQUENCY}/top_results.csv")
    #experiment_1.top_results("cv_r2", 5, stage="CV", top_data_path=f"data/results/{FREQUENCY}/top_results.csv")
    experiment_1.save_results(f"data/climate_results_NN/{region}_results/results.csv")

## Chile

In [16]:
region="chile"
metadata = pd.read_csv(f"data/summer_features/{region}/metadata.csv")
metadata.reset_index(inplace=True, drop=True)
display(metadata)

Unnamed: 0,id,filename,season,indices
0,6bfb94aa,predictor_6bfb94aa_1.parquet,1,54fdbdb3-2afba2c1-1d705895-6e909044-ef22ff7b-8...
1,d5ec141a,predictor_d5ec141a_1.parquet,1,PDO-ONI-SAM-DMI
2,3c59cc03,predictor_3c59cc03_1.parquet,1,c8260118-1e8ced6f-26976a3c-e0fba9b4-7e5d8e97-6...
3,cb34c388,predictor_cb34c388_1.parquet,1,c8260118-1e8ced6f-26976a3c-e0fba9b4-7e5d8e97-P...
4,4f14de2d,predictor_4f14de2d_1.parquet,1,c8260118-1e8ced6f-26976a3c-e0fba9b4-7e5d8e97-6...
5,891b838a,predictor_891b838a_1.parquet,1,c8260118-1e8ced6f-26976a3c-e0fba9b4-7e5d8e97-7...
6,5bcdae55,predictor_5bcdae55_1.parquet,1,c8260118-1e8ced6f-26976a3c-e0fba9b4-7e5d8e97-7...


In [19]:
results = pd.read_csv(f"data/summer_results/{region}_results/results.csv")
ids_results = results["id_data"].unique()
id_experiments = metadata["id"].unique()
ids_to_execute = [id for id in id_experiments if id not in ids_results]
print(len(ids_to_execute))


5


In [20]:
k=0
for id in ids_to_execute:
    k+=1
    print("Executing",id, "iter", k)
    data = {1: pd.read_parquet(f"data/summer_features/{region}/predictor_{id}_1.parquet")}
    rnn_model = Sequential([
    SimpleRNN(8, activation="tanh", input_shape=(1, len(data[1].columns) - len(indices_of_interest))),
    Dropout(0.1),  # Regularization
    Dense(4, activation="relu"),
    Dense(len(indices_of_interest))  # Predict 5 indices
    ])
    lstm_model = Sequential([
    LSTM(8, activation="tanh", input_shape=(1, len(data[1].columns) - len(indices_of_interest))),
    Dropout(0.1),  # Regularization
    Dense(4, activation="relu"),
    Dense(len(indices_of_interest))  # Predict 5 indices
    ])
    rnn16_model = Sequential([
    SimpleRNN(16, activation="tanh", input_shape=(1, len(data[1].columns) - len(indices_of_interest))),
    Dropout(0.1),  # Regularization
    Dense(8, activation="relu"),
    Dense(len(indices_of_interest))  # Predict 5 indices
    ])
    lstm16_model = Sequential([
    LSTM(16, activation="tanh", input_shape=(1, len(data[1].columns) - len(indices_of_interest))),
    Dropout(0.1),  # Regularization
    Dense(8, activation="relu"),
    Dense(len(indices_of_interest))  # Predict 5 indices
    ])
    # Stacked LSTM
    stacked_lstm = Sequential([
        LSTM(16, activation="tanh", return_sequences=True, input_shape=(1, len(data[1].columns) - len(indices_of_interest))),
        LSTM(8, activation="tanh"),
        Dropout(0.1),
        Dense(len(indices_of_interest))
    ])
    cnn_rnn_model = Sequential([
        Conv1D(16, kernel_size=1, activation="relu", input_shape=(1, len(data[1].columns) - len(indices_of_interest))),
        Reshape((1, 16)),  # Back to time dimension
        SimpleRNN(8, activation="tanh"),
        Dropout(0.1),
        Dense(len(indices_of_interest))
    ])
    lp_model = Sequential([
        Flatten(input_shape=(1, len(data[1].columns) - len(indices_of_interest))),
        Dense(16, activation="relu"),
        Dropout(0.1),
        Dense(8, activation="relu"),
        Dense(len(indices_of_interest))
    ])
    lp8_model = Sequential([
        Flatten(input_shape=(1, len(data[1].columns) - len(indices_of_interest))),
        Dense(8, activation="relu"),
        Dropout(0.1),
        Dense(4, activation="relu"),
        Dense(len(indices_of_interest))
    ])
    kernel = RBF(length_scale=1.0) + WhiteKernel(noise_level=1)
    regressors = [LinearRegression(),RandomForestRegressor(random_state=42, n_estimators=5), RandomForestRegressor(random_state=42, n_estimators=10), SVR(kernel='rbf'), SVR(kernel='linear'), SVR(kernel='poly', degree=3),
                    XGBRegressor(random_state=42, n_estimators=10, learning_rate=0.1),XGBRegressor(random_state=42, n_estimators=15, learning_rate=0.1), XGBRegressor(random_state=42, n_estimators=10, learning_rate=0.1),XGBRegressor(random_state=42, n_estimators=15, learning_rate=0.1), GaussianProcessRegressor(kernel=kernel, random_state=42, n_restarts_optimizer=10)]
    name_regressors = ["Linear", "RF5", "RF10", "SVR-rbf", "SVR-linear", "SVR-cubic", "XGBL10", "XGBL15", "XGBT10", "XGBT15", "GPR"]
    assert len(regressors) == len(name_regressors)
    regressors = regressors + [rnn_model, lstm_model, rnn16_model, lstm16_model, stacked_lstm, cnn_rnn_model, lp_model, lp8_model]
    name_regressors =  name_regressors+ ["RNN8", "LSTM8", "RNN16", "LSTM16", "SLSTM16", "CNNRNN16", "MLP16", "MLP8"]
    assert len(regressors) == len(name_regressors)
    experiment_1 = PredictionExperiment(data, indices_of_interest, regressors, name_regressors, 5, id)
    experiment_1.execute_experiment()
    experiment_1.get_metrics("r2", "prediction", show=False)
    experiment_1.get_metrics("mape", "prediction", show=False)
    experiment_1.get_metrics("mae", stage="prediction", show=False)
    experiment_1.get_metrics("r2", stage="training", show=False)
    experiment_1.get_metrics("mape", stage="training", show=False)
    experiment_1.get_metrics("mae", stage="training", show=False)
    experiment_1.get_metrics("r2", stage="CV", show=False)
    experiment_1.get_metrics("mape", stage="CV", show=False)
    experiment_1.get_metrics("mae", stage="CV", show=False)
    experiment_1.get_metrics("r2", stage="TSCV", show=False)
    experiment_1.get_metrics("mape", stage="TSCV", show=False)
    experiment_1.get_metrics("mae", stage="TSCV", show=False)
    #experiment_1.top_results("r2", 5, stage="prediction", top_data_path=f"data/results/{FREQUENCY}/top_results.csv")
    #experiment_1.top_results("cv_r2", 5, stage="CV", top_data_path=f"data/results/{FREQUENCY}/top_results.csv")
    experiment_1.save_results(f"data/summer_results/{region}_results/results.csv")

Executing 3c59cc03 iter 1
Executing cb34c388 iter 2
Executing 4f14de2d iter 3
Executing 891b838a iter 4
Executing 5bcdae55 iter 5
