# Monthly Prediction by Label

## Imports and Functions

In [1]:
import tensorflow as tf
import os
import numpy as np
import random
import keras
SEED = 42

def set_seeds(seed=SEED):
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    tf.random.set_seed(seed)
    np.random.seed(seed)
    keras.utils.set_random_seed(seed)

def set_global_determinism(seed=SEED):
    set_seeds(seed=seed)

    os.environ['TF_DETERMINISTIC_OPS'] = '1'
    os.environ['TF_CUDNN_DETERMINISTIC'] = '1'
    
    tf.config.threading.set_inter_op_parallelism_threads(1)
    tf.config.threading.set_intra_op_parallelism_threads(1)

# Call the above function with seed value
set_global_determinism(seed=SEED)

In [2]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.gaussian_process.kernels import RBF, WhiteKernel
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.model_selection import TimeSeriesSplit

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, LSTM, Dense, Dropout, GRU, Conv1D, Flatten, Reshape

from sklearn.svm import SVR
from xgboost import XGBRegressor
import sys
from catboost import CatBoostRegressor


In [3]:
# Add the folder to the Python path

os.chdir("../")
# change working directory to project's root path
print(os.getcwd())

FIRST_YEAR= 1972
FREQUENCY= "monthly"

c:\Users\marti\Desktop\data\hw_extra


In [4]:
folder_path = os.path.abspath("functions/") #INPUT_PATH)#'path_to_your_folder')  # Replace with the actual folder path
sys.path.insert(0, folder_path)

from Predictions import (
    PredictionExperiment,
    PredictionModel
)


TensorFlow Addons (TFA) has ended development and introduction of new features.
TFA has entered a minimal maintenance and release mode until a planned end of life in May 2024.
Please modify downstream libraries to take dependencies from other repositories in our TensorFlow community (e.g. Keras, Keras-CV, and Keras-NLP). 

For more information see: https://github.com/tensorflow/addons/issues/2807 



In [5]:
indices_of_interest = ["HWN", "HWF", "HWD", "HWM", "HWA"]


## California

In [21]:
region="california"
features = "climate"
metadata_path = f"data/{features}_features/{region}/metadata.csv"
metadata = pd.read_csv(metadata_path)
metadata.reset_index(inplace=True, drop=True)
display(metadata)

Unnamed: 0,id,filename,season,indices
0,6e47cb06,predictor_6e47cb06_1.parquet,1,df9a31c5-20a07cea-cfb03125-9169e0dc-0b0bffae-b...
1,6e47cb06,predictor_6e47cb06_2.parquet,2,df9a31c5-20a07cea-cfb03125-9169e0dc-0b0bffae-b...
2,6e47cb06,predictor_6e47cb06_3.parquet,3,df9a31c5-20a07cea-cfb03125-9169e0dc-0b0bffae-b...
3,6e47cb06,predictor_6e47cb06_4.parquet,4,df9a31c5-20a07cea-cfb03125-9169e0dc-0b0bffae-b...
4,6e47cb06,predictor_6e47cb06_5.parquet,5,df9a31c5-20a07cea-cfb03125-9169e0dc-0b0bffae-b...
5,6e47cb06,predictor_6e47cb06_6.parquet,6,df9a31c5-20a07cea-cfb03125-9169e0dc-0b0bffae-b...
6,6e47cb06,predictor_6e47cb06_7.parquet,7,df9a31c5-20a07cea-cfb03125-9169e0dc-0b0bffae-b...
7,6e47cb06,predictor_6e47cb06_8.parquet,8,df9a31c5-20a07cea-cfb03125-9169e0dc-0b0bffae-b...
8,6e47cb06,predictor_6e47cb06_9.parquet,9,df9a31c5-20a07cea-cfb03125-9169e0dc-0b0bffae-b...
9,6e47cb06,predictor_6e47cb06_10.parquet,10,df9a31c5-20a07cea-cfb03125-9169e0dc-0b0bffae-b...


In [22]:
results = pd.read_csv(f"data/{features}_results/{region}_results/results.csv")
ids_results = results["id_data"].unique()
id_experiments = metadata["id"].unique()
ids_to_execute = [id for id in id_experiments if id not in ids_results]
print(len(ids_to_execute))

0


In [23]:
k=0
sep_indices = [["HWN","HWF"]]
for id in ["5cb3fa02"]:
    k+=1
    print("Executing",id, "iter", k)
    for indices in sep_indices:
        print(indices)
        data = {i: pd.read_parquet(f"data/{features}_features/{region}/predictor_{id}_{i}.parquet") for i in range(1,13)}
        labels_to_remove = indices_of_interest.copy()
        for index in indices:
            labels_to_remove.remove(index)
        columns_to_keep = data[1].columns.difference(labels_to_remove)
        data = {i: data[i][columns_to_keep] for i in range(1,13)}
        rnn_model = Sequential([
        SimpleRNN(16, activation="tanh", input_shape=(1, len(columns_to_keep)- len(indices))),
        Dropout(0.1),  # Regularization
        Dense(8, activation="relu"),
        Dense(len(indices))  
        ])
        lstm_model = Sequential([
        LSTM(16, activation="tanh", input_shape=(1,len(columns_to_keep)- len(indices))),
        Dropout(0.1),  # Regularization
        Dense(8, activation="relu"),
        Dense(len(indices)) 
        ])
        # Stacked LSTM
        stacked_lstm = Sequential([
            LSTM(16, activation="tanh", return_sequences=True, input_shape=(1, len(columns_to_keep)- len(indices))),
            LSTM(8, activation="tanh"),
            Dropout(0.1),
            Dense(len(indices))
        ])
        cnn_rnn_model = Sequential([
            Conv1D(16, kernel_size=1, activation="relu", input_shape=(1, len(columns_to_keep)- len(indices))),
            Reshape((1, 16)),  # Back to time dimension
            SimpleRNN(8, activation="tanh"),
            Dropout(0.1),
            Dense(len(indices))
        ])
        lp_model = Sequential([
            Flatten(input_shape=(1, len(columns_to_keep)- len(indices))),
            Dense(16, activation="relu"),
            Dropout(0.1),
            Dense(8, activation="relu"),
            Dense(len(indices))
        ])
        regressors = [rnn_model, lstm_model, stacked_lstm, cnn_rnn_model, lp_model]
        name_regressors = ["RNN16", "LSTM16", "CNNRNN16", "CNNLSTM16", "MLP16"]
        regressors = [LinearRegression(), RandomForestRegressor(random_state=42, n_estimators=10), SVR(kernel='rbf'), SVR(kernel='linear'),
                    XGBRegressor(random_state=42, n_estimators=10, learning_rate=0.1)] + regressors
        name_regressors = ["Linear", "RF10", "SVR-rbf", "SVR-linear", "XGB10"] + name_regressors
        assert len(regressors) == len(name_regressors)
        experiment_1 = PredictionExperiment(data, indices, regressors, name_regressors, 5, id)
        experiment_1.execute_experiment()
        experiment_1.get_metrics("r2", "prediction", show=False)
        experiment_1.get_metrics("mape", "prediction", show=False)
        experiment_1.get_metrics("mae", stage="prediction", show=False)
        experiment_1.get_metrics("r2", stage="training", show=False)
        experiment_1.get_metrics("mape", stage="training", show=False)
        experiment_1.get_metrics("mae", stage="training", show=False)
        experiment_1.get_metrics("r2", stage="CV", show=False)
        experiment_1.get_metrics("mape", stage="CV", show=False)
        experiment_1.get_metrics("mae", stage="CV", show=False)
        experiment_1.get_metrics("r2", stage="TSCV", show=False)
        experiment_1.get_metrics("mape", stage="TSCV", show=False)
        experiment_1.get_metrics("mae", stage="TSCV", show=False)
        #experiment_1.top_results("r2", 5, stage="prediction", top_data_path=f"data/results/{FREQUENCY}/top_results.csv")
        #experiment_1.top_results("cv_r2", 5, stage="CV", top_data_path=f"data/results/{FREQUENCY}/top_results.csv")
        experiment_1.save_results(f"data/labelly_results/{region}_results/results_frequency.csv") if "HWN" in indices else experiment_1.save_results(f"data/labelly_results/{region}_results/results_intensity.csv")

Executing 5cb3fa02 iter 1
['HWN', 'HWF']


## Chile

In [6]:
region="chile"
features = "climate"
metadata = pd.read_csv(f"data/{features}_features/{region}/metadata.csv")
metadata.reset_index(inplace=True, drop=True)
display(metadata)

Unnamed: 0,id,filename,season,indices
0,978f49d7,predictor_978f49d7_1.parquet,1,fde0e327-340e2882-43701738-e306f58b-e601b072-e...
1,978f49d7,predictor_978f49d7_2.parquet,2,fde0e327-340e2882-43701738-e306f58b-e601b072-e...
2,978f49d7,predictor_978f49d7_3.parquet,3,fde0e327-340e2882-43701738-e306f58b-e601b072-e...
3,978f49d7,predictor_978f49d7_4.parquet,4,fde0e327-340e2882-43701738-e306f58b-e601b072-e...
4,978f49d7,predictor_978f49d7_5.parquet,5,fde0e327-340e2882-43701738-e306f58b-e601b072-e...
...,...,...,...,...
247,458d357c,predictor_458d357c_8.parquet,8,32f131d2-69ffcfa8-4af95abb-4a86cb22-52eda853-3...
248,458d357c,predictor_458d357c_9.parquet,9,32f131d2-69ffcfa8-4af95abb-4a86cb22-52eda853-3...
249,458d357c,predictor_458d357c_10.parquet,10,32f131d2-69ffcfa8-4af95abb-4a86cb22-52eda853-3...
250,458d357c,predictor_458d357c_11.parquet,11,32f131d2-69ffcfa8-4af95abb-4a86cb22-52eda853-3...


In [7]:
results = pd.read_csv(f"data/{features}_results/{region}_results/results.csv")
ids_results = results["id_data"].unique()
id_experiments = metadata["id"].unique()
ids_to_execute = [id for id in id_experiments if id not in ids_results]
print(len(ids_to_execute))

0


In [8]:
k=0
sep_indices = [["HWN","HWF"], ["HWD","HWM","HWA"]]
for id in id_experiments[4:]:
    k+=1
    print("Executing",id, "iter", k)
    for indices in sep_indices:
        print(indices)
        data = {i: pd.read_parquet(f"data/{features}_features/{region}/predictor_{id}_{i}.parquet") for i in range(1,13)}
        labels_to_remove = indices_of_interest.copy()
        for index in indices:
            labels_to_remove.remove(index)
        columns_to_keep = data[1].columns.difference(labels_to_remove)
        data = {i: data[i][columns_to_keep] for i in range(1,13)}
        rnn_model = Sequential([
        SimpleRNN(16, activation="tanh", input_shape=(1, len(columns_to_keep)- len(indices))),
        Dropout(0.1),  # Regularization
        Dense(8, activation="relu"),
        Dense(len(indices))  
        ])
        lstm_model = Sequential([
        LSTM(16, activation="tanh", input_shape=(1,len(columns_to_keep)- len(indices))),
        Dropout(0.1),  # Regularization
        Dense(8, activation="relu"),
        Dense(len(indices)) 
        ])
        # Stacked LSTM
        stacked_lstm = Sequential([
            LSTM(16, activation="tanh", return_sequences=True, input_shape=(1, len(columns_to_keep)- len(indices))),
            LSTM(8, activation="tanh"),
            Dropout(0.1),
            Dense(len(indices))
        ])
        cnn_rnn_model = Sequential([
            Conv1D(16, kernel_size=1, activation="relu", input_shape=(1, len(columns_to_keep)- len(indices))),
            Reshape((1, 16)),  # Back to time dimension
            SimpleRNN(8, activation="tanh"),
            Dropout(0.1),
            Dense(len(indices))
        ])
        lp_model = Sequential([
            Flatten(input_shape=(1, len(columns_to_keep)- len(indices))),
            Dense(16, activation="relu"),
            Dropout(0.1),
            Dense(8, activation="relu"),
            Dense(len(indices))
        ])
        regressors = [rnn_model, lstm_model, stacked_lstm, cnn_rnn_model, lp_model]
        name_regressors = ["RNN16", "LSTM16", "CNNRNN16", "CNNLSTM16", "MLP16"]
        regressors = [LinearRegression(), RandomForestRegressor(random_state=42, n_estimators=10), SVR(kernel='rbf'), SVR(kernel='linear'),
                    XGBRegressor(random_state=42, n_estimators=10, learning_rate=0.1)] + regressors
        name_regressors = ["Linear", "RF10", "SVR-rbf", "SVR-linear", "XGB10"] + name_regressors
        assert len(regressors) == len(name_regressors)
        experiment_1 = PredictionExperiment(data, indices, regressors, name_regressors, 5, id)
        experiment_1.execute_experiment()
        experiment_1.get_metrics("r2", "prediction", show=False)
        experiment_1.get_metrics("mape", "prediction", show=False)
        experiment_1.get_metrics("mae", stage="prediction", show=False)
        experiment_1.get_metrics("r2", stage="training", show=False)
        experiment_1.get_metrics("mape", stage="training", show=False)
        experiment_1.get_metrics("mae", stage="training", show=False)
        experiment_1.get_metrics("r2", stage="CV", show=False)
        experiment_1.get_metrics("mape", stage="CV", show=False)
        experiment_1.get_metrics("mae", stage="CV", show=False)
        experiment_1.get_metrics("r2", stage="TSCV", show=False)
        experiment_1.get_metrics("mape", stage="TSCV", show=False)
        experiment_1.get_metrics("mae", stage="TSCV", show=False)
        #experiment_1.top_results("r2", 5, stage="prediction", top_data_path=f"data/results/{FREQUENCY}/top_results.csv")
        #experiment_1.top_results("cv_r2", 5, stage="CV", top_data_path=f"data/results/{FREQUENCY}/top_results.csv")
        experiment_1.save_results(f"data/labelly_results/{region}_results/results_frequency.csv") if "HWN" in indices else experiment_1.save_results(f"data/labelly_results/{region}_results/results_intensity.csv")

Executing 4d17ba1a iter 1
['HWN', 'HWF']
['HWD', 'HWM', 'HWA']
Executing 3adff093 iter 2
['HWN', 'HWF']
['HWD', 'HWM', 'HWA']
Executing b33fc639 iter 3
['HWN', 'HWF']
['HWD', 'HWM', 'HWA']
Executing 511854f2 iter 4
['HWN', 'HWF']
['HWD', 'HWM', 'HWA']
Executing 9bd58418 iter 5
['HWN', 'HWF']


KeyboardInterrupt: 