# Monthly Prediction

## Imports and Functions

In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.gaussian_process.kernels import RBF, WhiteKernel
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.model_selection import TimeSeriesSplit


from sklearn.svm import SVR
import os
import numpy as np
from xgboost import XGBRegressor
import sys
from catboost import CatBoostRegressor

In [2]:
# Add the folder to the Python path

os.chdir("../")
# change working directory to project's root path
print(os.getcwd())

FIRST_YEAR= 1972
FREQUENCY= "monthly"

c:\Users\marti\Desktop\data\hw_extra


In [3]:
folder_path = os.path.abspath("functions/") #INPUT_PATH)#'path_to_your_folder')  # Replace with the actual folder path
sys.path.insert(0, folder_path)

from Predictions import (
    PredictionExperiment,
    PredictionModel
)

In [4]:
kernel_rbf_noise = RBF(length_scale=1.0) + WhiteKernel(noise_level=1)
regressors = [LinearRegression(),
            RandomForestRegressor(random_state=42, n_estimators=5),
            RandomForestRegressor(random_state=42, n_estimators=10),
            RandomForestRegressor(random_state=42, n_estimators=15), 
            SVR(kernel='rbf'),
            SVR(kernel='linear', max_iter=200),
            SVR(kernel='poly', max_iter=200),
            XGBRegressor(booster="gblinear",random_state=42, n_estimators=5, learning_rate=0.1),
            XGBRegressor(booster="gblinear",random_state=42, n_estimators=10, learning_rate=0.1),
            XGBRegressor(booster="gblinear",random_state=42, n_estimators=15, learning_rate=0.1),
            XGBRegressor(booster="gbtree",random_state=42, n_estimators=5, learning_rate=0.1),
            XGBRegressor(booster="gbtree", random_state=42, n_estimators=10, learning_rate=0.1),
            XGBRegressor(booster="gbtree", random_state=42, n_estimators=15, learning_rate=0.1),
            GaussianProcessRegressor(kernel=kernel_rbf_noise, random_state=42, n_restarts_optimizer=10),
            CatBoostRegressor(learning_rate=0.1, iterations=10, verbose=False,random_state=42),
            CatBoostRegressor(learning_rate=0.1, iterations=20, verbose=False,random_state=42),
            CatBoostRegressor(learning_rate=0.1, iterations=30, verbose=False,random_state=42),
            ]
name_regressors = ["Linear", "RF5", "RF10", "RF15", "SVR-rbf", "SVR-linear", "SVR-cubic","XGBL5", "XGBL10", "XGBL15", "XGBT5", "XGBT10", "XGBT15", "GPR-rbf-noise", "CBR-10", "CBR-20","CBR-30"]
indices_of_interest = ["HWN", "HWF", "HWD", "HWM", "HWA"]
assert len(regressors)==len(name_regressors)

## California

In [11]:
region="california"
metadata_path = f"data/climate_features/{region}/metadata.csv"
metadata = pd.read_csv(metadata_path)
metadata.reset_index(inplace=True, drop=True)
display(metadata)

Unnamed: 0,id,filename,season,indices
0,6e47cb06,predictor_6e47cb06_1.parquet,1,df9a31c5-20a07cea-cfb03125-9169e0dc-0b0bffae-b...
1,6e47cb06,predictor_6e47cb06_2.parquet,2,df9a31c5-20a07cea-cfb03125-9169e0dc-0b0bffae-b...
2,6e47cb06,predictor_6e47cb06_3.parquet,3,df9a31c5-20a07cea-cfb03125-9169e0dc-0b0bffae-b...
3,6e47cb06,predictor_6e47cb06_4.parquet,4,df9a31c5-20a07cea-cfb03125-9169e0dc-0b0bffae-b...
4,6e47cb06,predictor_6e47cb06_5.parquet,5,df9a31c5-20a07cea-cfb03125-9169e0dc-0b0bffae-b...
5,6e47cb06,predictor_6e47cb06_6.parquet,6,df9a31c5-20a07cea-cfb03125-9169e0dc-0b0bffae-b...
6,6e47cb06,predictor_6e47cb06_7.parquet,7,df9a31c5-20a07cea-cfb03125-9169e0dc-0b0bffae-b...
7,6e47cb06,predictor_6e47cb06_8.parquet,8,df9a31c5-20a07cea-cfb03125-9169e0dc-0b0bffae-b...
8,6e47cb06,predictor_6e47cb06_9.parquet,9,df9a31c5-20a07cea-cfb03125-9169e0dc-0b0bffae-b...
9,6e47cb06,predictor_6e47cb06_10.parquet,10,df9a31c5-20a07cea-cfb03125-9169e0dc-0b0bffae-b...


In [12]:
results = pd.read_csv(f"data/climate_results/{region}_results/results.csv")
ids_results = results["id_data"].unique()
id_experiments = metadata["id"].unique()
ids_to_execute = [id for id in id_experiments if id not in ids_results]
print(len(ids_to_execute))

0


In [13]:
for id in ids_to_execute:
    data = {i: pd.read_parquet(f"data/climate_features/{region}/predictor_{id}_{i}.parquet") for i in range(1,13)}
    experiment_1 = PredictionExperiment(data, indices_of_interest, regressors, name_regressors, 5, id)
    experiment_1.execute_experiment()
    experiment_1.get_metrics("r2", "prediction", show=False)
    experiment_1.get_metrics("mape", "prediction", show=False)
    experiment_1.get_metrics("r2", stage="training", show=False)
    experiment_1.get_metrics("mape", stage="training", show=False)
    experiment_1.get_metrics("r2", stage="CV", show=False)
    experiment_1.get_metrics("mape", stage="CV", show=False)
    experiment_1.get_metrics("r2", stage="TSCV", show=False)
    experiment_1.get_metrics("mape", stage="TSCV", show=False)
    #experiment_1.top_results("r2", 5, stage="prediction", top_data_path=f"data/results/{FREQUENCY}/top_results.csv")
    #experiment_1.top_results("cv_r2", 5, stage="CV", top_data_path=f"data/results/{FREQUENCY}/top_results.csv")
    experiment_1.save_results(f"data/climate_results/{region}_results/results.csv")

## Chile

In [14]:
region="chile"
metadata = pd.read_csv(f"data/climate_features/{region}/metadata.csv")
metadata.reset_index(inplace=True, drop=True)
display(metadata)

Unnamed: 0,id,filename,season,indices
0,978f49d7,predictor_978f49d7_1.parquet,1,8334b687-f25567c1-43701738-e306f58b-e601b072-e...
1,978f49d7,predictor_978f49d7_2.parquet,2,8334b687-f25567c1-43701738-e306f58b-e601b072-e...
2,978f49d7,predictor_978f49d7_3.parquet,3,8334b687-f25567c1-43701738-e306f58b-e601b072-e...
3,978f49d7,predictor_978f49d7_4.parquet,4,8334b687-f25567c1-43701738-e306f58b-e601b072-e...
4,978f49d7,predictor_978f49d7_5.parquet,5,8334b687-f25567c1-43701738-e306f58b-e601b072-e...
...,...,...,...,...
127,3832cbd6,predictor_3832cbd6_8.parquet,8,a1bff473-4e788494-13e2f761-f5811892-2748fd3a-4...
128,3832cbd6,predictor_3832cbd6_9.parquet,9,a1bff473-4e788494-13e2f761-f5811892-2748fd3a-4...
129,3832cbd6,predictor_3832cbd6_10.parquet,10,a1bff473-4e788494-13e2f761-f5811892-2748fd3a-4...
130,3832cbd6,predictor_3832cbd6_11.parquet,11,a1bff473-4e788494-13e2f761-f5811892-2748fd3a-4...


In [15]:
results = pd.read_csv(f"data/climate_results/{region}_results/results.csv")
ids_results = results["id_data"].unique()
id_experiments = metadata["id"].unique()
ids_to_execute = [id for id in id_experiments if id not in ids_results]
print(len(ids_to_execute))


0


In [None]:
for id in ids_to_execute:
    print("Executing", id)
    data = {i: pd.read_parquet(f"data/climate_features/{region}/predictor_{id}_{i}.parquet") for i in range(1,13)}
    experiment_1 = PredictionExperiment(data, indices_of_interest, regressors, name_regressors, 5, id)
    experiment_1.execute_experiment()
    experiment_1.get_metrics("r2", "prediction", show=False)
    experiment_1.get_metrics("mape", "prediction", show=False)
    experiment_1.get_metrics("r2", stage="training", show=False)
    experiment_1.get_metrics("mape", stage="training", show=False)
    experiment_1.get_metrics("r2", stage="CV", show=False)
    experiment_1.get_metrics("mape", stage="CV", show=False)
    experiment_1.get_metrics("r2", stage="TSCV", show=False)
    experiment_1.get_metrics("mape", stage="TSCV", show=False)
    #experiment_1.top_results("r2", 5, stage="prediction", top_data_path=f"data/results/{FREQUENCY}/top_results.csv")
    #experiment_1.top_results("cv_r2", 5, stage="CV", top_data_path=f"data/results/{FREQUENCY}/top_results.csv")
    experiment_1.save_results(f"data/climate_results/{region}_results/results.csv")