# Chile Year Prediction, Biseasonly Features by Index

## Imports and Functions

In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.gaussian_process.kernels import RBF, WhiteKernel
from sklearn.gaussian_process import GaussianProcessRegressor


from sklearn.svm import SVR
import os
import matplotlib.pyplot as plt
import numpy as np
from xgboost import XGBRegressor

In [2]:
# Add the folder to the Python path

os.chdir("../../")
# change working directory to project's root path
print(os.getcwd())

FIRST_YEAR= 1972

c:\Users\marti\Desktop\data\hw_extra


In [3]:
def model_predict(data, regressor, len_pred, name_regressor=None, show=False, indices_of_interest=["HWN", "HWF", "HWD", "HWM", "HWA"] ):

    data['Date'] = pd.to_datetime(data['Date'])
    data['Date'] = data['Date'].dt.to_period('M').astype(str)

    features = data.columns.difference(["Date"]+ indices_of_interest)

    # Iterate over each index to create and evaluate models
    if show:
        fig, axs = plt.subplots(len(indices_of_interest), 1, figsize=(25,15))

    mae_errors = []
    mae_errors_training = []
    r2_scores = []
    r2_scores_training = []
    for i in range(len(indices_of_interest)):
        X = data[features]
        y = data[indices_of_interest[i]]
        dates = data["Date"]
        
        # Split into training and testing sets
        X_train, X_test, y_train, y_test = X[:-len_pred], X[-len_pred:], y[:-len_pred], y[-len_pred:]
        train_dates, test_dates = dates[:-len_pred], dates[-len_pred:]
        
        regressor.fit(X_train, y_train)
        y_pred = regressor.predict(X_test)
        y_pred_train = regressor.predict(X_train)
        
        # Evaluation
        mae_errors.append(mean_absolute_error(y_test, y_pred))
        mae_errors_training.append(mean_absolute_error(y_train, y_pred_train))
        r2_scores.append(r2_score(y_test, y_pred))
        r2_scores_training.append(r2_score(y_train, y_pred_train))
        
        if show:
            # Plot training values
            axs[i].plot(
                train_dates,
                y_train,
                label="Training",
                marker='o',
                color='green',
                linestyle='-',
                linewidth=1.5
            )
            axs[i].plot(
                train_dates,
                y_pred_train,
                label="Predicted Training",
                marker='x',
                color='red',
                linestyle='-',
                linewidth=1.5
            )
            axs[i].plot(
                test_dates,
                y_test,
                label="Test",
                marker='o',
                color='blue',
                linestyle='-',
                linewidth=1.5
            )
            axs[i].plot(
                test_dates,
                y_pred,
                label="Predicted Test",
                marker='x',
                color='red',
                linestyle='--',
                linewidth=1.5
            )
            axs[i].set_title(f"Prediction for {indices_of_interest[i]}")
            axs[i].legend()
    if show:
        fig.tight_layout(rect=[0, 0.03, 1, 0.95])
        fig.suptitle(f"Model: {name_regressor}")
        plt.show()

    return [mae_errors, mae_errors_training, r2_scores, r2_scores_training]


In [5]:
indices = ["HWN", "HWF", "HWD", "HWM", "HWA"]
data_per_index = {}
for index in indices:
    data = pd.read_csv(f"data/features/chile/from{FIRST_YEAR}/{index}/HWs_chile_features_biseasonly_year.csv")
    data_per_index[index] = data


In [6]:
kernel = RBF(length_scale=1.0) + WhiteKernel(noise_level=1)
predictors = [LinearRegression(),RandomForestRegressor(random_state=42, n_estimators=10), RandomForestRegressor(random_state=42, n_estimators=50),  RandomForestRegressor(random_state=42, n_estimators=100), SVR(kernel='rbf'),
                XGBRegressor(random_state=42, n_estimators=10, learning_rate=0.1),XGBRegressor(random_state=42, n_estimators=25, learning_rate=0.1), XGBRegressor(random_state=42, n_estimators=50, learning_rate=0.1), XGBRegressor(random_state=42, n_estimators=100, learning_rate=0.1), GaussianProcessRegressor(kernel=kernel, random_state=42, n_restarts_optimizer=10)]
name_predictors = ["Linear", "RF10", "RF50", "RF100", "SVR-rbf", "XGB10","XGB25",  "XGB50",  "XGB100", "GPR"]

In [7]:
stats_per_index = {}
for index in indices:
    stats = [model_predict(data_per_index[index], reg, 60, name_regressor=name, show=False, indices_of_interest=[index]) for reg,name in zip(predictors, name_predictors)]
    stats_per_index[index] = stats

In [8]:
indices_of_interest = ["Model", "HWN", "HWF", "HWD", "HWM", "HWA"]
results_df = pd.DataFrame(columns=indices_of_interest)
for i in range(len(predictors)):
    results_df.loc[len(results_df.index)] = [f"{name_predictors[i]}"]+ [stats_per_index[index][i][0][0] for index in indices]

results_df.style.set_caption("MAE Model Results in Prediction").format(precision=2)

Unnamed: 0,Model,HWN,HWF,HWD,HWM,HWA
0,Linear,0.31,1.36,0.92,0.56,1.03
1,RF10,0.21,0.86,0.6,0.42,0.69
2,RF50,0.2,0.83,0.6,0.38,0.69
3,RF100,0.2,0.84,0.59,0.38,0.68
4,SVR-rbf,0.22,0.92,0.63,0.39,0.73
5,XGB10,0.22,0.88,0.6,0.39,0.69
6,XGB25,0.22,0.87,0.61,0.4,0.71
7,XGB50,0.22,0.87,0.61,0.4,0.71
8,XGB100,0.22,0.87,0.61,0.39,0.71
9,GPR,0.23,0.93,0.65,0.4,0.72


In [9]:
indices_of_interest = ["Model", "HWN", "HWF", "HWD", "HWM", "HWA"]
results_df = pd.DataFrame(columns=indices_of_interest)
for i in range(len(predictors)):
    results_df.loc[len(results_df.index)] = [f"{name_predictors[i]}"]+ [stats_per_index[index][i][1][0] for index in indices]

results_df.style.set_caption("MAE Model Results in Training").format(precision=2)

Unnamed: 0,Model,HWN,HWF,HWD,HWM,HWA
0,Linear,0.14,0.57,0.46,0.24,0.44
1,RF10,0.07,0.27,0.21,0.12,0.22
2,RF50,0.06,0.26,0.2,0.11,0.2
3,RF100,0.06,0.25,0.19,0.11,0.2
4,SVR-rbf,0.08,0.32,0.21,0.11,0.21
5,XGB10,0.1,0.41,0.32,0.17,0.32
6,XGB25,0.05,0.2,0.16,0.08,0.14
7,XGB50,0.02,0.07,0.06,0.03,0.06
8,XGB100,0.0,0.01,0.01,0.0,0.01
9,GPR,0.16,0.35,0.2,0.28,0.19


In [10]:
indices_of_interest = ["Model", "HWN", "HWF", "HWD", "HWM", "HWA"]
results_df = pd.DataFrame(columns=indices_of_interest)
for i in range(len(predictors)):
    results_df.loc[len(results_df.index)] = [f"{name_predictors[i]}"]+ [stats_per_index[index][i][2][0] for index in indices]

results_df.style.set_caption("R^2 Model Results in Prediction").format(precision=2)

Unnamed: 0,Model,HWN,HWF,HWD,HWM,HWA
0,Linear,-1.3,-1.67,-1.8,-1.19,-1.26
1,RF10,-0.18,-0.24,-0.1,-0.36,-0.08
2,RF50,-0.11,-0.06,-0.11,-0.12,-0.1
3,RF100,-0.11,-0.11,-0.12,-0.11,-0.09
4,SVR-rbf,-0.25,-0.28,-0.24,-0.12,-0.19
5,XGB10,-0.24,-0.21,-0.13,-0.14,-0.13
6,XGB25,-0.19,-0.19,-0.12,-0.11,-0.08
7,XGB50,-0.2,-0.18,-0.1,-0.08,-0.06
8,XGB100,-0.19,-0.18,-0.07,-0.07,-0.05
9,GPR,-0.3,-0.23,-0.27,-0.24,-0.1


In [11]:
indices_of_interest = ["Model", "HWN", "HWF", "HWD", "HWM", "HWA"]
results_df = pd.DataFrame(columns=indices_of_interest)
for i in range(len(predictors)):
    results_df.loc[len(results_df.index)] = [f"{name_predictors[i]}"]+ [stats_per_index[index][i][3][0] for index in indices]

results_df.style.set_caption("R^2 Model Results in Training").format(precision=2)

Unnamed: 0,Model,HWN,HWF,HWD,HWM,HWA
0,Linear,0.46,0.45,0.42,0.45,0.45
1,RF10,0.84,0.85,0.85,0.82,0.83
2,RF50,0.89,0.88,0.88,0.87,0.88
3,RF100,0.89,0.89,0.89,0.88,0.89
4,SVR-rbf,0.84,0.61,0.75,0.86,0.75
5,XGB10,0.71,0.71,0.7,0.7,0.69
6,XGB25,0.93,0.93,0.92,0.93,0.94
7,XGB50,0.99,0.99,0.99,0.99,0.99
8,XGB100,1.0,1.0,1.0,1.0,1.0
9,GPR,0.26,0.76,0.86,0.2,0.87
