# Chile Season Prediction

## Imports and Functions

In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.gaussian_process.kernels import RBF, WhiteKernel
from sklearn.gaussian_process import GaussianProcessRegressor


from sklearn.svm import SVR
import os
import matplotlib.pyplot as plt
import numpy as np
from xgboost import XGBRegressor

In [2]:
# Add the folder to the Python path

os.chdir("../../")
# change working directory to project's root path
print(os.getcwd())

FIRST_YEAR= 1972

c:\Users\marti\Desktop\data\hw_extra


In [3]:
def model_predict(data, regressor, len_pred, name_regressor=None, show=True):

    data['Date'] = pd.to_datetime(data['Date'])
    data['Date'] = data['Date'].dt.to_period('M').astype(str)

    indices_of_interest = ["HWN", "HWF", "HWD", "HWM", "HWA"]
    features = data.columns.difference(["Date"]+ indices_of_interest)

    # Iterate over each index to create and evaluate models
    if show:
        fig, axs = plt.subplots(5, 1, figsize=(25,15))

    mae_errors = []
    mae_errors_training = []
    r2_scores = []
    r2_scores_training = []
    for i in range(len(indices_of_interest)):
        X = data[features]
        y = data[indices_of_interest[i]]
        dates = data["Date"]
        
        # Split into training and testing sets
        X_train, X_test, y_train, y_test = X[:-len_pred], X[-len_pred:], y[:-len_pred], y[-len_pred:]
        train_dates, test_dates = dates[:-len_pred], dates[-len_pred:]
        
        regressor.fit(X_train, y_train)
        y_pred = regressor.predict(X_test)
        y_pred_train = regressor.predict(X_train)
        
        # Evaluation
        mae_errors.append(mean_absolute_error(y_test, y_pred))
        mae_errors_training.append(mean_absolute_error(y_train, y_pred_train))
        r2_scores.append(r2_score(y_test, y_pred))
        r2_scores_training.append(r2_score(y_train, y_pred_train))
        
        if show:
            # Plot training values
            axs[i].plot(
                train_dates,
                y_train,
                label="Training",
                marker='o',
                color='green',
                linestyle='-',
                linewidth=1.5
            )
            axs[i].plot(
                train_dates,
                y_pred_train,
                label="Predicted Training",
                marker='x',
                color='red',
                linestyle='-',
                linewidth=1.5
            )
            axs[i].plot(
                test_dates,
                y_test,
                label="Test",
                marker='o',
                color='blue',
                linestyle='-',
                linewidth=1.5
            )
            axs[i].plot(
                test_dates,
                y_pred,
                label="Predicted Test",
                marker='x',
                color='red',
                linestyle='--',
                linewidth=1.5
            )
            axs[i].set_title(f"Prediction for {indices_of_interest[i]}")
            axs[i].legend()
    if show:
        fig.tight_layout(rect=[0, 0.03, 1, 0.95])
        fig.suptitle(f"Model: {name_regressor}")
        plt.show()

    return [mae_errors, mae_errors_training, r2_scores, r2_scores_training]


In [4]:
data = {i: pd.read_csv(f"data/features/chile/from{FIRST_YEAR}modes_1/HWs_chile_features_biseason_{i}.csv") for i in range(1,13)}

##  Summer DJ-JF-FM

### Linear Regression

In [5]:
linear_stats = {}

In [6]:
linear_stats[1] = model_predict(data[1], LinearRegression(), 5, name_regressor="Linear", show=False)

In [7]:
linear_stats[2] = model_predict(data[2], LinearRegression(), 5, name_regressor="Linear", show=False)

In [8]:
linear_stats[3] = model_predict(data[3], LinearRegression(), 5, name_regressor="Linear", show=False)

### RF

In [9]:
RF_stats = {}

In [10]:
RF_stats[1] = model_predict(data[1], RandomForestRegressor(random_state=42, n_estimators=100), 5, name_regressor="RF", show=False)

In [11]:
RF_stats[2] = model_predict(data[2], RandomForestRegressor(random_state=42, n_estimators=100), 5, name_regressor="RF", show=False)

In [12]:
RF_stats[3] = model_predict(data[3], RandomForestRegressor(random_state=42, n_estimators=100), 5, name_regressor="RF", show=False)

### SVC

In [13]:
svc_stats = {}

In [14]:
svc_stats[1] = model_predict(data[1], SVR(kernel='rbf') , 5, name_regressor="SVC", show=False)

In [15]:
svc_stats[2] = model_predict(data[2], SVR(kernel='rbf') , 5, name_regressor="SVC", show=False)

In [16]:
svc_stats[3] = model_predict(data[3], SVR(kernel='rbf') , 5, name_regressor="SVC", show=False)

### XGBoost

In [17]:
XGBoost_stats = {}

In [18]:
XGBoost_stats[1] = model_predict(data[1], XGBRegressor(random_state=42, n_estimators=100, learning_rate=0.1), 5, name_regressor="XGB", show=False)

In [19]:
XGBoost_stats[2] = model_predict(data[2], XGBRegressor(random_state=42, n_estimators=100, learning_rate=0.1), 5, name_regressor="XGB", show=False)

In [20]:
XGBoost_stats[3] = model_predict(data[3], XGBRegressor(random_state=42, n_estimators=100, learning_rate=0.1), 5, name_regressor="XGB", show=False)

### GaussianProcess Regressor

In [21]:
kernel = RBF(length_scale=1.0) + WhiteKernel(noise_level=1)
GPR_stats = {}

In [22]:
GPR_stats[1] = model_predict(data[1], GaussianProcessRegressor(kernel=kernel, random_state=42, n_restarts_optimizer=10), 5, name_regressor="GPR", show=False)

In [23]:
GPR_stats[2] = model_predict(data[2], GaussianProcessRegressor(kernel=kernel, random_state=42, n_restarts_optimizer=10), 5, name_regressor="GPR", show=False)

In [24]:
GPR_stats[3] = model_predict(data[3], GaussianProcessRegressor(kernel=kernel, random_state=42, n_restarts_optimizer=10), 5, name_regressor="GPR", show=False)

### Results

In [25]:
indices_of_interest = ["Model", "Season", "HWN", "HWF", "HWD", "HWM", "HWA"]
results_mae_df = pd.DataFrame(columns=indices_of_interest)
for i in range(1,4):
    results_mae_df.loc[len(results_mae_df.index)] = ["Linear", i] + linear_stats[i][0]
    results_mae_df.loc[len(results_mae_df.index)] = ["RF", i] + RF_stats[i][0]
    results_mae_df.loc[len(results_mae_df.index)] = ["SVC", i] + svc_stats[i][0]
    results_mae_df.loc[len(results_mae_df.index)] = ["XGBoost", i] + XGBoost_stats[i][0]
    results_mae_df.loc[len(results_mae_df.index)] = ["GPR", i] + GPR_stats[i][0]

results_mae_df.style.set_caption("MAE Model Results in Prediction").format(precision=2)

Unnamed: 0,Model,Season,HWN,HWF,HWD,HWM,HWA
0,Linear,1,0.16,0.79,0.7,0.18,0.39
1,RF,1,0.15,0.82,0.67,0.17,0.42
2,SVC,1,0.15,0.98,0.84,0.16,0.43
3,XGBoost,1,0.15,0.72,0.66,0.23,0.5
4,GPR,1,0.19,0.88,0.76,0.19,0.38
5,Linear,2,0.4,1.89,1.23,0.61,1.25
6,RF,2,0.42,2.01,1.23,0.59,1.24
7,SVC,2,0.44,2.07,1.27,0.64,1.41
8,XGBoost,2,0.39,2.0,1.1,0.58,1.21
9,GPR,2,0.4,1.89,1.14,0.57,1.19


In [26]:
indices_of_interest = ["Model", "Season", "HWN", "HWF", "HWD", "HWM", "HWA"]
results_mae_df_training = pd.DataFrame(columns=indices_of_interest)
for i in range(1,4):
    results_mae_df_training.loc[len(results_mae_df_training.index)] = ["Linear", i] + linear_stats[i][1]
    results_mae_df_training.loc[len(results_mae_df_training.index)] = ["RF", i] + RF_stats[i][1]
    results_mae_df_training.loc[len(results_mae_df_training.index)] = ["SVC", i] + svc_stats[i][1]
    results_mae_df_training.loc[len(results_mae_df_training.index)] = ["XGBoost", i] + XGBoost_stats[i][1]
    results_mae_df_training.loc[len(results_mae_df_training.index)] = ["GPR", i] + GPR_stats[i][1]

results_mae_df_training.style.set_caption("MAE Model Results in Training").format(precision=2)

Unnamed: 0,Model,Season,HWN,HWF,HWD,HWM,HWA
0,Linear,1,0.13,0.51,0.41,0.2,0.39
1,RF,1,0.05,0.19,0.16,0.08,0.15
2,SVC,1,0.09,0.38,0.3,0.14,0.28
3,XGBoost,1,0.0,0.0,0.0,0.0,0.0
4,GPR,1,0.14,0.52,0.44,0.23,0.44
5,Linear,2,0.18,0.78,0.56,0.28,0.57
6,RF,2,0.09,0.38,0.26,0.14,0.27
7,SVC,2,0.14,0.65,0.44,0.22,0.49
8,XGBoost,2,0.0,0.0,0.0,0.0,0.0
9,GPR,2,0.2,0.82,0.6,0.33,0.64


In [27]:
results_r2_df = pd.DataFrame(columns=indices_of_interest)
for i in range(1,4):
    results_r2_df.loc[len(results_r2_df.index)] = ["Linear", i] + linear_stats[i][2]
    results_r2_df.loc[len(results_r2_df.index)] = ["RF", i] + RF_stats[i][2]
    results_r2_df.loc[len(results_r2_df.index)] = ["SVC", i] + svc_stats[i][2]
    results_r2_df.loc[len(results_r2_df.index)] = ["XGBoost", i] + XGBoost_stats[i][2]
    results_r2_df.loc[len(results_r2_df.index)] = ["GPR", i] + GPR_stats[i][2]
results_r2_df.style.set_caption("R^2 Model Results").format(precision=2)

Unnamed: 0,Model,Season,HWN,HWF,HWD,HWM,HWA
0,Linear,1,0.05,-0.46,-0.67,0.06,-0.17
1,RF,1,0.29,-0.5,-0.66,0.37,-0.03
2,SVC,1,0.15,-1.05,-1.39,0.32,-0.41
3,XGBoost,1,0.45,-0.37,-0.67,-0.27,-0.63
4,GPR,1,0.02,-0.51,-0.65,0.21,0.15
5,Linear,2,-3.52,-3.01,-3.61,-1.66,-2.1
6,RF,2,-4.28,-3.62,-3.72,-1.9,-2.44
7,SVC,2,-4.45,-3.93,-3.99,-2.06,-2.82
8,XGBoost,2,-3.64,-3.64,-3.2,-1.93,-2.55
9,GPR,2,-3.89,-3.03,-2.9,-1.47,-1.85


In [28]:
results_r2_df_training = pd.DataFrame(columns=indices_of_interest)
for i in range(1,4):
    results_r2_df_training.loc[len(results_r2_df_training.index)] = ["Linear", i] + linear_stats[i][3]
    results_r2_df_training.loc[len(results_r2_df_training.index)] = ["RF", i] + RF_stats[i][3]
    results_r2_df_training.loc[len(results_r2_df_training.index)] = ["SVC", i] + svc_stats[i][3]
    results_r2_df_training.loc[len(results_r2_df_training.index)] = ["XGBoost", i] + XGBoost_stats[i][3]
    results_r2_df_training.loc[len(results_r2_df_training.index)] = ["GPR", i] + GPR_stats[i][3]
results_r2_df_training.style.set_caption("R^2 Model Results Training").format(precision=2)

Unnamed: 0,Model,Season,HWN,HWF,HWD,HWM,HWA
0,Linear,1,0.39,0.39,0.35,0.33,0.35
1,RF,1,0.91,0.9,0.9,0.92,0.91
2,SVC,1,0.71,0.55,0.54,0.6,0.51
3,XGBoost,1,1.0,1.0,1.0,1.0,1.0
4,GPR,1,0.33,0.38,0.29,0.2,0.22
5,Linear,2,0.33,0.34,0.34,0.28,0.28
6,RF,2,0.86,0.85,0.84,0.85,0.85
7,SVC,2,0.63,0.36,0.49,0.5,0.3
8,XGBoost,2,1.0,1.0,1.0,1.0,1.0
9,GPR,2,0.23,0.23,0.21,0.13,0.14


##  Winter JJ-JA-AS

### Linear Regression

In [29]:
linear_stats[7] = model_predict(data[7], LinearRegression(), 5, show=False)

In [30]:
linear_stats[8] = model_predict(data[8], LinearRegression(), 5, show=False)

In [31]:
linear_stats[9] = model_predict(data[9], LinearRegression(), 5, show=False)

### RF

In [32]:
RF_stats[7] = model_predict(data[7], RandomForestRegressor(random_state=42, n_estimators=100), 5, show=False)

In [33]:
RF_stats[8] = model_predict(data[8], RandomForestRegressor(random_state=42, n_estimators=100), 5, show=False)

In [34]:
RF_stats[9] = model_predict(data[9], RandomForestRegressor(random_state=42, n_estimators=100), 5, show=False)

### SVC

In [35]:
svc_stats[7] = model_predict(data[7], SVR(kernel='rbf') , 5, show=False)

In [36]:
svc_stats[8] = model_predict(data[8], SVR(kernel='rbf') , 5, show=False)

In [37]:
svc_stats[9] = model_predict(data[9], SVR(kernel='rbf') , 5, show=False)

### XGBoost

In [38]:
XGBoost_stats[7] = model_predict(data[7], XGBRegressor(random_state=42, n_estimators=100, learning_rate=0.1), 5, show=False)

In [39]:
XGBoost_stats[8] = model_predict(data[8], XGBRegressor(random_state=42, n_estimators=100, learning_rate=0.1), 5, show=False)

In [40]:
XGBoost_stats[9] = model_predict(data[9], XGBRegressor(random_state=42, n_estimators=100, learning_rate=0.1), 5, show=False)

### GPR

In [41]:
GPR_stats[7] = model_predict(data[7], GaussianProcessRegressor(kernel=kernel, random_state=42, n_restarts_optimizer=10), 5, show=False)

In [42]:
GPR_stats[8] = model_predict(data[8], GaussianProcessRegressor(kernel=kernel, random_state=42, n_restarts_optimizer=10), 5, show=False)

In [43]:
GPR_stats[9] = model_predict(data[9], GaussianProcessRegressor(kernel=kernel, random_state=42, n_restarts_optimizer=10), 5, show=False)

### Results

In [44]:
indices_of_interest = ["Model", "Season", "HWN", "HWF", "HWD", "HWM", "HWA"]
results_mae_df = pd.DataFrame(columns=indices_of_interest)
for i in range(7,10):
    results_mae_df.loc[len(results_mae_df.index)] = ["Linear", i] + linear_stats[i][0]
    results_mae_df.loc[len(results_mae_df.index)] = ["RF", i] + RF_stats[i][0]
    results_mae_df.loc[len(results_mae_df.index)] = ["SVC", i] + svc_stats[i][0]
    results_mae_df.loc[len(results_mae_df.index)] = ["XGBoost", i] + XGBoost_stats[i][0]
    results_mae_df.loc[len(results_mae_df.index)] = ["GPR", i] + GPR_stats[i][0]

results_mae_df.style.set_caption("MAE Model Results in Prediction").format(precision=2)

Unnamed: 0,Model,Season,HWN,HWF,HWD,HWM,HWA
0,Linear,7,0.05,0.21,0.15,0.17,0.27
1,RF,7,0.04,0.24,0.16,0.15,0.25
2,SVC,7,0.07,0.27,0.19,0.15,0.25
3,XGBoost,7,0.07,0.47,0.33,0.25,0.36
4,GPR,7,0.07,0.27,0.2,0.18,0.29
5,Linear,8,0.19,0.63,0.45,0.48,0.77
6,RF,8,0.22,0.74,0.54,0.51,0.83
7,SVC,8,0.22,0.78,0.6,0.52,0.86
8,XGBoost,8,0.27,0.88,0.64,0.55,0.91
9,GPR,8,0.17,0.67,0.46,0.4,0.68


In [45]:
indices_of_interest = ["Model", "Season", "HWN", "HWF", "HWD", "HWM", "HWA"]
results_mae_df_training = pd.DataFrame(columns=indices_of_interest)
for i in range(7,10):
    results_mae_df_training.loc[len(results_mae_df_training.index)] = ["Linear", i] + linear_stats[i][1]
    results_mae_df_training.loc[len(results_mae_df_training.index)] = ["RF", i] + RF_stats[i][1]
    results_mae_df_training.loc[len(results_mae_df_training.index)] = ["SVC", i] + svc_stats[i][1]
    results_mae_df_training.loc[len(results_mae_df_training.index)] = ["XGBoost", i] + XGBoost_stats[i][1]
    results_mae_df_training.loc[len(results_mae_df_training.index)] = ["GPR", i] + GPR_stats[i][1]

results_mae_df_training.style.set_caption("MAE Model Results in Training").format(precision=2)

Unnamed: 0,Model,Season,HWN,HWF,HWD,HWM,HWA
0,Linear,7,0.09,0.33,0.29,0.15,0.27
1,RF,7,0.04,0.14,0.12,0.06,0.1
2,SVC,7,0.08,0.25,0.21,0.12,0.2
3,XGBoost,7,0.0,0.0,0.0,0.0,0.0
4,GPR,7,0.09,0.32,0.28,0.15,0.27
5,Linear,8,0.14,0.57,0.42,0.28,0.51
6,RF,8,0.05,0.23,0.18,0.09,0.18
7,SVC,8,0.11,0.42,0.32,0.2,0.38
8,XGBoost,8,0.0,0.0,0.0,0.0,0.0
9,GPR,8,0.14,0.57,0.44,0.28,0.51


In [46]:
results_r2_df = pd.DataFrame(columns=indices_of_interest)
for i in range(7,10):
    results_r2_df.loc[len(results_r2_df.index)] = ["Linear", i] + linear_stats[i][2]
    results_r2_df.loc[len(results_r2_df.index)] = ["RF", i] + RF_stats[i][2]
    results_r2_df.loc[len(results_r2_df.index)] = ["SVC", i] + svc_stats[i][2]
    results_r2_df.loc[len(results_r2_df.index)] = ["XGBoost", i] + XGBoost_stats[i][2]
    results_r2_df.loc[len(results_r2_df.index)] = ["GPR", i] + GPR_stats[i][2]
results_r2_df.style.set_caption("R^2 Model Results").format(precision=2)

Unnamed: 0,Model,Season,HWN,HWF,HWD,HWM,HWA
0,Linear,7,0.45,0.3,0.41,-0.2,-0.22
1,RF,7,0.34,0.26,0.3,0.11,0.0
2,SVC,7,-0.14,-0.17,-0.03,-0.03,-0.1
3,XGBoost,7,-0.08,-1.07,-1.24,-1.37,-1.38
4,GPR,7,0.11,0.1,0.15,-0.25,-0.24
5,Linear,8,-1.42,-0.78,-1.61,-2.25,-1.98
6,RF,8,-2.23,-1.34,-2.08,-2.88,-2.68
7,SVC,8,-2.72,-1.63,-2.77,-3.0,-2.95
8,XGBoost,8,-4.29,-2.01,-3.35,-3.24,-3.43
9,GPR,8,-1.16,-1.08,-1.47,-1.36,-1.38


In [47]:
results_r2_df_training = pd.DataFrame(columns=indices_of_interest)
for i in range(7,10):
    results_r2_df_training.loc[len(results_r2_df_training.index)] = ["Linear", i] + linear_stats[i][3]
    results_r2_df_training.loc[len(results_r2_df_training.index)] = ["RF", i] + RF_stats[i][3]
    results_r2_df_training.loc[len(results_r2_df_training.index)] = ["SVC", i] + svc_stats[i][3]
    results_r2_df_training.loc[len(results_r2_df_training.index)] = ["XGBoost", i] + XGBoost_stats[i][3]
    results_r2_df_training.loc[len(results_r2_df_training.index)] = ["GPR", i] + GPR_stats[i][3]
results_r2_df_training.style.set_caption("R^2 Model Results in Training").format(precision=2)

Unnamed: 0,Model,Season,HWN,HWF,HWD,HWM,HWA
0,Linear,7,0.45,0.38,0.37,0.41,0.41
1,RF,7,0.89,0.87,0.87,0.88,0.89
2,SVC,7,0.58,0.57,0.55,0.55,0.52
3,XGBoost,7,1.0,1.0,1.0,1.0,1.0
4,GPR,7,0.41,0.37,0.33,0.34,0.35
5,Linear,8,0.35,0.31,0.31,0.31,0.32
6,RF,8,0.9,0.88,0.88,0.91,0.91
7,SVC,8,0.6,0.33,0.36,0.56,0.48
8,XGBoost,8,1.0,1.0,1.0,1.0,1.0
9,GPR,8,0.26,0.28,0.25,0.26,0.28
