# California Season Prediction by Index

## Imports and Functions

In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.gaussian_process.kernels import RBF, WhiteKernel
from sklearn.gaussian_process import GaussianProcessRegressor


from sklearn.svm import SVR
import os
import matplotlib.pyplot as plt
import numpy as np
from xgboost import XGBRegressor

In [2]:
# Add the folder to the Python path

os.chdir("../../")
# change working directory to project's root path
print(os.getcwd())

FIRST_YEAR= 1972

c:\Users\marti\Desktop\data\hw_extra


In [3]:
def model_predict(data, regressor, len_pred, name_regressor=None, show=False, indices_of_interest=["HWN", "HWF", "HWD", "HWM", "HWA"] ):

    data['Date'] = pd.to_datetime(data['Date'])
    data['Date'] = data['Date'].dt.to_period('M').astype(str)

    features = data.columns.difference(["Date"]+ indices_of_interest)

    # Iterate over each index to create and evaluate models
    if show:
        fig, axs = plt.subplots(len(indices_of_interest), 1, figsize=(25,15))

    mae_errors = []
    mae_errors_training = []
    r2_scores = []
    r2_scores_training = []
    for i in range(len(indices_of_interest)):
        X = data[features]
        y = data[indices_of_interest[i]]
        dates = data["Date"]
        
        # Split into training and testing sets
        X_train, X_test, y_train, y_test = X[:-len_pred], X[-len_pred:], y[:-len_pred], y[-len_pred:]
        train_dates, test_dates = dates[:-len_pred], dates[-len_pred:]
        
        regressor.fit(X_train, y_train)
        y_pred = regressor.predict(X_test)
        y_pred_train = regressor.predict(X_train)
        
        # Evaluation
        mae_errors.append(mean_absolute_error(y_test, y_pred))
        mae_errors_training.append(mean_absolute_error(y_train, y_pred_train))
        r2_scores.append(r2_score(y_test, y_pred))
        r2_scores_training.append(r2_score(y_train, y_pred_train))
        
        if show:
            # Plot training values
            axs[i].plot(
                train_dates,
                y_train,
                label="Training",
                marker='o',
                color='green',
                linestyle='-',
                linewidth=1.5
            )
            axs[i].plot(
                train_dates,
                y_pred_train,
                label="Predicted Training",
                marker='x',
                color='red',
                linestyle='-',
                linewidth=1.5
            )
            axs[i].plot(
                test_dates,
                y_test,
                label="Test",
                marker='o',
                color='blue',
                linestyle='-',
                linewidth=1.5
            )
            axs[i].plot(
                test_dates,
                y_pred,
                label="Predicted Test",
                marker='x',
                color='red',
                linestyle='--',
                linewidth=1.5
            )
            axs[i].set_title(f"Prediction for {indices_of_interest[i]}")
            axs[i].legend()
    if show:
        fig.tight_layout(rect=[0, 0.03, 1, 0.95])
        fig.suptitle(f"Model: {name_regressor}")
        plt.show()

    return [mae_errors, mae_errors_training, r2_scores, r2_scores_training]


In [4]:
indices = ["HWN", "HWF", "HWD", "HWM", "HWA"]
data_per_index = {}
for index in indices:
    data = {i: pd.read_csv(f"data/features/california/from{FIRST_YEAR}/{index}/HWs_cali_features_biseason_{i}.csv") for i in range(1,13)}
    data_per_index[index] = data

##   DJ-JF-FM

### Linear Regression

In [5]:
linear_stats_per_index = {}
for index in indices:
    linear_stats = {}
    linear_stats[1] = model_predict(data_per_index[index][1], LinearRegression(), 5, indices_of_interest=[index])
    linear_stats[2] = model_predict(data_per_index[index][2], LinearRegression(), 5, indices_of_interest=[index])
    linear_stats[3] = model_predict(data_per_index[index][3], LinearRegression(), 5, indices_of_interest=[index])
    linear_stats_per_index[index] = linear_stats

### RF

In [6]:
RF_stats_per_index = {}
for index in indices:
    RF_stats = {}
    RF_stats[1] = model_predict(data_per_index[index][1], RandomForestRegressor(random_state=42, n_estimators=100), 5, indices_of_interest=[index])
    RF_stats[2] = model_predict(data_per_index[index][2], RandomForestRegressor(random_state=42, n_estimators=100), 5, indices_of_interest=[index])
    RF_stats[3] = model_predict(data_per_index[index][3], RandomForestRegressor(random_state=42, n_estimators=100), 5, indices_of_interest=[index])
    RF_stats_per_index[index] = RF_stats

### SVC

In [7]:
svc_stats_per_index = {}
for index in indices:
    svc_stats = {}
    svc_stats[1] = model_predict(data_per_index[index][1], SVR(kernel='rbf') , 5, indices_of_interest=[index])
    svc_stats[2] = model_predict(data_per_index[index][2], SVR(kernel='rbf') , 5, indices_of_interest=[index])
    svc_stats[3] = model_predict(data_per_index[index][3], SVR(kernel='rbf') , 5, indices_of_interest=[index])
    svc_stats_per_index[index] = svc_stats

### XGBoost

In [8]:
XGBoost_stats_per_index = {}
for index in indices:
    XGBoost_stats = {}
    XGBoost_stats[1] = model_predict(data_per_index[index][1], XGBRegressor(random_state=42, n_estimators=20, learning_rate=0.1), 5, indices_of_interest=[index])
    XGBoost_stats[2] = model_predict(data_per_index[index][2], XGBRegressor(random_state=42, n_estimators=20, learning_rate=0.1), 5, indices_of_interest=[index])
    XGBoost_stats[3] = model_predict(data_per_index[index][3], XGBRegressor(random_state=42, n_estimators=20, learning_rate=0.1), 5, indices_of_interest=[index])
    XGBoost_stats_per_index[index] = XGBoost_stats

### GaussianProcess Regressor

In [9]:
kernel = RBF(length_scale=1.0) + WhiteKernel(noise_level=1)
GPR_stats_per_index = {}
for index in indices:
    GPR_stats = {}
    GPR_stats[1] = model_predict(data_per_index[index][1], GaussianProcessRegressor(kernel=kernel, random_state=42, n_restarts_optimizer=10), 5, indices_of_interest=[index])
    GPR_stats[2] = model_predict(data_per_index[index][2], GaussianProcessRegressor(kernel=kernel, random_state=42, n_restarts_optimizer=10), 5, indices_of_interest=[index])
    GPR_stats[3] = model_predict(data_per_index[index][3], GaussianProcessRegressor(kernel=kernel, random_state=42, n_restarts_optimizer=10), 5, indices_of_interest=[index])
    GPR_stats_per_index[index] = GPR_stats

### Results

In [10]:
linear_stats_per_index["HWN"][1][0][0]

0.26983348381708394

In [11]:
indices_of_interest = ["Model", "Season", "HWN", "HWF", "HWD", "HWM", "HWA"]
results_mae_df = pd.DataFrame(columns=indices_of_interest)
for i in range(1,4):
    results_mae_df.loc[len(results_mae_df.index)] = ["Linear", i] + [linear_stats_per_index[index][i][0][0] for index in indices]
    results_mae_df.loc[len(results_mae_df.index)] = ["RF", i] + [RF_stats_per_index[index][i][0][0] for index in indices]
    results_mae_df.loc[len(results_mae_df.index)] = ["SVC", i] + [svc_stats_per_index[index][i][0][0] for index in indices]
    results_mae_df.loc[len(results_mae_df.index)] = ["XGBoost", i] + [XGBoost_stats_per_index[index][i][0][0] for index in indices]
    results_mae_df.loc[len(results_mae_df.index)] = ["GPR", i] + [GPR_stats_per_index[index][i][0][0] for index in indices]

results_mae_df.style.set_caption("MAE Model Results in Prediction").format(precision=2)

Unnamed: 0,Model,Season,HWN,HWF,HWD,HWM,HWA
0,Linear,1,0.27,1.07,1.26,0.45,0.8
1,RF,1,0.11,1.16,1.0,0.39,0.62
2,SVC,1,0.11,1.15,1.04,0.3,0.48
3,XGBoost,1,0.16,1.19,1.14,0.41,0.77
4,GPR,1,0.14,1.3,1.08,0.39,0.66
5,Linear,2,0.32,1.59,1.65,0.77,1.13
6,RF,2,0.36,2.07,1.41,0.75,1.4
7,SVC,2,0.29,1.52,1.11,0.7,1.07
8,XGBoost,2,0.38,2.16,1.43,0.78,1.31
9,GPR,2,0.22,1.24,0.99,0.63,1.03


In [12]:
indices_of_interest = ["Model", "Season", "HWN", "HWF", "HWD", "HWM", "HWA"]
results_mae_df = pd.DataFrame(columns=indices_of_interest)
for i in range(1,4):
    results_mae_df.loc[len(results_mae_df.index)] = ["Linear", i] + [linear_stats_per_index[index][i][1][0] for index in indices]
    results_mae_df.loc[len(results_mae_df.index)] = ["RF", i] + [RF_stats_per_index[index][i][1][0] for index in indices]
    results_mae_df.loc[len(results_mae_df.index)] = ["SVC", i] + [svc_stats_per_index[index][i][1][0] for index in indices]
    results_mae_df.loc[len(results_mae_df.index)] = ["XGBoost", i] + [XGBoost_stats_per_index[index][i][1][0] for index in indices]
    results_mae_df.loc[len(results_mae_df.index)] = ["GPR", i] + [GPR_stats_per_index[index][i][1][0] for index in indices]

results_mae_df.style.set_caption("MAE Model Results in Training").format(precision=2)

Unnamed: 0,Model,Season,HWN,HWF,HWD,HWM,HWA
0,Linear,1,0.1,0.57,0.46,0.15,0.3
1,RF,1,0.05,0.29,0.25,0.08,0.14
2,SVC,1,0.12,0.67,0.53,0.14,0.28
3,XGBoost,1,0.04,0.23,0.19,0.06,0.12
4,GPR,1,0.14,0.79,0.61,0.18,0.36
5,Linear,2,0.13,0.8,0.54,0.23,0.42
6,RF,2,0.08,0.44,0.32,0.12,0.23
7,SVC,2,0.17,1.04,0.75,0.25,0.54
8,XGBoost,2,0.07,0.35,0.27,0.1,0.19
9,GPR,2,0.2,1.24,0.93,0.27,0.59


In [13]:
indices_of_interest = ["Model", "Season", "HWN", "HWF", "HWD", "HWM", "HWA"]
results_mae_df = pd.DataFrame(columns=indices_of_interest)
for i in range(1,4):
    results_mae_df.loc[len(results_mae_df.index)] = ["Linear", i] + [linear_stats_per_index[index][i][2][0] for index in indices]
    results_mae_df.loc[len(results_mae_df.index)] = ["RF", i] + [RF_stats_per_index[index][i][2][0] for index in indices]
    results_mae_df.loc[len(results_mae_df.index)] = ["SVC", i] + [svc_stats_per_index[index][i][2][0] for index in indices]
    results_mae_df.loc[len(results_mae_df.index)] = ["XGBoost", i] + [XGBoost_stats_per_index[index][i][2][0] for index in indices]
    results_mae_df.loc[len(results_mae_df.index)] = ["GPR", i] + [GPR_stats_per_index[index][i][2][0] for index in indices]

results_mae_df.style.set_caption("R^2 Model Results in Prediction").format(precision=2)

Unnamed: 0,Model,Season,HWN,HWF,HWD,HWM,HWA
0,Linear,1,0.24,0.59,0.28,0.4,0.39
1,RF,1,0.79,0.4,0.29,0.32,0.4
2,SVC,1,0.76,0.36,0.23,0.63,0.64
3,XGBoost,1,0.59,0.44,0.19,0.23,0.02
4,GPR,1,0.56,0.19,0.15,0.38,0.36
5,Linear,2,-0.85,-0.48,-1.25,-1.1,-0.78
6,RF,2,-1.31,-0.77,-0.37,-0.76,-1.08
7,SVC,2,-0.57,-0.37,-0.17,-0.56,-0.32
8,XGBoost,2,-1.16,-0.66,-0.24,-0.85,-0.8
9,GPR,2,-0.05,-0.03,-0.03,-0.39,-0.31


In [14]:
indices_of_interest = ["Model", "Season", "HWN", "HWF", "HWD", "HWM", "HWA"]
results_mae_df = pd.DataFrame(columns=indices_of_interest)
for i in range(1,4):
    results_mae_df.loc[len(results_mae_df.index)] = ["Linear", i] + [linear_stats_per_index[index][i][3][0] for index in indices]
    results_mae_df.loc[len(results_mae_df.index)] = ["RF", i] + [RF_stats_per_index[index][i][3][0] for index in indices]
    results_mae_df.loc[len(results_mae_df.index)] = ["SVC", i] + [svc_stats_per_index[index][i][3][0] for index in indices]
    results_mae_df.loc[len(results_mae_df.index)] = ["XGBoost", i] + [XGBoost_stats_per_index[index][i][3][0] for index in indices]
    results_mae_df.loc[len(results_mae_df.index)] = ["GPR", i] + [GPR_stats_per_index[index][i][3][0] for index in indices]

results_mae_df.style.set_caption("R^2 Model Results in Training").format(precision=2)

Unnamed: 0,Model,Season,HWN,HWF,HWD,HWM,HWA
0,Linear,1,0.76,0.68,0.68,0.72,0.66
1,RF,1,0.94,0.92,0.9,0.91,0.91
2,SVC,1,0.67,0.42,0.41,0.73,0.62
3,XGBoost,1,0.95,0.92,0.92,0.94,0.93
4,GPR,1,0.51,0.4,0.42,0.55,0.52
5,Linear,2,0.79,0.8,0.82,0.78,0.79
6,RF,2,0.92,0.94,0.94,0.93,0.94
7,SVC,2,0.7,0.55,0.61,0.68,0.59
8,XGBoost,2,0.94,0.95,0.94,0.95,0.94
9,GPR,2,0.55,0.38,0.42,0.67,0.57


##  JJ-JA-AS

### Linear Regression

In [25]:
linear_stats_per_index = {}
for index in indices:
    linear_stats = {}
    linear_stats[7] = model_predict(data_per_index[index][7], LinearRegression(), 5, indices_of_interest=[index])
    linear_stats[8] = model_predict(data_per_index[index][8], LinearRegression(), 5, indices_of_interest=[index])
    linear_stats[9] = model_predict(data_per_index[index][9], LinearRegression(), 5, indices_of_interest=[index])
    linear_stats_per_index[index] = linear_stats

### RF

In [16]:
RF_stats_per_index = {}
for index in indices:
    RF_stats = {}
    RF_stats[7] = model_predict(data_per_index[index][7], RandomForestRegressor(random_state=42, n_estimators=100), 5, indices_of_interest=[index])
    RF_stats[8] = model_predict(data_per_index[index][8], RandomForestRegressor(random_state=42, n_estimators=100), 5, indices_of_interest=[index])
    RF_stats[9] = model_predict(data_per_index[index][9], RandomForestRegressor(random_state=42, n_estimators=100), 5, indices_of_interest=[index])
    RF_stats_per_index[index] = RF_stats

### SVC

In [17]:
svc_stats_per_index = {}
for index in indices:
    svc_stats = {}
    svc_stats[7] = model_predict(data_per_index[index][7], SVR(kernel='rbf') , 5, indices_of_interest=[index])
    svc_stats[8] = model_predict(data_per_index[index][8], SVR(kernel='rbf') , 5, indices_of_interest=[index])
    svc_stats[9] = model_predict(data_per_index[index][9], SVR(kernel='rbf') , 5, indices_of_interest=[index])
    svc_stats_per_index[index] = svc_stats

### XGBoost

In [18]:
XGBoost_stats_per_index = {}
for index in indices:
    XGBoost_stats = {}
    XGBoost_stats[7] = model_predict(data_per_index[index][7], XGBRegressor(random_state=42, n_estimators=20, learning_rate=0.1), 5, indices_of_interest=[index])
    XGBoost_stats[8] = model_predict(data_per_index[index][8], XGBRegressor(random_state=42, n_estimators=20, learning_rate=0.1), 5, indices_of_interest=[index])
    XGBoost_stats[9] = model_predict(data_per_index[index][9], XGBRegressor(random_state=42, n_estimators=20, learning_rate=0.1), 5, indices_of_interest=[index])
    XGBoost_stats_per_index[index] = XGBoost_stats

### GaussianProcess Regressor

In [19]:
kernel = RBF(length_scale=1.0) + WhiteKernel(noise_level=1)
GPR_stats_per_index = {}
for index in indices:
    GPR_stats = {}
    GPR_stats[7] = model_predict(data_per_index[index][7], GaussianProcessRegressor(kernel=kernel, random_state=42, n_restarts_optimizer=10), 5, indices_of_interest=[index])
    GPR_stats[8] = model_predict(data_per_index[index][8], GaussianProcessRegressor(kernel=kernel, random_state=42, n_restarts_optimizer=10), 5, indices_of_interest=[index])
    GPR_stats[9] = model_predict(data_per_index[index][9], GaussianProcessRegressor(kernel=kernel, random_state=42, n_restarts_optimizer=10), 5, indices_of_interest=[index])
    GPR_stats_per_index[index] = GPR_stats

### Results

In [20]:
indices_of_interest = ["Model", "Season", "HWN", "HWF", "HWD", "HWM", "HWA"]
results_mae_df = pd.DataFrame(columns=indices_of_interest)
for i in range(7,10):
    results_mae_df.loc[len(results_mae_df.index)] = ["Linear", i] + [linear_stats_per_index[index][i][0][0] for index in indices]
    results_mae_df.loc[len(results_mae_df.index)] = ["RF", i] + [RF_stats_per_index[index][i][0][0] for index in indices]
    results_mae_df.loc[len(results_mae_df.index)] = ["SVC", i] + [svc_stats_per_index[index][i][0][0] for index in indices]
    results_mae_df.loc[len(results_mae_df.index)] = ["XGBoost", i] + [XGBoost_stats_per_index[index][i][0][0] for index in indices]
    results_mae_df.loc[len(results_mae_df.index)] = ["GPR", i] + [GPR_stats_per_index[index][i][0][0] for index in indices]

results_mae_df.style.set_caption("MAE Model Results in Prediction").format(precision=2)

Unnamed: 0,Model,Season,HWN,HWF,HWD,HWM,HWA
0,Linear,7,0.19,0.87,0.7,0.18,0.18
1,RF,7,0.18,0.81,0.53,0.27,0.4
2,SVC,7,0.14,0.61,0.44,0.34,0.45
3,XGBoost,7,0.17,0.71,0.54,0.26,0.46
4,GPR,7,0.19,0.82,0.63,0.25,0.37
5,Linear,8,0.26,0.86,0.76,0.34,0.69
6,RF,8,0.17,0.55,0.42,0.16,0.36
7,SVC,8,0.15,0.66,0.44,0.17,0.37
8,XGBoost,8,0.15,0.59,0.84,0.19,0.42
9,GPR,8,0.17,0.49,0.4,0.17,0.38


In [21]:
indices_of_interest = ["Model", "Season", "HWN", "HWF", "HWD", "HWM", "HWA"]
results_mae_df = pd.DataFrame(columns=indices_of_interest)
for i in range(7,10):
    results_mae_df.loc[len(results_mae_df.index)] = ["Linear", i] + [linear_stats_per_index[index][i][1][0] for index in indices]
    results_mae_df.loc[len(results_mae_df.index)] = ["RF", i] + [RF_stats_per_index[index][i][1][0] for index in indices]
    results_mae_df.loc[len(results_mae_df.index)] = ["SVC", i] + [svc_stats_per_index[index][i][1][0] for index in indices]
    results_mae_df.loc[len(results_mae_df.index)] = ["XGBoost", i] + [XGBoost_stats_per_index[index][i][1][0] for index in indices]
    results_mae_df.loc[len(results_mae_df.index)] = ["GPR", i] + [GPR_stats_per_index[index][i][1][0] for index in indices]

results_mae_df.style.set_caption("MAE Model Results in Training").format(precision=2)

Unnamed: 0,Model,Season,HWN,HWF,HWD,HWM,HWA
0,Linear,7,0.11,0.57,0.49,0.21,0.36
1,RF,7,0.06,0.28,0.22,0.1,0.17
2,SVC,7,0.09,0.47,0.39,0.14,0.26
3,XGBoost,7,0.04,0.19,0.16,0.07,0.12
4,GPR,7,0.14,0.73,0.57,0.25,0.44
5,Linear,8,0.1,0.51,0.41,0.17,0.29
6,RF,8,0.06,0.29,0.24,0.1,0.18
7,SVC,8,0.1,0.56,0.43,0.17,0.31
8,XGBoost,8,0.04,0.21,0.17,0.07,0.12
9,GPR,8,0.12,0.78,0.62,0.22,0.41


In [22]:
indices_of_interest = ["Model", "Season", "HWN", "HWF", "HWD", "HWM", "HWA"]
results_mae_df = pd.DataFrame(columns=indices_of_interest)
for i in range(7,10):
    results_mae_df.loc[len(results_mae_df.index)] = ["Linear", i] + [linear_stats_per_index[index][i][2][0] for index in indices]
    results_mae_df.loc[len(results_mae_df.index)] = ["RF", i] + [RF_stats_per_index[index][i][2][0] for index in indices]
    results_mae_df.loc[len(results_mae_df.index)] = ["SVC", i] + [svc_stats_per_index[index][i][2][0] for index in indices]
    results_mae_df.loc[len(results_mae_df.index)] = ["XGBoost", i] + [XGBoost_stats_per_index[index][i][2][0] for index in indices]
    results_mae_df.loc[len(results_mae_df.index)] = ["GPR", i] + [GPR_stats_per_index[index][i][2][0] for index in indices]

results_mae_df.style.set_caption("R^2 Model Results in Prediction").format(precision=2)

Unnamed: 0,Model,Season,HWN,HWF,HWD,HWM,HWA
0,Linear,7,0.31,0.23,0.15,0.66,0.9
1,RF,7,0.42,0.42,0.61,0.43,0.54
2,SVC,7,0.58,0.66,0.71,0.07,0.35
3,XGBoost,7,0.39,0.46,0.56,0.43,0.37
4,GPR,7,0.37,0.4,0.38,0.53,0.55
5,Linear,8,-1.16,-0.73,-2.8,-1.6,-1.73
6,RF,8,0.02,0.01,-1.02,0.12,-0.05
7,SVC,8,0.24,-0.19,-0.72,0.19,0.04
8,XGBoost,8,0.16,0.02,-3.17,-0.3,-0.39
9,GPR,8,0.17,0.21,-0.51,0.11,-0.04


In [23]:
indices_of_interest = ["Model", "Season", "HWN", "HWF", "HWD", "HWM", "HWA"]
results_mae_df = pd.DataFrame(columns=indices_of_interest)
for i in range(7,10):
    results_mae_df.loc[len(results_mae_df.index)] = ["Linear", i] + [linear_stats_per_index[index][i][3][0] for index in indices]
    results_mae_df.loc[len(results_mae_df.index)] = ["RF", i] + [RF_stats_per_index[index][i][3][0] for index in indices]
    results_mae_df.loc[len(results_mae_df.index)] = ["SVC", i] + [svc_stats_per_index[index][i][3][0] for index in indices]
    results_mae_df.loc[len(results_mae_df.index)] = ["XGBoost", i] + [XGBoost_stats_per_index[index][i][3][0] for index in indices]
    results_mae_df.loc[len(results_mae_df.index)] = ["GPR", i] + [GPR_stats_per_index[index][i][3][0] for index in indices]

results_mae_df.style.set_caption("R^2 Model Results in Training").format(precision=2)

Unnamed: 0,Model,Season,HWN,HWF,HWD,HWM,HWA
0,Linear,7,0.64,0.62,0.58,0.55,0.59
1,RF,7,0.91,0.91,0.91,0.9,0.89
2,SVC,7,0.81,0.67,0.63,0.73,0.66
3,XGBoost,7,0.96,0.95,0.94,0.94,0.94
4,GPR,7,0.46,0.44,0.4,0.4,0.39
5,Linear,8,0.71,0.72,0.72,0.7,0.71
6,RF,8,0.89,0.91,0.9,0.88,0.87
7,SVC,8,0.74,0.58,0.63,0.59,0.53
8,XGBoost,8,0.94,0.95,0.94,0.94,0.93
9,GPR,8,0.55,0.36,0.4,0.43,0.38
