# Linear prediction

In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.gaussian_process.kernels import RBF, WhiteKernel
from sklearn.gaussian_process import GaussianProcessRegressor


from sklearn.svm import SVR
import os
import matplotlib.pyplot as plt
import numpy as np
from xgboost import XGBRegressor

In [2]:
# Add the folder to the Python path

os.chdir("../")
# change working directory to project's root path
print(os.getcwd())

c:\Users\marti\Desktop\data\hw_extra


In [3]:
def model_predict(data, regressor, len_pred, name_regressor=None, show=True):

    data['Date'] = pd.to_datetime(data['Date'])
    data['Date'] = data['Date'].dt.to_period('M').astype(str)

    indices_of_interest = ["HWN", "HWF", "HWD", "HWM", "HWA"]
    features = data.columns.difference(["Date"]+ indices_of_interest)

    # Iterate over each index to create and evaluate models
    if show:
        fig, axs = plt.subplots(5, 1, figsize=(25,15))

    mae_errors = []
    mae_errors_training = []
    r2_scores = []
    r2_scores_training = []
    for i in range(len(indices_of_interest)):
        X = data[features]
        y = data[indices_of_interest[i]]
        dates = data["Date"]
        
        # Split into training and testing sets
        X_train, X_test, y_train, y_test = X[:-len_pred], X[-len_pred:], y[:-len_pred], y[-len_pred:]
        train_dates, test_dates = dates[:-len_pred], dates[-len_pred:]
        
        regressor.fit(X_train, y_train)
        y_pred = regressor.predict(X_test)
        y_pred_train = regressor.predict(X_train)
        
        # Evaluation
        mae_errors.append(mean_absolute_error(y_test, y_pred))
        mae_errors_training.append(mean_absolute_error(y_train, y_pred_train))
        r2_scores.append(r2_score(y_test, y_pred))
        r2_scores_training.append(r2_score(y_train, y_pred_train))
        
        if show:
            # Plot training values
            axs[i].plot(
                train_dates,
                y_train,
                label="Training",
                marker='o',
                color='green',
                linestyle='-',
                linewidth=1.5
            )
            axs[i].plot(
                train_dates,
                y_pred_train,
                label="Predicted Training",
                marker='x',
                color='red',
                linestyle='-',
                linewidth=1.5
            )
            axs[i].plot(
                test_dates,
                y_test,
                label="Test",
                marker='o',
                color='blue',
                linestyle='-',
                linewidth=1.5
            )
            axs[i].plot(
                test_dates,
                y_pred,
                label="Predicted Test",
                marker='x',
                color='red',
                linestyle='--',
                linewidth=1.5
            )
            axs[i].set_title(f"Prediction for {indices_of_interest[i]}")
            axs[i].legend()
    if show:
        fig.tight_layout(rect=[0, 0.03, 1, 0.95])
        fig.suptitle(f"Model: {name_regressor}")
        plt.show()

    return [mae_errors, mae_errors_training, r2_scores, r2_scores_training]


## Chile

In [8]:
FIRST_YEAR=[1972, "1972removing", 1990, 1995, 2000, "1995removing"]
data = [pd.read_csv(f"data/features/chile/from{first}/HWs_chile_features_biseasonly_year.csv") if  type(first)==int 
        else pd.read_csv(f"data/features/chile/from{first}/HWs_chile_features_biseasonly_removing_year.csv") for first in FIRST_YEAR]
data_filtered = [d.iloc[:, :7] for d in data]
data_filtered[0]

Unnamed: 0,Date,HWN,HWF,HWD,HWM,HWA,PC_965-Mode-3
0,1972-01-01,0.05,0.15,0.15,0.225864,0.490855,0.108281
1,1972-02-01,0.10,0.30,0.30,0.290486,0.590487,-0.458013
2,1972-03-01,0.05,0.15,0.15,0.064622,0.099632,-0.085797
3,1972-04-01,0.00,0.00,0.00,0.000000,0.000000,0.150808
4,1972-05-01,0.05,0.15,0.15,0.045400,0.070509,-0.017484
...,...,...,...,...,...,...,...
607,2022-08-01,0.25,0.75,0.75,0.409921,0.663724,0.254501
608,2022-09-01,0.55,2.10,2.10,1.273283,2.113854,-0.178719
609,2022-10-01,0.60,2.25,2.10,1.537712,2.647765,-0.028817
610,2022-11-01,0.60,1.90,1.30,1.014790,1.833975,-0.092528


In [9]:
predictors = [LinearRegression()]
name_predictors = ["Linear"]

In [10]:
stats = [model_predict(d, reg, 5, name_regressor=name, show=False) for reg,name in zip(predictors, name_predictors) for d in data_filtered]

In [11]:
indices_of_interest = ["Model", "HWN", "HWF", "HWD", "HWM", "HWA"]

results_df = pd.DataFrame(columns=indices_of_interest)
for j in range(len(stats)):
    results_df.loc[len(results_df.index)] = [f"{name_predictors[0]}-{FIRST_YEAR[j]}"]+ stats[j][0]

results_df.style.set_caption("MAE Model Results in Prediction").format(precision=2)

Unnamed: 0,Model,HWN,HWF,HWD,HWM,HWA
0,Linear-1972,0.25,0.82,0.68,0.55,0.93
1,Linear-1972removing,0.25,0.82,0.68,0.55,0.93
2,Linear-1990,0.24,0.81,0.66,0.52,0.92
3,Linear-1995,0.19,0.61,0.52,0.45,0.79
4,Linear-2000,0.19,0.61,0.52,0.45,0.79
5,Linear-1995removing,0.19,0.61,0.52,0.45,0.79


In [12]:
indices_of_interest = ["Model", "HWN", "HWF", "HWD", "HWM", "HWA"]

results_df = pd.DataFrame(columns=indices_of_interest)
for j in range(len(stats)):
    results_df.loc[len(results_df.index)] = [f"{name_predictors[0]}-{FIRST_YEAR[j]}"]+ stats[j][1]

results_df.style.set_caption("MAE Model Results in Training").format(precision=2)

Unnamed: 0,Model,HWN,HWF,HWD,HWM,HWA
0,Linear-1972,0.19,0.76,0.58,0.33,0.6
1,Linear-1972removing,0.19,0.76,0.58,0.33,0.6
2,Linear-1990,0.22,0.89,0.65,0.36,0.67
3,Linear-1995,0.22,0.9,0.66,0.36,0.67
4,Linear-2000,0.22,0.9,0.65,0.36,0.69
5,Linear-1995removing,0.22,0.9,0.66,0.36,0.67


In [13]:
indices_of_interest = ["Model", "HWN", "HWF", "HWD", "HWM", "HWA"]

results_df = pd.DataFrame(columns=indices_of_interest)
for j in range(len(stats)):
    results_df.loc[len(results_df.index)] = [f"{name_predictors[0]}-{FIRST_YEAR[j]}"]+ stats[j][2]

results_df.style.set_caption("R^2 Model Results in Prediction").format(precision=2)

Unnamed: 0,Model,HWN,HWF,HWD,HWM,HWA
0,Linear-1972,-2.78,-1.75,-1.55,-1.78,-1.81
1,Linear-1972removing,-2.78,-1.75,-1.55,-1.78,-1.81
2,Linear-1990,-2.19,-1.3,-1.23,-1.5,-1.49
3,Linear-1995,-0.93,-0.36,-0.37,-0.75,-0.72
4,Linear-2000,-0.88,-0.36,-0.37,-0.73,-0.71
5,Linear-1995removing,-0.93,-0.36,-0.37,-0.75,-0.72


In [14]:
indices_of_interest = ["Model", "HWN", "HWF", "HWD", "HWM", "HWA"]

results_df = pd.DataFrame(columns=indices_of_interest)
for j in range(len(stats)):
    results_df.loc[len(results_df.index)] = [f"{name_predictors[0]}-{FIRST_YEAR[j]}"]+ stats[j][3]

results_df.style.set_caption("R^2 Model Results in Training").format(precision=2)

Unnamed: 0,Model,HWN,HWF,HWD,HWM,HWA
0,Linear-1972,0.04,0.04,0.03,0.02,0.02
1,Linear-1972removing,0.04,0.04,0.03,0.02,0.02
2,Linear-1990,0.0,0.0,0.0,0.0,0.0
3,Linear-1995,0.05,0.05,0.05,0.04,0.04
4,Linear-2000,0.06,0.06,0.05,0.04,0.04
5,Linear-1995removing,0.05,0.05,0.05,0.04,0.04


## California

In [15]:
FIRST_YEAR=[1972, "1972removing", 1990, 1995, 2000]
data = [pd.read_csv(f"data/features/california/from{first}/HWs_cali_features_biseasonly_year.csv") if  type(first)==int 
        else pd.read_csv(f"data/features/california/from{first}/HWs_cali_features_biseasonly_removing_year.csv") for first in FIRST_YEAR]
data_filtered = [d.iloc[:, :7] for d in data]
data_filtered[0]

Unnamed: 0,Date,HWN,HWF,HWD,HWM,HWA,PC_131-Mode-3
0,1972-01-01,0.00,0.00,0.00,0.000000,0.000000,0.377409
1,1972-02-01,0.35,1.30,0.85,0.393818,0.810149,0.087409
2,1972-03-01,1.10,7.15,4.95,1.940684,4.085165,-0.160896
3,1972-04-01,0.75,5.85,4.10,1.546867,3.275016,-0.168609
4,1972-05-01,0.45,1.75,1.20,0.643407,1.114190,0.076124
...,...,...,...,...,...,...,...
607,2022-08-01,0.20,1.50,1.50,0.503094,0.917230,-0.041337
608,2022-09-01,0.65,3.45,3.30,1.814032,2.993340,-0.232334
609,2022-10-01,0.85,4.15,3.60,1.763006,2.863935,-0.011008
610,2022-11-01,0.35,2.00,1.60,0.358290,0.637526,0.051810


In [16]:
predictors = [LinearRegression()]
name_predictors = ["Linear"]

In [17]:
stats = [model_predict(d, reg, 5, name_regressor=name, show=False) for reg,name in zip(predictors, name_predictors) for d in data_filtered]

In [18]:
indices_of_interest = ["Model", "HWN", "HWF", "HWD", "HWM", "HWA"]

results_df = pd.DataFrame(columns=indices_of_interest)
for j in range(len(stats)):
    results_df.loc[len(results_df.index)] = [f"{name_predictors[0]}-{FIRST_YEAR[j]}"]+ stats[j][0]

results_df.style.set_caption("MAE Model Results in Prediction").format(precision=2)

Unnamed: 0,Model,HWN,HWF,HWD,HWM,HWA
0,Linear-1972,0.19,0.76,0.74,0.54,0.84
1,Linear-1972removing,0.19,0.76,0.74,0.54,0.84
2,Linear-1990,0.68,5.12,3.7,0.98,2.15
3,Linear-1995,0.68,5.1,3.68,0.97,2.14
4,Linear-2000,0.65,4.97,3.6,0.96,2.07


In [19]:
indices_of_interest = ["Model", "HWN", "HWF", "HWD", "HWM", "HWA"]

results_df = pd.DataFrame(columns=indices_of_interest)
for j in range(len(stats)):
    results_df.loc[len(results_df.index)] = [f"{name_predictors[0]}-{FIRST_YEAR[j]}"]+ stats[j][1]

results_df.style.set_caption("MAE Model Results in Training").format(precision=2)

Unnamed: 0,Model,HWN,HWF,HWD,HWM,HWA
0,Linear-1972,0.25,1.29,0.98,0.46,0.79
1,Linear-1972removing,0.25,1.29,0.98,0.46,0.79
2,Linear-1990,0.33,1.66,1.22,0.56,0.98
3,Linear-1995,0.33,1.66,1.22,0.57,0.99
4,Linear-2000,0.33,1.72,1.26,0.58,1.01


In [20]:
indices_of_interest = ["Model", "HWN", "HWF", "HWD", "HWM", "HWA"]

results_df = pd.DataFrame(columns=indices_of_interest)
for j in range(len(stats)):
    results_df.loc[len(results_df.index)] = [f"{name_predictors[0]}-{FIRST_YEAR[j]}"]+ stats[j][2]

results_df.style.set_caption("R^2 Model Results in Prediction").format(precision=2)

Unnamed: 0,Model,HWN,HWF,HWD,HWM,HWA
0,Linear-1972,0.37,0.49,0.44,0.36,0.41
1,Linear-1972removing,0.37,0.49,0.44,0.36,0.41
2,Linear-1990,-0.83,-1.45,-1.23,-0.53,-0.72
3,Linear-1995,-0.82,-1.43,-1.21,-0.51,-0.7
4,Linear-2000,-0.72,-1.3,-1.1,-0.44,-0.63


In [21]:
indices_of_interest = ["Model", "HWN", "HWF", "HWD", "HWM", "HWA"]

results_df = pd.DataFrame(columns=indices_of_interest)
for j in range(len(stats)):
    results_df.loc[len(results_df.index)] = [f"{name_predictors[0]}-{FIRST_YEAR[j]}"]+ stats[j][3]

results_df.style.set_caption("R^2 Model Results in Training").format(precision=2)

Unnamed: 0,Model,HWN,HWF,HWD,HWM,HWA
0,Linear-1972,0.18,0.18,0.17,0.14,0.15
1,Linear-1972removing,0.18,0.18,0.17,0.14,0.15
2,Linear-1990,0.05,0.05,0.04,0.03,0.04
3,Linear-1995,0.06,0.06,0.05,0.04,0.05
4,Linear-2000,0.06,0.07,0.05,0.04,0.05
