# Linear prediction

In [2]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.gaussian_process.kernels import RBF, WhiteKernel
from sklearn.gaussian_process import GaussianProcessRegressor


from sklearn.svm import SVR
import os
import matplotlib.pyplot as plt
import numpy as np
from xgboost import XGBRegressor

In [3]:
# Add the folder to the Python path

os.chdir("../")
# change working directory to project's root path
print(os.getcwd())

c:\Users\marti\Desktop\data\hw_extra


In [4]:
def model_predict(data, regressor, len_pred, name_regressor=None, show=True):

    data['Date'] = pd.to_datetime(data['Date'])
    data['Date'] = data['Date'].dt.to_period('M').astype(str)

    indices_of_interest = ["HWN", "HWF", "HWD", "HWM", "HWA"]
    features = data.columns.difference(["Date"]+ indices_of_interest)

    # Iterate over each index to create and evaluate models
    if show:
        fig, axs = plt.subplots(5, 1, figsize=(25,15))

    mae_errors = []
    mae_errors_training = []
    r2_scores = []
    r2_scores_training = []
    for i in range(len(indices_of_interest)):
        X = data[features]
        y = data[indices_of_interest[i]]
        dates = data["Date"]
        
        # Split into training and testing sets
        X_train, X_test, y_train, y_test = X[:-len_pred], X[-len_pred:], y[:-len_pred], y[-len_pred:]
        train_dates, test_dates = dates[:-len_pred], dates[-len_pred:]
        
        regressor.fit(X_train, y_train)
        y_pred = regressor.predict(X_test)
        y_pred_train = regressor.predict(X_train)
        
        # Evaluation
        mae_errors.append(mean_absolute_error(y_test, y_pred))
        mae_errors_training.append(mean_absolute_error(y_train, y_pred_train))
        r2_scores.append(r2_score(y_test, y_pred))
        r2_scores_training.append(r2_score(y_train, y_pred_train))
        
        if show:
            # Plot training values
            axs[i].plot(
                train_dates,
                y_train,
                label="Training",
                marker='o',
                color='green',
                linestyle='-',
                linewidth=1.5
            )
            axs[i].plot(
                train_dates,
                y_pred_train,
                label="Predicted Training",
                marker='x',
                color='red',
                linestyle='-',
                linewidth=1.5
            )
            axs[i].plot(
                test_dates,
                y_test,
                label="Test",
                marker='o',
                color='blue',
                linestyle='-',
                linewidth=1.5
            )
            axs[i].plot(
                test_dates,
                y_pred,
                label="Predicted Test",
                marker='x',
                color='red',
                linestyle='--',
                linewidth=1.5
            )
            axs[i].set_title(f"Prediction for {indices_of_interest[i]}")
            axs[i].legend()
    if show:
        fig.tight_layout(rect=[0, 0.03, 1, 0.95])
        fig.suptitle(f"Model: {name_regressor}")
        plt.show()

    return [mae_errors, mae_errors_training, r2_scores, r2_scores_training]


## Chile

In [59]:
FIRST_YEAR=[1990, 1995, 2000, "1995removing"]
data = [pd.read_csv(f"data/features/chile/from{first}/HWs_chile_features_biseasonly_year.csv") for first in FIRST_YEAR]
data_filtered = [d.iloc[:, :7] for d in data]
data_filtered[0]

Unnamed: 0,Date,HWN,HWF,HWD,HWM,HWA,PC_720-Mode-3
0,1990-01-01,0.10,0.35,0.20,0.038741,0.094355,-0.454250
1,1990-02-01,0.05,0.15,0.15,0.058557,0.079414,0.146745
2,1990-03-01,0.15,0.45,0.45,0.176087,0.331234,0.143666
3,1990-04-01,0.15,0.45,0.45,0.229103,0.476756,-0.495377
4,1990-05-01,0.10,0.35,0.35,0.180689,0.401781,0.046357
...,...,...,...,...,...,...,...
391,2022-08-01,0.25,0.75,0.75,0.409873,0.663675,0.198691
392,2022-09-01,0.55,2.10,2.10,1.273128,2.113698,0.048432
393,2022-10-01,0.60,2.25,2.10,1.537504,2.647557,0.080836
394,2022-11-01,0.60,1.90,1.30,1.014617,1.833799,0.171453


In [60]:
predictors = [LinearRegression()]
name_predictors = ["Linear"]

In [61]:
stats = [model_predict(d, reg, 5, name_regressor=name, show=False) for reg,name in zip(predictors, name_predictors) for d in data_filtered]

In [62]:
indices_of_interest = ["Model", "HWN", "HWF", "HWD", "HWM", "HWA"]

results_df = pd.DataFrame(columns=indices_of_interest)
for j in range(len(stats)):
    results_df.loc[len(results_df.index)] = [f"{name_predictors[0]}-{FIRST_YEAR[j]}"]+ stats[j][0]

results_df.style.set_caption("MAE Model Results in Prediction").format(precision=2)

Unnamed: 0,Model,HWN,HWF,HWD,HWM,HWA
0,Linear-1990,0.24,0.81,0.66,0.52,0.92
1,Linear-1995,0.19,0.61,0.52,0.45,0.79
2,Linear-2000,0.19,0.61,0.52,0.45,0.79
3,Linear-1995removing,0.19,0.61,0.52,0.45,0.79


In [63]:
indices_of_interest = ["Model", "HWN", "HWF", "HWD", "HWM", "HWA"]

results_df = pd.DataFrame(columns=indices_of_interest)
for j in range(len(stats)):
    results_df.loc[len(results_df.index)] = [f"{name_predictors[0]}-{FIRST_YEAR[j]}"]+ stats[j][1]

results_df.style.set_caption("MAE Model Results in Training").format(precision=2)

Unnamed: 0,Model,HWN,HWF,HWD,HWM,HWA
0,Linear-1990,0.22,0.89,0.65,0.36,0.67
1,Linear-1995,0.22,0.9,0.66,0.36,0.67
2,Linear-2000,0.22,0.9,0.65,0.36,0.69
3,Linear-1995removing,0.22,0.9,0.66,0.36,0.67


In [64]:
indices_of_interest = ["Model", "HWN", "HWF", "HWD", "HWM", "HWA"]

results_df = pd.DataFrame(columns=indices_of_interest)
for j in range(len(stats)):
    results_df.loc[len(results_df.index)] = [f"{name_predictors[0]}-{FIRST_YEAR[j]}"]+ stats[j][2]

results_df.style.set_caption("R^2 Model Results in Prediction").format(precision=2)

Unnamed: 0,Model,HWN,HWF,HWD,HWM,HWA
0,Linear-1990,-2.19,-1.3,-1.23,-1.5,-1.49
1,Linear-1995,-0.93,-0.36,-0.37,-0.75,-0.72
2,Linear-2000,-0.88,-0.36,-0.37,-0.73,-0.71
3,Linear-1995removing,-0.93,-0.36,-0.37,-0.75,-0.72


In [65]:
indices_of_interest = ["Model", "HWN", "HWF", "HWD", "HWM", "HWA"]

results_df = pd.DataFrame(columns=indices_of_interest)
for j in range(len(stats)):
    results_df.loc[len(results_df.index)] = [f"{name_predictors[0]}-{FIRST_YEAR[j]}"]+ stats[j][3]

results_df.style.set_caption("R^2 Model Results in Training").format(precision=2)

Unnamed: 0,Model,HWN,HWF,HWD,HWM,HWA
0,Linear-1990,0.0,0.0,0.0,0.0,0.0
1,Linear-1995,0.05,0.05,0.05,0.04,0.04
2,Linear-2000,0.06,0.06,0.05,0.04,0.04
3,Linear-1995removing,0.05,0.05,0.05,0.04,0.04


## California

In [49]:
FIRST_YEAR=[1990, 1995, 2000]
data = [pd.read_csv(f"data/features/california/from{first}/HWs_cali_features_biseasonly_year.csv") for first in FIRST_YEAR]
data_filtered = [d.iloc[:, :7] for d in data]
data_filtered[0]

Unnamed: 0,Date,HWN,HWF,HWD,HWM,HWA,PC_555-Mode-2
0,1990-01-01,0.40,1.55,1.25,0.747521,1.497509,-0.281278
1,1990-02-01,0.70,3.05,2.90,1.404837,2.758578,-0.061912
2,1990-03-01,1.15,5.95,5.00,1.913945,3.589558,-0.173153
3,1990-04-01,1.25,5.80,4.55,2.103263,3.712839,-0.638605
4,1990-05-01,0.90,3.30,2.85,2.084124,3.328110,-0.312992
...,...,...,...,...,...,...,...
391,2022-08-01,1.25,8.10,5.55,1.655071,3.394373,0.156185
392,2022-09-01,1.40,9.50,7.40,2.454881,5.167971,0.098006
393,2022-10-01,1.40,9.00,6.95,2.287492,4.551468,-0.162649
394,2022-11-01,0.80,5.55,3.85,0.926211,1.885328,-0.166952


In [50]:
predictors = [LinearRegression()]
name_predictors = ["Linear"]

In [51]:
stats = [model_predict(d, reg, 5, name_regressor=name, show=False) for reg,name in zip(predictors, name_predictors) for d in data_filtered]

In [52]:
indices_of_interest = ["Model", "HWN", "HWF", "HWD", "HWM", "HWA"]

results_df = pd.DataFrame(columns=indices_of_interest)
for j in range(len(stats)):
    results_df.loc[len(results_df.index)] = [f"{name_predictors[0]}-{FIRST_YEAR[j]}"]+ stats[j][0]

results_df.style.set_caption("MAE Model Results in Prediction").format(precision=2)

Unnamed: 0,Model,HWN,HWF,HWD,HWM,HWA
0,Linear-1990,0.68,5.12,3.7,0.98,2.15
1,Linear-1995,0.68,5.1,3.68,0.97,2.14
2,Linear-2000,0.65,4.97,3.6,0.96,2.07


In [56]:
indices_of_interest = ["Model", "HWN", "HWF", "HWD", "HWM", "HWA"]

results_df = pd.DataFrame(columns=indices_of_interest)
for j in range(len(stats)):
    results_df.loc[len(results_df.index)] = [f"{name_predictors[0]}-{FIRST_YEAR[j]}"]+ stats[j][1]

results_df.style.set_caption("MAE Model Results in Training").format(precision=2)

Unnamed: 0,Model,HWN,HWF,HWD,HWM,HWA
0,Linear-1990,0.33,1.66,1.22,0.56,0.98
1,Linear-1995,0.33,1.66,1.22,0.57,0.99
2,Linear-2000,0.33,1.72,1.26,0.58,1.01


In [57]:
indices_of_interest = ["Model", "HWN", "HWF", "HWD", "HWM", "HWA"]

results_df = pd.DataFrame(columns=indices_of_interest)
for j in range(len(stats)):
    results_df.loc[len(results_df.index)] = [f"{name_predictors[0]}-{FIRST_YEAR[j]}"]+ stats[j][2]

results_df.style.set_caption("R^2 Model Results in Prediction").format(precision=2)

Unnamed: 0,Model,HWN,HWF,HWD,HWM,HWA
0,Linear-1990,-0.83,-1.45,-1.23,-0.53,-0.72
1,Linear-1995,-0.82,-1.43,-1.21,-0.51,-0.7
2,Linear-2000,-0.72,-1.3,-1.1,-0.44,-0.63


In [58]:
indices_of_interest = ["Model", "HWN", "HWF", "HWD", "HWM", "HWA"]

results_df = pd.DataFrame(columns=indices_of_interest)
for j in range(len(stats)):
    results_df.loc[len(results_df.index)] = [f"{name_predictors[0]}-{FIRST_YEAR[j]}"]+ stats[j][3]

results_df.style.set_caption("R^2 Model Results in Training").format(precision=2)

Unnamed: 0,Model,HWN,HWF,HWD,HWM,HWA
0,Linear-1990,0.05,0.05,0.04,0.03,0.04
1,Linear-1995,0.06,0.06,0.05,0.04,0.05
2,Linear-2000,0.06,0.07,0.05,0.04,0.05
