# <font color="red">**3. Correlation-Based FS - Fine-tuning Hyperparameters**</font>

**Author:** Osmar Bolivar

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import plotly.graph_objects as go

## **1. Monthly data**

In [46]:
raw = pd.read_csv("./Data/PC_DATASET.csv", index_col=0)
raw.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3757 entries, 2011-01-03 to 2024-10-31
Data columns (total 85 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   ipc_all  166 non-null    float64
 1   lag_1    3757 non-null   float64
 2   lag_2    3757 non-null   float64
 3   lag_3    3757 non-null   float64
 4   lag_6    3757 non-null   float64
 5   lag_9    3757 non-null   float64
 6   lag_12   3757 non-null   float64
 7   PC_1     3757 non-null   float64
 8   PC_2     3757 non-null   float64
 9   PC_3     3757 non-null   float64
 10  PC_4     3757 non-null   float64
 11  PC_5     3757 non-null   float64
 12  PC_6     3757 non-null   float64
 13  PC_7     3757 non-null   float64
 14  PC_8     3757 non-null   float64
 15  PC_9     3757 non-null   float64
 16  PC_10    3757 non-null   float64
 17  PC_11    3757 non-null   float64
 18  PC_12    3757 non-null   float64
 19  PC_13    3757 non-null   float64
 20  PC_14    3757 non-null   float64
 21  PC_1

In [47]:
df = raw.copy().query('freq == "month"')
df.drop(columns=["freq"], inplace=True)
df

Unnamed: 0,ipc_all,lag_1,lag_2,lag_3,lag_6,lag_9,lag_12,PC_1,PC_2,PC_3,...,PC_68,PC_69,PC_70,PC_71,PC_72,PC_73,PC_74,PC_75,PC_76,PC_77
2011-01-31,74.207255,73.260267,71.989803,71.196381,69.071086,68.561311,68.467691,-3.227580,-15.719013,-7.670009,...,-0.264026,0.710757,0.997002,0.104920,-0.162398,0.653257,0.502554,-0.861750,0.053930,0.540294
2011-02-28,75.439060,74.207255,73.260267,71.989803,69.800954,68.549203,68.581371,-2.919574,-16.436690,-9.716248,...,0.219816,0.091479,-0.975326,0.291379,-0.094332,-0.274985,0.096940,0.550827,0.612226,0.351364
2011-03-31,76.108818,75.439060,74.207255,73.260267,70.335479,68.646613,68.499278,-3.091073,-15.926157,-9.891932,...,0.395526,-0.279226,-0.456199,-0.933345,0.364515,-1.457140,-0.375573,1.002877,-0.430876,-0.732482
2011-04-30,76.125495,76.108818,75.439060,74.207255,71.196381,69.071086,68.561311,-4.388957,-17.340868,-12.485030,...,0.450770,-0.318136,0.533166,-0.001306,-0.381737,0.148083,-0.400274,-0.298039,-0.024627,0.002681
2011-05-31,76.277495,76.125495,76.108818,75.439060,71.989803,69.800954,68.549203,-5.093055,-16.268759,-12.819472,...,-0.544914,0.660364,0.221464,0.812034,0.260043,0.932172,-0.537724,-0.678505,0.236763,0.586495
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-06-30,113.895030,113.288776,112.579661,111.941559,111.123491,110.440281,109.678594,22.763621,1.491855,5.250182,...,-0.633444,0.296681,-0.012045,1.004644,-0.328013,0.167554,0.570326,-0.367190,-1.233679,1.268891
2024-07-31,114.434010,113.895030,113.288776,112.579661,111.211060,110.429431,110.081702,22.773596,1.210424,5.131451,...,-0.006083,1.090327,-1.261868,-0.473735,0.022446,-0.287541,0.150687,0.454159,0.115831,-0.219436
2024-08-31,116.245314,114.434010,113.895030,113.288776,111.433348,110.425657,110.506839,23.879811,1.009565,5.753375,...,-0.044732,-0.611091,0.472469,-0.304947,0.384683,-0.257463,-0.637446,-0.535473,0.399538,-0.139207
2024-09-30,117.267773,116.245314,114.434010,113.895030,111.941559,111.123491,110.440281,24.103446,0.285826,7.352197,...,-0.603562,-0.231411,1.024801,0.282118,0.497147,0.442791,0.560913,0.442355,0.680619,0.775763


In [48]:
isna = df.isna().sum()
isna[isna > 0]

Series([], dtype: int64)

In [49]:
df.shape

(166, 84)

## **2. Train and Val sets**

In [50]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [51]:
#train_size = int(len(df) * 0.75)
#train, validation = df[:train_size], df[train_size:]

train, validation = train_test_split(df, test_size=0.2, random_state=42)

print(f'Obs in train set: {train.shape[0]}; variables in train set: {train.shape[1]}')
print(f'Obs in validation set: {validation.shape[0]}; variables in validation set: {validation.shape[1]}')

Obs in train set: 132; variables in train set: 84
Obs in validation set: 34; variables in validation set: 84


In [52]:
validation.info()

<class 'pandas.core.frame.DataFrame'>
Index: 34 entries, 2019-05-31 to 2018-02-28
Data columns (total 84 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   ipc_all  34 non-null     float64
 1   lag_1    34 non-null     float64
 2   lag_2    34 non-null     float64
 3   lag_3    34 non-null     float64
 4   lag_6    34 non-null     float64
 5   lag_9    34 non-null     float64
 6   lag_12   34 non-null     float64
 7   PC_1     34 non-null     float64
 8   PC_2     34 non-null     float64
 9   PC_3     34 non-null     float64
 10  PC_4     34 non-null     float64
 11  PC_5     34 non-null     float64
 12  PC_6     34 non-null     float64
 13  PC_7     34 non-null     float64
 14  PC_8     34 non-null     float64
 15  PC_9     34 non-null     float64
 16  PC_10    34 non-null     float64
 17  PC_11    34 non-null     float64
 18  PC_12    34 non-null     float64
 19  PC_13    34 non-null     float64
 20  PC_14    34 non-null     float64
 21  PC_15 

In [53]:
scaler = StandardScaler()
train_scaled = pd.DataFrame(scaler.fit_transform(train), columns=train.columns, index=train.index)
validation_scaled = pd.DataFrame(scaler.transform(validation), columns=validation.columns, index=validation.index)
#test_scaled = pd.DataFrame(scaler.transform(test), columns=test.columns, index=test.index)

X_train = train_scaled.drop('ipc_all', axis=1)
y_train = train_scaled['ipc_all']

X_validation = validation_scaled.drop('ipc_all', axis=1)
y_validation = validation_scaled['ipc_all']

#X_test = test_scaled.drop('ipc_all', axis=1)

## **3. Algorithms**

In [54]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import TimeSeriesSplit
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor

tscv5 = TimeSeriesSplit(n_splits=5)

### **3.1. Ridge**   

**Without tuning (Validation):**
MSE:  0.052; R2:  0.544; MAE:  0.170

In [55]:
ridge = Ridge(random_state=123)
#ridge = Ridge(alpha=0.10569345535579884, fit_intercept=False, positive=True, random_state=123) 
# Fit on training set
ridge.fit(X_train, y_train)
# Make predictions on validation set using best model from GridSearchCV
ridge_train_pred = ridge.predict(X_train)
ridge_val_pred = ridge.predict(X_validation)
# Calculate Forecast metrics on train set
mse_train_ridge = mean_squared_error(y_train, ridge_train_pred)
r2_train_ridge = r2_score(y_train, ridge_train_pred)
mae_train_ridge = mean_absolute_error(y_train, ridge_train_pred)
print("Train MSE: ", mse_train_ridge)
print("Train R2: ", r2_train_ridge)
print("Train MAE: ", mae_train_ridge)
# Calculate Forecast metrics on validation set
mse_val_ridge = mean_squared_error(y_validation, ridge_val_pred)
r2_val_ridge = r2_score(y_validation, ridge_val_pred)
mae_val_ridge = mean_absolute_error(y_validation, ridge_val_pred)
print("Validation MSE: ", mse_val_ridge)
print("Validation R2: ", r2_val_ridge)
print("Validation MAE: ", mae_val_ridge)

Train MSE:  8.731823370986402e-05
Train R2:  0.9999126817662901
Train MAE:  0.0073153944620165555
Validation MSE:  0.0005619029154339739
Validation R2:  0.9993925355866891
Validation MAE:  0.02003978463105835


In [None]:
# Define parameter grid for GridSearchCV
alphas_ridge = np.logspace(-1, 2, num=500)
#alphas_ridge = np.arange(1, 100, 0.05)
param_grid = {'alpha': alphas_ridge,
              'positive': [True, False],
              'fit_intercept': [True, False],
              'random_state': [123]}

# Instantiate Ridge model
ridge = Ridge()
# Define GridSearchCV object
grid_search_ridge = GridSearchCV(ridge, param_grid, cv=tscv5, scoring='neg_mean_squared_error') 
# Fit GridSearchCV on training set
grid_search_ridge.fit(X_train, y_train)
# Print best parameter and score from GridSearchCV
print("Best parameter: ", grid_search_ridge.best_params_)
print("Best score: ", -grid_search_ridge.best_score_)
# Make predictions on validation set using best model from GridSearchCV
best_ridge = grid_search_ridge.best_estimator_
y_val_pred = best_ridge.predict(X_validation)
# Calculate RMSE on validation set
mse_val = mean_squared_error(y_validation, y_val_pred)
r2_val = r2_score(y_validation, y_val_pred)
mae_val = mean_absolute_error(y_validation, y_val_pred)
print("Validation MSE: ", mse_val)
print("Validation R2: ", r2_val)
print("Validation MAE: ", mae_val)
# Make predictions on test set using best model from GridSearchCV
#y_test_pred_ridge = best_ridge.predict(X_test)

Best parameter:  {'alpha': 0.10569345535579884, 'fit_intercept': False, 'positive': True, 'random_state': 123}
Best score:  0.011366644236477848
Validation MSE:  0.009385692484917108
Validation R2:  0.9178248689184463
Validation MAE:  0.07399663198027367


In [56]:
# Get the coefficients from the Ridge model
coef = ridge.coef_
# Create a dataframe of feature importances
feature_importance_ridge = pd.DataFrame({'Feature': X_train.columns, 'Importance': coef})
# Sort the features by importance
feature_importance_ridge = feature_importance_ridge.sort_values('Importance', ascending=False).reset_index(drop=True)
feature_importance_ridge.columns = ['feat_ridge', 'imp_ridge']
# Print the feature importances
feature_importance_ridge.head(15)

Unnamed: 0,feat_ridge,imp_ridge
0,lag_2,0.151052
1,lag_1,0.148284
2,lag_3,0.145189
3,lag_6,0.141421
4,lag_12,0.137412
5,lag_9,0.137211
6,PC_1,0.11766
7,PC_2,0.094534
8,PC_3,0.022627
9,PC_5,0.021511


### **3.2. Lasso**
Without tuning:
Validation MSE:  0.9523622169433499
Validation R2:  -0.0030766763969758415
Validation MAE:  0.8455503308136401

In [57]:
#lasso = Lasso()
lasso = Lasso(alpha=0.1, fit_intercept=False, positive=True, random_state=123)  
# Fit on training set
lasso.fit(X_train, y_train)
# Make predictions on validation set using best model from GridSearchCV
lasso_train_pred = lasso.predict(X_train)
lasso_val_pred = lasso.predict(X_validation)
# Calculate Forecast metrics on train set
mse_train_lasso = mean_squared_error(y_train, lasso_train_pred)
r2_train_lasso = r2_score(y_train, lasso_train_pred)
mae_train_lasso = mean_absolute_error(y_train, lasso_train_pred)
print("Train MSE: ", mse_train_lasso)
print("Train R2: ", r2_train_lasso)
print("Train MAE: ", mae_train_lasso)
# Calculate Forecast metrics on validation set
mse_val_lasso = mean_squared_error(y_validation, lasso_val_pred)
r2_val_lasso = r2_score(y_validation, lasso_val_pred)
mae_val_lasso = mean_absolute_error(y_validation, lasso_val_pred)
print("Validation MSE: ", mse_val_lasso)
print("Validation R2: ", r2_val_lasso)
print("Validation MAE: ", mae_val_lasso)

Train MSE:  0.011577523024845571
Train R2:  0.9884224769751544
Train MAE:  0.08526642023449883
Validation MSE:  0.00955925603318146
Validation R2:  0.9896656384966414
Validation MAE:  0.07983777488127498


In [28]:
# Define parameter grid for GridSearchCV
alphas_lasso = np.logspace(-1, 2, num=500)
#alphas_lasso = np.arange(1, 100, 0.05)
param_grid = {'alpha': alphas_lasso,
              'positive': [True, False],
              'fit_intercept': [True, False],
              'random_state': [123]}

# Instantiate lasso model
lasso = Lasso()
# Define GridSearchCV object
grid_search_lasso = GridSearchCV(lasso, param_grid, cv=tscv5, scoring='neg_mean_squared_error')  
# Fit GridSearchCV on training set
grid_search_lasso.fit(X_train, y_train)
# Print best parameter and score from GridSearchCV
print("Best parameter: ", grid_search_lasso.best_params_)
print("Best score: ", -grid_search_lasso.best_score_)
# Make predictions on validation set using best model from GridSearchCV
best_lasso = grid_search_lasso.best_estimator_
y_val_pred = best_lasso.predict(X_validation)
# Calculate RMSE on validation set
mse_val = mean_squared_error(y_validation, y_val_pred)
r2_val = r2_score(y_validation, y_val_pred)
mae_val = mean_absolute_error(y_validation, y_val_pred)
print("Validation MSE: ", mse_val)
print("Validation R2: ", r2_val)
print("Validation MAE: ", mae_val)
# Make predictions on test set using best model from GridSearchCV
#y_test_pred_lasso = best_lasso.predict(X_test)

Best parameter:  {'alpha': 0.1, 'fit_intercept': False, 'positive': True, 'random_state': 123}
Best score:  0.015196429729357317
Validation MSE:  0.04629494785951437
Validation R2:  0.594670993655202
Validation MAE:  0.20094663332870588


In [58]:
# Get the coefficients from the Lasso model
coef = lasso.coef_
# Create a dataframe of feature importances
feature_importance_lasso = pd.DataFrame({'Feature': X_train.columns, 'Importance': coef})
# Sort the features by importance
feature_importance_lasso = feature_importance_lasso.sort_values('Importance', ascending=False).reset_index(drop=True)
feature_importance_lasso.columns = ['feat_lasso', 'imp_lasso']
# Print the feature importances
feature_importance_lasso.head(15)

Unnamed: 0,feat_lasso,imp_lasso
0,lag_1,0.899211
1,PC_57,0.0
2,PC_55,0.0
3,PC_54,0.0
4,PC_53,0.0
5,PC_52,0.0
6,PC_51,0.0
7,PC_50,0.0
8,PC_49,0.0
9,PC_48,0.0


### **3.3. ElasticNet**
Without tuning:
Validation MSE:  0.2878895501474107
Validation R2:  0.6967799772043428
Validation MAE:  0.4602160948451647

In [60]:
#enet = ElasticNet()
enet = ElasticNet(alpha=0.11326825671361537, l1_ratio=0.1, fit_intercept=False, positive=True)  ## cv=5
# Fit on training set
enet.fit(X_train, y_train)
# Make predictions on validation set using best model from GridSearchCV
enet_train_pred = enet.predict(X_train)
enet_val_pred = enet.predict(X_validation)
# Calculate Forecast metrics on train set
mse_train_enet = mean_squared_error(y_train, enet_train_pred)
r2_train_enet = r2_score(y_train, enet_train_pred)
mae_train_enet = mean_absolute_error(y_train, enet_train_pred)
print("Train MSE: ", mse_train_enet)
print("Train R2: ", r2_train_enet)
print("Train MAE: ", mae_train_enet)
# Calculate Forecast metrics on validation set
mse_val_enet = mean_squared_error(y_validation, enet_val_pred)
r2_val_enet = r2_score(y_validation, enet_val_pred)
mae_val_enet = mean_absolute_error(y_validation, enet_val_pred)
print("Validation MSE: ", mse_val_enet)
print("Validation R2: ", r2_val_enet)
print("Validation MAE: ", mae_val_enet)

Train MSE:  0.0030480879116186062
Train R2:  0.9969519120883814
Train MAE:  0.038450032500424926
Validation MSE:  0.002596644144473864
Validation R2:  0.997192808813633
Validation MAE:  0.040450080638443846


In [None]:
# Define parameter grid for GridSearchCV
alphas_enet = np.logspace(-1, 2, num=500)
#alphas_enet = np.arange(1, 100, 0.05)
param_grid = {'alpha': alphas_enet,
              'l1_ratio': np.arange(0.01, 0.95, 0.01),
              'positive': [True, False],
              'fit_intercept': [True, False]}

# Instantiate enet model
enet = ElasticNet()

# Define GridSearchCV object
grid_search_enet = GridSearchCV(enet, param_grid, cv=5, scoring='neg_mean_squared_error')  ## works better with 5 cv
#grid_search_enet = GridSearchCV(enet, param_grid, cv=5, scoring='r2')  ## works better with 5 cv

# Fit GridSearchCV on training set
grid_search_enet.fit(X_train, y_train)

# Print best parameter and score from GridSearchCV
print("Best parameter: ", grid_search_enet.best_params_)
print("Best score: ", -grid_search_enet.best_score_)

# Make predictions on validation set using best model from GridSearchCV
best_enet = grid_search_enet.best_estimator_
y_val_pred = best_enet.predict(X_validation)

# Calculate RMSE on validation set
mse_val = mean_squared_error(y_validation, y_val_pred, squared=True)
r2_val = r2_score(y_validation, y_val_pred)
mae_val = mean_absolute_error(y_validation, y_val_pred)
print("Validation MSE: ", mse_val)
print("Validation R2: ", r2_val)
print("Validation MAE: ", mae_val)

# Make predictions on test set using best model from GridSearchCV
#y_test_pred_enet = best_enet.predict(X_test)

In [61]:
# Get the coefficients from the enet model
coef = enet.coef_
# Create a dataframe of feature importances
feature_importance_enet = pd.DataFrame({'Feature': X_train.columns, 'Importance': coef})
# Sort the features by importance
feature_importance_enet = feature_importance_enet.sort_values('Importance', ascending=False).reset_index(drop=True)
feature_importance_enet.columns = ['feat_enet', 'imp_enet']
# Print the feature importances
feature_importance_enet.head(15)

Unnamed: 0,feat_enet,imp_enet
0,lag_1,0.163655
1,lag_2,0.155478
2,lag_3,0.147564
3,lag_6,0.139409
4,lag_9,0.134004
5,lag_12,0.132565
6,PC_1,0.090036
7,PC_2,0.06467
8,PC_5,0.008719
9,PC_3,0.006244


### **3.4. ADA**
Without tuning:
Validation MSE:  0.0036814337026600556
Validation R2:  0.9961225254245258
Validation MAE:  0.0469554086526466

In [62]:
ada = AdaBoostRegressor()
#ada = AdaBoostRegressor(estimator=DecisionTreeRegressor(max_depth=6), random_state=123)
# Fit the model on training data
ada.fit(X_train, y_train)
# Make predictions on the validation set
ada_train_pred = ada.predict(X_train)
ada_val_pred = ada.predict(X_validation)
# Calculate Forecast metrics on train set
mse_train_ada = mean_squared_error(y_train, ada_train_pred)
r2_train_ada = r2_score(y_train, ada_train_pred)
mae_train_ada = mean_absolute_error(y_train, ada_train_pred)
print("Train MSE: ", mse_train_ada)
print("Train R2: ", r2_train_ada)
print("Train MAE: ", mae_train_ada)
# Evaluate the model on the validation set
mse_val_ada = mean_squared_error(y_validation, ada_val_pred)
r2_val_ada = r2_score(y_validation, ada_val_pred)
mae_val_ada = mean_absolute_error(y_validation, ada_val_pred)
print("Validation MSE: ", mse_val_ada)
print("Validation R2: ", r2_val_ada)
print("Validation MAE: ", mae_val_ada)

Train MSE:  0.0023462984248052918
Train R2:  0.9976537015751947
Train MAE:  0.03656177016179222
Validation MSE:  0.005062782388118694
Validation R2:  0.9945267055061561
Validation MAE:  0.05892086846063003


In [57]:
# Define the AdaBoost Regressor
ada = AdaBoostRegressor()

# Define the range of hyperparameters to search over
param_grid_ada = {
    #'n_estimators': range(50, 200, 5),    ##120 was selected range(50, 200, 5)
    'learning_rate': np.logspace(-5,1, 100),
    #'loss': ['linear', 'square', 'exponential']
    'random_state': [123],
    'estimator': [#DecisionTreeRegressor(max_depth=3),
                  #DecisionTreeRegressor(max_depth=4),
                  #DecisionTreeRegressor(max_depth=5),
                  DecisionTreeRegressor(max_depth=6),
                  #DecisionTreeRegressor(max_depth=7),
                  #DecisionTreeRegressor(max_depth=8),
                  ]
}

# Tune hyperparameters using GridSearchCV with TimeSeriesSplit
grid_search_ada = GridSearchCV(estimator=ada, param_grid=param_grid_ada, cv=tscv5, scoring='r2')
#grid_search_ada = GridSearchCV(estimator=ada, param_grid=param_grid_ada, cv=tscv5, scoring='neg_mean_squared_error')
grid_search_ada.fit(X_train, y_train)

# Evaluate the model using the best hyperparameters on the test set
ada_best = AdaBoostRegressor(**grid_search_ada.best_params_)
ada_best.fit(X_train, y_train)

# Make predictions on the validation set
y_val_pred = ada_best.predict(X_validation)

# Evaluate the model on the validation set
mse_val = mean_squared_error(y_validation, y_val_pred)
r2_val = r2_score(y_validation, y_val_pred)
mae_val = mean_absolute_error(y_validation, y_val_pred)
print("Validation MSE: ", mse_val)
print("Validation R2: ", r2_val)
print("Validation MAE: ", mae_val)

# Print the best hyperparameters and the best score
print("Best parameters found: ", grid_search_ada.best_params_)
print("Lowest MSE found: ", -grid_search_ada.best_score_)

# Evaluate the model on the test set
#y_test_pred_ada = ada_best.predict(X_test)

Validation MSE:  0.3740074941755078
Validation R2:  -2.274570833078731
Validation MAE:  0.5144651864672598
Best parameters found:  {'estimator': DecisionTreeRegressor(max_depth=6), 'learning_rate': 0.002009233002565048, 'random_state': 123}
Lowest MSE found:  7.190820165324213


In [63]:
# Create a DataFrame with the feature importance values
feature_importance_ada = pd.DataFrame({'Feature': X_train.columns, 'Importance': ada.feature_importances_})
# Sort the DataFrame by importance values in descending order
feature_importance_ada = feature_importance_ada.sort_values(by='Importance', ascending=False).reset_index(drop=True)
feature_importance_ada.columns = ['feat', 'imp_ada']
# Print the feature importance DataFrame
feature_importance_ada.head(15)

Unnamed: 0,feat,imp_ada
0,lag_6,0.291084
1,lag_12,0.188587
2,lag_9,0.130556
3,lag_2,0.118845
4,lag_1,0.108275
5,lag_3,0.07097
6,PC_2,0.035249
7,PC_1,0.029572
8,PC_3,0.012368
9,PC_6,0.002424


### **3.5. GBR**
Without tuning:
Validation MSE:  0.0014618738110729937
Validation R2:  0.998460279610389
Validation MAE:  0.02910552415848012

In [64]:
gbr = GradientBoostingRegressor(random_state=123)
#gbr = GradientBoostingRegressor(learning_rate=0.040949150623804255, n_estimators=290, random_state=0)
# Fit the model on the training set
gbr.fit(X_train, y_train)
# Make predictions on validation set using best model from GridSearchCV
gbr_train_pred = gbr.predict(X_train)
gbr_val_pred = gbr.predict(X_validation)
# Calculate Forecast metrics on train set
mse_train_gbr = mean_squared_error(y_train, gbr_train_pred)
r2_train_gbr = r2_score(y_train, gbr_train_pred)
mae_train_gbr = mean_absolute_error(y_train, gbr_train_pred)
print("Train MSE: ", mse_train_gbr)
print("Train R2: ", r2_train_gbr)
print("Train MAE: ", mae_train_gbr)
# Calculate Forecast metrics on validation set
mse_val_gbr = mean_squared_error(y_validation, gbr_val_pred)
r2_val_gbr = r2_score(y_validation, gbr_val_pred)
mae_val_gbr = mean_absolute_error(y_validation, gbr_val_pred)
print("Validation MSE: ", mse_val_gbr)
print("Validation R2: ", r2_val_gbr)
print("Validation MAE: ", mae_val_gbr)

Train MSE:  2.4076440786368363e-06
Train R2:  0.9999975923559213
Train MAE:  0.0012292039802156475
Validation MSE:  0.0016919662055226718
Validation R2:  0.9981708419192201
Validation MAE:  0.033407693379536134


In [32]:
# Define the model
gbr = GradientBoostingRegressor()

# Define the hyperparameters to be tuned
params = {
    'learning_rate': np.logspace(-2,0.5, 50),
    #'loss': ['squared_error', 'absolute_error', 'huber', 'quantile'],   ## 'loss': 'squared_error'
    'n_estimators': range(100, 300, 10),
    'max_depth': range(3,6,1),
    #'min_samples_split': range(2,20,1),
    #'min_samples_leaf': range(1,50, 1),
    #'min_weight_fraction_leaf': np.arange(0.0, 0.5, 0.01),
    #'subsample': np.arange(0.5, 1, 0.05),
    #'max_features': [None, 'sqrt', 'log2'],
    #'max_leaf_nodes': range(2, 200, 1),
    #'criterion': ['friedman_mse', 'squared_error'],
    'random_state': [123]
}

# Create the GridSearchCV object
#grid_gbr = GridSearchCV(gbr, params, cv=tscv5, scoring='r2', n_jobs=-1)
grid_gbr = GridSearchCV(gbr, params, cv=tscv5, scoring='neg_mean_squared_error', n_jobs=-1)
# Fit the model on the training set with GridSearchCV
grid_gbr.fit(X_train, y_train)
# Print the best hyperparameters
print('Best hyperparameters:', grid_gbr.best_params_)
# Use the best model to make predictions on the validation set
y_val_pred = grid_gbr.predict(X_validation)
# Compute the mean squared error of the predictions on the validation set
mse_val = mean_squared_error(y_validation, y_val_pred)
r2_val = r2_score(y_validation, y_val_pred)
mae_val = mean_absolute_error(y_validation, y_val_pred)
print("Validation MSE: ", mse_val)
print("Validation R2: ", r2_val)
print("Validation MAE: ", mae_val)
# Use the best model to make predictions on the test set
#y_test_pred_gbr = grid_gbr.predict(X_test)

  (array - array_means[:, np.newaxis]) ** 2, axis=1, weights=weights
  ret = umr_sum(arr, axis, dtype, out, keepdims, where=where)


Best hyperparameters: {'learning_rate': 1.757510624854793, 'max_depth': 5, 'n_estimators': 100, 'random_state': 123}
Validation MSE:  0.2815678564215272
Validation R2:  -1.4652283831985624
Validation MAE:  0.37399708361903783


In [65]:
# Create a DataFrame with the feature importance values
feature_importance_gbr = pd.DataFrame({'Feature': X_train.columns, 'Importance': gbr.feature_importances_})
# Sort the DataFrame by importance values in descending order
feature_importance_gbr = feature_importance_gbr.sort_values(by='Importance', ascending=False).reset_index(drop=True)
feature_importance_gbr.columns = ['feat_gbr', 'imp_gbr']
# Print the feature importance DataFrame
feature_importance_gbr.head(15)

Unnamed: 0,feat_gbr,imp_gbr
0,lag_2,0.26581
1,lag_9,0.216508
2,lag_3,0.166422
3,lag_6,0.165441
4,lag_1,0.095043
5,lag_12,0.054378
6,PC_1,0.01594
7,PC_2,0.008846
8,PC_3,0.006498
9,PC_6,0.002554


### **3.4. RF**
Without tuning:
Validation MSE:  0.00100147205985504
Validation R2:  0.9989451983211514
Validation MAE:  0.026535714623290185

In [66]:
# Define the Random Forest Regression model
rf = RandomForestRegressor(random_state=123)
#rf = RandomForestRegressor(min_samples_split=2, n_estimators=265, random_state=0)

# Fit the model to the training data and make predictions on the validation set
rf.fit(X_train, y_train)
# Make predictions on validation set using best model from GridSearchCV
rf_train_pred = rf.predict(X_train)
rf_val_pred = rf.predict(X_validation)
# Calculate Forecast metrics on train set
mse_train_rf = mean_squared_error(y_train, rf_train_pred)
r2_train_rf = r2_score(y_train, rf_train_pred)
mae_train_rf = mean_absolute_error(y_train, rf_train_pred)
print("Train MSE: ", mse_train_rf)
print("Train R2: ", r2_train_rf)
print("Train MAE: ", mae_train_rf)
# Calculate Forecast metrics on validation set
mse_val_rf = mean_squared_error(y_validation, rf_val_pred)
r2_val_rf = r2_score(y_validation, rf_val_pred)
mae_val_rf = mean_absolute_error(y_validation, rf_val_pred)
print("Validation MSE: ", mse_val_rf)
print("Validation R2: ", r2_val_rf)
print("Validation MAE: ", mae_val_rf)

Train MSE:  0.0005249549404387961
Train R2:  0.9994750450595612
Train MAE:  0.01305839442408408
Validation MSE:  0.001750701843331092
Validation R2:  0.9981073437440342
Validation MAE:  0.03362460935074801


In [None]:
# Define the Random Forest Regression model
rf_reg = RandomForestRegressor()

# Define the hyperparameters to tune
param_grid_rf = {
    'n_estimators': range(100, 300, 15),
    #'max_features': [None, 'sqrt', 'sqrt']
    #'max_depth': range(3,7,1),
    'min_samples_split': range(2,20,1),
    #'min_samples_leaf': range(1,50, 1),
    #'min_weight_fraction_leaf': np.arange(0.0, 0.5, 0.01),
    #'bootstrap': [True],
    #'oob_score': [True, False],
    #'warm_start': [True, False],
    #'max_samples': np.arange(0.1, 1.0, 0.01)
    'random_state': [0]
}

# Define the GridSearchCV object
#grid_rf_reg = GridSearchCV(estimator=rf_reg, param_grid=param_grid_rf, cv=5, scoring='r2')
grid_rf_reg = GridSearchCV(estimator=rf_reg, param_grid=param_grid_rf, cv=5, scoring='neg_mean_squared_error')

# Fit the GridSearchCV object to the training data
grid_rf_reg.fit(X_train, y_train)

# Extract the best hyperparameters and score
best_params = grid_rf_reg.best_params_
best_score = grid_rf_reg.best_score_

# Print the best hyperparameters found by GridSearchCV
print(f"Best hyperparameters: {best_params}")
print(f"Best score: {best_score}")

# Instantiate a new Random Forest Regression model using the best hyperparameters
rf_reg_best = RandomForestRegressor(**best_params)

# Fit the model to the training data and make predictions on the validation set
rf_reg_best.fit(X_train, y_train)

# Use the best model to make predictions on the validation set
y_val_pred = rf_reg_best.predict(X_validation)

# Compute the mean squared error of the predictions on the validation set
mse_val = mean_squared_error(y_validation, y_val_pred)
r2_val = r2_score(y_validation, y_val_pred)
mae_val = mean_absolute_error(y_validation, y_val_pred)
print("Validation MSE: ", mse_val)
print("Validation R2: ", r2_val)
print("Validation MAE: ", mae_val)

# Use the best model to make predictions on the test set
#y_test_pred_rf = rf_reg_best.predict(X_test)

Best hyperparameters: {'min_samples_split': 2, 'n_estimators': 265, 'random_state': 0}
Best score: -0.0035817383690906874
Validation MSE:  0.0009983900502684609
Validation R2:  0.9989484444515393
Validation MAE:  0.024970310604635504


In [67]:
# Create a DataFrame with the feature importance values
feature_importance_rf = pd.DataFrame({'Feature': X_train.columns, 'Importance': rf.feature_importances_})
# Sort the DataFrame by importance values in descending order
feature_importance_rf = feature_importance_rf.sort_values(by='Importance', ascending=False).reset_index(drop=True)
feature_importance_rf.columns = ['feat', 'imp_rf']
# Print the feature importance DataFrame
feature_importance_rf.head(30)

Unnamed: 0,feat,imp_rf
0,lag_1,0.174576
1,lag_3,0.170873
2,lag_12,0.144252
3,lag_2,0.143975
4,lag_9,0.133729
5,lag_6,0.122655
6,PC_2,0.041011
7,PC_1,0.032928
8,PC_3,0.022189
9,PC_6,0.002042


### **3.5. ET**

In [68]:
# Define the Extra Trees Regression model
et = ExtraTreesRegressor(random_state=123)
#et = ExtraTreesRegressor(bootstrap=True, max_samples=0.9599999999999995, oob_score=True)
# Fit the model to the training data and make predictions on the validation set
et.fit(X_train, y_train)
# Make predictions on validation set using best model from GridSearchCV
et_train_pred = et.predict(X_train)
et_val_pred = et.predict(X_validation)
# Calculate Forecast metrics on train set
mse_train_et = mean_squared_error(y_train, et_train_pred)
r2_train_et = r2_score(y_train, et_train_pred)
mae_train_et = mean_absolute_error(y_train, et_train_pred)
print("Train MSE: ", mse_train_et)
print("Train R2: ", r2_train_et)
print("Train MAE: ", mae_train_et)
# Calculate Forecast metrics on validation set
mse_val_et = mean_squared_error(y_validation, et_val_pred)
r2_val_et = r2_score(y_validation, et_val_pred)
mae_val_et = mean_absolute_error(y_validation, et_val_pred)
print("Validation MSE: ", mse_val_et)
print("Validation R2: ", r2_val_et)
print("Validation MAE: ", mae_val_et)

Train MSE:  2.453062538447141e-30
Train R2:  1.0
Train MAE:  1.0973505879227378e-15
Validation MSE:  0.000908491379219845
Validation R2:  0.9990178442440548
Validation MAE:  0.0231625334846288


In [None]:
# Define the Extra Trees Regression model
et_reg = ExtraTreesRegressor()

# Define the hyperparameter grid to search over
param_grid = {
    #'n_estimators': range(100, 300, 15),
    #'max_depth': range(3,15,1),
    #'max_features': [None, 'sqrt', 'sqrt']
    'min_samples_split': range(2,20,1),
    'min_samples_leaf': range(1,50, 1),
    #'min_weight_fraction_leaf': np.arange(0.0, 0.5, 0.01),
    'bootstrap': [True],
    'oob_score': [True, False],
    #'warm_start': [True, False],
    'max_samples': np.arange(0.1, 1.0, 0.01)
    #'criterion': ['squared_error', 'absolute_error', 'friedman_mse'],
    #'min_impurity_decrease': np.arange(0.0, 0.01, 0.00001),
    #'random_state': [0]
}

# Use GridSearchCV to find the best hyperparameters
grid_search = GridSearchCV(et_reg, param_grid=param_grid, cv=5, scoring='r2')
#grid_search = GridSearchCV(et_reg, param_grid=param_grid, cv=tscv5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

# Print the best hyperparameters and the corresponding validation score
print("Best hyperparameters: ", grid_search.best_params_)
#print("Validation score: ", grid_search.best_score_)

# Use the best model to make predictions on the validation set
best_et_reg = grid_search.best_estimator_
y_val_pred = best_et_reg.predict(X_validation)

# Compute the mean squared error of the predictions on the validation set
mse_val = mean_squared_error(y_validation, y_val_pred)
r2_val = r2_score(y_validation, y_val_pred)
mae_val = mean_absolute_error(y_validation, y_val_pred)
print("Validation MSE: ", mse_val)
print("Validation R2: ", r2_val)
print("Validation MAE: ", mae_val)

# Use the best model to make predictions on the test set
#y_test_pred_et = best_et_reg.predict(X_test)

In [69]:
# Create a DataFrame with the feature importance values
feature_importance_et = pd.DataFrame({'Feature': X_train.columns, 'Importance': et.feature_importances_})
# Sort the DataFrame by importance values in descending order
feature_importance_et = feature_importance_et.sort_values(by='Importance', ascending=False).reset_index(drop=True)
feature_importance_et.columns = ['feat', 'imp_et']
# Print the feature importance DataFrame
feature_importance_et.head(30)

Unnamed: 0,feat,imp_et
0,lag_12,0.199932
1,lag_9,0.156215
2,lag_3,0.142346
3,lag_1,0.123836
4,lag_2,0.113748
5,PC_2,0.102128
6,lag_6,0.101285
7,PC_1,0.026649
8,PC_3,0.021784
9,PC_9,0.001696


## **4. Report**

#### End