# <font color="red">**3. L1 FS - Fine-tuning Hyperparameters**</font>

**Author:** Osmar Bolivar

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import plotly.graph_objects as go

## **1. Monthly data**

In [2]:
raw = pd.read_csv("./Data/LASSO_DATASET.csv", index_col=0)
raw.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5901 entries, 2010-01-01 to 2024-10-31
Data columns (total 36 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   ipc_all        178 non-null    float64
 1   lag_1          5901 non-null   float64
 2   lag_2          5901 non-null   float64
 3   lag_3          5901 non-null   float64
 4   lag_6          5901 non-null   float64
 5   lag_9          5901 non-null   float64
 6   lag_12         5901 non-null   float64
 7   oil_cb         5901 non-null   float64
 8   sugar_sc       5901 non-null   float64
 9   milk2_cb       5901 non-null   float64
 10  watermelon_po  5901 non-null   float64
 11  flour_lp       5901 non-null   float64
 12  chicken_tj     5901 non-null   float64
 13  lard_co        5901 non-null   float64
 14  onion2_tj      5901 non-null   float64
 15  ycorn_tj       5901 non-null   float64
 16  platano_co     5901 non-null   float64
 17  oil_su         5901 non-null   float64
 18

In [3]:
df = raw.copy().query('freq == "month"')
df.drop(columns=["freq"], inplace=True)
df

Unnamed: 0,ipc_all,lag_1,lag_2,lag_3,lag_6,lag_9,lag_12,oil_cb,sugar_sc,milk2_cb,...,sugar_tj,chicken_cb,squash_co,peas_tj,rice3_po,peas_su,papaya_su,carrot_tj,bean_cb,yuca_lp
2010-01-31,0.074472,0.263790,0.457498,0.784931,1.447299,5.323961,11.041182,8.829032,155.000000,128.000000,...,165.000000,8.483871,65.00,53.370968,220.000000,55.000000,500.0,16.955806,23.548387,35.00
2010-02-28,0.312381,0.074472,0.263790,0.457498,1.417620,3.200128,8.128586,8.800000,165.000000,129.214286,...,157.500000,10.685714,65.00,54.285714,220.000000,55.000000,500.0,15.580357,20.575000,35.00
2010-03-31,0.685510,0.312381,0.074472,0.263790,0.643999,2.117642,6.563669,8.800000,165.483871,128.193548,...,159.677419,10.990323,65.00,36.935484,219.677419,55.000000,500.0,12.238387,14.612903,35.00
2010-04-30,1.215574,0.685510,0.312381,0.074472,0.784931,1.447299,5.323961,8.800000,174.333333,128.400000,...,165.000000,11.683333,65.00,55.916667,210.000000,55.000000,500.0,15.375333,21.133333,35.00
2010-05-31,1.384706,1.215574,0.685510,0.312381,0.457498,1.417620,3.200128,8.800000,183.483871,128.258065,...,165.000000,9.777419,65.00,78.306452,210.000000,55.000000,500.0,14.072581,25.129032,35.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-06-30,3.844356,3.516994,3.460018,3.063085,2.118325,2.847826,2.725282,10.500000,247.500000,123.860000,...,245.000000,13.464000,52.21,143.250000,240.000000,150.833333,500.0,9.955000,54.371333,33.75
2024-07-31,3.953707,3.844356,3.516994,3.460018,1.863150,2.075432,2.700872,10.478710,242.806452,126.717742,...,245.000000,12.922258,52.21,112.258065,240.000000,120.000000,500.0,13.953548,88.540000,33.75
2024-08-31,5.192869,3.953707,3.844356,3.516994,2.516604,1.594145,3.058496,11.132258,240.838710,134.069677,...,245.000000,14.789677,52.21,117.500000,240.000000,120.000000,500.0,14.740000,82.338710,33.75
2024-09-30,6.182067,5.192869,3.953707,3.844356,3.063085,2.118325,2.847826,11.660000,242.833333,140.663667,...,245.000000,12.538667,52.21,117.500000,240.000000,120.000000,500.0,14.740000,49.250000,33.75


In [4]:
isna = df.isna().sum()
isna[isna > 0]

Series([], dtype: int64)

In [5]:
df.shape

(178, 35)

## **2. Train and Val sets**

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [7]:
#train_size = int(len(df) * 0.8)
#train, validation = df[:train_size], df[train_size:]
train, validation = train_test_split(df, test_size=0.2, random_state=42)

print(f'Obs in train set: {train.shape[0]}; variables in train set: {train.shape[1]}')
print(f'Obs in validation set: {validation.shape[0]}; variables in validation set: {validation.shape[1]}')

Obs in train set: 142; variables in train set: 35
Obs in validation set: 36; variables in validation set: 35


In [8]:
validation.info()

<class 'pandas.core.frame.DataFrame'>
Index: 36 entries, 2011-08-31 to 2013-07-31
Data columns (total 35 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   ipc_all        36 non-null     float64
 1   lag_1          36 non-null     float64
 2   lag_2          36 non-null     float64
 3   lag_3          36 non-null     float64
 4   lag_6          36 non-null     float64
 5   lag_9          36 non-null     float64
 6   lag_12         36 non-null     float64
 7   oil_cb         36 non-null     float64
 8   sugar_sc       36 non-null     float64
 9   milk2_cb       36 non-null     float64
 10  watermelon_po  36 non-null     float64
 11  flour_lp       36 non-null     float64
 12  chicken_tj     36 non-null     float64
 13  lard_co        36 non-null     float64
 14  onion2_tj      36 non-null     float64
 15  ycorn_tj       36 non-null     float64
 16  platano_co     36 non-null     float64
 17  oil_su         36 non-null     float64
 18  

In [9]:
scaler = StandardScaler()
train_scaled = pd.DataFrame(scaler.fit_transform(train), columns=train.columns, index=train.index)
validation_scaled = pd.DataFrame(scaler.transform(validation), columns=validation.columns, index=validation.index)
#test_scaled = pd.DataFrame(scaler.transform(test), columns=test.columns, index=test.index)

X_train = train_scaled.drop('ipc_all', axis=1)
#y_train = train_scaled['ipc_all']
y_train = train['ipc_all']

X_validation = validation_scaled.drop('ipc_all', axis=1)
#y_validation = validation_scaled['ipc_all']
y_validation = validation['ipc_all']

#X_test = test_scaled.drop('ipc_all', axis=1)

## **3. Algorithms**

In [31]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import TimeSeriesSplit
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor

tscv5 = TimeSeriesSplit(n_splits=5)

### **3.1. Ridge**   

**Without tuning (Validation):**
MSE:  0.513; R2:  0.768; MAE:  0.573

In [32]:
#ridge = Ridge(random_state=0)
ridge = Ridge(alpha=5.3, fit_intercept=True, positive=False, random_state=0) 
# Fit on training set
ridge.fit(X_train, y_train)
# Make predictions on validation set using best model from GridSearchCV
ridge_train_pred = ridge.predict(X_train)
ridge_val_pred = ridge.predict(X_validation)
# Calculate Forecast metrics on train set
mse_train_ridge = mean_squared_error(y_train, ridge_train_pred)
r2_train_ridge = r2_score(y_train, ridge_train_pred)
mae_train_ridge = mean_absolute_error(y_train, ridge_train_pred)
print("Train MSE: ", mse_train_ridge)
print("Train R2: ", r2_train_ridge)
print("Train MAE: ", mae_train_ridge)
# Calculate Forecast metrics on validation set
mse_val_ridge = mean_squared_error(y_validation, ridge_val_pred)
r2_val_ridge = r2_score(y_validation, ridge_val_pred)
mae_val_ridge = mean_absolute_error(y_validation, ridge_val_pred)
print("Validation MSE: ", mse_val_ridge)
print("Validation R2: ", r2_val_ridge)
print("Validation MAE: ", mae_val_ridge)

Train MSE:  0.12049009685630677
Train R2:  0.9774497366585313
Train MAE:  0.26481709978142615
Validation MSE:  0.14362510985069066
Validation R2:  0.9843583666049067
Validation MAE:  0.2941826373449692


In [14]:
# Define parameter grid for GridSearchCV
#alphas_ridge = np.logspace(0.1, 5, num=500)
alphas_ridge = np.arange(1.0, 10, 0.5)
param_grid = {
    'alpha': alphas_ridge,  # [1.0] 
    'fit_intercept': [True],  # [True, False] 
    'copy_X': [True], 
    'max_iter': [None], 
    'tol': [0.0001], 
    'solver': ['auto'], 
    'positive': [False],  # [True, False]
    'random_state': [0]    
}

# Instantiate Ridge model
ridge = Ridge()
# Define GridSearchCV object
grid_search_ridge = GridSearchCV(ridge, param_grid, cv=tscv5, scoring='neg_mean_squared_error') 
# Fit GridSearchCV on training set
grid_search_ridge.fit(X_train, y_train)
# Print best parameter and score from GridSearchCV
print("Best parameter: ", grid_search_ridge.best_params_)
print("Best score: ", -grid_search_ridge.best_score_)
# Make predictions on validation set using best model from GridSearchCV
best_ridge = grid_search_ridge.best_estimator_
ridge_val_pred = best_ridge.predict(X_validation)
# Calculate Forecast metrics on validation set
mse_val_ridge = mean_squared_error(y_validation, ridge_val_pred)
r2_val_ridge = r2_score(y_validation, ridge_val_pred)
mae_val_ridge = mean_absolute_error(y_validation, ridge_val_pred)
print("Validation MSE: ", mse_val_ridge)
print("Validation R2: ", r2_val_ridge)
print("Validation MAE: ", mae_val_ridge)

Best parameter:  {'alpha': 1.0, 'copy_X': True, 'fit_intercept': True, 'max_iter': None, 'positive': False, 'random_state': 0, 'solver': 'auto', 'tol': 0.0001}
Best score:  0.2710708702550876
Validation MSE:  0.15792558949477556
Validation R2:  0.9828009588494014
Validation MAE:  0.3048457445029009


In [33]:
# Get the coefficients from the Ridge model
coef = ridge.coef_
# Create a dataframe of feature importances
feature_importance_ridge = pd.DataFrame({'Feature': X_train.columns, 'Importance': coef})
# Sort the features by importance
feature_importance_ridge = feature_importance_ridge.sort_values('Importance', ascending=False).reset_index(drop=True)
feature_importance_ridge.columns = ['feat_ridge', 'imp_ridge']
# Print the feature importances
feature_importance_ridge.head(15)

Unnamed: 0,feat_ridge,imp_ridge
0,lag_1,0.785743
1,milk2_cb,0.369223
2,oil_cb,0.351211
3,flour_lp,0.261752
4,lard_co,0.252792
5,onion2_tj,0.251549
6,sugar_tj,0.219797
7,oil_su,0.198505
8,silver,0.192101
9,sugar_sc,0.185257


### **3.2. Lasso**
Without tuning:
Validation MSE:  0.796
Validation R2:  0.640
Validation MAE:  0.690

In [40]:
#lasso = Lasso(random_state=0)
lasso = Lasso(alpha=0.1, fit_intercept=True, max_iter=10000, random_state=0)  
# Fit on training set
lasso.fit(X_train, y_train)
# Make predictions on validation set using best model from GridSearchCV
lasso_train_pred = lasso.predict(X_train)
lasso_val_pred = lasso.predict(X_validation)
# Calculate Forecast metrics on train set
mse_train_lasso = mean_squared_error(y_train, lasso_train_pred)
r2_train_lasso = r2_score(y_train, lasso_train_pred)
mae_train_lasso = mean_absolute_error(y_train, lasso_train_pred)
print("Train MSE: ", mse_train_lasso)
print("Train R2: ", r2_train_lasso)
print("Train MAE: ", mae_train_lasso)
# Calculate Forecast metrics on validation set
mse_val_lasso = mean_squared_error(y_validation, lasso_val_pred)
r2_val_lasso = r2_score(y_validation, lasso_val_pred)
mae_val_lasso = mean_absolute_error(y_validation, lasso_val_pred)
print("Validation MSE: ", mse_val_lasso)
print("Validation R2: ", r2_val_lasso)
print("Validation MAE: ", mae_val_lasso)

Train MSE:  0.26525480558196435
Train R2:  0.9503563705688042
Train MAE:  0.39137207800671525
Validation MSE:  0.299688446649142
Validation R2:  0.9673621359099116
Validation MAE:  0.42742352220563906


In [36]:
# Define parameter grid for GridSearchCV
alphas_lasso = np.logspace(-1, 2, num=500)
#alphas_lasso = np.arange(0.1, 100, 0.05)
param_grid = {
    'alpha': alphas_lasso,  # alphas_lasso [1.0]
    'fit_intercept': [True], 
    'precompute': [False], 
    'copy_X': [True], 
    'max_iter': [10000],  # [1000]
    'tol': [0.0001], 
    'warm_start': [False], 
    'positive': [False], 
    'random_state': [0], 
    'selection': ['cyclic']
}

# Instantiate lasso model
lasso = Lasso()
# Define GridSearchCV object
grid_search_lasso = GridSearchCV(lasso, param_grid, cv=tscv5, scoring='neg_mean_squared_error')  
# Fit GridSearchCV on training set
grid_search_lasso.fit(X_train, y_train)
# Print best parameter and score from GridSearchCV
print("Best parameter: ", grid_search_lasso.best_params_)
print("Best score: ", -grid_search_lasso.best_score_)
# Make predictions on validation set using best model from GridSearchCV
best_lasso = grid_search_lasso.best_estimator_
y_val_pred = best_lasso.predict(X_validation)
# Calculate RMSE on validation set
mse_val = mean_squared_error(y_validation, y_val_pred)
r2_val = r2_score(y_validation, y_val_pred)
mae_val = mean_absolute_error(y_validation, y_val_pred)
print("Validation MSE: ", mse_val)
print("Validation R2: ", r2_val)
print("Validation MAE: ", mae_val)
# Make predictions on test set using best model from GridSearchCV
#y_test_pred_lasso = best_lasso.predict(X_test)

Best parameter:  {'alpha': 0.1, 'copy_X': True, 'fit_intercept': True, 'max_iter': 10000, 'positive': False, 'precompute': False, 'random_state': 0, 'selection': 'cyclic', 'tol': 0.0001, 'warm_start': False}
Best score:  0.405853357754059
Validation MSE:  0.299688446649142
Validation R2:  0.9673621359099116
Validation MAE:  0.42742352220563906


In [41]:
# Get the coefficients from the Lasso model
coef = lasso.coef_
# Create a dataframe of feature importances
feature_importance_lasso = pd.DataFrame({'Feature': X_train.columns, 'Importance': coef})
# Sort the features by importance
feature_importance_lasso = feature_importance_lasso.sort_values('Importance', ascending=False).reset_index(drop=True)
feature_importance_lasso.columns = ['feat_lasso', 'imp_lasso']
# Print the feature importances
feature_importance_lasso.head(15)

Unnamed: 0,feat_lasso,imp_lasso
0,lag_1,1.888944
1,sugar_sc,0.125754
2,rice_cb,0.11217
3,silver,0.07917
4,oil_su,0.073536
5,ycorn_tj,0.052165
6,tomato_tj,0.042336
7,inflacion,0.03624
8,peas_su,0.022003
9,watermelon_po,0.018432


### **3.3. ElasticNet**
Without tuning:
Validation MSE:  0.2237
Validation R2:  0.250
Validation MAE:  0.401

In [42]:
#enet = ElasticNet(random_state=0)
enet = ElasticNet(alpha=0.4, l1_ratio=0.97, fit_intercept=True, max_iter=10000, random_state=0) 
# Fit on training set
enet.fit(X_train, y_train)
# Make predictions on validation set using best model from GridSearchCV
enet_train_pred = enet.predict(X_train)
enet_val_pred = enet.predict(X_validation)
# Calculate Forecast metrics on train set
mse_train_enet = mean_squared_error(y_train, enet_train_pred)
r2_train_enet = r2_score(y_train, enet_train_pred)
mae_train_enet = mean_absolute_error(y_train, enet_train_pred)
print("Train MSE: ", mse_train_enet)
print("Train R2: ", r2_train_enet)
print("Train MAE: ", mae_train_enet)
# Calculate Forecast metrics on validation set
mse_val_enet = mean_squared_error(y_validation, enet_val_pred)
r2_val_enet = r2_score(y_validation, enet_val_pred)
mae_val_enet = mean_absolute_error(y_validation, enet_val_pred)
print("Validation MSE: ", mse_val_enet)
print("Validation R2: ", r2_val_enet)
print("Validation MAE: ", mae_val_enet)

Train MSE:  0.5355292635269001
Train R2:  0.8997732906298821
Train MAE:  0.5373925090451613
Validation MSE:  0.7089692113969585
Validation R2:  0.9227890129754613
Validation MAE:  0.6383506148809828


In [None]:
# Define parameter grid for GridSearchCV
#alphas_enet = np.logspace(-1, 2, num=500)
alphas_enet = np.arange(0.1, 10, 0.05)
param_grid = {
    'alpha': [1.0], # alphas_enet
    'l1_ratio': [0.5], # np.arange(0.1, 0.95, 0.05)
    'fit_intercept': [True], 
    'precompute': [False], 
    'max_iter': [10000], 
    'copy_X': [True], 
    'tol': [0.0001], 
    'warm_start': [False], 
    'positive': [False], 
    'random_state': [0], 
    'selection': ['cyclic']
}

# Instantiate enet model
enet = ElasticNet()
# Define GridSearchCV object
grid_search_enet = GridSearchCV(enet, param_grid, cv=tscv5, scoring='neg_mean_squared_error') 
# Fit GridSearchCV on training set
grid_search_enet.fit(X_train, y_train)
# Print best parameter and score from GridSearchCV
print("Best parameter: ", grid_search_enet.best_params_)
print("Best score: ", -grid_search_enet.best_score_)
# Make predictions on validation set using best model from GridSearchCV
best_enet = grid_search_enet.best_estimator_
y_val_pred = best_enet.predict(X_validation)
# Calculate RMSE on validation set
mse_val = mean_squared_error(y_validation, y_val_pred)
r2_val = r2_score(y_validation, y_val_pred)
mae_val = mean_absolute_error(y_validation, y_val_pred)
print("Validation MSE: ", mse_val)
print("Validation R2: ", r2_val)
print("Validation MAE: ", mae_val)
# Make predictions on test set using best model from GridSearchCV
#y_test_pred_enet = best_enet.predict(X_test)

In [43]:
# Get the coefficients from the enet model
coef = enet.coef_
# Create a dataframe of feature importances
feature_importance_enet = pd.DataFrame({'Feature': X_train.columns, 'Importance': coef})
# Sort the features by importance
feature_importance_enet = feature_importance_enet.sort_values('Importance', ascending=False).reset_index(drop=True)
feature_importance_enet.columns = ['feat_enet', 'imp_enet']
# Print the feature importances
feature_importance_enet.head(15)

Unnamed: 0,feat_enet,imp_enet
0,lag_1,1.816027
1,sugar_sc,0.00719
2,squash_co,0.0
3,tomato_tj,0.0
4,chicken_tr,0.0
5,papa2_tj,0.0
6,silver,0.0
7,sugar_tj,0.0
8,chicken_cb,0.0
9,peas_tj,0.0


### **3.4. ADA**
Without tuning:
Validation MSE:  0.407
Validation R2:  0.816
Validation MAE:  0.490

In [60]:
#ada = AdaBoostRegressor(random_state=0)
ada = AdaBoostRegressor(learning_rate=0.99, estimator=DecisionTreeRegressor(max_depth=7), random_state=0)
# Fit the model on training data
ada.fit(X_train, y_train)
# Make predictions on the validation set
ada_train_pred = ada.predict(X_train)
ada_val_pred = ada.predict(X_validation)
# Calculate Forecast metrics on train set
mse_train_ada = mean_squared_error(y_train, ada_train_pred)
r2_train_ada = r2_score(y_train, ada_train_pred)
mae_train_ada = mean_absolute_error(y_train, ada_train_pred)
print("Train MSE: ", mse_train_ada)
print("Train R2: ", r2_train_ada)
print("Train MAE: ", mae_train_ada)
# Evaluate the model on the validation set
mse_val_ada = mean_squared_error(y_validation, ada_val_pred)
r2_val_ada = r2_score(y_validation, ada_val_pred)
mae_val_ada = mean_absolute_error(y_validation, ada_val_pred)
print("Validation MSE: ", mse_val_ada)
print("Validation R2: ", r2_val_ada)
print("Validation MAE: ", mae_val_ada)

Train MSE:  0.003501187646615026
Train R2:  0.9993447369908481
Train MAE:  0.02743245720011363
Validation MSE:  0.4824007893846811
Validation R2:  0.9474636690972563
Validation MAE:  0.5308086758845862


In [None]:
# Define the AdaBoost Regressor
ada = AdaBoostRegressor()
# Define the range of hyperparameters to search over
param_grid_ada = {
    'estimator': [DecisionTreeRegressor(max_depth=3),  # [None]
                  DecisionTreeRegressor(max_depth=4),
                  DecisionTreeRegressor(max_depth=5),
                  DecisionTreeRegressor(max_depth=6),
                  DecisionTreeRegressor(max_depth=7),
                  DecisionTreeRegressor(max_depth=8) ],   
    'n_estimators': [50],  # range(50, 200, 5)
    'learning_rate': [1.0], # np.logspace(-5,1, 100)
    'loss': ['square'], # ['linear']
    'random_state': [0]
}

# Tune hyperparameters using GridSearchCV with TimeSeriesSplit
grid_search_ada = GridSearchCV(estimator=ada, param_grid=param_grid_ada, cv=tscv5, scoring='neg_mean_squared_error')
grid_search_ada.fit(X_train, y_train)
# Evaluate the model using the best hyperparameters on the test set
ada_best = AdaBoostRegressor(**grid_search_ada.best_params_)
ada_best.fit(X_train, y_train)
# Make predictions on the validation set
y_val_pred = ada_best.predict(X_validation)
# Evaluate the model on the validation set
mse_val = mean_squared_error(y_validation, y_val_pred)
r2_val = r2_score(y_validation, y_val_pred)
mae_val = mean_absolute_error(y_validation, y_val_pred)
print("Validation MSE: ", mse_val)
print("Validation R2: ", r2_val)
print("Validation MAE: ", mae_val)
# Print the best hyperparameters and the best score
print("Best parameters found: ", grid_search_ada.best_params_)
print("Lowest MSE found: ", -grid_search_ada.best_score_)
# Evaluate the model on the test set
#y_test_pred_ada = ada_best.predict(X_test)

In [61]:
# Create a DataFrame with the feature importance values
feature_importance_ada = pd.DataFrame({'Feature': X_train.columns, 'Importance': ada.feature_importances_})
# Sort the DataFrame by importance values in descending order
feature_importance_ada = feature_importance_ada.sort_values(by='Importance', ascending=False).reset_index(drop=True)
feature_importance_ada.columns = ['feat', 'imp_ada']
# Print the feature importance DataFrame
feature_importance_ada.head(15)

Unnamed: 0,feat,imp_ada
0,lag_1,0.828091
1,sugar_tj,0.034874
2,chicken_tj,0.031449
3,ycorn_tj,0.008959
4,squash_co,0.007964
5,flour_lp,0.007102
6,lag_2,0.006183
7,tomato_tj,0.005996
8,sugar_sc,0.005812
9,platano_co,0.005631


### **3.5. GBR**
Without tuning:
Validation MSE:  0.407
Validation R2:  0.816
Validation MAE:  0.481

In [107]:
#gbr = GradientBoostingRegressor(random_state=0)
gbr = GradientBoostingRegressor(learning_rate=0.3, n_estimators=120, random_state=0)
# Fit the model on the training set
gbr.fit(X_train, y_train)
# Make predictions on validation set using best model from GridSearchCV
gbr_train_pred = gbr.predict(X_train)
gbr_val_pred = gbr.predict(X_validation)
# Calculate Forecast metrics on train set
mse_train_gbr = mean_squared_error(y_train, gbr_train_pred)
r2_train_gbr = r2_score(y_train, gbr_train_pred)
mae_train_gbr = mean_absolute_error(y_train, gbr_train_pred)
print("Train MSE: ", mse_train_gbr)
print("Train R2: ", r2_train_gbr)
print("Train MAE: ", mae_train_gbr)
# Calculate Forecast metrics on validation set
mse_val_gbr = mean_squared_error(y_validation, gbr_val_pred)
r2_val_gbr = r2_score(y_validation, gbr_val_pred)
mae_val_gbr = mean_absolute_error(y_validation, gbr_val_pred)
print("Validation MSE: ", mse_val_gbr)
print("Validation R2: ", r2_val_gbr)
print("Validation MAE: ", mae_val_gbr)

Train MSE:  1.0554826952531321e-05
Train R2:  0.9999980246166821
Train MAE:  0.0025301681097614995
Validation MSE:  0.30887472237516017
Validation R2:  0.9663616955459531
Validation MAE:  0.43475446738807083


In [89]:
# Define the model
gbr = GradientBoostingRegressor()
# Define the hyperparameters to be tuned
params = {
    'loss': ['squared_error'], # ['squared_error', 'absolute_error', 'huber', 'quantile']
    'learning_rate': [0.3], # [0.1] 
    'n_estimators': range(100, 300, 5),  # [110]
    'subsample': [1.0], 
    'criterion': ['friedman_mse'], # ['friedman_mse', 'squared_error']
    'min_samples_split': [2], 
    'min_samples_leaf': [1], 
    'min_weight_fraction_leaf': [0.0], 
    'max_depth': [4], # range(3,6,1)
    'min_impurity_decrease': [0.0], 
    'init': [None], 
    'random_state': [0], 
    'max_features': [None], 
    'alpha': [0.9], 
    'verbose': [0], 
    'max_leaf_nodes': [None], 
    'warm_start': [False], 
    'validation_fraction': [0.1], 
    'n_iter_no_change': [None], 
    'tol': [0.0001], 
    'ccp_alpha': [0.0]
}

# Create the GridSearchCV object
grid_gbr = GridSearchCV(gbr, params, cv=tscv5, scoring='neg_mean_squared_error', n_jobs=-1)
# Fit the model on the training set with GridSearchCV
grid_gbr.fit(X_train, y_train)
# Print the best hyperparameters
print('Best hyperparameters:', grid_gbr.best_params_)
# Use the best model to make predictions on the validation set
y_val_pred = grid_gbr.predict(X_validation)
# Compute the mean squared error of the predictions on the validation set
mse_val = mean_squared_error(y_validation, y_val_pred)
r2_val = r2_score(y_validation, y_val_pred)
mae_val = mean_absolute_error(y_validation, y_val_pred)
print("Validation MSE: ", mse_val)
print("Validation R2: ", r2_val)
print("Validation MAE: ", mae_val)
# Use the best model to make predictions on the test set
#y_test_pred_gbr = grid_gbr.predict(X_test)

Best hyperparameters: {'alpha': 0.9, 'ccp_alpha': 0.0, 'criterion': 'friedman_mse', 'init': None, 'learning_rate': 0.3, 'loss': 'squared_error', 'max_depth': 4, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 180, 'n_iter_no_change': None, 'random_state': 0, 'subsample': 1.0, 'tol': 0.0001, 'validation_fraction': 0.1, 'verbose': 0, 'warm_start': False}
Validation MSE:  0.3791277621112379
Validation R2:  0.958710719379007
Validation MAE:  0.464979691442616


In [108]:
# Create a DataFrame with the feature importance values
feature_importance_gbr = pd.DataFrame({'Feature': X_train.columns, 'Importance': gbr.feature_importances_})
# Sort the DataFrame by importance values in descending order
feature_importance_gbr = feature_importance_gbr.sort_values(by='Importance', ascending=False).reset_index(drop=True)
feature_importance_gbr.columns = ['feat_gbr', 'imp_gbr']
# Print the feature importance DataFrame
feature_importance_gbr.head(15)

Unnamed: 0,feat_gbr,imp_gbr
0,lag_1,0.798719
1,chicken_tj,0.099332
2,flour_lp,0.033513
3,sugar_sc,0.009717
4,lag_12,0.008335
5,silver,0.00642
6,tomato_tj,0.006328
7,chicken_tr,0.00434
8,oil_cb,0.004155
9,peas_tj,0.003668


### **3.4. RF**
Without tuning:
Validation MSE:  0.306
Validation R2:  0.861
Validation MAE:  0.420

In [126]:
#rf = RandomForestRegressor(random_state=0)
rf = RandomForestRegressor(criterion="absolute_error", max_depth=5, n_estimators=130, random_state=0)

# Fit the model to the training data and make predictions on the validation set
rf.fit(X_train, y_train)
# Make predictions on validation set using best model from GridSearchCV
rf_train_pred = rf.predict(X_train)
rf_val_pred = rf.predict(X_validation)
# Calculate Forecast metrics on train set
mse_train_rf = mean_squared_error(y_train, rf_train_pred)
r2_train_rf = r2_score(y_train, rf_train_pred)
mae_train_rf = mean_absolute_error(y_train, rf_train_pred)
print("Train MSE: ", mse_train_rf)
print("Train R2: ", r2_train_rf)
print("Train MAE: ", mae_train_rf)
# Calculate Forecast metrics on validation set
mse_val_rf = mean_squared_error(y_validation, rf_val_pred)
r2_val_rf = r2_score(y_validation, rf_val_pred)
mae_val_rf = mean_absolute_error(y_validation, rf_val_pred)
print("Validation MSE: ", mse_val_rf)
print("Validation R2: ", r2_val_rf)
print("Validation MAE: ", mae_val_rf)

Train MSE:  0.07303579074934144
Train R2:  0.9863310233975976
Train MAE:  0.20304947032092058
Validation MSE:  0.4043625326285658
Validation R2:  0.95596250195622
Validation MAE:  0.4992689131608102


In [127]:
# Define the Random Forest Regression model
rf_reg = RandomForestRegressor()

# Define the hyperparameters to tune
param_grid_rf = {
    'n_estimators': range(100, 300, 10),  # [100]
    'criterion': ['absolute_error'], 
    'max_depth': [7], # [None] 
    'min_samples_split': [2],  # range(2,20,1) 
    'min_samples_leaf': [1], 
    'min_weight_fraction_leaf': [0.0], 
    'max_features': [1.0], 
    'max_leaf_nodes': [None], 
    'min_impurity_decrease': [0.0], 
    'bootstrap': [True], 
    'oob_score': [False], 
    'n_jobs':[None], 
    'random_state': [0], 
    'verbose': [0], 
    'warm_start': [False], 
    'ccp_alpha': [0.0], 
    'max_samples': [None], 
    'monotonic_cst': [None]
}

# Define the GridSearchCV object
grid_rf_reg = GridSearchCV(estimator=rf_reg, param_grid=param_grid_rf, cv=tscv5, scoring='neg_mean_squared_error')
# Fit the GridSearchCV object to the training data
grid_rf_reg.fit(X_train, y_train)
# Extract the best hyperparameters and score
best_params = grid_rf_reg.best_params_
best_score = grid_rf_reg.best_score_
# Print the best hyperparameters found by GridSearchCV
print(f"Best hyperparameters: {best_params}")
print(f"Best score: {best_score}")
# Instantiate a new Random Forest Regression model using the best hyperparameters
rf_reg_best = RandomForestRegressor(**best_params)
# Fit the model to the training data and make predictions on the validation set
rf_reg_best.fit(X_train, y_train)
# Use the best model to make predictions on the validation set
y_val_pred = rf_reg_best.predict(X_validation)
# Compute the mean squared error of the predictions on the validation set
mse_val = mean_squared_error(y_validation, y_val_pred)
r2_val = r2_score(y_validation, y_val_pred)
mae_val = mean_absolute_error(y_validation, y_val_pred)
print("Validation MSE: ", mse_val)
print("Validation R2: ", r2_val)
print("Validation MAE: ", mae_val)

# Use the best model to make predictions on the test set
#y_test_pred_rf = rf_reg_best.predict(X_test)

Best hyperparameters: {'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'absolute_error', 'max_depth': 7, 'max_features': 1.0, 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'n_estimators': 160, 'n_jobs': None, 'oob_score': False, 'random_state': 0, 'verbose': 0, 'warm_start': False}
Best score: -0.9843834316265578
Validation MSE:  0.29444041514778374
Validation R2:  0.8667731882502967
Validation MAE:  0.4229197462439819


In [127]:
# Create a DataFrame with the feature importance values
feature_importance_rf = pd.DataFrame({'Feature': X_train.columns, 'Importance': rf.feature_importances_})
# Sort the DataFrame by importance values in descending order
feature_importance_rf = feature_importance_rf.sort_values(by='Importance', ascending=False).reset_index(drop=True)
feature_importance_rf.columns = ['feat', 'imp_rf']
# Print the feature importance DataFrame
feature_importance_rf.head(30)

Unnamed: 0,feat,imp_rf
0,lag_1,0.744405
1,sugar_tj,0.021616
2,chicken_tj,0.019536
3,ycorn_tj,0.015704
4,tomato_tj,0.013193
5,flour_lp,0.012898
6,yuca_lp,0.010333
7,lag_2,0.010108
8,sugar_sc,0.009855
9,lag_6,0.009828


### **3.5. ET**   
Without tuning:
Validation MSE:  0.587
Validation R2:  0.734
Validation MAE:  0.537

In [145]:
#et = ExtraTreesRegressor(random_state=0)
et = ExtraTreesRegressor(max_depth=10, random_state=0)
# Fit the model to the training data and make predictions on the validation set
et.fit(X_train, y_train)
# Make predictions on validation set using best model from GridSearchCV
et_train_pred = et.predict(X_train)
et_val_pred = et.predict(X_validation)
# Calculate Forecast metrics on train set
mse_train_et = mean_squared_error(y_train, et_train_pred)
r2_train_et = r2_score(y_train, et_train_pred)
mae_train_et = mean_absolute_error(y_train, et_train_pred)
print("Train MSE: ", mse_train_et)
print("Train R2: ", r2_train_et)
print("Train MAE: ", mae_train_et)
# Calculate Forecast metrics on validation set
mse_val_et = mean_squared_error(y_validation, et_val_pred)
r2_val_et = r2_score(y_validation, et_val_pred)
mae_val_et = mean_absolute_error(y_validation, et_val_pred)
print("Validation MSE: ", mse_val_et)
print("Validation R2: ", r2_val_et)
print("Validation MAE: ", mae_val_et)

Train MSE:  0.0006568715985793962
Train R2:  0.9998770635270784
Train MAE:  0.015543962672007165
Validation MSE:  0.21570377404436566
Validation R2:  0.9765085690166051
Validation MAE:  0.36623364454569796


In [None]:
# Define the Extra Trees Regression model
et_reg = ExtraTreesRegressor()
# Define the hyperparameter grid to search over
param_grid = {
    'n_estimators': [100], # range(100, 300, 15)
     'criterion': ['squared_error'], 
     'max_depth':[None], 
     'min_samples_split':[2], 
     'min_samples_leaf':[1], 
     'min_weight_fraction_leaf':[0.0], 
     'max_features':[1.0], 
     'max_leaf_nodes':[None], 
     'min_impurity_decrease':[0.0], 
     'bootstrap':[True], # [False] 
     'oob_score':[True], # [False]
     'n_jobs':[None], 
     'random_state':[0], 
     'verbose':[0], 
     'warm_start':[False], 
     'ccp_alpha':[0.0], 
     'max_samples':[None],   
     'monotonic_cst':[None]
}

# Use GridSearchCV to find the best hyperparameters
grid_search = GridSearchCV(et_reg, param_grid=param_grid, cv=tscv5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)
# Print the best hyperparameters and the corresponding validation score
print("Best hyperparameters: ", grid_search.best_params_)
#print("Validation score: ", grid_search.best_score_)
# Use the best model to make predictions on the validation set
best_et_reg = grid_search.best_estimator_
y_val_pred = best_et_reg.predict(X_validation)
# Compute the mean squared error of the predictions on the validation set
mse_val = mean_squared_error(y_validation, y_val_pred)
r2_val = r2_score(y_validation, y_val_pred)
mae_val = mean_absolute_error(y_validation, y_val_pred)
print("Validation MSE: ", mse_val)
print("Validation R2: ", r2_val)
print("Validation MAE: ", mae_val)

# Use the best model to make predictions on the test set
#y_test_pred_et = best_et_reg.predict(X_test)

In [146]:
# Create a DataFrame with the feature importance values
feature_importance_et = pd.DataFrame({'Feature': X_train.columns, 'Importance': et.feature_importances_})
# Sort the DataFrame by importance values in descending order
feature_importance_et = feature_importance_et.sort_values(by='Importance', ascending=False).reset_index(drop=True)
feature_importance_et.columns = ['feat', 'imp_et']
# Print the feature importance DataFrame
feature_importance_et.head(30)

Unnamed: 0,feat,imp_et
0,lag_1,0.381876
1,lag_2,0.193318
2,lag_3,0.067342
3,chicken_tj,0.036333
4,oil_cb,0.03627
5,papa2_tj,0.033896
6,sugar_tj,0.031134
7,sugar_sc,0.027371
8,squash_co,0.026992
9,lard_co,0.018165


## **4. Report**

#### End