# <font color="red">**4. L1 FS - High-freq Forecast**</font>

**Author:** Osmar Bolivar

In [6]:
#%pip install kaleido

In [7]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns
import statsmodels.api as sm
import plotly.graph_objects as go
import plotly.express as px

In [8]:
print(plt.style.available)

['Solarize_Light2', '_classic_test_patch', '_mpl-gallery', '_mpl-gallery-nogrid', 'bmh', 'classic', 'dark_background', 'fast', 'fivethirtyeight', 'ggplot', 'grayscale', 'petroff10', 'seaborn-v0_8', 'seaborn-v0_8-bright', 'seaborn-v0_8-colorblind', 'seaborn-v0_8-dark', 'seaborn-v0_8-dark-palette', 'seaborn-v0_8-darkgrid', 'seaborn-v0_8-deep', 'seaborn-v0_8-muted', 'seaborn-v0_8-notebook', 'seaborn-v0_8-paper', 'seaborn-v0_8-pastel', 'seaborn-v0_8-poster', 'seaborn-v0_8-talk', 'seaborn-v0_8-ticks', 'seaborn-v0_8-white', 'seaborn-v0_8-whitegrid', 'tableau-colorblind10']


## **1. Monthly data**

In [9]:
raw = pd.read_csv("./Data/LASSO_DATASET.csv", index_col=0)
raw.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5901 entries, 2010-01-01 to 2024-10-31
Data columns (total 36 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   ipc_all        178 non-null    float64
 1   lag_1          5901 non-null   float64
 2   lag_2          5901 non-null   float64
 3   lag_3          5901 non-null   float64
 4   lag_6          5901 non-null   float64
 5   lag_9          5901 non-null   float64
 6   lag_12         5901 non-null   float64
 7   oil_cb         5901 non-null   float64
 8   sugar_sc       5901 non-null   float64
 9   milk2_cb       5901 non-null   float64
 10  watermelon_po  5901 non-null   float64
 11  flour_lp       5901 non-null   float64
 12  chicken_tj     5901 non-null   float64
 13  lard_co        5901 non-null   float64
 14  onion2_tj      5901 non-null   float64
 15  ycorn_tj       5901 non-null   float64
 16  platano_co     5901 non-null   float64
 17  oil_su         5901 non-null   float64
 18

In [10]:
isna = raw.isna().sum()
isna[isna > 0]

ipc_all      5723
inflacion    5418
dtype: int64

In [11]:
raw['inflacion'] = raw['inflacion'].interpolate(method='linear', limit_direction='both')
raw['inflacion']

2010-01-01    47.0
2010-01-02    47.0
2010-01-03    47.0
2010-01-04    47.0
2010-01-05    47.0
              ... 
2024-10-28    34.5
2024-10-29    32.0
2024-10-30    29.5
2024-10-31    27.0
2024-10-31    27.0
Name: inflacion, Length: 5901, dtype: float64

In [12]:
df = raw.copy().query('freq == "month"')
df.drop(columns=["freq"], inplace=True)

df_w = raw.copy().query('freq == "week"')
df_w.drop(columns=["freq"], inplace=True)

df_d = raw.copy().query('freq == "day"')
df_d.drop(columns=["freq"], inplace=True)

In [13]:
isna = df.isna().sum()
isna[isna > 0]

Series([], dtype: int64)

In [14]:
df.shape

(178, 35)

In [15]:
isna_d = df_d.isna().sum()
isna_d[isna_d > 0]

ipc_all    5418
dtype: int64

## **2. Train and Val sets**

In [16]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [17]:
#train_size = int(len(df) * 0.8)
#train, validation = df[:train_size], df[train_size:]
train, validation = train_test_split(df, test_size=0.2, random_state=42)

print(f'Obs in train set: {train.shape[0]}; variables in train set: {train.shape[1]}')
print(f'Obs in validation set: {validation.shape[0]}; variables in validation set: {validation.shape[1]}')

Obs in train set: 142; variables in train set: 35
Obs in validation set: 36; variables in validation set: 35


In [18]:
test_w = df_w.copy()
test_d = df_d.copy()
print(f'Obs in weekly test set: {test_w.shape[0]}; variables in weekly test set: {test_w.shape[1]}')
print(f'Obs in daily test set: {test_d.shape[0]}; variables in daily test set: {test_d.shape[1]}')

Obs in weekly test set: 305; variables in weekly test set: 35
Obs in daily test set: 5418; variables in daily test set: 35


In [19]:
scaler = StandardScaler()
train_scaled = pd.DataFrame(scaler.fit_transform(train), columns=train.columns, index=train.index)
validation_scaled = pd.DataFrame(scaler.transform(validation), columns=validation.columns, index=validation.index)
test_scaled_w = pd.DataFrame(scaler.transform(test_w), columns=test_w.columns, index=test_w.index)
test_scaled_d = pd.DataFrame(scaler.transform(test_d), columns=test_d.columns, index=test_d.index)

X_train = train_scaled.drop('ipc_all', axis=1)
y_train = train['ipc_all']
#y_train = train_scaled['ipc_all']

X_validation = validation_scaled.drop('ipc_all', axis=1)
y_validation = validation['ipc_all']
#y_validation = validation_scaled['ipc_all']

X_test_w = test_scaled_w.drop('ipc_all', axis=1)
X_test_d = test_scaled_d.drop('ipc_all', axis=1)

## **3. Algorithms**

In [20]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import TimeSeriesSplit
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor

tscv5 = TimeSeriesSplit(n_splits=5)

### **3.1. Ridge**   

**Without tuning (Validation):**
MSE:  2.209; R2:  0.001; MAE:  1.256

In [21]:
#ridge = Ridge(random_state=0)
ridge = Ridge(alpha=5.3, fit_intercept=True, positive=False, random_state=0)  
# Fit on training set
ridge.fit(X_train, y_train)
# Make predictions on validation set using best model from GridSearchCV
ridge_train_pred = ridge.predict(X_train)
ridge_val_pred = ridge.predict(X_validation)
# Calculate Forecast metrics on train set
mse_train_ridge = mean_squared_error(y_train, ridge_train_pred)
r2_train_ridge = r2_score(y_train, ridge_train_pred)
mae_train_ridge = mean_absolute_error(y_train, ridge_train_pred)
print("Train MSE: ", mse_train_ridge)
print("Train R2: ", r2_train_ridge)
print("Train MAE: ", mae_train_ridge)
# Calculate Forecast metrics on validation set
mse_val_ridge = mean_squared_error(y_validation, ridge_val_pred)
r2_val_ridge = r2_score(y_validation, ridge_val_pred)
mae_val_ridge = mean_absolute_error(y_validation, ridge_val_pred)
print("Validation MSE: ", mse_val_ridge)
print("Validation R2: ", r2_val_ridge)
print("Validation MAE: ", mae_val_ridge)

ridge_test_pred_w = ridge.predict(X_test_w)
ridge_test_pred_d = ridge.predict(X_test_d)

Train MSE:  0.12049009685630677
Train R2:  0.9774497366585313
Train MAE:  0.26481709978142615
Validation MSE:  0.14362510985069066
Validation R2:  0.9843583666049067
Validation MAE:  0.2941826373449692


In [22]:
# Get the coefficients from the Ridge model
coef = ridge.coef_
feature_importance_ridge = pd.DataFrame({'Feature': X_train.columns, 'Importance': coef})
feature_importance_ridge = feature_importance_ridge.sort_values('Importance', ascending=False).reset_index(drop=True)
feature_importance_ridge.columns = ['feat_ridge', 'imp_ridge']
feature_importance_ridge.head(15)

Unnamed: 0,feat_ridge,imp_ridge
0,lag_1,0.785743
1,milk2_cb,0.369223
2,oil_cb,0.351211
3,flour_lp,0.261752
4,lard_co,0.252792
5,onion2_tj,0.251549
6,sugar_tj,0.219797
7,oil_su,0.198505
8,silver,0.192101
9,sugar_sc,0.185257


### **3.2. Lasso**
Without tuning:
Validation MSE:  0.796
Validation R2:  0.640
Validation MAE:  0.690

In [23]:
#lasso = Lasso(random_state=0)
lasso = Lasso(alpha=0.1, fit_intercept=True, max_iter=10000, random_state=0)  
# Fit on training set
lasso.fit(X_train, y_train)
# Make predictions on validation set using best model from GridSearchCV
lasso_train_pred = lasso.predict(X_train)
lasso_val_pred = lasso.predict(X_validation)
# Calculate Forecast metrics on train set
mse_train_lasso = mean_squared_error(y_train, lasso_train_pred)
r2_train_lasso = r2_score(y_train, lasso_train_pred)
mae_train_lasso = mean_absolute_error(y_train, lasso_train_pred)
print("Train MSE: ", mse_train_lasso)
print("Train R2: ", r2_train_lasso)
print("Train MAE: ", mae_train_lasso)
# Calculate Forecast metrics on validation set
mse_val_lasso = mean_squared_error(y_validation, lasso_val_pred)
r2_val_lasso = r2_score(y_validation, lasso_val_pred)
mae_val_lasso = mean_absolute_error(y_validation, lasso_val_pred)
print("Validation MSE: ", mse_val_lasso)
print("Validation R2: ", r2_val_lasso)
print("Validation MAE: ", mae_val_lasso)

lasso_test_pred_w = lasso.predict(X_test_w)
lasso_test_pred_d = lasso.predict(X_test_d)

Train MSE:  0.26525480558196435
Train R2:  0.9503563705688042
Train MAE:  0.39137207800671525
Validation MSE:  0.299688446649142
Validation R2:  0.9673621359099116
Validation MAE:  0.42742352220563906


In [24]:
# Get the coefficients from the Lasso model
coef = lasso.coef_
# Create a dataframe of feature importances
feature_importance_lasso = pd.DataFrame({'Feature': X_train.columns, 'Importance': coef})
# Sort the features by importance
feature_importance_lasso = feature_importance_lasso.sort_values('Importance', ascending=False).reset_index(drop=True)
feature_importance_lasso.columns = ['feat_lasso', 'imp_lasso']
# Print the feature importances
feature_importance_lasso.head(15)

Unnamed: 0,feat_lasso,imp_lasso
0,lag_1,1.888944
1,sugar_sc,0.125754
2,rice_cb,0.11217
3,silver,0.07917
4,oil_su,0.073536
5,ycorn_tj,0.052165
6,tomato_tj,0.042336
7,inflacion,0.03624
8,peas_su,0.022003
9,watermelon_po,0.018432


### **3.4. ADA**
Without tuning:
Validation MSE:  0.324
Validation R2:  0.853
Validation MAE:  0.429

In [25]:
#ada = AdaBoostRegressor(random_state=0)
ada = AdaBoostRegressor(learning_rate=0.99, estimator=DecisionTreeRegressor(max_depth=7), random_state=0)
# Fit the model on training data
ada.fit(X_train, y_train)
# Make predictions on the validation set
ada_train_pred = ada.predict(X_train)
ada_val_pred = ada.predict(X_validation)
# Calculate Forecast metrics on train set
mse_train_ada = mean_squared_error(y_train, ada_train_pred)
r2_train_ada = r2_score(y_train, ada_train_pred)
mae_train_ada = mean_absolute_error(y_train, ada_train_pred)
print("Train MSE: ", mse_train_ada)
print("Train R2: ", r2_train_ada)
print("Train MAE: ", mae_train_ada)
# Evaluate the model on the validation set
mse_val_ada = mean_squared_error(y_validation, ada_val_pred)
r2_val_ada = r2_score(y_validation, ada_val_pred)
mae_val_ada = mean_absolute_error(y_validation, ada_val_pred)
print("Validation MSE: ", mse_val_ada)
print("Validation R2: ", r2_val_ada)
print("Validation MAE: ", mae_val_ada)

ada_test_pred_w = ada.predict(X_test_w)
ada_test_pred_d = ada.predict(X_test_d)

Train MSE:  0.003501187646615026
Train R2:  0.9993447369908481
Train MAE:  0.02743245720011363
Validation MSE:  0.4824007893846811
Validation R2:  0.9474636690972563
Validation MAE:  0.5308086758845862


In [26]:
# Create a DataFrame with the feature importance values
feature_importance_ada = pd.DataFrame({'Feature': X_train.columns, 'Importance': ada.feature_importances_})
feature_importance_ada = feature_importance_ada.sort_values(by='Importance', ascending=False).reset_index(drop=True)
feature_importance_ada.columns = ['feat', 'imp_ada']
feature_importance_ada.head(15)

Unnamed: 0,feat,imp_ada
0,lag_1,0.828091
1,sugar_tj,0.034874
2,chicken_tj,0.031449
3,ycorn_tj,0.008959
4,squash_co,0.007964
5,flour_lp,0.007102
6,lag_2,0.006183
7,tomato_tj,0.005996
8,sugar_sc,0.005812
9,platano_co,0.005631


### **3.5. GBR**
Without tuning:
Validation MSE:  0.277
Validation R2:  0.874
Validation MAE:  0.414

In [27]:
#gbr = GradientBoostingRegressor(random_state=0)
gbr = GradientBoostingRegressor(learning_rate=0.3, n_estimators=120, random_state=0)
# Fit the model on the training set
gbr.fit(X_train, y_train)
# Make predictions on validation set using best model from GridSearchCV
gbr_train_pred = gbr.predict(X_train)
gbr_val_pred = gbr.predict(X_validation)
# Calculate Forecast metrics on train set
mse_train_gbr = mean_squared_error(y_train, gbr_train_pred)
r2_train_gbr = r2_score(y_train, gbr_train_pred)
mae_train_gbr = mean_absolute_error(y_train, gbr_train_pred)
print("Train MSE: ", mse_train_gbr)
print("Train R2: ", r2_train_gbr)
print("Train MAE: ", mae_train_gbr)
# Calculate Forecast metrics on validation set
mse_val_gbr = mean_squared_error(y_validation, gbr_val_pred)
r2_val_gbr = r2_score(y_validation, gbr_val_pred)
mae_val_gbr = mean_absolute_error(y_validation, gbr_val_pred)
print("Validation MSE: ", mse_val_gbr)
print("Validation R2: ", r2_val_gbr)
print("Validation MAE: ", mae_val_gbr)

gbr_test_pred_w = gbr.predict(X_test_w)
gbr_test_pred_d = gbr.predict(X_test_d)

Train MSE:  1.0554826952531321e-05
Train R2:  0.9999980246166821
Train MAE:  0.0025301681097614995
Validation MSE:  0.30887472237516017
Validation R2:  0.9663616955459531
Validation MAE:  0.43475446738807083


In [28]:
# Create a DataFrame with the feature importance values
feature_importance_gbr = pd.DataFrame({'Feature': X_train.columns, 'Importance': gbr.feature_importances_})
feature_importance_gbr = feature_importance_gbr.sort_values(by='Importance', ascending=False).reset_index(drop=True)
feature_importance_gbr.columns = ['feat_gbr', 'imp_gbr']
feature_importance_gbr.head(15)

Unnamed: 0,feat_gbr,imp_gbr
0,lag_1,0.798719
1,chicken_tj,0.099332
2,flour_lp,0.033513
3,sugar_sc,0.009717
4,lag_12,0.008335
5,silver,0.00642
6,tomato_tj,0.006328
7,chicken_tr,0.00434
8,oil_cb,0.004155
9,peas_tj,0.003668


### **3.4. RF**
Without tuning:
Validation MSE:  0.324
Validation R2:  0.853
Validation MAE:  0.434

In [29]:
#rf = RandomForestRegressor(random_state=0)
rf = RandomForestRegressor(criterion="absolute_error", max_depth=5, n_estimators=130, random_state=0)

# Fit the model to the training data and make predictions on the validation set
rf.fit(X_train, y_train)
# Make predictions on validation set using best model from GridSearchCV
rf_train_pred = rf.predict(X_train)
rf_val_pred = rf.predict(X_validation)
# Calculate Forecast metrics on train set
mse_train_rf = mean_squared_error(y_train, rf_train_pred)
r2_train_rf = r2_score(y_train, rf_train_pred)
mae_train_rf = mean_absolute_error(y_train, rf_train_pred)
print("Train MSE: ", mse_train_rf)
print("Train R2: ", r2_train_rf)
print("Train MAE: ", mae_train_rf)
# Calculate Forecast metrics on validation set
mse_val_rf = mean_squared_error(y_validation, rf_val_pred)
r2_val_rf = r2_score(y_validation, rf_val_pred)
mae_val_rf = mean_absolute_error(y_validation, rf_val_pred)
print("Validation MSE: ", mse_val_rf)
print("Validation R2: ", r2_val_rf)
print("Validation MAE: ", mae_val_rf)

rf_test_pred_w = rf.predict(X_test_w)
rf_test_pred_d = rf.predict(X_test_d)

Train MSE:  0.07303579074934144
Train R2:  0.9863310233975976
Train MAE:  0.20304947032092058
Validation MSE:  0.4043625326285658
Validation R2:  0.95596250195622
Validation MAE:  0.4992689131608102


In [30]:
# Create a DataFrame with the feature importance values
feature_importance_rf = pd.DataFrame({'Feature': X_train.columns, 'Importance': rf.feature_importances_})
feature_importance_rf = feature_importance_rf.sort_values(by='Importance', ascending=False).reset_index(drop=True)
feature_importance_rf.columns = ['feat', 'imp_rf']
feature_importance_rf.head(30)

Unnamed: 0,feat,imp_rf
0,lag_1,0.744405
1,sugar_tj,0.021616
2,chicken_tj,0.019536
3,ycorn_tj,0.015704
4,tomato_tj,0.013193
5,flour_lp,0.012898
6,yuca_lp,0.010333
7,lag_2,0.010108
8,sugar_sc,0.009855
9,lag_6,0.009828


### **3.5. ET**   
Without tuning:
Validation MSE:  0.322
Validation R2:  0.854
Validation MAE:  0.423

In [31]:
#et = ExtraTreesRegressor(random_state=0)
et = ExtraTreesRegressor(max_depth=10, random_state=0)
# Fit the model to the training data and make predictions on the validation set
et.fit(X_train, y_train)
# Make predictions on validation set using best model from GridSearchCV
et_train_pred = et.predict(X_train)
et_val_pred = et.predict(X_validation)
# Calculate Forecast metrics on train set
mse_train_et = mean_squared_error(y_train, et_train_pred)
r2_train_et = r2_score(y_train, et_train_pred)
mae_train_et = mean_absolute_error(y_train, et_train_pred)
print("Train MSE: ", mse_train_et)
print("Train R2: ", r2_train_et)
print("Train MAE: ", mae_train_et)
# Calculate Forecast metrics on validation set
mse_val_et = mean_squared_error(y_validation, et_val_pred)
r2_val_et = r2_score(y_validation, et_val_pred)
mae_val_et = mean_absolute_error(y_validation, et_val_pred)
print("Validation MSE: ", mse_val_et)
print("Validation R2: ", r2_val_et)
print("Validation MAE: ", mae_val_et)

et_test_pred_w = et.predict(X_test_w)
et_test_pred_d = et.predict(X_test_d)

Train MSE:  0.0006568715985793962
Train R2:  0.9998770635270784
Train MAE:  0.015543962672007165
Validation MSE:  0.21570377404436566
Validation R2:  0.9765085690166051
Validation MAE:  0.36623364454569796


In [32]:
feature_importance_et = pd.DataFrame({'Feature': X_train.columns, 'Importance': et.feature_importances_})
feature_importance_et = feature_importance_et.sort_values(by='Importance', ascending=False).reset_index(drop=True)
feature_importance_et.columns = ['feat', 'imp_et']
feature_importance_et.head(30)

Unnamed: 0,feat,imp_et
0,lag_1,0.381876
1,lag_2,0.193318
2,lag_3,0.067342
3,chicken_tj,0.036333
4,oil_cb,0.03627
5,papa2_tj,0.033896
6,sugar_tj,0.031134
7,sugar_sc,0.027371
8,squash_co,0.026992
9,lard_co,0.018165


## **4. Report**

In [33]:
val_forecast = pd.DataFrame(
    {'cpi': y_validation,
     'ridge': ridge_val_pred,
     'lasso': lasso_val_pred,
     'ada': ada_val_pred,
     'gbr': gbr_val_pred,
     'rf': rf_val_pred,
     'et': et_val_pred
    }, index=validation.index
)
val_forecast['set'] = 'validation'

week_forecast = pd.DataFrame(
    {'cpi': np.nan,
     'ridge': ridge_test_pred_w,
     'lasso': lasso_test_pred_w,
     'ada': ada_test_pred_w,
     'gbr': gbr_test_pred_w,
     'rf': rf_test_pred_w,
     'et': et_test_pred_w
    }, index=test_w.index
)
week_forecast['set'] = 'week'

day_forecast = pd.DataFrame(
    {'cpi': np.nan,
     'ridge': ridge_test_pred_d,
     'lasso': lasso_test_pred_d,
     'ada': ada_test_pred_d,
     'gbr': gbr_test_pred_d,
     'rf': rf_test_pred_d,
     'et': et_test_pred_d
    }, index=test_d.index
)
day_forecast['set'] = 'day'

all_forecast = pd.concat([val_forecast, week_forecast, day_forecast], axis=0)
all_forecast = all_forecast.sort_index(ascending=True)
all_forecast.tail(40)

Unnamed: 0,cpi,ridge,lasso,ada,gbr,rf,et,set
2024-09-27,,6.488237,5.761154,6.182067,6.314872,5.859222,6.374218,day
2024-09-28,,6.513361,5.759994,6.182067,6.378789,5.850298,6.456159,day
2024-09-29,,6.484345,5.752832,6.182067,6.326907,5.8307,6.335165,week
2024-09-29,,6.505916,5.757269,6.182067,6.378789,5.850298,6.456159,day
2024-09-30,,6.50326,5.763164,6.182067,6.374956,5.886209,6.46889,day
2024-10-01,,7.054246,6.589385,7.938964,7.45974,7.113879,7.610004,day
2024-10-02,,7.061926,6.592374,7.938964,7.45974,7.128758,7.610004,day
2024-10-03,,7.081695,6.600344,7.938964,7.762455,7.128758,7.627573,day
2024-10-04,,7.081185,6.599957,7.938964,7.762455,7.128758,7.627573,day
2024-10-05,,7.078694,6.598753,7.938964,7.45974,7.128758,7.627573,day


In [34]:
metrics = pd.DataFrame(
    {'MSE': [mse_val_ridge, mse_val_lasso, mse_val_ada, mse_val_gbr, mse_val_rf, mse_val_et],
     'R2': [r2_val_ridge, r2_val_lasso, r2_val_ada, r2_val_gbr, r2_val_rf, r2_val_et],
     'MAE': [mae_val_ridge, mae_val_lasso, mae_val_ada, mae_val_gbr, mae_val_rf, mae_val_et]},
    index=['ridge', 'lasso', 'ada', 'gbr', 'rf', 'et']
)
metrics.sort_values(['MSE'], ascending=True, inplace=True)
metrics

Unnamed: 0,MSE,R2,MAE
ridge,0.143625,0.984358,0.294183
et,0.215704,0.976509,0.366234
lasso,0.299688,0.967362,0.427424
gbr,0.308875,0.966362,0.434754
rf,0.404363,0.955963,0.499269
ada,0.482401,0.947464,0.530809


In [35]:
all_forecast['min'] = all_forecast[['ridge', 'lasso', 'ada', 'gbr', 'rf', 'et']].min(axis=1)
all_forecast['max'] = all_forecast[['ridge', 'lasso', 'ada', 'gbr', 'rf', 'et']].max(axis=1)

inv1 = (1/mse_val_ridge)  / 1000
inv2 = (1/mse_val_lasso) / 1000
inv3 = (1/mse_val_ada) / 1000
inv4 = (1/mse_val_gbr) / 1000
inv5 = (1/mse_val_rf) / 1000
inv6 = (1/mse_val_et) / 1000

num_1 = (all_forecast['ridge'] * inv1) + (all_forecast['lasso'] * inv2) + (all_forecast['ada'] * inv3) + (all_forecast['gbr'] * inv4) + (all_forecast['rf'] * inv5) + (all_forecast['et'] * inv6)
den_1 = inv1+inv2+inv3+inv4+inv5+inv6
all_forecast['w_avg'] = num_1 / den_1

num_2 = (all_forecast['ridge'] * inv1) + (all_forecast['et'] * inv6) + (all_forecast['lasso'] * inv2)
den_2 = inv1+inv6+inv2
all_forecast['w_avg_best'] = num_2 / den_2

all_forecast

Unnamed: 0,cpi,ridge,lasso,ada,gbr,rf,et,set,min,max,w_avg,w_avg_best
2010-01-01,,-0.304772,0.610665,0.074472,-0.108073,0.053954,0.186372,day,-0.304772,0.610665,0.031586,0.052204
2010-01-02,,-0.236446,0.602463,0.074472,-0.206282,0.053954,0.147775,day,-0.236446,0.602463,0.029450,0.070243
2010-01-03,,-0.234024,0.603461,0.074472,-0.206282,0.054912,0.147775,day,-0.234024,0.603461,0.030443,0.071595
2010-01-04,,-0.493750,0.590062,0.074472,-0.195710,0.054912,0.111753,day,-0.493750,0.590062,-0.056966,-0.063659
2010-01-05,,-0.541573,0.593752,0.074472,-0.185909,0.051194,0.081405,day,-0.541573,0.593752,-0.076281,-0.094549
...,...,...,...,...,...,...,...,...,...,...,...,...
2024-10-27,,7.427080,6.753992,7.938964,7.797957,7.175745,7.686946,week,6.753992,7.938964,7.453449,7.357365
2024-10-28,,7.616825,6.854450,7.938964,10.122065,7.763411,8.017403,day,6.854450,10.122065,7.988953,7.570839
2024-10-29,,7.569679,6.838239,7.938964,10.112478,7.771712,8.050816,day,6.838239,10.112478,7.978479,7.555610
2024-10-30,,7.711327,6.773495,7.938964,9.759882,7.741934,7.972323,day,6.773495,9.759882,7.942875,7.582814


In [36]:
all_forecast['cpi'] = df['ipc_all'].reindex(all_forecast.index)
all_forecast.tail(35)

Unnamed: 0,cpi,ridge,lasso,ada,gbr,rf,et,set,min,max,w_avg,w_avg_best
2024-10-01,,7.054246,6.589385,7.938964,7.45974,7.113879,7.610004,day,6.589385,7.938964,7.244379,7.122898
2024-10-02,,7.061926,6.592374,7.938964,7.45974,7.128758,7.610004,day,6.592374,7.938964,7.248791,7.127146
2024-10-03,,7.081695,6.600344,7.938964,7.762455,7.128758,7.627573,day,6.600344,7.938964,7.302744,7.143596
2024-10-04,,7.081185,6.599957,7.938964,7.762455,7.128758,7.627573,day,6.599957,7.938964,7.302531,7.143272
2024-10-05,,7.078694,6.598753,7.938964,7.45974,7.128758,7.627573,day,6.598753,7.938964,7.258452,7.141841
2024-10-06,,7.076202,6.597549,7.938964,7.461773,7.128758,7.627573,day,6.597549,7.938964,7.257801,7.140411
2024-10-06,,7.075353,6.597022,7.938964,7.461773,7.128758,7.610004,week,6.597022,7.938964,7.253879,7.134444
2024-10-07,,7.067404,6.594366,7.938964,7.45974,7.128758,7.610004,day,6.594366,7.938964,7.250763,7.130145
2024-10-08,,6.835542,6.509779,7.938964,7.463353,7.092062,7.552508,day,6.509779,7.938964,7.152069,6.98531
2024-10-09,,6.831547,6.508753,7.938964,7.463353,7.095948,7.570077,day,6.508753,7.938964,7.154702,6.988672


In [37]:
only_validation = all_forecast.query('set == "validation"')

mse_val_w_avg = mean_squared_error(only_validation['cpi'], only_validation['w_avg'])
r2_val_w_avg = r2_score(only_validation['cpi'], only_validation['w_avg'])
mae_val_w_avg = mean_absolute_error(only_validation['cpi'], only_validation['w_avg'])

mse_val_w_avg_best = mean_squared_error(only_validation['cpi'], only_validation['w_avg_best'])
r2_val_w_avg_best = r2_score(only_validation['cpi'], only_validation['w_avg_best'])
mae_val_w_avg_best = mean_absolute_error(only_validation['cpi'], only_validation['w_avg_best'])

metrics_b = pd.DataFrame(
    {'MSE': [mse_val_w_avg, mse_val_w_avg_best],
     'R2': [r2_val_w_avg, r2_val_w_avg_best],
     'MAE': [mae_val_w_avg, mae_val_w_avg_best]},
    index=['w_avg', 'w_avg_best']
)

metrics_all = pd.concat([metrics, metrics_b], axis=0)
metrics_all.sort_values(['MSE'], ascending=True, inplace=True)
metrics_all

Unnamed: 0,MSE,R2,MAE
ridge,0.143625,0.984358,0.294183
w_avg_best,0.155714,0.983042,0.305367
w_avg,0.193061,0.978974,0.360927
et,0.215704,0.976509,0.366234
lasso,0.299688,0.967362,0.427424
gbr,0.308875,0.966362,0.434754
rf,0.404363,0.955963,0.499269
ada,0.482401,0.947464,0.530809


In [38]:
# Create a scatter plot
fig = px.scatter(only_validation, x='cpi', y='ridge')
# Fit a line to the data
fit = np.polyfit(only_validation['cpi'], only_validation['ridge'], deg=1)
fit_fn = np.poly1d(fit)
# Add the fitted line to the plot
fig.add_trace(go.Scatter(
    x=only_validation['cpi'], 
    y=fit_fn(only_validation['cpi']), 
    mode='lines', 
    name='Fitted Line',
    line=dict(color='red', width=2)
))

# Update layout for better visualization
fig.update_layout(
    xaxis_title='Observed y-o-y Inflation',
    yaxis_title='Best Forecast',
    template='plotly_white',
    width=800, 
    height=600,
    legend=dict(
        x=0.8,
        y=0.1,
        bgcolor='rgba(255, 255, 255, 0.8)',
        bordercolor='black',
        borderwidth=1
    ),
    title=dict(
        x=0.5,
        xanchor='center'
    )
)

# Show the plot
fig.show()

In [39]:
weekly_forecast = all_forecast.copy().query('set == "week"')
daily_forecast = all_forecast.copy().query('set == "day"')

#### Weekly Forecast

In [40]:
fig1 = go.Figure()
# Add the forecast interval fill
fig1.add_trace(go.Scatter(
    x=weekly_forecast.index.tolist() + weekly_forecast.index[::-1].tolist(),
    y=weekly_forecast["min"].tolist() + weekly_forecast["max"][::-1].tolist(),
    fill='toself',
    fillcolor='#ffdabe',
    line=dict(color='rgba(255,255,255,0.5)'),
    hoverinfo="skip",
    showlegend=True,
    name="Forecast Interval"
))
# Add the high-frequency forecast line
fig1.add_trace(go.Scatter(
    x=weekly_forecast.index, 
    y=weekly_forecast["ridge"], 
    mode='lines+markers', 
    name="Weekly Forecast", 
    line=dict(color='#ff7d00', width=1.0),
    marker=dict(size=2.5)
))
# Add the y-o-y inflation line
fig1.add_trace(go.Scatter(
    x=df.loc['2018-12-30':, ].index, 
    y=df.loc['2018-12-30':, "ipc_all"], 
    mode='markers', 
    name="Observed y-o-y Inflation", 
    line=dict(color='#073763', width=0.8),
    marker=dict(size=4)
))
# Update layout for better visualization
fig1.update_layout(
    #title='High-frequency Inflation Forecast and y-o-y Inflation',
    #xaxis_title='Date',
    #yaxis_title='%',
    #legend_title='Legend',
    template='plotly_white',
    width=1200, 
    height=600,
    legend=dict(
        x=0.6,
        y=0.99,
        bgcolor='rgba(255, 255, 255, 0.8)',
        font=dict(size=15),
        bordercolor='black',
        borderwidth=1
    )
)
# Update x-axis to show dates in the desired format
fig1.update_xaxes(tickformat='%b %Y')
# Show the plot
fig1.show()

# Save the plot as an image
fig1.write_image('./Images/F3_Weekly_Forecast.png', format='png', scale=3)

#### Daily Forecast

In [41]:
fig2 = go.Figure()
# Add the forecast interval fill
fig2.add_trace(go.Scatter(
    x=daily_forecast.index.tolist() + daily_forecast.index[::-1].tolist(),
    y=daily_forecast["min"].tolist() + daily_forecast["max"][::-1].tolist(),
    fill='toself',
    fillcolor='#ffdabe',
    line=dict(color='rgba(255,255,255,0.5)'),
    hoverinfo="skip",
    showlegend=True,
    name="Forecast Interval"
))
# Add the high-frequency forecast line
fig2.add_trace(go.Scatter(
    x=daily_forecast.index, 
    y=daily_forecast["w_avg_best"], 
    mode='lines', 
    name="Daily Forecast", 
    line=dict(color='#ff7d00', width=1.0),
    #marker=dict(size=1.5)
))
# Add the y-o-y inflation line
fig2.add_trace(go.Scatter(
    x=df.index, 
    y=df["ipc_all"], 
    mode='markers', 
    name="Observed y-o-y Inflation", 
    line=dict(color='#073763', width=0.8),
    marker=dict(size=4)
))
# Update layout for better visualization
fig2.update_layout(
    #title='High-frequency Inflation Forecast and y-o-y Inflation',
    #xaxis_title='Date',
    #yaxis_title='%',
    #legend_title='Legend',
    template='plotly_white',
    width=1200, 
    height=600,
    legend=dict(
        x=0.75,
        y=0.99,
        bgcolor='rgba(255, 255, 255, 0.8)',
        font=dict(size=15),
        bordercolor='black',
        borderwidth=1
    )
)
# Update x-axis to show dates in the desired format
fig2.update_xaxes(tickformat='%b %Y')
# Show the plot
fig2.show()

# Save the plot as an image
fig2.write_image('./Images/F3_Daily_Forecast.png', format='png', scale=3)

In [42]:
all_forecast.to_csv('./Forecast/F3_FORECAST.csv')

## Bridge Equation

In [43]:
import statsmodels.api as sm

# Add a constant to the independent variables
X_train_const = sm.add_constant(X_train)
# Fit the OLS model
bridge_model = sm.OLS(y_train, X_train_const).fit()
# Print the summary of the regression
print(bridge_model.summary())

                            OLS Regression Results                            
Dep. Variable:                ipc_all   R-squared:                       0.981
Model:                            OLS   Adj. R-squared:                  0.975
Method:                 Least Squares   F-statistic:                     165.5
Date:                Wed, 26 Mar 2025   Prob (F-statistic):           5.10e-78
Time:                        14:49:40   Log-Likelihood:                -37.808
No. Observations:                 142   AIC:                             145.6
Df Residuals:                     107   BIC:                             249.1
Df Model:                          34                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
const             3.3717      0.031    110.444

In [44]:
# Add a constant to the validation set
X_validation_const = sm.add_constant(X_validation)
# Make predictions on the validation set
bridge_val_pred = bridge_model.predict(X_validation_const)

# Calculate Forecast metrics on validation set
mse_val_bridge = mean_squared_error(y_validation, bridge_val_pred)
r2_val_bridge = r2_score(y_validation, bridge_val_pred)
mae_val_bridge = mean_absolute_error(y_validation, bridge_val_pred)
print("Validation MSE: ", mse_val_bridge)
print("Validation R2: ", r2_val_bridge)
print("Validation MAE: ", mae_val_bridge)

Validation MSE:  0.17394719498576086
Validation R2:  0.9810561101961865
Validation MAE:  0.32019816556726555


In [45]:
tab_comparison = pd.DataFrame(
    {"Rel-MSE": [mse_val_ridge/mse_val_ridge, mse_val_bridge/mse_val_ridge],
     "Rel-R2": [r2_val_ridge/r2_val_ridge, r2_val_bridge/r2_val_ridge]
    }, index=["Two-Step ML Final Model", "Bridge Equation"]
)

tab_comparison

Unnamed: 0,Rel-MSE,Rel-R2
Two-Step ML Final Model,1.0,1.0
Bridge Equation,1.21112,0.996645


## MIDAS Regression