# <font color="red">**4. NO FS - High-freq Forecast**</font>

**Author:** Osmar Bolivar

In [2]:
#%pip install kaleido

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns
import statsmodels.api as sm
import plotly.graph_objects as go
import plotly.express as px

In [4]:
print(plt.style.available)

['Solarize_Light2', '_classic_test_patch', '_mpl-gallery', '_mpl-gallery-nogrid', 'bmh', 'classic', 'dark_background', 'fast', 'fivethirtyeight', 'ggplot', 'grayscale', 'seaborn', 'seaborn-bright', 'seaborn-colorblind', 'seaborn-dark', 'seaborn-dark-palette', 'seaborn-darkgrid', 'seaborn-deep', 'seaborn-muted', 'seaborn-notebook', 'seaborn-paper', 'seaborn-pastel', 'seaborn-poster', 'seaborn-talk', 'seaborn-ticks', 'seaborn-white', 'seaborn-whitegrid', 'tableau-colorblind10']


## **1. Monthly data**

In [5]:
raw = pd.read_csv("./Data/FINAL_DATASET_G12.csv", index_col=0)
raw.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5901 entries, 2010-01-01 to 2024-10-31
Columns: 461 entries, ipc_all to freq
dtypes: float64(460), object(1)
memory usage: 20.8+ MB


In [6]:
isna = raw.isna().sum()
isna[isna > 0]

ipc_all                 5723
Inflación               5418
demanda                 5418
desempleo               5418
dinero                  5418
economia                5418
ine                     5418
ine bolivia             5418
inflacion               5418
inflacion en bolivia    5418
inflación bolivia       5418
ipc                     5418
la inflacion            5418
la inflación            5418
pib                     5418
pib bolivia             5418
que es inflacion        5418
que es pib              5418
dtype: int64

In [7]:
raw.iloc[:, 1:] = raw.iloc[:, 1:].interpolate(method='linear', limit_direction='both')

In [8]:
df = raw.copy().query('freq == "month"')
df.drop(columns=["freq"], inplace=True)

df_w = raw.copy().query('freq == "week"')
df_w.drop(columns=["freq"], inplace=True)

df_d = raw.copy().query('freq == "day"')
df_d.drop(columns=["freq"], inplace=True)

In [9]:
isna = df.isna().sum()
isna[isna > 0]

Series([], dtype: int64)

In [10]:
df.shape

(178, 460)

In [11]:
isna_d = df_d.isna().sum()
isna_d[isna_d > 0]

ipc_all    5418
dtype: int64

## **2. Train and Val sets**

In [12]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [13]:
#train_size = int(len(df) * 0.8)
#train, validation = df[:train_size], df[train_size:]
train, validation = train_test_split(df, test_size=0.2, random_state=5)

print(f'Obs in train set: {train.shape[0]}; variables in train set: {train.shape[1]}')
print(f'Obs in validation set: {validation.shape[0]}; variables in validation set: {validation.shape[1]}')

Obs in train set: 142; variables in train set: 460
Obs in validation set: 36; variables in validation set: 460


In [14]:
test_w = df_w.copy()
test_d = df_d.copy()
print(f'Obs in weekly test set: {test_w.shape[0]}; variables in weekly test set: {test_w.shape[1]}')
print(f'Obs in daily test set: {test_d.shape[0]}; variables in daily test set: {test_d.shape[1]}')

Obs in weekly test set: 305; variables in weekly test set: 460
Obs in daily test set: 5418; variables in daily test set: 460


In [15]:
scaler = StandardScaler()
train_scaled = pd.DataFrame(scaler.fit_transform(train), columns=train.columns, index=train.index)
validation_scaled = pd.DataFrame(scaler.transform(validation), columns=validation.columns, index=validation.index)
test_scaled_w = pd.DataFrame(scaler.transform(test_w), columns=test_w.columns, index=test_w.index)
test_scaled_d = pd.DataFrame(scaler.transform(test_d), columns=test_d.columns, index=test_d.index)

X_train = train_scaled.drop('ipc_all', axis=1)
y_train = train['ipc_all']
#y_train = train_scaled['ipc_all']

X_validation = validation_scaled.drop('ipc_all', axis=1)
y_validation = validation['ipc_all']
#y_validation = validation_scaled['ipc_all']

X_test_w = test_scaled_w.drop('ipc_all', axis=1)
X_test_d = test_scaled_d.drop('ipc_all', axis=1)

## **3. Algorithms**

In [16]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import TimeSeriesSplit
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor

tscv5 = TimeSeriesSplit(n_splits=5)

### **3.1. Ridge**   

**Without tuning (Validation):**
MSE:  2.209; R2:  0.001; MAE:  1.256

In [17]:
ridge = Ridge(random_state=0)
#ridge = Ridge(alpha=5.3, fit_intercept=True, positive=False, random_state=0)  
# Fit on training set
ridge.fit(X_train, y_train)
# Make predictions on validation set using best model from GridSearchCV
ridge_train_pred = ridge.predict(X_train)
ridge_val_pred = ridge.predict(X_validation)
# Calculate Forecast metrics on train set
mse_train_ridge = mean_squared_error(y_train, ridge_train_pred)
r2_train_ridge = r2_score(y_train, ridge_train_pred)
mae_train_ridge = mean_absolute_error(y_train, ridge_train_pred)
print("Train MSE: ", mse_train_ridge)
print("Train R2: ", r2_train_ridge)
print("Train MAE: ", mae_train_ridge)
# Calculate Forecast metrics on validation set
mse_val_ridge = mean_squared_error(y_validation, ridge_val_pred)
r2_val_ridge = r2_score(y_validation, ridge_val_pred)
mae_val_ridge = mean_absolute_error(y_validation, ridge_val_pred)
print("Validation MSE: ", mse_val_ridge)
print("Validation R2: ", r2_val_ridge)
print("Validation MAE: ", mae_val_ridge)

ridge_test_pred_w = ridge.predict(X_test_w)
ridge_test_pred_d = ridge.predict(X_test_d)

Train MSE:  0.0007557469688483131
Train R2:  0.9998754929958378
Train MAE:  0.016398256320598405
Validation MSE:  0.43342498261910106
Validation R2:  0.9367413634279705
Validation MAE:  0.472041037909121


In [18]:
# Get the coefficients from the Ridge model
coef = ridge.coef_
feature_importance_ridge = pd.DataFrame({'Feature': X_train.columns, 'Importance': coef})
feature_importance_ridge = feature_importance_ridge.sort_values('Importance', ascending=False).reset_index(drop=True)
feature_importance_ridge.columns = ['feat_ridge', 'imp_ridge']
feature_importance_ridge.head(15)

Unnamed: 0,feat_ridge,imp_ridge
0,lag_1,0.197362
1,lag_2,0.129095
2,orange2_cb,0.110233
3,oil_cb,0.106868
4,carrot_tj,0.103232
5,orange_sc,0.101964
6,sugar_lp,0.096775
7,watermelon_po,0.090723
8,flour_lp,0.089361
9,ycorn_cb,0.088808


### **3.2. Lasso**
Without tuning:
Validation MSE:  0.796
Validation R2:  0.640
Validation MAE:  0.690

In [19]:
#lasso = Lasso(random_state=0)
lasso = Lasso(alpha=0.1, fit_intercept=True, max_iter=10000, random_state=0)  
# Fit on training set
lasso.fit(X_train, y_train)
# Make predictions on validation set using best model from GridSearchCV
lasso_train_pred = lasso.predict(X_train)
lasso_val_pred = lasso.predict(X_validation)
# Calculate Forecast metrics on train set
mse_train_lasso = mean_squared_error(y_train, lasso_train_pred)
r2_train_lasso = r2_score(y_train, lasso_train_pred)
mae_train_lasso = mean_absolute_error(y_train, lasso_train_pred)
print("Train MSE: ", mse_train_lasso)
print("Train R2: ", r2_train_lasso)
print("Train MAE: ", mae_train_lasso)
# Calculate Forecast metrics on validation set
mse_val_lasso = mean_squared_error(y_validation, lasso_val_pred)
r2_val_lasso = r2_score(y_validation, lasso_val_pred)
mae_val_lasso = mean_absolute_error(y_validation, lasso_val_pred)
print("Validation MSE: ", mse_val_lasso)
print("Validation R2: ", r2_val_lasso)
print("Validation MAE: ", mae_val_lasso)

lasso_test_pred_w = lasso.predict(X_test_w)
lasso_test_pred_d = lasso.predict(X_test_d)

Train MSE:  0.21202317737538237
Train R2:  0.9650698292999013
Train MAE:  0.3535134653605935
Validation MSE:  0.46631260536969726
Validation R2:  0.9319413951318969
Validation MAE:  0.5274531363125456


In [20]:
# Get the coefficients from the Lasso model
coef = lasso.coef_
# Create a dataframe of feature importances
feature_importance_lasso = pd.DataFrame({'Feature': X_train.columns, 'Importance': coef})
# Sort the features by importance
feature_importance_lasso = feature_importance_lasso.sort_values('Importance', ascending=False).reset_index(drop=True)
feature_importance_lasso.columns = ['feat_lasso', 'imp_lasso']
# Print the feature importances
feature_importance_lasso.head(15)

Unnamed: 0,feat_lasso,imp_lasso
0,lag_1,1.932217
1,sugar_lp,0.259622
2,oil_su,0.071985
3,rice4_su,0.065762
4,tomato_cb,0.064752
5,rice_sc,0.060773
6,chicken_sc,0.057197
7,silver,0.02815
8,rice2_cb,0.025933
9,rice_lp,0.018307


### **3.4. ADA**
Without tuning:
Validation MSE:  0.324
Validation R2:  0.853
Validation MAE:  0.429

In [21]:
#ada = AdaBoostRegressor(random_state=0)
ada = AdaBoostRegressor(learning_rate=0.99, estimator=DecisionTreeRegressor(max_depth=7), random_state=0)
# Fit the model on training data
ada.fit(X_train, y_train)
# Make predictions on the validation set
ada_train_pred = ada.predict(X_train)
ada_val_pred = ada.predict(X_validation)
# Calculate Forecast metrics on train set
mse_train_ada = mean_squared_error(y_train, ada_train_pred)
r2_train_ada = r2_score(y_train, ada_train_pred)
mae_train_ada = mean_absolute_error(y_train, ada_train_pred)
print("Train MSE: ", mse_train_ada)
print("Train R2: ", r2_train_ada)
print("Train MAE: ", mae_train_ada)
# Evaluate the model on the validation set
mse_val_ada = mean_squared_error(y_validation, ada_val_pred)
r2_val_ada = r2_score(y_validation, ada_val_pred)
mae_val_ada = mean_absolute_error(y_validation, ada_val_pred)
print("Validation MSE: ", mse_val_ada)
print("Validation R2: ", r2_val_ada)
print("Validation MAE: ", mae_val_ada)

ada_test_pred_w = ada.predict(X_test_w)
ada_test_pred_d = ada.predict(X_test_d)

Train MSE:  0.002313466076886468
Train R2:  0.9996188635319267
Train MAE:  0.020597987172050726
Validation MSE:  0.5700176900520886
Validation R2:  0.9168055757267655
Validation MAE:  0.4932026597849101


In [22]:
# Create a DataFrame with the feature importance values
feature_importance_ada = pd.DataFrame({'Feature': X_train.columns, 'Importance': ada.feature_importances_})
feature_importance_ada = feature_importance_ada.sort_values(by='Importance', ascending=False).reset_index(drop=True)
feature_importance_ada.columns = ['feat', 'imp_ada']
feature_importance_ada.head(15)

Unnamed: 0,feat,imp_ada
0,lag_1,0.764468
1,sugar_cb,0.022136
2,oil2_co,0.020783
3,chicken_bol,0.017671
4,chicken_tj,0.016444
5,silver,0.015235
6,lag_2,0.008292
7,sugar_lp,0.007415
8,rice2_or,0.007342
9,economia,0.007009


### **3.5. GBR**
Without tuning:
Validation MSE:  0.277
Validation R2:  0.874
Validation MAE:  0.414

In [23]:
#gbr = GradientBoostingRegressor(random_state=0)
gbr = GradientBoostingRegressor(learning_rate=0.15, random_state=0)
# Fit the model on the training set
gbr.fit(X_train, y_train)
# Make predictions on validation set using best model from GridSearchCV
gbr_train_pred = gbr.predict(X_train)
gbr_val_pred = gbr.predict(X_validation)
# Calculate Forecast metrics on train set
mse_train_gbr = mean_squared_error(y_train, gbr_train_pred)
r2_train_gbr = r2_score(y_train, gbr_train_pred)
mae_train_gbr = mean_absolute_error(y_train, gbr_train_pred)
print("Train MSE: ", mse_train_gbr)
print("Train R2: ", r2_train_gbr)
print("Train MAE: ", mae_train_gbr)
# Calculate Forecast metrics on validation set
mse_val_gbr = mean_squared_error(y_validation, gbr_val_pred)
r2_val_gbr = r2_score(y_validation, gbr_val_pred)
mae_val_gbr = mean_absolute_error(y_validation, gbr_val_pred)
print("Validation MSE: ", mse_val_gbr)
print("Validation R2: ", r2_val_gbr)
print("Validation MAE: ", mae_val_gbr)

gbr_test_pred_w = gbr.predict(X_test_w)
gbr_test_pred_d = gbr.predict(X_test_d)

Train MSE:  0.00023733416378551482
Train R2:  0.9999608999216189
Train MAE:  0.012843130182475165
Validation MSE:  0.5176496669517197
Validation R2:  0.9244487201557823
Validation MAE:  0.5088207794278508


In [24]:
# Create a DataFrame with the feature importance values
feature_importance_gbr = pd.DataFrame({'Feature': X_train.columns, 'Importance': gbr.feature_importances_})
feature_importance_gbr = feature_importance_gbr.sort_values(by='Importance', ascending=False).reset_index(drop=True)
feature_importance_gbr.columns = ['feat_gbr', 'imp_gbr']
feature_importance_gbr.head(15)

Unnamed: 0,feat_gbr,imp_gbr
0,lag_1,0.825576
1,oil2_co,0.048335
2,chicken_tj,0.016534
3,milk_lp,0.012221
4,lemon_co,0.01039
5,chicken_bol,0.009307
6,sugar_lp,0.009258
7,sugar_cb,0.008354
8,veglard_tr,0.005042
9,tomato_tj,0.003603


### **3.4. RF**
Without tuning:
Validation MSE:  0.324
Validation R2:  0.853
Validation MAE:  0.434

In [25]:
#rf = RandomForestRegressor(random_state=0)
rf = RandomForestRegressor(criterion="absolute_error", max_depth=5, n_estimators=130, random_state=0)

# Fit the model to the training data and make predictions on the validation set
rf.fit(X_train, y_train)
# Make predictions on validation set using best model from GridSearchCV
rf_train_pred = rf.predict(X_train)
rf_val_pred = rf.predict(X_validation)
# Calculate Forecast metrics on train set
mse_train_rf = mean_squared_error(y_train, rf_train_pred)
r2_train_rf = r2_score(y_train, rf_train_pred)
mae_train_rf = mean_absolute_error(y_train, rf_train_pred)
print("Train MSE: ", mse_train_rf)
print("Train R2: ", r2_train_rf)
print("Train MAE: ", mae_train_rf)
# Calculate Forecast metrics on validation set
mse_val_rf = mean_squared_error(y_validation, rf_val_pred)
r2_val_rf = r2_score(y_validation, rf_val_pred)
mae_val_rf = mean_absolute_error(y_validation, rf_val_pred)
print("Validation MSE: ", mse_val_rf)
print("Validation R2: ", r2_val_rf)
print("Validation MAE: ", mae_val_rf)

rf_test_pred_w = rf.predict(X_test_w)
rf_test_pred_d = rf.predict(X_test_d)

Train MSE:  0.07310038474924326
Train R2:  0.9879569349486108
Train MAE:  0.1968934955865922
Validation MSE:  0.6223168804489697
Validation R2:  0.9091724774721707
Validation MAE:  0.5565825105662928


In [26]:
# Create a DataFrame with the feature importance values
feature_importance_rf = pd.DataFrame({'Feature': X_train.columns, 'Importance': rf.feature_importances_})
feature_importance_rf = feature_importance_rf.sort_values(by='Importance', ascending=False).reset_index(drop=True)
feature_importance_rf.columns = ['feat', 'imp_rf']
feature_importance_rf.head(30)

Unnamed: 0,feat,imp_rf
0,lag_1,0.642546
1,lag_2,0.013629
2,lemon_co,0.008263
3,sugar_or,0.007912
4,sugar_bol,0.006627
5,onion_bol,0.006478
6,oil2_co,0.006243
7,milk_lp,0.006123
8,rice2_po,0.005362
9,chicken_tj,0.005065


### **3.5. ET**   
Without tuning:
Validation MSE:  0.322
Validation R2:  0.854
Validation MAE:  0.423

In [27]:
et = ExtraTreesRegressor(random_state=0)
#et = ExtraTreesRegressor(max_depth=9, random_state=0)
# Fit the model to the training data and make predictions on the validation set
et.fit(X_train, y_train)
# Make predictions on validation set using best model from GridSearchCV
et_train_pred = et.predict(X_train)
et_val_pred = et.predict(X_validation)
# Calculate Forecast metrics on train set
mse_train_et = mean_squared_error(y_train, et_train_pred)
r2_train_et = r2_score(y_train, et_train_pred)
mae_train_et = mean_absolute_error(y_train, et_train_pred)
print("Train MSE: ", mse_train_et)
print("Train R2: ", r2_train_et)
print("Train MAE: ", mae_train_et)
# Calculate Forecast metrics on validation set
mse_val_et = mean_squared_error(y_validation, et_val_pred)
r2_val_et = r2_score(y_validation, et_val_pred)
mae_val_et = mean_absolute_error(y_validation, et_val_pred)
print("Validation MSE: ", mse_val_et)
print("Validation R2: ", r2_val_et)
print("Validation MAE: ", mae_val_et)

et_test_pred_w = et.predict(X_test_w)
et_test_pred_d = et.predict(X_test_d)

Train MSE:  2.665852155948152e-29
Train R2:  1.0
Train MAE:  3.802318397541051e-15
Validation MSE:  0.46966343816223355
Validation R2:  0.931452339072978
Validation MAE:  0.4698861285064184


In [28]:
feature_importance_et = pd.DataFrame({'Feature': X_train.columns, 'Importance': et.feature_importances_})
feature_importance_et = feature_importance_et.sort_values(by='Importance', ascending=False).reset_index(drop=True)
feature_importance_et.columns = ['feat', 'imp_et']
feature_importance_et.head(30)

Unnamed: 0,feat,imp_et
0,lag_1,0.344648
1,lag_2,0.125857
2,chicken_tj,0.056012
3,sugar_or,0.039064
4,lag_3,0.023043
5,sugar_lp,0.022332
6,sugar_po,0.020952
7,sugar_tj,0.018952
8,onion2_su,0.018264
9,sugar_cb,0.015095


## **4. Report**

In [29]:
val_forecast = pd.DataFrame(
    {'cpi': y_validation,
     'ridge': ridge_val_pred,
     'lasso': lasso_val_pred,
     'ada': ada_val_pred,
     'gbr': gbr_val_pred,
     'rf': rf_val_pred,
     'et': et_val_pred
    }, index=validation.index
)
val_forecast['set'] = 'validation'

week_forecast = pd.DataFrame(
    {'cpi': np.nan,
     'ridge': ridge_test_pred_w,
     'lasso': lasso_test_pred_w,
     'ada': ada_test_pred_w,
     'gbr': gbr_test_pred_w,
     'rf': rf_test_pred_w,
     'et': et_test_pred_w
    }, index=test_w.index
)
week_forecast['set'] = 'week'

day_forecast = pd.DataFrame(
    {'cpi': np.nan,
     'ridge': ridge_test_pred_d,
     'lasso': lasso_test_pred_d,
     'ada': ada_test_pred_d,
     'gbr': gbr_test_pred_d,
     'rf': rf_test_pred_d,
     'et': et_test_pred_d
    }, index=test_d.index
)
day_forecast['set'] = 'day'

all_forecast = pd.concat([val_forecast, week_forecast, day_forecast], axis=0)
all_forecast = all_forecast.sort_index(ascending=True)
all_forecast.tail(40)

Unnamed: 0,cpi,ridge,lasso,ada,gbr,rf,et,set
2024-09-28,,7.317078,5.845885,5.93673,7.57836,6.125185,6.634576,day
2024-09-29,,7.323735,5.84525,5.93673,7.57836,6.125185,6.634576,day
2024-09-29,,7.03637,5.859032,5.93673,7.57836,6.076921,6.654365,week
2024-09-30,6.182067,6.332732,5.818411,5.93673,7.688684,6.120003,6.519333,validation
2024-09-30,,7.309072,5.772228,5.93673,7.564416,6.078783,6.602387,day
2024-10-01,,7.285241,6.63243,7.938964,7.926829,7.550651,7.504474,day
2024-10-02,,7.318743,6.642639,7.938964,7.931249,7.582398,7.509206,day
2024-10-03,,7.20936,6.663247,7.938964,7.931249,7.586982,7.629252,day
2024-10-04,,7.07966,6.663298,7.938964,7.917443,7.589035,7.718324,day
2024-10-05,,7.034565,6.653707,7.938964,7.913023,7.589035,7.759203,day


In [30]:
metrics = pd.DataFrame(
    {'MSE': [mse_val_ridge, mse_val_lasso, mse_val_ada, mse_val_gbr, mse_val_rf, mse_val_et],
     'R2': [r2_val_ridge, r2_val_lasso, r2_val_ada, r2_val_gbr, r2_val_rf, r2_val_et],
     'MAE': [mae_val_ridge, mae_val_lasso, mae_val_ada, mae_val_gbr, mae_val_rf, mae_val_et]},
    index=['ridge', 'lasso', 'ada', 'gbr', 'rf', 'et']
)
metrics.sort_values(['MSE'], ascending=True, inplace=True)
metrics

Unnamed: 0,MSE,R2,MAE
ridge,0.433425,0.936741,0.472041
lasso,0.466313,0.931941,0.527453
et,0.469663,0.931452,0.469886
gbr,0.51765,0.924449,0.508821
ada,0.570018,0.916806,0.493203
rf,0.622317,0.909172,0.556583


In [31]:
all_forecast['min'] = all_forecast[['ridge', 'lasso', 'ada', 'gbr', 'rf', 'et']].min(axis=1)
all_forecast['max'] = all_forecast[['ridge', 'lasso', 'ada', 'gbr', 'rf', 'et']].max(axis=1)

inv1 = (1/mse_val_ridge)  / 1000
inv2 = (1/mse_val_lasso) / 1000
inv3 = (1/mse_val_ada) / 1000
inv4 = (1/mse_val_gbr) / 1000
inv5 = (1/mse_val_rf) / 1000
inv6 = (1/mse_val_et) / 1000

num_1 = (all_forecast['ridge'] * inv1) + (all_forecast['lasso'] * inv2) + (all_forecast['ada'] * inv3) + (all_forecast['gbr'] * inv4) + (all_forecast['rf'] * inv5) + (all_forecast['et'] * inv6)
den_1 = inv1+inv2+inv3+inv4+inv5+inv6
all_forecast['w_avg'] = num_1 / den_1

num_2 = (all_forecast['ridge'] * inv1) + (all_forecast['et'] * inv6) + (all_forecast['lasso'] * inv2)
den_2 = inv1+inv6+inv2
all_forecast['w_avg_best'] = num_2 / den_2

all_forecast

Unnamed: 0,cpi,ridge,lasso,ada,gbr,rf,et,set,min,max,w_avg,w_avg_best
2010-01-01,,0.022975,0.842256,0.187137,0.607619,0.605668,0.386747,day,0.022975,0.842256,0.434399,0.407646
2010-01-02,,0.169182,0.867482,0.685510,0.544726,0.648256,0.404012,day,0.169182,0.867482,0.539625,0.472712
2010-01-03,,0.171419,0.867809,0.685510,0.544726,0.648256,0.410122,day,0.171419,0.867809,0.541214,0.475579
2010-01-04,,0.144027,0.875274,0.685510,0.457196,0.648256,0.352587,day,0.144027,0.875274,0.512682,0.449793
2010-01-05,,0.102399,0.875095,0.698787,0.636387,0.651473,0.373119,day,0.102399,0.875095,0.539793,0.441784
...,...,...,...,...,...,...,...,...,...,...,...,...
2024-10-27,,8.352075,6.879072,7.938964,7.894714,7.617267,7.735931,week,6.879072,8.352075,7.740672,7.672725
2024-10-28,,9.084624,6.964501,7.938964,8.420635,7.728616,7.866239,day,6.964501,9.084624,8.022441,7.999550
2024-10-29,,8.675871,6.965996,7.938964,8.406264,7.725202,7.821579,day,6.965996,8.675871,7.932478,7.842282
2024-10-30,,9.508229,6.946407,7.938964,8.406264,7.733056,7.869193,day,6.946407,9.508229,8.100275,8.143122


In [32]:
all_forecast['cpi'] = df['ipc_all'].reindex(all_forecast.index)
all_forecast.tail(35)

Unnamed: 0,cpi,ridge,lasso,ada,gbr,rf,et,set,min,max,w_avg,w_avg_best
2024-10-01,,7.285241,6.63243,7.938964,7.926829,7.550651,7.504474,day,6.63243,7.938964,7.443538,7.143443
2024-10-02,,7.318743,6.642639,7.938964,7.931249,7.582398,7.509206,day,6.642639,7.938964,7.457755,7.160046
2024-10-03,,7.20936,6.663247,7.938964,7.931249,7.586982,7.629252,day,6.663247,7.938964,7.46237,7.167253
2024-10-04,,7.07966,6.663298,7.938964,7.917443,7.589035,7.718324,day,6.663298,7.938964,7.451181,7.150616
2024-10-05,,7.034565,6.653707,7.938964,7.913023,7.589035,7.759203,day,6.653707,7.938964,7.447298,7.144907
2024-10-06,,7.044099,6.653181,7.938964,7.913023,7.589035,7.759203,day,6.653181,7.938964,7.449055,7.148078
2024-10-06,,7.271213,6.639225,7.938964,7.913023,7.591976,7.621089,week,6.639225,7.938964,7.466297,7.178468
2024-10-07,,7.095355,6.682604,7.938964,7.913023,7.602667,7.755997,day,6.682604,7.938964,7.465598,7.174598
2024-10-08,,7.363605,6.694286,7.938964,7.901807,7.607084,7.856953,day,6.694286,7.938964,7.536708,7.305115
2024-10-09,,7.393957,6.688289,7.938964,7.901807,7.602485,7.848827,day,6.688289,7.938964,7.539443,7.311173


In [33]:
only_validation = all_forecast.query('set == "validation"')

mse_val_w_avg = mean_squared_error(only_validation['cpi'], only_validation['w_avg'])
r2_val_w_avg = r2_score(only_validation['cpi'], only_validation['w_avg'])
mae_val_w_avg = mean_absolute_error(only_validation['cpi'], only_validation['w_avg'])

mse_val_w_avg_best = mean_squared_error(only_validation['cpi'], only_validation['w_avg_best'])
r2_val_w_avg_best = r2_score(only_validation['cpi'], only_validation['w_avg_best'])
mae_val_w_avg_best = mean_absolute_error(only_validation['cpi'], only_validation['w_avg_best'])

metrics_b = pd.DataFrame(
    {'MSE': [mse_val_w_avg, mse_val_w_avg_best],
     'R2': [r2_val_w_avg, r2_val_w_avg_best],
     'MAE': [mae_val_w_avg, mae_val_w_avg_best]},
    index=['w_avg', 'w_avg_best']
)

metrics_all = pd.concat([metrics, metrics_b], axis=0)
metrics_all.sort_values(['MSE'], ascending=True, inplace=True)
metrics_all

Unnamed: 0,MSE,R2,MAE
w_avg_best,0.384719,0.94385,0.432434
w_avg,0.42982,0.937268,0.455581
ridge,0.433425,0.936741,0.472041
lasso,0.466313,0.931941,0.527453
et,0.469663,0.931452,0.469886
gbr,0.51765,0.924449,0.508821
ada,0.570018,0.916806,0.493203
rf,0.622317,0.909172,0.556583


In [34]:
# Create a scatter plot
fig = px.scatter(only_validation, x='cpi', y='ridge')
# Fit a line to the data
fit = np.polyfit(only_validation['cpi'], only_validation['ridge'], deg=1)
fit_fn = np.poly1d(fit)
# Add the fitted line to the plot
fig.add_trace(go.Scatter(
    x=only_validation['cpi'], 
    y=fit_fn(only_validation['cpi']), 
    mode='lines', 
    name='Fitted Line',
    line=dict(color='red', width=2)
))

# Update layout for better visualization
fig.update_layout(
    xaxis_title='Observed y-o-y Inflation',
    yaxis_title='Best Forecast',
    template='plotly_white',
    width=800, 
    height=600,
    legend=dict(
        x=0.8,
        y=0.1,
        bgcolor='rgba(255, 255, 255, 0.8)',
        bordercolor='black',
        borderwidth=1
    ),
    title=dict(
        x=0.5,
        xanchor='center'
    )
)

# Show the plot
fig.show()

In [35]:
weekly_forecast = all_forecast.copy().query('set == "week"')
daily_forecast = all_forecast.copy().query('set == "day"')

#### Weekly Forecast

In [36]:
fig1 = go.Figure()
# Add the forecast interval fill
fig1.add_trace(go.Scatter(
    x=weekly_forecast.index.tolist() + weekly_forecast.index[::-1].tolist(),
    y=weekly_forecast["min"].tolist() + weekly_forecast["max"][::-1].tolist(),
    fill='toself',
    fillcolor='#ffdabe',
    line=dict(color='rgba(255,255,255,0.5)'),
    hoverinfo="skip",
    showlegend=True,
    name="Forecast Interval"
))
# Add the high-frequency forecast line
fig1.add_trace(go.Scatter(
    x=weekly_forecast.index, 
    y=weekly_forecast["ridge"], 
    mode='lines+markers', 
    name="Weekly Forecast", 
    line=dict(color='#ff7d00', width=1.0),
    marker=dict(size=2.5)
))
# Add the y-o-y inflation line
fig1.add_trace(go.Scatter(
    x=df.loc['2018-12-30':, ].index, 
    y=df.loc['2018-12-30':, "ipc_all"], 
    mode='markers', 
    name="Observed y-o-y Inflation", 
    line=dict(color='#073763', width=0.8),
    marker=dict(size=4)
))
# Update layout for better visualization
fig1.update_layout(
    #title='High-frequency Inflation Forecast and y-o-y Inflation',
    #xaxis_title='Date',
    #yaxis_title='%',
    #legend_title='Legend',
    template='plotly_white',
    width=1200, 
    height=600,
    legend=dict(
        x=0.6,
        y=0.99,
        bgcolor='rgba(255, 255, 255, 0.8)',
        font=dict(size=15),
        bordercolor='black',
        borderwidth=1
    )
)
# Update x-axis to show dates in the desired format
fig1.update_xaxes(tickformat='%b %Y')
# Show the plot
fig1.show()

# Save the plot as an image
fig1.write_image('./Images/F0_Weekly_Forecast.png', format='png', scale=3)

#### Daily Forecast

In [37]:
fig2 = go.Figure()
# Add the forecast interval fill
fig2.add_trace(go.Scatter(
    x=daily_forecast.index.tolist() + daily_forecast.index[::-1].tolist(),
    y=daily_forecast["min"].tolist() + daily_forecast["max"][::-1].tolist(),
    fill='toself',
    fillcolor='#ffdabe',
    line=dict(color='rgba(255,255,255,0.5)'),
    hoverinfo="skip",
    showlegend=True,
    name="Forecast Interval"
))
# Add the high-frequency forecast line
fig2.add_trace(go.Scatter(
    x=daily_forecast.index, 
    y=daily_forecast["w_avg_best"], 
    mode='lines', 
    name="Daily Forecast", 
    line=dict(color='#ff7d00', width=1.0),
    #marker=dict(size=1.5)
))
# Add the y-o-y inflation line
fig2.add_trace(go.Scatter(
    x=df.index, 
    y=df["ipc_all"], 
    mode='markers', 
    name="Observed y-o-y Inflation", 
    line=dict(color='#073763', width=0.8),
    marker=dict(size=4)
))
# Update layout for better visualization
fig2.update_layout(
    #title='High-frequency Inflation Forecast and y-o-y Inflation',
    #xaxis_title='Date',
    #yaxis_title='%',
    #legend_title='Legend',
    template='plotly_white',
    width=1200, 
    height=600,
    legend=dict(
        x=0.75,
        y=0.99,
        bgcolor='rgba(255, 255, 255, 0.8)',
        font=dict(size=15),
        bordercolor='black',
        borderwidth=1
    )
)
# Update x-axis to show dates in the desired format
fig2.update_xaxes(tickformat='%b %Y')
# Show the plot
fig2.show()

# Save the plot as an image
fig2.write_image('./Images/F0_Daily_Forecast.png', format='png', scale=3)

In [38]:
all_forecast.to_csv('./Forecast/F0_FORECAST.csv')

#### End