# <font color="red">**STEP 2: FEATURE SELECTION**</font>

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import plotly.graph_objects as go

In [3]:
# Correlation matrix
def lagged_correlation_color(df, specific_var, l=0):
    lags = [l]
    # Create lagged versions of the DataFrame
    lagged_df = pd.concat([df.shift(lag) for lag in lags], axis=1, keys=[f'Lag{lag}' for lag in lags])
    # Calculate correlation coefficients
    correlations = lagged_df.corrwith(df[specific_var])
    # Sort correlations from highest to lowest
    correlations_sorted = correlations.sort_values(ascending=False)
    # Convert the sorted correlations to a data frame and reset index
    df_correlations_sorted = pd.DataFrame(correlations_sorted, columns=['corr']).reset_index()

    lagged_df.columns = df.columns
    return df_correlations_sorted, lagged_df

col_range = range(0,1)


# Growth Rate datafrmaes
def preprocess_data(df, period):
    df_copy = df.copy().pct_change(period).dropna(axis=0)
    df_copy = df_copy.loc[:, np.isfinite(df_copy).all(axis=0)]
    return df_copy.dropna(axis=1)


## **1. Feature Selection (FS)**

In [4]:
dataset = pd.read_csv('./Data/FINAL_DATASET.csv', index_col=0)

dataset_m = dataset.copy().query('freq == "month"').drop(columns=['freq'])
dataset_w = dataset.copy().query('freq == "week"').drop(columns=['freq'])

In [5]:
dataset_m.tail(20)

Unnamed: 0,ipc_all,lag_1,lag_2,lag_3,lag_6,lag_9,lag_12,beef_lp,beef_cb,beef_sc,...,inflacion,inflacion en bolivia,inflación bolivia,ipc,la inflacion,la inflación,pib,pib bolivia,que es inflacion,que es pib
2022-07-31,107.186726,106.768842,106.358789,105.958607,105.848463,105.148521,105.04501,21.988462,22.5,18.0,...,73.0,55.0,72.0,46.0,79.0,67.0,47.0,43.0,66.0,39.0
2022-08-31,107.227296,107.186726,106.768842,106.358789,105.9791,105.352557,105.592889,22.299231,22.5,18.0,...,46.0,42.0,53.0,47.0,56.0,67.0,52.0,40.0,56.0,43.0
2022-09-30,107.382222,107.227296,107.186726,106.768842,105.929316,105.523019,105.386115,22.721154,22.5,18.269231,...,43.0,42.0,46.0,47.0,59.0,57.0,62.0,43.0,47.0,64.0
2022-10-31,108.184143,107.382222,107.227296,107.186726,105.958607,105.848463,105.148521,23.015769,22.576923,20.0,...,46.0,49.0,49.0,46.0,59.0,52.0,58.0,44.0,48.0,53.0
2022-11-30,108.692935,108.184143,107.382222,107.227296,106.358789,105.9791,105.352557,23.02,24.24,21.84,...,36.0,53.0,51.0,48.0,49.0,43.0,48.0,36.0,40.0,46.0
2022-12-31,108.818364,108.692935,108.184143,107.382222,106.768842,105.929316,105.523019,22.97,23.5,22.0,...,34.0,47.0,52.0,60.0,46.0,52.0,36.0,24.0,57.0,45.0
2023-01-31,109.17693,108.818364,108.692935,108.184143,107.186726,105.958607,105.848463,24.761364,24.590909,21.5,...,28.0,50.0,46.0,62.0,54.0,45.0,41.0,28.0,56.0,49.0
2023-02-28,108.697854,109.17693,108.818364,108.692935,107.227296,106.358789,105.9791,24.447368,22.631579,21.0,...,32.0,64.0,45.0,64.0,44.0,47.0,43.0,34.0,47.0,48.0
2023-03-31,108.614602,108.697854,109.17693,108.818364,107.382222,106.768842,105.929316,22.944444,23.055556,21.0,...,53.0,52.0,45.0,56.0,67.0,58.0,60.0,45.0,69.0,64.0
2023-05-31,109.439785,108.814654,108.614602,108.697854,108.692935,107.227296,106.358789,23.153846,23.134615,21.0,...,45.0,40.0,54.0,69.0,54.0,53.0,61.0,43.0,52.0,52.0


In [6]:
dataset_w.tail(20)

Unnamed: 0,ipc_all,lag_1,lag_2,lag_3,lag_6,lag_9,lag_12,beef_lp,beef_cb,beef_sc,...,inflacion,inflacion en bolivia,inflación bolivia,ipc,la inflacion,la inflación,pib,pib bolivia,que es inflacion,que es pib
2023-11-05,,110.429431,110.440281,110.506839,109.439785,108.697854,108.692935,23.2,23.0,21.0,...,45.0,39.0,47.0,61.0,69.0,45.0,47.0,38.0,37.0,39.0
2023-11-12,,110.429431,110.440281,110.506839,109.439785,108.697854,108.692935,23.2,23.0,21.0,...,46.0,39.0,100.0,63.0,58.0,52.0,43.0,31.0,40.0,38.0
2023-11-19,,110.429431,110.440281,110.506839,109.439785,108.697854,108.692935,23.2,23.0,21.0,...,43.0,39.0,39.0,43.0,47.0,44.0,55.0,46.0,44.0,38.0
2023-11-26,,110.429431,110.440281,110.506839,109.439785,108.697854,108.692935,23.2,23.0,21.0,...,31.0,39.0,50.0,49.0,38.0,36.0,46.0,34.0,47.0,40.0
2023-12-03,,110.425657,110.429431,110.440281,109.678594,108.614602,108.818364,23.27,23.0,21.0,...,36.0,39.0,62.0,55.0,60.0,40.0,42.0,29.0,51.0,43.0
2023-12-10,,110.425657,110.429431,110.440281,109.678594,108.614602,108.818364,23.27,23.0,21.0,...,46.0,39.0,73.0,61.0,58.0,58.0,54.0,36.0,54.0,45.0
2023-12-17,,110.425657,110.429431,110.440281,109.678594,108.614602,108.818364,23.27,23.0,21.0,...,32.0,39.0,71.0,67.0,55.0,56.0,35.0,19.0,57.0,45.0
2023-12-24,,110.425657,110.429431,110.440281,109.678594,108.614602,108.818364,23.18,23.0,21.0,...,17.0,39.0,70.0,74.0,53.0,54.0,12.0,18.0,56.0,44.0
2024-01-07,,111.123491,110.425657,110.429431,110.081702,108.814654,109.17693,23.25,23.0,21.0,...,28.0,38.0,67.0,86.0,48.0,49.0,35.0,38.0,55.0,44.0
2024-01-14,,111.123491,110.425657,110.429431,110.081702,108.814654,109.17693,23.624,23.0,21.0,...,35.0,38.0,65.0,92.0,45.0,47.0,34.0,26.0,54.0,43.0


### **1.1. Correlation-based FS**

It is a filter technique.

In [7]:
ldf_corr = dataset_m.dropna(axis=1)

g12df_corr = preprocess_data(ldf_corr, 12)
g1df_corr = preprocess_data(ldf_corr, 1)

In [8]:
lcorr_lag0, ldf_lag0 = lagged_correlation_color(ldf_corr, 'ipc_all', 0)
lcorr_lag0.head(10)

corr_plus05 = np.array(lcorr_lag0.query('corr > 0.5')['level_1'])
#corr_minus05 = np.array(lcorr_lag0.query('corr < -0.5')['level_1'])
corr_plus05 = np.append(corr_plus05, 'freq')

corr_plus05

  c /= stddev[:, None]
  c /= stddev[None, :]


array(['ipc_all', 'lag_1', 'lag_2', 'lag_3', 'ufv', 'lag_6', 'lag_9',
       'lag_12', 'milk_lp', 'milk_or', 'milk_bol', 'paprika_tr',
       'milk_sc', 'milk_su', 'beef_lp', 'milk_po', 'squash_tr',
       'banana_co', 'milk_cb', 'corn_co', 'papaya_tr', 'milk2_po',
       'milk2_or', 'apple_sc', 'wheat_sc', 'beef_su', 'beef_bol',
       'rice3_co', 'onion2_po', 'banana_bol', 'sorghum_lp',
       'redpepper_tr', 'flour_tj', 'rice2_co', 'banana_tj', 'apple_or',
       'rice_co', 'beef_or', 'dinero', 'flour_po', 'beef_sc',
       'grapefruit_po', 'bean_tr', 'greenbean_tr', 'beef_cb', 'libor',
       'grapefruit_bol', 'banana_tr', 'rice4_or', 'zinc', 'grapefruit_cb',
       'banana_sc', 'pineapple_or', 'banana_su', 'redpepper_po',
       'milk2_lp', 'wheat_po', 'veglard_co', 'banana_lp', 'grapefruit_or',
       'peas_tr', 'beef_tr', 'rice2_or', 'oil_co', 'grapefruit_su',
       'soy_po', 'freq'], dtype=object)

In [9]:
CORR_DATASET = dataset[corr_plus05]
CORR_DATASET.info()

<class 'pandas.core.frame.DataFrame'>
Index: 424 entries, 2011-01-31 to 2024-03-31
Data columns (total 67 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   ipc_all         158 non-null    float64
 1   lag_1           424 non-null    float64
 2   lag_2           424 non-null    float64
 3   lag_3           424 non-null    float64
 4   ufv             424 non-null    float64
 5   lag_6           424 non-null    float64
 6   lag_9           424 non-null    float64
 7   lag_12          424 non-null    float64
 8   milk_lp         424 non-null    float64
 9   milk_or         424 non-null    float64
 10  milk_bol        424 non-null    float64
 11  paprika_tr      424 non-null    float64
 12  milk_sc         424 non-null    float64
 13  milk_su         424 non-null    float64
 14  beef_lp         424 non-null    float64
 15  milk_po         424 non-null    float64
 16  squash_tr       424 non-null    float64
 17  banana_co       424 non-

In [10]:
CORR_DATASET.to_csv('./Data/CORR_DATASET.csv')

### **1.2. Principal Component FS**

Scale the features, otherwise accuracy may drop.

In [11]:
target = 'ipc_all'
target_lags = ['ipc_all', 'lag_1', 'lag_2', 'lag_3', 'lag_6', 'lag_9', 'lag_12']

X = dataset.drop(columns=target_lags + ['freq'])
y = dataset[target]

In [12]:
print(f'Target variable:', y.shape, f'; Features', X.shape)


Target variable: (424,) ; Features (424, 485)


In [13]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled

array([[-2.5518729 , -0.5592163 , -0.84940486, ..., -0.84355335,
         2.58238005,  0.0037038 ],
       [-2.37168157, -0.90249425, -0.90555186, ...,  0.58425751,
         2.58238005,  0.0037038 ],
       [-2.02014421, -0.88927169, -1.1168629 , ...,  2.27167034,
         2.58238005,  0.0037038 ],
       ...,
       [ 0.66336714,  1.59656924,  1.35328482, ..., -0.77865285,
        -0.48696309,  2.03215336],
       [ 0.58451764,  1.59656924,  1.35328482, ..., -1.10315532,
        -0.54598892, -0.25803162],
       [ 0.71067684,  1.59656924,  1.35328482, ..., -0.38924989,
        -0.54598892,  0.4617408 ]])

In [14]:
from sklearn.decomposition import PCA

# It selects the number of components such that the amount of variance that needs to be explained is greater than 95%s
pca = PCA(0.95, svd_solver='full')
X_pca = pca.fit_transform(X_scaled)
X_pca.shape


(424, 54)

54 components were identified.

In [15]:
pca.explained_variance_ratio_

array([0.21870675, 0.11974718, 0.08598362, 0.07148314, 0.04755106,
       0.04131229, 0.03410571, 0.02925262, 0.02788804, 0.02414513,
       0.02376331, 0.01639214, 0.0152539 , 0.01473797, 0.01368769,
       0.01149441, 0.01095208, 0.01044513, 0.01015067, 0.00917791,
       0.00795184, 0.00724378, 0.00664535, 0.00632599, 0.0055678 ,
       0.00536715, 0.00508531, 0.00459676, 0.00430498, 0.00412271,
       0.00406898, 0.00376742, 0.00351575, 0.00337755, 0.00314655,
       0.00293456, 0.00288108, 0.00278109, 0.00264311, 0.00255644,
       0.00238201, 0.00233525, 0.0022165 , 0.00205197, 0.00188864,
       0.00182057, 0.00181259, 0.00170544, 0.00167166, 0.00160149,
       0.00154667, 0.00148643, 0.00144715, 0.00137813])

In [16]:
X_pca_df = pd.DataFrame(X_pca, columns=[f'PC_{i+1}' for i in range(X_pca.shape[1])])
X_pca_df.index = dataset.index
X_pca_df

Unnamed: 0,PC_1,PC_2,PC_3,PC_4,PC_5,PC_6,PC_7,PC_8,PC_9,PC_10,...,PC_45,PC_46,PC_47,PC_48,PC_49,PC_50,PC_51,PC_52,PC_53,PC_54
2011-01-31,1.618272,21.399397,8.147795,-10.219601,-0.707132,8.810415,-3.900702,10.181453,4.492543,7.106562,...,0.298699,-1.867338,0.647402,1.469152,-1.759837,2.428206,-0.589483,0.819520,-4.059087,2.177350
2011-02-28,2.635480,21.130432,8.879051,-13.329218,-2.347886,9.066456,-2.424623,10.319748,3.706225,7.990768,...,-1.211004,-0.773811,0.336185,0.221073,0.171034,0.104572,-0.034142,0.664742,-2.034270,1.355404
2011-03-31,2.520260,20.362836,8.649516,-14.723832,-1.513458,8.109200,-1.846639,9.217193,3.573121,8.869831,...,-1.422534,-0.656897,0.259347,-1.304001,0.736011,-1.096544,1.018928,-1.160390,2.292700,-0.599128
2011-04-30,2.628249,20.058338,10.860027,-18.995607,-0.885793,5.862776,-2.844614,5.678375,3.459106,9.287995,...,-1.268370,0.188996,-0.253840,-1.201931,0.220813,-1.189772,0.510255,-0.935842,4.148804,-2.151768
2011-05-31,1.953217,19.082113,10.097210,-19.587024,0.976199,4.383969,-2.715734,3.602964,2.920417,7.902015,...,-0.324715,-0.143133,-1.588114,-1.680717,0.726330,-1.533480,-0.742266,-0.085081,3.316563,-3.510836
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-03-03,15.634746,-1.188915,-8.197315,0.408506,1.511528,9.314679,6.364997,-3.172502,6.584371,-3.291637,...,-0.502128,0.253666,0.380488,0.055817,1.860663,-0.322729,0.611453,0.937095,-0.345179,0.263440
2024-03-10,15.516065,-1.120126,-8.056191,-0.071418,1.748747,8.976042,6.434908,-3.741832,7.248441,-3.146427,...,-0.006984,0.424370,0.400337,-0.620378,2.055106,0.411677,0.794320,1.440046,-0.379653,0.508299
2024-03-17,15.772056,-1.100795,-8.083854,-0.105591,2.199829,9.377069,6.409880,-3.700712,7.337779,-3.251936,...,0.156617,0.243650,0.281633,-1.067548,2.447660,1.360366,0.742355,1.567588,-0.691553,0.700105
2024-03-24,15.438536,-1.149701,-7.495522,0.736987,1.644222,9.042150,5.897276,-4.077331,6.205651,-3.432321,...,-0.367230,0.664909,0.540041,0.029874,1.760883,-0.176651,0.469203,0.694438,-1.020309,0.813734


In [17]:
PC_DATASET = dataset[target_lags].join(X_pca_df)
PC_DATASET["freq"] = dataset["freq"]
PC_DATASET.tail(20)

Unnamed: 0,ipc_all,lag_1,lag_2,lag_3,lag_6,lag_9,lag_12,PC_1,PC_2,PC_3,...,PC_46,PC_47,PC_48,PC_49,PC_50,PC_51,PC_52,PC_53,PC_54,freq
2023-12-03,,110.425657,110.429431,110.440281,109.678594,108.614602,108.818364,15.949204,-2.725247,-10.610411,...,-0.126053,0.822463,-0.496561,-0.109133,-0.364257,-0.021453,0.198877,-0.125846,-0.367924,week
2023-12-10,,110.425657,110.429431,110.440281,109.678594,108.614602,108.818364,15.990535,-2.725276,-11.258556,...,-0.175893,0.789568,0.037452,-1.541396,-0.196886,-0.231297,-0.038309,0.402831,0.226956,week
2023-12-17,,110.425657,110.429431,110.440281,109.678594,108.614602,108.818364,16.315276,-2.604024,-12.106647,...,-0.362346,0.584184,-0.074462,-0.950702,-0.262739,-0.482425,-0.49467,-0.049554,0.413114,week
2023-12-24,,110.425657,110.429431,110.440281,109.678594,108.614602,108.818364,16.711435,-2.209146,-12.158933,...,-0.081828,0.030914,0.057585,-1.282159,-0.547339,-1.160835,-0.817605,0.292639,0.878564,week
2023-12-31,111.123491,110.425657,110.429431,110.440281,109.678594,108.614602,108.818364,16.432148,-2.401648,-11.71025,...,-0.148902,0.390676,-0.073415,-1.272103,-0.367204,-0.554913,-0.472879,0.209194,0.380973,month
2024-01-07,,111.123491,110.425657,110.429431,110.081702,108.814654,109.17693,16.913693,-1.587718,-10.857358,...,-0.444013,-0.365531,-0.353143,-2.319572,0.310509,-0.481257,-0.471948,0.691171,-0.258853,week
2024-01-14,,111.123491,110.425657,110.429431,110.081702,108.814654,109.17693,17.023034,-1.331947,-10.384093,...,-0.649796,-0.271715,-0.256806,-2.452464,0.47146,-0.6681,-0.378481,0.626475,-0.590564,week
2024-01-21,,111.123491,110.425657,110.429431,110.081702,108.814654,109.17693,16.919577,-1.112481,-9.959996,...,-0.464756,-0.301449,-0.259016,-2.213853,0.742886,-0.404601,-0.373968,0.603807,-0.636179,week
2024-01-28,,111.123491,110.425657,110.429431,110.081702,108.814654,109.17693,17.058285,-1.767327,-10.420215,...,-0.010475,-0.505029,0.136477,-1.686857,0.541325,-0.185723,-0.571936,0.863712,-0.429786,week
2024-01-31,111.21106,111.123491,110.425657,110.429431,110.081702,108.814654,109.17693,17.036254,-1.530279,-10.438091,...,-0.238097,-0.406037,-0.025649,-2.045824,0.531945,-0.351038,-0.579723,0.750484,-0.473649,month


In [18]:
PC_DATASET.to_csv('./Data/PC_DATASET.csv')

### **1.3. L1-LR FS**

In [19]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

df_ml = dataset.query('freq == "month"')
X_ml = df_ml.drop(columns=target_lags + ['freq'])
y_ml = df_ml[target]
X_ml_scaled = scaler.fit_transform(X_ml)

X_train, X_test, y_train, y_test = train_test_split(X_ml_scaled, y_ml, test_size=0.2, random_state=42)

In [20]:
#lasso = Lasso()
lasso = Lasso(alpha=0.1, fit_intercept=True, max_iter=10000, tol=0.0001, positive=False, random_state=42, selection='cyclic')
# Fit on training set
lasso.fit(X_train, y_train)
# Make predictions on validation set using best model from GridSearchCV
lasso_train_pred = lasso.predict(X_train)
lasso_val_pred = lasso.predict(X_test)
# Calculate Forecast metrics on train set
mse_train_lasso = mean_squared_error(y_train, lasso_train_pred)
r2_train_lasso = r2_score(y_train, lasso_train_pred)
mae_train_lasso = mean_absolute_error(y_train, lasso_train_pred)
print("Train MSE: ", mse_train_lasso)
print("Train R2: ", r2_train_lasso)
print("Train MAE: ", mae_train_lasso)
# Calculate Forecast metrics on validation set
mse_val_lasso = mean_squared_error(y_test, lasso_val_pred)
r2_val_lasso = r2_score(y_test, lasso_val_pred)
mae_val_lasso = mean_absolute_error(y_test, lasso_val_pred)
print("Test MSE: ", mse_val_lasso)
print("Test R2: ", r2_val_lasso)
print("Test MAE: ", mae_val_lasso)

Train MSE:  0.09679851190549649
Train R2:  0.9990904917386021
Train MAE:  0.25077726956958246
Test MSE:  0.17198211994208862
Test R2:  0.9985095550930295
Test MAE:  0.3393414466183371


In [80]:
# Define parameter grid for GridSearchCV
alphas_lasso = np.logspace(-1, 2, num=500)
#alphas_lasso = np.arange(1, 100, 0.05)
param_grid = {'alpha': alphas_lasso,
              'positive': [True, False],
              'fit_intercept': [True, False]}

# Instantiate lasso model
lasso = Lasso()

# Define GridSearchCV object
grid_search_lasso = GridSearchCV(lasso, param_grid, cv=5, scoring='neg_mean_squared_error')  ## works better with 5 cv
# Fit GridSearchCV on training set
grid_search_lasso.fit(X_train, y_train)
# Print best parameter and score from GridSearchCV
print("Best parameter: ", grid_search_lasso.best_params_)
print("Best score: ", -grid_search_lasso.best_score_)
# Make predictions on validation set using best model from GridSearchCV
best_lasso = grid_search_lasso.best_estimator_
y_val_pred = best_lasso.predict(X_test)
# Calculate RMSE on validation set
mse_val = mean_squared_error(y_test, y_val_pred)
r2_val = r2_score(y_test, y_val_pred)
mae_val = mean_absolute_error(y_test, y_val_pred)
print("Validation MSE: ", mse_val)
print("Validation R2: ", r2_val)
print("Validation MAE: ", mae_val)

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

Best parameter:  {'alpha': 0.1, 'fit_intercept': True, 'positive': False}
Best score:  0.1699959478073344
Validation MSE:  0.17198211994208862
Validation R2:  0.9985095550930295
Validation MAE:  0.3393414466183371


In [21]:
# Get the coefficients from the Lasso model
coef = lasso.coef_
# Create a dataframe of feature importances
feature_importance_lasso = pd.DataFrame({'Feature': X_ml.columns, 'Importance': coef})
# Sort the features by importance
feature_importance_lasso = feature_importance_lasso.sort_values('Importance', ascending=False).reset_index(drop=True)
feature_importance_lasso.columns = ['feat', 'imp_lasso']
# Print the feature importances
feature_importance_lasso[feature_importance_lasso['imp_lasso'] != 0]

Unnamed: 0,feat,imp_lasso
0,ufv,9.314622
1,banana_co,0.19169
2,beef_lp,0.176471
3,papa2_or,0.149298
4,squash_tr,0.136158
5,noodle_sc,0.13439
6,greenbean_tr,0.120859
7,peas_lp,0.106239
8,beef_or,0.10531
9,ycorn_tr,0.080515


In [22]:
vars_lasso = feature_importance_lasso.query('imp_lasso != 0')['feat'].values
vars_lasso

array(['ufv', 'banana_co', 'beef_lp', 'papa2_or', 'squash_tr',
       'noodle_sc', 'greenbean_tr', 'peas_lp', 'beef_or', 'ycorn_tr',
       'paprika_tr', 'noodle_tj', 'tomato_lp', 'papa2_tr', 'peas_or',
       'banana_tj', 'tomato_tr', 'milk_or', 'libor', 'yuca_sc',
       'papa2_sc', 'peas_po', 'oil2_sc', 'lard_co', 'tomato_co'],
      dtype=object)

In [23]:
lasso_df_1 = dataset[target_lags]
lasso_df_2 = dataset[vars_lasso]

In [24]:
LASSO_DATASET = pd.merge(lasso_df_1, lasso_df_2, left_index=True, right_index=True)
LASSO_DATASET['freq'] = dataset['freq']
LASSO_DATASET

Unnamed: 0,ipc_all,lag_1,lag_2,lag_3,lag_6,lag_9,lag_12,ufv,banana_co,beef_lp,...,tomato_tr,milk_or,libor,yuca_sc,papa2_sc,peas_po,oil2_sc,lard_co,tomato_co,freq
2011-01-31,74.207255,73.260267,71.989803,71.196381,69.071086,68.561311,68.467691,1.568637,33.5,18.492308,...,119.955000,4.6,0.455426,39.903846,33.423077,66.923077,9.211538,290.0,108.0,month
2011-02-28,75.439060,74.207255,73.260267,71.989803,69.800954,68.549203,68.581371,1.577936,33.5,18.720833,...,105.000000,4.6,0.464045,34.140833,35.208333,58.864583,9.395833,290.0,108.0,month
2011-03-31,76.108818,75.439060,74.207255,73.260267,70.335479,68.646613,68.499278,1.588942,33.5,19.166667,...,87.601852,4.6,0.460783,23.962963,34.351852,74.537037,9.500000,290.0,108.0,month
2011-04-30,76.125495,76.108818,75.439060,74.207255,71.196381,69.071086,68.561311,1.601850,33.5,19.076923,...,72.238462,4.6,0.440875,17.548077,30.173077,71.298077,9.500000,290.0,108.0,month
2011-05-31,76.277495,76.125495,76.108818,75.439060,71.989803,69.800954,68.549203,1.615897,33.5,19.000000,...,57.788462,4.6,0.414302,16.057692,30.307692,52.500000,9.721154,290.0,108.0,month
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-03-03,,111.433348,111.211060,111.123491,110.440281,109.678594,108.614602,2.482441,120.0,23.183333,...,83.483333,5.8,5.713008,30.000000,44.375000,91.666667,9.230000,210.0,110.0,week
2024-03-10,,111.433348,111.211060,111.123491,110.440281,109.678594,108.614602,2.483290,120.0,22.713333,...,84.776667,5.8,5.670106,30.000000,35.166667,89.166667,9.230000,210.0,110.0,week
2024-03-17,,111.433348,111.211060,111.123491,110.440281,109.678594,108.614602,2.484330,120.0,22.570000,...,91.650000,5.8,5.671370,30.000000,34.541667,80.000000,9.230000,210.0,110.0,week
2024-03-24,,111.433348,111.211060,111.123491,110.440281,109.678594,108.614602,2.485350,120.0,22.470000,...,93.280000,5.8,5.688478,30.000000,34.550000,80.000000,9.230000,210.0,110.0,week


In [25]:
LASSO_DATASET.to_csv('./Data/LASSO_DATASET.csv')

### **1.4. Random Forest FS**

In [26]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(random_state=42)
# Fit the model to the training data and make predictions on the validation set
rf.fit(X_train, y_train)
# Make predictions on validation set using best model from GridSearchCV
rf_train_pred = rf.predict(X_train)
rf_val_pred = rf.predict(X_test)
# Calculate Forecast metrics on train set
mse_train_rf = mean_squared_error(y_train, rf_train_pred)
r2_train_rf = r2_score(y_train, rf_train_pred)
mae_train_rf = mean_absolute_error(y_train, rf_train_pred)
print("Train MSE: ", mse_train_rf)
print("Train R2: ", r2_train_rf)
print("Train MAE: ", mae_train_rf)
# Calculate Forecast metrics on validation set
mse_val_rf = mean_squared_error(y_test, rf_val_pred)
r2_val_rf = r2_score(y_test, rf_val_pred)
mae_val_rf = mean_absolute_error(y_test, rf_val_pred)
print("Test MSE: ", mse_val_rf)
print("Test R2: ", r2_val_rf)
print("Test MAE: ", mae_val_rf)

Train MSE:  0.05666049914440738
Train R2:  0.999467624129211
Train MAE:  0.15373526784367525
Test MSE:  0.38756524065236453
Test R2:  0.9966412517810362
Test MAE:  0.43776365575590725


In [27]:
# Create a DataFrame with the feature importance values
feature_importance_rf = pd.DataFrame({'Feature': X_ml.columns, 'Importance': rf.feature_importances_})
# Sort the DataFrame by importance values in descending order
feature_importance_rf = feature_importance_rf.sort_values(by='Importance', ascending=False).reset_index(drop=True)
feature_importance_rf.columns = ['feat', 'imp_rf']
# Print the feature importance DataFrame
feature_importance_rf[feature_importance_rf['imp_rf'] != 0]

Unnamed: 0,feat,imp_rf
0,ufv,2.608241e-01
1,lard_co,1.718390e-01
2,flour_tj,6.776175e-02
3,wheat_lp,4.517957e-02
4,oil2_su,3.856744e-02
...,...,...
474,onion2_tr,1.922446e-07
475,que es pib,1.818888e-07
476,redpepper_co,4.027837e-08
477,onion2_or,8.348764e-09


In [28]:
# Calculate deciles
feature_importance_rf['decile'] = pd.qcut(feature_importance_rf['imp_rf'], 10, labels=False) + 1

In [29]:
vars_rf = feature_importance_rf.query('decile > 8')['feat'].values
vars_rf

array(['ufv', 'lard_co', 'flour_tj', 'wheat_lp', 'oil2_su', 'oil2_or',
       'quinoa_tj', 'milk_su', 'rice4_su', 'milk_bol', 'milk_po',
       'milk_sc', 'gold', 'paprika_tr', 'milk_cb', 'milk_lp', 'noodle_or',
       'milk_or', 'milk2_or', 'ycorn_co', 'wheat_sc', 'milk2_cb',
       'squash_co', 'milk_tj', 'apple_co', 'flour_tr', 'quinoa_po',
       'noodle_cb', 'rice4_po', 'noodle_tj', 'noodle_sc', 'copper',
       'la inflación', 'papa2_tj', 'flour_or', 'silver', 'oil2_sc',
       'redpepper_lp', 'noodle_tr', 'noodle_su', 'rice3_su', 'squash_tr',
       'exchange', 'orange2_su', 'lard_sc', 'rice3_bol', 'rice3_tj',
       'rice3_or', 'lard_po', 'noodle_bol', 'flour_bol', 'papaya_su',
       'corn_co', 'flour_lp', 'noodle_lp', 'orange2_bol', 'soy_oil',
       'milk2_tr', 'papa1_co', 'rice_tr', 'flour2_co', 'veglard_sc',
       'noodle_po', 'quinoa_sc', 'rice_cb', 'rice3_co', 'oil2_po',
       'lard_cb', 'quinoa_bol', 'oil2_co', 'veglard_tr', 'libor',
       'beef_lp', 'milk2_po', 'red

In [30]:
rf_df_1 = dataset[target_lags]
rf_df_2 = dataset[vars_rf]

RF_DATASET = pd.merge(rf_df_1, rf_df_2, left_index=True, right_index=True)
RF_DATASET['freq'] = dataset['freq']
RF_DATASET

Unnamed: 0,ipc_all,lag_1,lag_2,lag_3,lag_6,lag_9,lag_12,ufv,lard_co,flour_tj,...,sugar_sc,flour2_or,rice2_tj,sorghum_su,sugar_cb,sugar_po,sugar_lp,onion2_cb,redpepper_or,freq
2011-01-31,74.207255,73.260267,71.989803,71.196381,69.071086,68.561311,68.467691,1.568637,290.0,136.0,...,384.615385,192.307692,226.730769,87.5,353.653846,390.153846,380.230769,22.86,442.5,month
2011-02-28,75.439060,74.207255,73.260267,71.989803,69.800954,68.549203,68.581371,1.577936,290.0,136.0,...,449.583333,186.291667,224.583333,87.5,412.500000,413.125000,403.541667,22.86,442.5,month
2011-03-31,76.108818,75.439060,74.207255,73.260267,70.335479,68.646613,68.499278,1.588942,290.0,136.0,...,422.222222,190.000000,230.000000,87.5,432.962963,405.555556,439.888889,22.86,442.5,month
2011-04-30,76.125495,76.108818,75.439060,74.207255,71.196381,69.071086,68.561311,1.601850,290.0,136.0,...,378.653846,195.692308,216.153846,87.5,381.538462,398.653846,374.461538,22.86,442.5,month
2011-05-31,76.277495,76.125495,76.108818,75.439060,71.989803,69.800954,68.549203,1.615897,290.0,136.0,...,343.230769,195.769231,200.000000,87.5,345.192308,382.500000,363.230769,22.86,442.5,month
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-03-03,,111.433348,111.211060,111.123491,110.440281,109.678594,108.614602,2.482441,210.0,170.2,...,222.500000,230.000000,190.000000,100.0,231.000000,240.000000,239.000000,37.76,380.0,week
2024-03-10,,111.433348,111.211060,111.123491,110.440281,109.678594,108.614602,2.483290,210.0,170.2,...,222.500000,236.666667,190.000000,100.0,231.000000,240.000000,239.000000,37.76,380.0,week
2024-03-17,,111.433348,111.211060,111.123491,110.440281,109.678594,108.614602,2.484330,210.0,170.2,...,222.500000,230.000000,190.000000,100.0,231.000000,240.000000,239.000000,37.76,380.0,week
2024-03-24,,111.433348,111.211060,111.123491,110.440281,109.678594,108.614602,2.485350,210.0,170.2,...,222.500000,230.000000,199.000000,100.0,231.000000,240.000000,239.000000,37.76,380.0,week


In [31]:
RF_DATASET.to_csv('./Data/RF_DATASET.csv')