In [153]:
%pip install scikit-learn

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip available: 22.2.2 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


# <font color="red">**STEP 2: FEATURE SELECTION**</font>

In [154]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import plotly.graph_objects as go

In [155]:
# Correlation matrix
def lagged_correlation_color(df, specific_var, l=0):
    lags = [l]
    # Create lagged versions of the DataFrame
    lagged_df = pd.concat([df.shift(lag) for lag in lags], axis=1, keys=[f'Lag{lag}' for lag in lags])
    # Calculate correlation coefficients
    correlations = lagged_df.corrwith(df[specific_var])
    # Sort correlations from highest to lowest
    correlations_sorted = correlations.sort_values(ascending=False)
    # Convert the sorted correlations to a data frame and reset index
    df_correlations_sorted = pd.DataFrame(correlations_sorted, columns=['corr']).reset_index()

    lagged_df.columns = df.columns
    return df_correlations_sorted, lagged_df

col_range = range(0,1)


# Growth Rate datafrmaes
def preprocess_data(df, period):
    df_copy = df.copy().pct_change(period).dropna(axis=0)
    df_copy = df_copy.loc[:, np.isfinite(df_copy).all(axis=0)]
    return df_copy.dropna(axis=1)


## **1. Feature Selection (FS)**

In [156]:
dataset = pd.read_csv('./Data/FINAL_DATASET.csv', index_col=0)
dataset.index = pd.to_datetime(dataset.index)
dataset.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 6277 entries, 2009-01-02 to 2024-10-31
Columns: 461 entries, ipc_all to freq
dtypes: float64(460), object(1)
memory usage: 22.1+ MB


In [157]:
total_rows = len(dataset)
month_rows = len(dataset[dataset['freq'] == 'month'])
week_rows = len(dataset[dataset['freq'] == 'week'])
day_rows = len(dataset[dataset['freq'] == 'day'])
total_feats = len(dataset.columns)

print(f'Total rows: {total_rows}')
print(f'Month rows: {month_rows}')
print(f'Week rows: {week_rows}')
print(f'Day rows: {day_rows}')
print(f'Total features: {total_feats}')

Total rows: 6277
Month rows: 190
Week rows: 305
Day rows: 5782
Total features: 461


In [158]:
dataset_g12 = pd.read_csv('./Data/FINAL_DATASET_G12.csv', index_col=0)
dataset_g12.index = pd.to_datetime(dataset_g12.index)
dataset_g12.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 5901 entries, 2010-01-01 to 2024-10-31
Columns: 461 entries, ipc_all to freq
dtypes: float64(460), object(1)
memory usage: 20.8+ MB


In [159]:
total_rows_g12 = len(dataset_g12)
month_rows_g12 = len(dataset_g12[dataset_g12['freq'] == 'month'])
week_rows_g12 = len(dataset_g12[dataset_g12['freq'] == 'week'])
day_rows_g12 = len(dataset_g12[dataset_g12['freq'] == 'day'])
total_feats_g12 = len(dataset_g12.columns)

print(f'Total rows: {total_rows_g12}')
print(f'Month rows: {month_rows_g12}')
print(f'Week rows: {week_rows_g12}')
print(f'Day rows: {day_rows_g12}')
print(f'Total features: {total_feats_g12}')

Total rows: 5901
Month rows: 178
Week rows: 305
Day rows: 5418
Total features: 461


In [160]:
dataset_m = dataset.copy().query('freq == "month"').drop(columns=['freq'])
dataset_m_g12 = dataset_g12.copy().query('freq == "month"').drop(columns=['freq'])

feats_w = dataset.copy().query('freq == "week"').drop(columns=['ipc_all', 'freq'])

feats_d = dataset.copy().query('freq == "day"').drop(columns=['ipc_all', 'freq'])
names_daily = feats_d.columns[:-17]
feats_d = feats_d.loc[:, names_daily]

In [161]:
dataset_m_g12.shape

(178, 460)

In [162]:
names_daily

Index(['lag_1', 'lag_2', 'lag_3', 'lag_6', 'lag_9', 'lag_12', 'beef_lp',
       'beef_cb', 'beef_sc', 'beef_or',
       ...
       'gold', 'silver', 'zinc', 'tin', 'soybean', 'soy_flour', 'soy_oil',
       'lead', 'copper', 'libor'],
      dtype='object', length=442)

### **1.1. Correlation-based FS**

It is a filter technique.

In [163]:
#ldf_corr = dataset_m.copy().dropna(axis=1)
ldf_corr = dataset_m_g12.copy().dropna(axis=1)

lcorr_lag0, ldf_lag0 = lagged_correlation_color(ldf_corr, 'ipc_all', 0)
lcorr_lag0.head(10)

Unnamed: 0,level_0,level_1,corr
0,Lag0,ipc_all,1.0
1,Lag0,lag_1,0.968994
2,Lag0,lag_2,0.912273
3,Lag0,lag_3,0.846406
4,Lag0,sugar_tj,0.736723
5,Lag0,sugar_bol,0.731961
6,Lag0,chicken_tj,0.73153
7,Lag0,sugar_po,0.721028
8,Lag0,sugar_cb,0.700789
9,Lag0,sugar_lp,0.699784


In [164]:
corr_plus05 = np.array(lcorr_lag0.query('corr > 0.5')['level_1'])
#corr_minus05 = np.array(lcorr_lag0.query('corr < -0.5')['level_1'])
corr_plus05 = np.append(corr_plus05, 'freq')

corr_plus05

array(['ipc_all', 'lag_1', 'lag_2', 'lag_3', 'sugar_tj', 'sugar_bol',
       'chicken_tj', 'sugar_po', 'sugar_cb', 'sugar_lp', 'sugar_or',
       'oil2_bol', 'sugar_sc', 'lag_6', 'lard_co', 'oil2_tj', 'rice4_su',
       'squash_co', 'watermelon_po', 'oil2_or', 'peas_co', 'onion2_tj',
       'oil2_su', 'oil_su', 'sugar_su', 'apple_tr', 'chicken_sc',
       'lard_or', 'oil2_po', 'oil_bol', 'oil_po', 'oil2_sc',
       'redpepper_co', 'silver', 'rice4_lp', 'rice4_tj', 'platano_co',
       'oil_sc', 'chicken_co', 'pineapple_co', 'ycorn_tj', 'freq'],
      dtype=object)

In [165]:
len(corr_plus05)

42

In [166]:
CORR_DATASET = dataset_g12[corr_plus05]
CORR_DATASET.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 5901 entries, 2010-01-01 to 2024-10-31
Data columns (total 42 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   ipc_all        178 non-null    float64
 1   lag_1          5901 non-null   float64
 2   lag_2          5901 non-null   float64
 3   lag_3          5901 non-null   float64
 4   sugar_tj       5901 non-null   float64
 5   sugar_bol      5901 non-null   float64
 6   chicken_tj     5901 non-null   float64
 7   sugar_po       5901 non-null   float64
 8   sugar_cb       5901 non-null   float64
 9   sugar_lp       5901 non-null   float64
 10  sugar_or       5901 non-null   float64
 11  oil2_bol       5901 non-null   float64
 12  sugar_sc       5901 non-null   float64
 13  lag_6          5901 non-null   float64
 14  lard_co        5901 non-null   float64
 15  oil2_tj        5901 non-null   float64
 16  rice4_su       5901 non-null   float64
 17  squash_co      5901 non-null   flo

In [167]:
CORR_DATASET.to_csv('./Data/CORR_DATASET.csv')

### **1.2. Principal Component FS**

Scale the features, otherwise accuracy may drop.

In [168]:
target = 'ipc_all'
target_lags = ['ipc_all', 'lag_1', 'lag_2', 'lag_3', 'lag_6', 'lag_9', 'lag_12']

dataset_g12['new_index'] = dataset_g12.index.astype(str) + '_' + dataset_g12['freq']

X = dataset_g12.drop(columns=target_lags + ['freq', 'new_index']).dropna(axis=1)
y = dataset_g12[target]

In [169]:
print(f'Target variable:', y.shape, f'; Features', X.shape)

Target variable: (5901,) ; Features (5901, 436)


In [170]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled

array([[-2.58742898, -2.26192688, -2.28516214, ...,  1.46076987,
         0.03142967, -0.70367543],
       [-2.58742898, -2.42449855, -2.25868094, ...,  1.58360414,
         0.05912217, -0.70280043],
       [-2.58742898, -2.42449855, -2.25868094, ...,  1.70643842,
         0.08681467, -0.70192542],
       ...,
       [ 4.833077  ,  6.95046762,  3.24278841, ..., -0.25299022,
         1.51187064,  1.67642999],
       [ 3.74766666,  2.47537655,  3.3762622 , ..., -0.04933262,
         1.58661895,  1.67657407],
       [ 4.74708613,  5.59570373,  3.24278841, ..., -0.32180701,
         1.47132882,  1.67642999]])

In [171]:
from sklearn.decomposition import PCA

# It selects the number of components such that the amount of variance that needs to be explained is greater than 95%s
pca = PCA(0.95, svd_solver='full')
X_pca = pca.fit_transform(X_scaled)
X_pca.shape


(5901, 79)

79 components were identified.

In [172]:
pca.explained_variance_ratio_

array([0.18859154, 0.12749233, 0.09021952, 0.05291936, 0.04726816,
       0.04030414, 0.03634113, 0.03025506, 0.02671613, 0.02362305,
       0.02065484, 0.01843391, 0.01732581, 0.01573493, 0.01275343,
       0.01167893, 0.01126393, 0.00944945, 0.00909397, 0.00832898,
       0.00811167, 0.0075462 , 0.00682879, 0.00635221, 0.00594859,
       0.00553161, 0.00515039, 0.0050129 , 0.00485257, 0.00477517,
       0.00433798, 0.00406171, 0.00380269, 0.003666  , 0.0035699 ,
       0.00335571, 0.00302302, 0.00298503, 0.00288305, 0.00269824,
       0.00260717, 0.00259726, 0.00246268, 0.00235899, 0.0023121 ,
       0.00220399, 0.00202826, 0.00198283, 0.00194565, 0.00181304,
       0.00175278, 0.00171082, 0.00168135, 0.00160296, 0.00152011,
       0.001512  , 0.00150034, 0.00144756, 0.00141887, 0.00132098,
       0.00127883, 0.00123518, 0.00122281, 0.0011903 , 0.00115439,
       0.0011262 , 0.00110055, 0.00107558, 0.00102066, 0.00101164,
       0.00098299, 0.00096271, 0.0009482 , 0.00092819, 0.00089

In [173]:
X_pca_df = pd.DataFrame(X_pca, columns=[f'PC_{i+1}' for i in range(X_pca.shape[1])])
X_pca_df.index = dataset_g12.index
X_pca_df['new_index'] = dataset_g12['new_index']
X_pca_df

Unnamed: 0,PC_1,PC_2,PC_3,PC_4,PC_5,PC_6,PC_7,PC_8,PC_9,PC_10,...,PC_71,PC_72,PC_73,PC_74,PC_75,PC_76,PC_77,PC_78,PC_79,new_index
2010-01-01,-20.970171,-4.625756,-4.168631,-0.254589,10.319964,1.134896,-1.110726,6.894987,0.520822,-0.495343,...,0.624202,-0.566304,-0.133735,-0.338199,-0.669237,0.359229,1.298030,0.334040,-0.004246,2010-01-01_day
2010-01-02,-20.868442,-4.598408,-4.102453,-0.239429,10.238183,1.196914,-1.040889,6.846254,0.528276,-0.519677,...,0.534415,-0.670979,-0.147108,-0.361855,-0.606749,0.418742,1.288367,0.369653,0.084450,2010-01-02_day
2010-01-03,-20.867455,-4.609475,-4.115188,-0.249937,10.240055,1.187149,-1.043273,6.837460,0.513682,-0.519133,...,0.528630,-0.674637,-0.129897,-0.372026,-0.621321,0.404548,1.296662,0.380093,0.087736,2010-01-03_day
2010-01-04,-20.967043,-4.600405,-4.204139,-0.226998,10.306419,1.172637,-1.132652,7.008741,0.536901,-0.552218,...,0.391153,-0.576913,-0.019974,-0.453570,-0.668582,0.345343,1.391529,0.436312,0.118345,2010-01-04_day
2010-01-05,-21.038689,-4.664602,-4.159526,-0.365742,10.322708,1.054909,-0.869388,7.029127,0.536302,-0.488472,...,0.328792,-0.404120,0.161493,-0.488204,-0.701417,0.406644,1.367918,0.704911,0.198210,2010-01-05_day
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-10-28,28.430384,-7.722128,5.563239,0.844455,12.648468,-10.666714,3.477344,-2.310923,-3.381367,2.104472,...,1.185404,-1.426368,-0.352951,-0.234083,-0.237996,0.484841,-1.967916,1.051485,-0.007844,2024-10-28_day
2024-10-29,28.751151,-7.683969,5.403818,1.026629,12.682780,-10.723074,3.346323,-2.262193,-3.631787,2.301760,...,1.276731,-1.372216,-0.421101,-0.294809,0.111416,0.701181,-2.011542,1.203387,-0.281491,2024-10-29_day
2024-10-30,29.262108,-7.422008,5.754088,1.567181,13.206778,-10.316025,3.320686,-3.500139,-3.648899,2.513973,...,1.439288,-1.410250,-0.119333,0.374774,0.731990,1.506659,-2.204932,1.961685,-0.314710,2024-10-30_day
2024-10-31,26.542171,-7.682526,4.842983,1.068551,11.079343,-10.904347,3.095026,-1.366122,-2.858670,-0.015268,...,-0.190123,-0.514462,0.011807,-0.460678,0.022921,-0.101392,-0.416919,0.074665,-0.181766,2024-10-31_month


In [174]:
PC_DATASET = pd.merge(dataset_g12[target_lags + ['freq', 'new_index']], X_pca_df, on='new_index')
PC_DATASET.index = dataset_g12.index
PC_DATASET

Unnamed: 0,ipc_all,lag_1,lag_2,lag_3,lag_6,lag_9,lag_12,freq,new_index,PC_1,...,PC_70,PC_71,PC_72,PC_73,PC_74,PC_75,PC_76,PC_77,PC_78,PC_79
2010-01-01,,0.263790,0.457498,0.784931,1.447299,5.323961,11.041182,day,2010-01-01_day,-20.970171,...,-0.467779,0.624202,-0.566304,-0.133735,-0.338199,-0.669237,0.359229,1.298030,0.334040,-0.004246
2010-01-02,,0.263790,0.457498,0.784931,1.447299,5.323961,11.041182,day,2010-01-02_day,-20.868442,...,-0.481176,0.534415,-0.670979,-0.147108,-0.361855,-0.606749,0.418742,1.288367,0.369653,0.084450
2010-01-03,,0.263790,0.457498,0.784931,1.447299,5.323961,11.041182,day,2010-01-03_day,-20.867455,...,-0.473800,0.528630,-0.674637,-0.129897,-0.372026,-0.621321,0.404548,1.296662,0.380093,0.087736
2010-01-04,,0.263790,0.457498,0.784931,1.447299,5.323961,11.041182,day,2010-01-04_day,-20.967043,...,-0.302514,0.391153,-0.576913,-0.019974,-0.453570,-0.668582,0.345343,1.391529,0.436312,0.118345
2010-01-05,,0.263790,0.457498,0.784931,1.447299,5.323961,11.041182,day,2010-01-05_day,-21.038689,...,-0.461695,0.328792,-0.404120,0.161493,-0.488204,-0.701417,0.406644,1.367918,0.704911,0.198210
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-10-28,,6.182067,5.192869,3.953707,3.460018,1.863150,2.075432,day,2024-10-28_day,28.430384,...,2.251461,1.185404,-1.426368,-0.352951,-0.234083,-0.237996,0.484841,-1.967916,1.051485,-0.007844
2024-10-29,,6.182067,5.192869,3.953707,3.460018,1.863150,2.075432,day,2024-10-29_day,28.751151,...,2.513371,1.276731,-1.372216,-0.421101,-0.294809,0.111416,0.701181,-2.011542,1.203387,-0.281491
2024-10-30,,6.182067,5.192869,3.953707,3.460018,1.863150,2.075432,day,2024-10-30_day,29.262108,...,1.515888,1.439288,-1.410250,-0.119333,0.374774,0.731990,1.506659,-2.204932,1.961685,-0.314710
2024-10-31,7.938964,6.182067,5.192869,3.953707,3.460018,1.863150,2.075432,month,2024-10-31_month,26.542171,...,1.257521,-0.190123,-0.514462,0.011807,-0.460678,0.022921,-0.101392,-0.416919,0.074665,-0.181766


In [175]:
dataset_g12 = dataset_g12.drop(columns=['new_index'])
dataset_g12

Unnamed: 0,ipc_all,lag_1,lag_2,lag_3,lag_6,lag_9,lag_12,beef_lp,beef_cb,beef_sc,...,inflacion en bolivia,inflación bolivia,ipc,la inflacion,la inflación,pib,pib bolivia,que es inflacion,que es pib,freq
2010-01-01,,0.263790,0.457498,0.784931,1.447299,5.323961,11.041182,16.000000,17.000000,15.650000,...,,,,,,,,,,day
2010-01-02,,0.263790,0.457498,0.784931,1.447299,5.323961,11.041182,16.000000,16.700000,15.690000,...,,,,,,,,,,day
2010-01-03,,0.263790,0.457498,0.784931,1.447299,5.323961,11.041182,16.000000,16.700000,15.690000,...,,,,,,,,,,day
2010-01-04,,0.263790,0.457498,0.784931,1.447299,5.323961,11.041182,16.000000,16.700000,15.690000,...,,,,,,,,,,day
2010-01-05,,0.263790,0.457498,0.784931,1.447299,5.323961,11.041182,16.000000,16.700000,15.690000,...,,,,,,,,,,day
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-10-28,,6.182067,5.192869,3.953707,3.460018,1.863150,2.075432,29.000000,34.500000,24.000000,...,,,,,,,,,,day
2024-10-29,,6.182067,5.192869,3.953707,3.460018,1.863150,2.075432,30.000000,34.500000,24.000000,...,,,,,,,,,,day
2024-10-30,,6.182067,5.192869,3.953707,3.460018,1.863150,2.075432,30.670000,34.000000,24.000000,...,,,,,,,,,,day
2024-10-31,7.938964,6.182067,5.192869,3.953707,3.460018,1.863150,2.075432,28.524194,25.741935,24.201613,...,93.0,51.0,55.0,35.0,57.0,42.0,29.0,60.0,39.0,month


In [176]:
PC_DATASET = PC_DATASET.drop(columns=['new_index'])

PC_DATASET.to_csv('./Data/PC_DATASET.csv')

### **1.3. L1-LR FS**

In [177]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

df_ml = dataset_g12.query('freq == "month"')
X_ml = df_ml.drop(columns=target_lags + ['freq'])
y_ml = df_ml[target]
X_ml_scaled = scaler.fit_transform(X_ml)

X_train, X_test, y_train, y_test = train_test_split(X_ml_scaled, y_ml, test_size=0.2, random_state=42)

In [178]:
#lasso = Lasso()
lasso = Lasso(alpha=0.1, fit_intercept=True, max_iter=10000, tol=0.0001, positive=True, random_state=42, selection='cyclic')
# Fit on training set
lasso.fit(X_train, y_train)
# Make predictions on validation set using best model from GridSearchCV
lasso_train_pred = lasso.predict(X_train)
lasso_val_pred = lasso.predict(X_test)
# Calculate Forecast metrics on train set
mse_train_lasso = mean_squared_error(y_train, lasso_train_pred)
r2_train_lasso = r2_score(y_train, lasso_train_pred)
mae_train_lasso = mean_absolute_error(y_train, lasso_train_pred)
print("Train MSE: ", mse_train_lasso)
print("Train R2: ", r2_train_lasso)
print("Train MAE: ", mae_train_lasso)
# Calculate Forecast metrics on validation set
mse_val_lasso = mean_squared_error(y_test, lasso_val_pred)
r2_val_lasso = r2_score(y_test, lasso_val_pred)
mae_val_lasso = mean_absolute_error(y_test, lasso_val_pred)
print("Test MSE: ", mse_val_lasso)
print("Test R2: ", r2_val_lasso)
print("Test MAE: ", mae_val_lasso)

Train MSE:  0.4312193907829029
Train R2:  0.919295352283608
Train MAE:  0.4809178444693284
Test MSE:  0.7241313078420651
Test R2:  0.9211377699975312
Test MAE:  0.5934779408480425


In [179]:
# Get the coefficients from the Lasso model
coef = lasso.coef_
# Create a dataframe of feature importances
feature_importance_lasso = pd.DataFrame({'Feature': X_ml.columns, 'Importance': coef})
# Sort the features by importance
feature_importance_lasso = feature_importance_lasso.sort_values('Importance', ascending=False).reset_index(drop=True)
feature_importance_lasso.columns = ['feat', 'imp_lasso']
# Print the feature importances
feature_importance_lasso[feature_importance_lasso['imp_lasso'] != 0]

Unnamed: 0,feat,imp_lasso
0,oil_cb,0.496768
1,sugar_sc,0.391551
2,milk2_cb,0.370174
3,watermelon_po,0.268509
4,flour_lp,0.252637
5,chicken_tj,0.234608
6,lard_co,0.224555
7,onion2_tj,0.20456
8,ycorn_tj,0.174047
9,platano_co,0.16529


In [180]:
vars_lasso = feature_importance_lasso.query('imp_lasso != 0')['feat'].values
vars_lasso

array(['oil_cb', 'sugar_sc', 'milk2_cb', 'watermelon_po', 'flour_lp',
       'chicken_tj', 'lard_co', 'onion2_tj', 'ycorn_tj', 'platano_co',
       'oil_su', 'chicken_co', 'rice_cb', 'inflacion', 'tomato_tj',
       'chicken_tr', 'papa2_tj', 'silver', 'sugar_tj', 'chicken_cb',
       'squash_co', 'peas_tj', 'rice3_po', 'peas_su', 'papaya_su',
       'carrot_tj', 'bean_cb', 'yuca_lp'], dtype=object)

In [181]:
lasso_df_1 = dataset_g12[target_lags]
lasso_df_2 = dataset_g12[vars_lasso]
lasso_df_2['freq'] = dataset_g12['freq']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  lasso_df_2['freq'] = dataset_g12['freq']


In [182]:
lasso_df_1

Unnamed: 0,ipc_all,lag_1,lag_2,lag_3,lag_6,lag_9,lag_12
2010-01-01,,0.263790,0.457498,0.784931,1.447299,5.323961,11.041182
2010-01-02,,0.263790,0.457498,0.784931,1.447299,5.323961,11.041182
2010-01-03,,0.263790,0.457498,0.784931,1.447299,5.323961,11.041182
2010-01-04,,0.263790,0.457498,0.784931,1.447299,5.323961,11.041182
2010-01-05,,0.263790,0.457498,0.784931,1.447299,5.323961,11.041182
...,...,...,...,...,...,...,...
2024-10-28,,6.182067,5.192869,3.953707,3.460018,1.863150,2.075432
2024-10-29,,6.182067,5.192869,3.953707,3.460018,1.863150,2.075432
2024-10-30,,6.182067,5.192869,3.953707,3.460018,1.863150,2.075432
2024-10-31,7.938964,6.182067,5.192869,3.953707,3.460018,1.863150,2.075432


In [183]:
lasso_df_2

Unnamed: 0,oil_cb,sugar_sc,milk2_cb,watermelon_po,flour_lp,chicken_tj,lard_co,onion2_tj,ycorn_tj,platano_co,...,chicken_cb,squash_co,peas_tj,rice3_po,peas_su,papaya_su,carrot_tj,bean_cb,yuca_lp,freq
2010-01-01,9.10000,145.000000,128.00,150.0,160.000000,13.000000,290.0,35.0,67.0,75.00,...,8.300000,65.00,60.000000,220.0,55.000000,500.0,21.88,32.000000,35.00,day
2010-01-02,9.10000,145.000000,128.00,150.0,174.000000,13.000000,290.0,35.0,67.0,75.00,...,8.000000,65.00,60.000000,220.0,55.000000,500.0,22.50,30.000000,35.00,day
2010-01-03,9.10000,145.000000,128.00,150.0,174.000000,13.000000,290.0,35.0,67.0,75.00,...,8.000000,65.00,60.000000,220.0,55.000000,500.0,22.50,30.000000,35.00,day
2010-01-04,8.80000,145.000000,128.00,150.0,173.000000,13.000000,290.0,35.0,67.0,75.00,...,8.000000,65.00,60.000000,220.0,55.000000,500.0,20.00,30.000000,35.00,day
2010-01-05,8.80000,145.000000,128.00,150.0,160.000000,13.000000,290.0,35.0,67.0,75.00,...,8.000000,65.00,60.000000,220.0,55.000000,500.0,22.50,28.000000,35.00,day
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-10-28,11.66000,250.000000,140.91,140.0,312.500000,17.000000,210.0,17.5,85.0,66.67,...,22.680000,52.21,117.500000,240.0,67.500000,500.0,14.74,47.500000,33.75,day
2024-10-29,11.66000,250.000000,140.91,140.0,312.500000,17.000000,210.0,17.5,85.0,66.67,...,22.430000,52.21,117.500000,240.0,88.000000,500.0,14.74,47.500000,33.75,day
2024-10-30,12.25000,250.000000,140.91,140.0,312.500000,17.000000,210.0,17.5,85.0,66.67,...,22.560000,52.21,44.500000,240.0,82.500000,500.0,14.74,46.250000,33.75,day
2024-10-31,11.70129,244.032258,140.91,140.0,301.343871,14.387097,210.0,17.5,85.0,66.67,...,15.745161,52.21,112.790323,240.0,78.497419,500.0,14.74,38.467742,33.75,month


In [184]:
LASSO_DATASET = pd.concat([lasso_df_1, lasso_df_2], axis=1)
LASSO_DATASET

Unnamed: 0,ipc_all,lag_1,lag_2,lag_3,lag_6,lag_9,lag_12,oil_cb,sugar_sc,milk2_cb,...,chicken_cb,squash_co,peas_tj,rice3_po,peas_su,papaya_su,carrot_tj,bean_cb,yuca_lp,freq
2010-01-01,,0.263790,0.457498,0.784931,1.447299,5.323961,11.041182,9.10000,145.000000,128.00,...,8.300000,65.00,60.000000,220.0,55.000000,500.0,21.88,32.000000,35.00,day
2010-01-02,,0.263790,0.457498,0.784931,1.447299,5.323961,11.041182,9.10000,145.000000,128.00,...,8.000000,65.00,60.000000,220.0,55.000000,500.0,22.50,30.000000,35.00,day
2010-01-03,,0.263790,0.457498,0.784931,1.447299,5.323961,11.041182,9.10000,145.000000,128.00,...,8.000000,65.00,60.000000,220.0,55.000000,500.0,22.50,30.000000,35.00,day
2010-01-04,,0.263790,0.457498,0.784931,1.447299,5.323961,11.041182,8.80000,145.000000,128.00,...,8.000000,65.00,60.000000,220.0,55.000000,500.0,20.00,30.000000,35.00,day
2010-01-05,,0.263790,0.457498,0.784931,1.447299,5.323961,11.041182,8.80000,145.000000,128.00,...,8.000000,65.00,60.000000,220.0,55.000000,500.0,22.50,28.000000,35.00,day
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-10-28,,6.182067,5.192869,3.953707,3.460018,1.863150,2.075432,11.66000,250.000000,140.91,...,22.680000,52.21,117.500000,240.0,67.500000,500.0,14.74,47.500000,33.75,day
2024-10-29,,6.182067,5.192869,3.953707,3.460018,1.863150,2.075432,11.66000,250.000000,140.91,...,22.430000,52.21,117.500000,240.0,88.000000,500.0,14.74,47.500000,33.75,day
2024-10-30,,6.182067,5.192869,3.953707,3.460018,1.863150,2.075432,12.25000,250.000000,140.91,...,22.560000,52.21,44.500000,240.0,82.500000,500.0,14.74,46.250000,33.75,day
2024-10-31,7.938964,6.182067,5.192869,3.953707,3.460018,1.863150,2.075432,11.70129,244.032258,140.91,...,15.745161,52.21,112.790323,240.0,78.497419,500.0,14.74,38.467742,33.75,month


In [185]:
LASSO_DATASET.to_csv('./Data/LASSO_DATASET.csv')

### **1.4. Random Forest FS**

In [186]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(random_state=42)
# Fit the model to the training data and make predictions on the validation set
rf.fit(X_train, y_train)
# Make predictions on validation set using best model from GridSearchCV
rf_train_pred = rf.predict(X_train)
rf_val_pred = rf.predict(X_test)
# Calculate Forecast metrics on train set
mse_train_rf = mean_squared_error(y_train, rf_train_pred)
r2_train_rf = r2_score(y_train, rf_train_pred)
mae_train_rf = mean_absolute_error(y_train, rf_train_pred)
print("Train MSE: ", mse_train_rf)
print("Train R2: ", r2_train_rf)
print("Train MAE: ", mae_train_rf)
# Calculate Forecast metrics on validation set
mse_val_rf = mean_squared_error(y_test, rf_val_pred)
r2_val_rf = r2_score(y_test, rf_val_pred)
mae_val_rf = mean_absolute_error(y_test, rf_val_pred)
print("Test MSE: ", mse_val_rf)
print("Test R2: ", r2_val_rf)
print("Test MAE: ", mae_val_rf)

Train MSE:  0.08742858383362608
Train R2:  0.983637347462911
Train MAE:  0.19147750872149402
Test MSE:  0.7097750019345995
Test R2:  0.9227012575669807
Test MAE:  0.605694715260626


In [187]:
# Create a DataFrame with the feature importance values
feature_importance_rf = pd.DataFrame({'Feature': X_ml.columns, 'Importance': rf.feature_importances_})
# Sort the DataFrame by importance values in descending order
feature_importance_rf = feature_importance_rf.sort_values(by='Importance', ascending=False).reset_index(drop=True)
feature_importance_rf.columns = ['feat', 'imp_rf']
# Print the feature importance DataFrame
feature_importance_rf[feature_importance_rf['imp_rf'] != 0]

Unnamed: 0,feat,imp_rf
0,oil_cb,2.412214e-01
1,sugar_bol,7.512543e-02
2,chicken_tj,5.542224e-02
3,oil2_tj,3.628344e-02
4,oil2_bol,3.528505e-02
...,...,...
446,onion2_cb,2.385244e-07
447,oil_co,7.149701e-08
448,milk_bol,5.712453e-08
449,milk_co,5.404016e-08


In [188]:
# Calculate deciles
feature_importance_rf['decile'] = pd.qcut(feature_importance_rf['imp_rf'], 10, labels=False) + 1

In [189]:
vars_rf = feature_importance_rf.query('decile > 9')['feat'].values
vars_rf

array(['oil_cb', 'sugar_bol', 'chicken_tj', 'oil2_tj', 'oil2_bol',
       'sugar_tj', 'rice4_tj', 'sugar_cb', 'rice3_po', 'sugar_or',
       'sugar_sc', 'rice4_su', 'ycorn_tj', 'chicken_bol', 'papa2_tj',
       'oil2_po', 'sugar_lp', 'papaya_tr', 'oil_sc', 'banana_co',
       'rice_bol', 'milk_sc', 'chicken_tr', 'pineapple_tr', 'rice2_lp',
       'rice3_su', 'ufv', 'paprika_or', 'squash_co', 'papa1_po',
       'rice2_po', 'flour_lp', 'chicken_po', 'silver', 'chicken_lp',
       'ycorn_sc', 'chicken_sc', 'tomato_co', 'libor', 'platano_co',
       'sugar_po', 'carrot_co', 'tomato_tj', 'oil_tr', 'lard_lp',
       'rice4_or'], dtype=object)

In [190]:
len(vars_rf) 

46

In [191]:
rf_df_1 = dataset_g12[target_lags]
rf_df_2 = dataset_g12[vars_rf]

RF_DATASET = pd.concat([rf_df_1, rf_df_2], axis=1)
RF_DATASET['freq'] = dataset_g12['freq']
RF_DATASET

Unnamed: 0,ipc_all,lag_1,lag_2,lag_3,lag_6,lag_9,lag_12,oil_cb,sugar_bol,chicken_tj,...,tomato_co,libor,platano_co,sugar_po,carrot_co,tomato_tj,oil_tr,lard_lp,rice4_or,freq
2010-01-01,,0.263790,0.457498,0.784931,1.447299,5.323961,11.041182,9.10000,169.000000,13.000000,...,174.0,0.429690,75.00,170.0,62.5,32.140000,9.17,185.0,250.00,day
2010-01-02,,0.263790,0.457498,0.784931,1.447299,5.323961,11.041182,9.10000,169.000000,13.000000,...,174.0,0.431253,75.00,180.0,62.5,28.000000,9.17,185.0,250.00,day
2010-01-03,,0.263790,0.457498,0.784931,1.447299,5.323961,11.041182,9.10000,169.000000,13.000000,...,174.0,0.432817,75.00,180.0,62.5,28.000000,9.17,185.0,250.00,day
2010-01-04,,0.263790,0.457498,0.784931,1.447299,5.323961,11.041182,8.80000,170.000000,13.000000,...,174.0,0.434380,75.00,180.0,62.5,20.000000,9.17,185.0,250.00,day
2010-01-05,,0.263790,0.457498,0.784931,1.447299,5.323961,11.041182,8.80000,175.000000,13.000000,...,174.0,0.427500,75.00,180.0,62.5,22.500000,9.17,185.0,250.00,day
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-10-28,,6.182067,5.192869,3.953707,3.460018,1.863150,2.075432,11.66000,245.000000,17.000000,...,110.0,4.682130,66.67,240.0,55.0,94.300000,10.80,232.5,10.17,day
2024-10-29,,6.182067,5.192869,3.953707,3.460018,1.863150,2.075432,11.66000,245.000000,17.000000,...,110.0,4.682130,66.67,240.0,55.0,71.850000,10.80,232.5,10.17,day
2024-10-30,,6.182067,5.192869,3.953707,3.460018,1.863150,2.075432,12.25000,245.000000,17.000000,...,110.0,4.682130,66.67,240.0,55.0,71.850000,10.80,232.5,10.17,day
2024-10-31,7.938964,6.182067,5.192869,3.953707,3.460018,1.863150,2.075432,11.70129,242.580645,14.387097,...,110.0,4.682387,66.67,240.0,55.0,92.127419,10.80,232.5,10.17,month


In [192]:
RF_DATASET.to_csv('./Data/RF_DATASET.csv')