In [80]:
%pip install scikit-learn

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip available: 22.2.2 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


# <font color="red">**STEP 2: FEATURE SELECTION**</font>

In [81]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import plotly.graph_objects as go

In [82]:
# Correlation matrix
def lagged_correlation_color(df, specific_var, l=0):
    lags = [l]
    # Create lagged versions of the DataFrame
    lagged_df = pd.concat([df.shift(lag) for lag in lags], axis=1, keys=[f'Lag{lag}' for lag in lags])
    # Calculate correlation coefficients
    correlations = lagged_df.corrwith(df[specific_var])
    # Sort correlations from highest to lowest
    correlations_sorted = correlations.sort_values(ascending=False)
    # Convert the sorted correlations to a data frame and reset index
    df_correlations_sorted = pd.DataFrame(correlations_sorted, columns=['corr']).reset_index()

    lagged_df.columns = df.columns
    return df_correlations_sorted, lagged_df

col_range = range(0,1)


# Growth Rate datafrmaes
def preprocess_data(df, period):
    df_copy = df.copy().pct_change(period).dropna(axis=0)
    df_copy = df_copy.loc[:, np.isfinite(df_copy).all(axis=0)]
    return df_copy.dropna(axis=1)


## **1. Feature Selection (FS)**

In [83]:
dataset = pd.read_csv('./Data/FINAL_DATASET.csv', index_col=0)

aux_lag1 = dataset.copy()
aux_lag1.loc[dataset['freq'] != 'month', aux_lag1.columns != 'freq'] = np.nan
aux_lag1 = aux_lag1.ffill()

In [84]:
dataset_m = dataset.copy().query('freq == "month"').drop(columns=['freq'])

g1_m = dataset_m.copy().pct_change(1).dropna(axis=0)
g1_m

Unnamed: 0,ipc_all,lag_1,lag_2,lag_3,lag_6,lag_9,lag_12,beef_lp,beef_cb,beef_sc,...,inflacion,inflacion en bolivia,inflación bolivia,ipc,la inflacion,la inflación,pib,pib bolivia,que es inflacion,que es pib
2011-04-30,0.000219,0.008878,0.016600,0.012926,0.012240,0.006183,0.000906,-0.004682,-0.066509,-0.016791,...,0.219512,0.153846,0.000000,0.722222,0.754386,0.000000,0.065217,0.000000,0.000000,0.092593
2011-05-31,0.001997,0.000219,0.008878,0.016600,0.011144,0.010567,-0.000177,-0.004032,-0.014056,-0.000453,...,-0.250000,-0.244444,0.000000,-0.053763,-0.270000,0.000000,-0.204082,-0.103896,-0.110000,-0.050847
2011-07-31,0.006763,0.003435,0.002216,0.009099,0.030802,0.019992,0.007613,-0.032389,-0.013035,-0.054496,...,-0.253333,-0.147059,-0.040000,-0.204545,-0.136986,0.000000,-0.282051,-0.362319,-0.033708,0.375000
2011-09-30,0.006848,0.009132,0.006763,0.003435,0.025625,0.028989,0.018306,0.014017,0.045398,0.040296,...,0.285714,0.293103,-0.041667,-0.185714,0.095238,0.000000,0.375000,0.636364,-0.023256,-0.337662
2011-10-31,0.004732,0.003044,0.003792,0.005320,0.000219,0.012926,0.012240,-0.007634,0.004343,-0.007677,...,0.083333,-0.040000,-0.021739,-0.070175,0.159420,0.000000,-0.064935,-0.333333,-0.011905,0.039216
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-06-30,0.005351,0.006299,0.005700,0.004561,0.006319,-0.000602,0.002182,0.017373,0.000000,0.000000,...,0.026316,-0.035088,0.240000,-0.035714,0.088889,-0.049180,-0.111111,-0.275000,0.229167,0.260870
2024-07-31,0.004732,0.005351,0.006299,0.005700,0.000788,-0.000098,0.003675,-0.006774,0.007819,-0.005892,...,0.051282,-0.072727,0.274194,-0.074074,0.224490,-0.137931,-0.125000,0.034483,0.135593,-0.137931
2024-08-31,0.015828,0.004732,0.005351,0.006299,0.001999,-0.000034,0.003862,0.015056,0.032949,0.004784,...,0.024390,-0.235294,-0.113924,0.100000,0.150000,1.220000,0.214286,0.300000,0.014925,-0.160000
2024-09-30,0.008796,0.015828,0.004732,0.005351,0.004561,0.006319,-0.000602,0.019719,0.006273,0.012514,...,-0.333333,0.358974,-0.300000,0.054545,-0.144928,-0.468468,-0.039216,0.000000,-0.058824,-0.095238


In [85]:
feats_w = dataset.copy().query('freq == "week"').drop(columns=['ipc_all', 'freq'])

feats_lag1_w = aux_lag1.query('freq == "week"').drop(columns=['ipc_all', 'freq'])

g1_w = (feats_w.copy() / feats_lag1_w.copy()) - 1
g1_w.dropna(axis=0, inplace=True)
g1_w

Unnamed: 0,lag_1,lag_2,lag_3,lag_6,lag_9,lag_12,beef_lp,beef_cb,beef_sc,beef_or,...,inflacion,inflacion en bolivia,inflación bolivia,ipc,la inflacion,la inflación,pib,pib bolivia,que es inflacion,que es pib
2018-12-30,0.006066,0.004362,0.000897,0.001597,-0.002699,0.006427,2.220446e-16,0.000000,0.019487,0.000000,...,-0.511628,1.888889,2.103448,1.147059,1.714286,0.909091,-0.534483,0.000000,0.581395,1.303030
2019-01-06,0.006066,0.004362,0.000897,0.001597,-0.002699,0.006427,2.220446e-16,-0.009524,0.019487,0.000000,...,-0.511628,1.888889,2.103448,1.147059,1.714286,0.909091,-0.310345,0.073171,0.581395,1.303030
2019-01-13,0.006066,0.004362,0.000897,0.001597,-0.002699,0.006427,-3.164091e-02,-0.023810,0.019487,0.000000,...,0.302326,1.888889,2.103448,1.147059,1.714286,0.909091,-0.500000,0.146341,0.581395,1.303030
2019-01-20,0.006066,0.004362,0.000897,0.001597,-0.002699,0.006427,-9.492274e-02,-0.007937,0.019487,0.000000,...,0.116279,1.888889,2.103448,1.147059,1.714286,0.909091,-0.344828,-0.292683,0.581395,1.303030
2019-01-27,0.006066,0.004362,0.000897,0.001597,-0.002699,0.006427,-9.492274e-02,0.000000,0.019487,0.000000,...,-0.069767,1.888889,2.103448,1.147059,1.714286,0.909091,-0.637931,-0.170732,0.581395,1.303030
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-09-29,0.015828,0.004732,0.005351,0.004561,0.006319,-0.000602,3.499021e-02,0.026409,0.004930,0.045119,...,-0.428571,0.948718,-0.285714,0.145455,-0.333333,-0.297297,-0.196078,0.051282,-0.073529,-0.071429
2024-10-06,0.008796,0.015828,0.004732,0.005700,0.000788,-0.000098,5.951648e-02,-0.024183,0.061798,0.023573,...,0.071429,0.660377,0.040816,0.017241,-0.372881,0.474576,-0.224490,-0.307692,-0.031250,-0.105263
2024-10-13,0.008796,0.015828,0.004732,0.005700,0.000788,-0.000098,1.208605e-01,-0.024183,0.104869,0.033912,...,-0.214286,0.716981,0.040816,0.086207,-0.423729,-0.169492,-0.102041,-0.256410,-0.046875,-0.026316
2024-10-20,0.008796,0.015828,0.004732,0.005700,0.000788,-0.000098,1.502840e-01,-0.024183,0.078652,0.033912,...,-0.214286,0.792453,0.061224,-0.241379,-0.457627,-0.084746,-0.102041,-0.435897,-0.062500,0.078947


In [86]:
feats_d = dataset.copy().query('freq == "day"').drop(columns=['ipc_all', 'freq'])
names_daily = feats_d.columns[:-17]
names_daily

Index(['lag_1', 'lag_2', 'lag_3', 'lag_6', 'lag_9', 'lag_12', 'beef_lp',
       'beef_cb', 'beef_sc', 'beef_or',
       ...
       'gold', 'silver', 'zinc', 'tin', 'soybean', 'soy_flour', 'soy_oil',
       'lead', 'copper', 'libor'],
      dtype='object', length=474)

In [87]:
feats_d = feats_d.loc[:, names_daily]

feats_lag1_d = aux_lag1.query('freq == "day"').drop(columns=['ipc_all', 'freq'])
feats_lag1_d = feats_lag1_d.loc[:, names_daily]

g1_d = (feats_d.copy() / feats_lag1_d.copy()) - 1
g1_d.dropna(axis=0, inplace=True)
g1_d

Unnamed: 0,lag_1,lag_2,lag_3,lag_6,lag_9,lag_12,beef_lp,beef_cb,beef_sc,beef_or,...,gold,silver,zinc,tin,soybean,soy_flour,soy_oil,lead,copper,libor
2011-04-01,0.008878,0.016600,0.012926,0.01224,0.006183,0.000906,0.043478,-0.074007,-0.007738,0.008117,...,0.004099,-0.007264,0.010998,0.080100,-0.016751,-0.019508,-0.016198,0.036453,-0.016227,-0.002784
2011-04-04,0.008878,0.016600,0.012926,0.01224,0.006183,0.000906,-0.008696,-0.098375,-0.007738,0.008117,...,0.007824,-0.035636,0.022420,0.100514,-0.026041,-0.031912,-0.034742,0.057197,-0.019380,-0.004411
2011-04-05,0.008878,0.016600,0.012926,0.01224,0.006183,0.000906,-0.008696,-0.098375,-0.007738,0.008117,...,0.023159,-0.063425,0.025804,0.117217,-0.030959,-0.033886,-0.035079,0.071795,-0.013074,-0.009294
2011-04-06,0.008878,0.016600,0.012926,0.01224,0.006183,0.000906,-0.008696,-0.098375,-0.007738,0.008117,...,0.025900,-0.082884,0.036379,0.119258,-0.027316,-0.027402,-0.036765,0.083320,0.009523,-0.013635
2011-04-07,0.008878,0.016600,0.012926,0.01224,0.006183,0.000906,-0.008696,-0.049639,-0.007738,0.008117,...,0.024811,-0.072916,0.037225,0.137631,-0.023309,-0.029939,-0.031370,0.071795,0.016355,-0.021491
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-10-24,0.008796,0.015828,0.004732,0.00570,0.000788,-0.000098,0.275469,0.389478,0.078652,0.033912,...,0.064063,0.125403,0.072163,-0.018312,-0.019488,0.004229,0.040533,0.010400,0.025269,-0.053032
2024-10-25,0.008796,0.015828,0.004732,0.00570,0.000788,-0.000098,0.275469,0.389478,0.078652,0.033912,...,0.068492,0.124871,0.093777,-0.021762,-0.037020,-0.014754,0.036369,0.016541,0.027843,-0.053032
2024-10-28,0.008796,0.015828,0.004732,0.00570,0.000788,-0.000098,0.174241,0.463725,0.078652,0.033912,...,0.066509,0.132263,0.096043,-0.021794,-0.023933,-0.015999,0.051800,0.013593,0.021247,-0.053032
2024-10-29,0.008796,0.015828,0.004732,0.00570,0.000788,-0.000098,0.214732,0.463725,0.078652,0.033912,...,0.079062,0.146916,0.106676,-0.014546,-0.016525,-0.015377,0.064293,0.019243,0.019692,-0.053032


### **1.1. Correlation-based FS**

It is a filter technique.

In [88]:
ldf_corr = dataset_m.copy().dropna(axis=1)

lcorr_lag0, ldf_lag0 = lagged_correlation_color(ldf_corr, 'ipc_all', 0)
lcorr_lag0.head(10)

Unnamed: 0,level_0,level_1,corr
0,Lag0,ipc_all,1.0
1,Lag0,lag_1,0.999334
2,Lag0,lag_2,0.998119
3,Lag0,lag_3,0.996729
4,Lag0,ufv,0.994591
5,Lag0,lag_6,0.994187
6,Lag0,lag_9,0.992064
7,Lag0,lag_12,0.990194
8,Lag0,paprika_tr,0.887889
9,Lag0,milk_lp,0.877696


In [89]:
corr_plus05 = np.array(lcorr_lag0.query('corr > 0.5')['level_1'])
#corr_minus05 = np.array(lcorr_lag0.query('corr < -0.5')['level_1'])
corr_plus05 = np.append(corr_plus05, 'freq')

corr_plus05

array(['ipc_all', 'lag_1', 'lag_2', 'lag_3', 'ufv', 'lag_6', 'lag_9',
       'lag_12', 'paprika_tr', 'milk_lp', 'milk_or', 'beef_lp',
       'milk_bol', 'milk_sc', 'milk_su', 'squash_tr', 'banana_co',
       'milk_po', 'apple_sc', 'corn_co', 'wheat_sc', 'papaya_tr',
       'milk2_po', 'beef_bol', 'milk_cb', 'beef_su', 'flour_tj',
       'apple_or', 'beef_sc', 'rice2_co', 'libor', 'milk2_or', 'rice3_co',
       'beef_or', 'redpepper_tr', 'greenbean_tr', 'onion2_po',
       'banana_bol', 'wheat_po', 'banana_tr', 'bean_tr', 'grapefruit_po',
       'beef_cb', 'banana_sc', 'soy_po', 'onion2_tr', 'beef_po', 'zinc',
       'apple_cb', 'redpepper_po', 'flour_po', 'dinero', 'quinoa_tr',
       'sorghum_lp', 'rice2_or', 'rice_co', 'yuca_co', 'grapefruit_cb',
       'grapefruit_or', 'platano_tr', 'papa2_co', 'grapefruit_bol',
       'pineapple_or', 'banana_lp', 'la inflación', 'veglard_co',
       'peas_tr', 'banana_su', 'ycorn_lp', 'corn_tr', 'lard_cb',
       'noodle_tj', 'chili_co', 'ycorn_su'

In [90]:
CORR_DATASET = dataset[corr_plus05]
CORR_DATASET.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3757 entries, 2011-01-03 to 2024-10-31
Data columns (total 84 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   ipc_all         105 non-null    float64
 1   lag_1           3757 non-null   float64
 2   lag_2           3757 non-null   float64
 3   lag_3           3757 non-null   float64
 4   ufv             3757 non-null   float64
 5   lag_6           3757 non-null   float64
 6   lag_9           3757 non-null   float64
 7   lag_12          3757 non-null   float64
 8   paprika_tr      3757 non-null   float64
 9   milk_lp         3757 non-null   float64
 10  milk_or         3757 non-null   float64
 11  beef_lp         3757 non-null   float64
 12  milk_bol        3757 non-null   float64
 13  milk_sc         3757 non-null   float64
 14  milk_su         3757 non-null   float64
 15  squash_tr       3757 non-null   float64
 16  banana_co       3757 non-null   float64
 17  milk_po         3757 no

In [91]:
CORR_DATASET.to_csv('./Data/CORR_DATASET.csv')

### **1.2. Principal Component FS**

Scale the features, otherwise accuracy may drop.

In [92]:
target = 'ipc_all'
target_lags = ['ipc_all', 'lag_1', 'lag_2', 'lag_3', 'lag_6', 'lag_9', 'lag_12']

X = dataset.drop(columns=target_lags + ['freq']).dropna(axis=1)
y = dataset[target]

In [93]:
print(f'Target variable:', y.shape, f'; Features', X.shape)


Target variable: (3757,) ; Features (3757, 468)


In [94]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled

array([[-1.48409418,  0.03780744, -0.47320819, ...,  2.04446777,
         1.49041897, -0.73120815],
       [-1.17982146, -0.90382155, -0.47320819, ...,  2.31306107,
         1.47728549, -0.73127868],
       [-1.48409418, -0.58994522, -0.77381646, ...,  2.54523494,
         1.45758527, -0.7319569 ],
       ...,
       [ 5.20990559,  8.19859198,  3.54742756, ..., -0.12021212,
         1.42934829,  1.5618224 ],
       [ 5.61763103,  7.88471565,  3.54742756, ..., -0.24995634,
         1.49206066,  1.5618224 ],
       [ 4.32481006,  2.68831866,  3.67963953, ..., -0.04480015,
         1.57043328,  1.56201065]])

In [95]:
from sklearn.decomposition import PCA

# It selects the number of components such that the amount of variance that needs to be explained is greater than 95%s
pca = PCA(0.95, svd_solver='full')
X_pca = pca.fit_transform(X_scaled)
X_pca.shape


(3757, 77)

77 components were identified.

In [96]:
pca.explained_variance_ratio_

array([0.17740379, 0.13462365, 0.09816807, 0.05624416, 0.04500762,
       0.04083712, 0.03711639, 0.03031635, 0.02707575, 0.02353592,
       0.02075264, 0.01756023, 0.01597952, 0.01388466, 0.01315455,
       0.01216847, 0.00996198, 0.00964   , 0.00935676, 0.00874322,
       0.00840212, 0.00758552, 0.00710944, 0.00615206, 0.00600259,
       0.0059539 , 0.00536106, 0.00513153, 0.00483374, 0.00453576,
       0.00439149, 0.00409442, 0.00380675, 0.00349125, 0.00334395,
       0.00321399, 0.00302898, 0.00294633, 0.00287837, 0.00275038,
       0.00247431, 0.00245305, 0.00231386, 0.00225913, 0.0022236 ,
       0.00201044, 0.00192206, 0.00189123, 0.0018807 , 0.00186726,
       0.00171886, 0.00170602, 0.00162194, 0.00151762, 0.00148527,
       0.00143297, 0.00140011, 0.00136705, 0.00130949, 0.00127668,
       0.00125322, 0.00121517, 0.00117102, 0.00113056, 0.00111634,
       0.00107728, 0.00106773, 0.00104453, 0.00102875, 0.00099912,
       0.00097742, 0.00092514, 0.00091941, 0.00089284, 0.00088

In [97]:
X_pca_df = pd.DataFrame(X_pca, columns=[f'PC_{i+1}' for i in range(X_pca.shape[1])])
X_pca_df.index = dataset.index
X_pca_df

Unnamed: 0,PC_1,PC_2,PC_3,PC_4,PC_5,PC_6,PC_7,PC_8,PC_9,PC_10,...,PC_68,PC_69,PC_70,PC_71,PC_72,PC_73,PC_74,PC_75,PC_76,PC_77
2011-01-03,-2.769783,-15.188152,-7.303219,1.490688,8.133160,9.104162,-4.096026,-7.341045,-4.970154,-5.052483,...,0.450177,0.206438,1.143297,0.786626,0.163045,0.410666,-0.101489,-1.405066,-0.477243,0.874123
2011-01-04,-2.862583,-15.422483,-7.669879,1.118487,8.077138,8.948151,-3.844780,-6.837277,-4.881342,-5.062311,...,0.290863,0.395187,0.923809,0.357512,0.382293,0.502012,0.344309,-1.296853,-0.239003,0.771657
2011-01-05,-3.025670,-15.593150,-7.755800,0.872163,8.167308,8.391132,-3.560674,-6.734429,-4.660109,-5.430694,...,0.124513,0.824373,1.187616,0.125493,0.055869,0.657812,0.624486,-1.703750,-0.106610,0.709411
2011-01-06,-3.281324,-15.585601,-7.523135,1.046702,8.307727,8.256143,-3.583503,-6.749074,-4.456714,-5.787042,...,0.129361,1.121220,0.979177,0.279287,-0.006881,0.753840,1.006927,-1.605939,0.258455,0.388302
2011-01-07,-3.195777,-15.668130,-7.446395,0.667970,8.648115,8.204476,-3.122946,-6.666520,-4.320171,-5.892593,...,0.111468,1.069738,1.010171,0.207283,-0.358434,0.719150,1.221932,-1.422106,-0.012374,0.554953
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-10-27,26.853984,0.063782,12.347713,4.271933,12.901570,0.172636,0.399574,-2.210673,2.164387,-2.326722,...,-0.314005,-0.429839,-0.513206,-0.480847,-0.926114,-0.285959,-1.238354,-0.551397,-1.057862,-0.852292
2024-10-28,27.607146,0.343800,12.684646,4.562160,13.385492,0.571019,-0.270165,-3.036708,0.672280,-2.212246,...,-0.920257,-0.852335,-0.874992,-0.963551,-0.540721,-1.215571,-1.353910,-0.923086,-1.267799,-1.880056
2024-10-29,27.920470,0.521563,12.681126,4.733526,13.404389,0.334105,-0.265591,-3.253962,0.576941,-1.947450,...,-0.736153,-0.872385,-0.813294,-0.694281,-0.503104,-1.665896,-1.386322,-1.132153,-1.418627,-1.543913
2024-10-30,28.151784,0.691861,13.189997,5.305712,13.395277,0.575805,-1.444137,-3.508921,0.348138,-2.114645,...,-0.899239,-0.672187,-0.871165,0.208582,-0.139113,-1.971016,-1.060804,-1.641966,-0.355819,-1.475582


In [98]:
PC_DATASET = dataset[target_lags].join(X_pca_df)
PC_DATASET["freq"] = dataset["freq"]
PC_DATASET.tail(20)

Unnamed: 0,ipc_all,lag_1,lag_2,lag_3,lag_6,lag_9,lag_12,PC_1,PC_2,PC_3,...,PC_69,PC_70,PC_71,PC_72,PC_73,PC_74,PC_75,PC_76,PC_77,freq
2024-10-09,,117.267773,116.245314,114.43401,112.579661,111.21106,110.429431,25.269057,-0.629126,10.149368,...,-0.383736,-0.258413,-0.0382,-0.283075,-0.205903,0.063997,0.174755,0.037031,-0.126106,day
2024-10-10,,117.267773,116.245314,114.43401,112.579661,111.21106,110.429431,25.488132,-0.704358,10.613199,...,-0.355665,-0.315244,0.090844,-0.432207,-0.178748,-0.220849,-0.065916,-0.308907,-0.436324,day
2024-10-11,,117.267773,116.245314,114.43401,112.579661,111.21106,110.429431,25.509418,-0.66976,10.703798,...,-0.212758,-0.352257,0.090033,-0.412342,-0.119177,-0.139543,-0.017201,-0.262157,-0.365278,day
2024-10-13,,117.267773,116.245314,114.43401,112.579661,111.21106,110.429431,25.349306,-0.598522,10.331243,...,-0.309598,-0.195485,0.112879,-0.332036,-0.112963,0.14794,0.043872,-0.11185,-0.14789,week
2024-10-14,,117.267773,116.245314,114.43401,112.579661,111.21106,110.429431,25.575099,-0.687487,10.880914,...,-0.353868,-0.23901,0.446963,-0.73234,0.017602,-0.257438,-0.179571,-0.312844,-0.292519,day
2024-10-15,,117.267773,116.245314,114.43401,112.579661,111.21106,110.429431,25.624159,-0.472134,10.928438,...,-0.48846,-0.410899,0.384705,-0.66663,-0.179919,-0.724138,-0.07886,-0.359565,-0.505174,day
2024-10-16,,117.267773,116.245314,114.43401,112.579661,111.21106,110.429431,25.662572,-0.342079,11.154266,...,-0.064738,-0.511511,0.231349,-1.022788,0.306306,-0.830892,-0.201403,-0.434504,-0.635712,day
2024-10-17,,117.267773,116.245314,114.43401,112.579661,111.21106,110.429431,25.626657,-0.52462,11.124834,...,-0.254041,-0.430813,0.246538,-0.9286,0.119761,-0.696048,-0.149238,-0.326879,-0.479313,day
2024-10-18,,117.267773,116.245314,114.43401,112.579661,111.21106,110.429431,25.91863,-0.541163,11.693377,...,-0.232739,-0.566838,0.091942,-1.036526,0.124735,-0.771749,-0.383663,-0.617938,-0.524311,day
2024-10-20,,117.267773,116.245314,114.43401,112.579661,111.21106,110.429431,25.726894,-0.535555,11.266439,...,-0.280065,-0.490791,0.214313,-0.910135,0.077914,-0.721277,-0.215535,-0.484441,-0.519696,week


In [99]:
PC_DATASET.to_csv('./Data/PC_DATASET.csv')

### **1.3. L1-LR FS**

In [100]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

df_ml = dataset.query('freq == "month"')
X_ml = df_ml.drop(columns=target_lags + ['freq'])
y_ml = df_ml[target]
X_ml_scaled = scaler.fit_transform(X_ml)

X_train, X_test, y_train, y_test = train_test_split(X_ml_scaled, y_ml, test_size=0.2, random_state=42)

In [101]:
#lasso = Lasso()
lasso = Lasso(alpha=0.1, fit_intercept=True, max_iter=10000, tol=0.0001, positive=True, random_state=42, selection='cyclic')
# Fit on training set
lasso.fit(X_train, y_train)
# Make predictions on validation set using best model from GridSearchCV
lasso_train_pred = lasso.predict(X_train)
lasso_val_pred = lasso.predict(X_test)
# Calculate Forecast metrics on train set
mse_train_lasso = mean_squared_error(y_train, lasso_train_pred)
r2_train_lasso = r2_score(y_train, lasso_train_pred)
mae_train_lasso = mean_absolute_error(y_train, lasso_train_pred)
print("Train MSE: ", mse_train_lasso)
print("Train R2: ", r2_train_lasso)
print("Train MAE: ", mae_train_lasso)
# Calculate Forecast metrics on validation set
mse_val_lasso = mean_squared_error(y_test, lasso_val_pred)
r2_val_lasso = r2_score(y_test, lasso_val_pred)
mae_val_lasso = mean_absolute_error(y_test, lasso_val_pred)
print("Test MSE: ", mse_val_lasso)
print("Test R2: ", r2_val_lasso)
print("Test MAE: ", mae_val_lasso)

Train MSE:  0.10375188812457764
Train R2:  0.9990814752759071
Train MAE:  0.2548324759883312
Test MSE:  0.4989459467669776
Test R2:  0.9966660501793304
Test MAE:  0.5201231402723756


In [102]:
# Get the coefficients from the Lasso model
coef = lasso.coef_
# Create a dataframe of feature importances
feature_importance_lasso = pd.DataFrame({'Feature': X_ml.columns, 'Importance': coef})
# Sort the features by importance
feature_importance_lasso = feature_importance_lasso.sort_values('Importance', ascending=False).reset_index(drop=True)
feature_importance_lasso.columns = ['feat', 'imp_lasso']
# Print the feature importances
feature_importance_lasso[feature_importance_lasso['imp_lasso'] != 0]

Unnamed: 0,feat,imp_lasso
0,ufv,9.661716
1,beef_lp,0.316264
2,squash_tr,0.266759
3,exchange,0.260027
4,papa1_or,0.167608
5,apple_cb,0.123806
6,noodle_sc,0.10982
7,ycorn_tr,0.096463
8,yuca_sc,0.092374
9,squash_bol,0.083357


In [103]:
vars_lasso = feature_importance_lasso.query('imp_lasso != 0')['feat'].values
vars_lasso

array(['ufv', 'beef_lp', 'squash_tr', 'exchange', 'papa1_or', 'apple_cb',
       'noodle_sc', 'ycorn_tr', 'yuca_sc', 'squash_bol', 'milk_or',
       'corn_sc', 'noodle_tj', 'peas_lp', 'peas_po', 'beef_or',
       'paprika_tr', 'ycorn_su', 'oil2_cb', 'tomato_po', 'watermelon_su',
       'bean_po', 'orange_bol', 'ycorn_bol', 'tomato_tr', 'banana_sc',
       'orange2_sc', 'squash_su', 'apple_or'], dtype=object)

In [104]:
lasso_df_1 = dataset[target_lags]
lasso_df_2 = dataset[vars_lasso]

In [105]:
LASSO_DATASET = pd.merge(lasso_df_1, lasso_df_2, left_index=True, right_index=True)
LASSO_DATASET['freq'] = dataset['freq']
LASSO_DATASET

Unnamed: 0,ipc_all,lag_1,lag_2,lag_3,lag_6,lag_9,lag_12,ufv,beef_lp,squash_tr,...,watermelon_su,bean_po,orange_bol,ycorn_bol,tomato_tr,banana_sc,orange2_sc,squash_su,apple_or,freq
2011-01-03,,75.439060,74.207255,73.260267,70.335479,68.646613,68.499278,1.56520,19.000000,17.5,...,135.0,38.750000,36.0,132.0,120.00,30.000000,45.0,25.0,119.47,day
2011-01-04,,75.439060,74.207255,73.260267,70.335479,68.646613,68.499278,1.56543,19.500000,17.5,...,135.0,33.750000,36.0,135.0,120.00,23.300000,45.0,25.0,109.65,day
2011-01-05,,75.439060,74.207255,73.260267,70.335479,68.646613,68.499278,1.56566,19.000000,17.5,...,135.0,28.750000,36.0,135.0,120.00,22.500000,41.0,25.0,108.70,day
2011-01-06,,75.439060,74.207255,73.260267,70.335479,68.646613,68.499278,1.56589,19.000000,15.0,...,135.0,23.750000,36.0,135.0,120.00,24.000000,40.0,25.0,107.76,day
2011-01-07,,75.439060,74.207255,73.260267,70.335479,68.646613,68.499278,1.56612,18.500000,15.0,...,135.0,26.250000,36.0,135.0,120.00,28.500000,43.0,25.0,106.84,day
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-10-27,,117.267773,116.245314,114.434010,112.579661,111.211060,110.429431,2.54193,30.420000,42.5,...,150.0,43.583333,52.5,120.0,116.55,28.520000,75.0,37.5,233.33,week
2024-10-28,,117.267773,116.245314,114.434010,112.579661,111.211060,110.429431,2.54357,29.000000,42.5,...,150.0,49.000000,52.5,120.0,116.55,28.520000,75.0,37.5,233.33,day
2024-10-29,,117.267773,116.245314,114.434010,112.579661,111.211060,110.429431,2.54398,30.000000,42.5,...,150.0,49.000000,52.5,120.0,116.55,28.520000,75.0,37.5,233.33,day
2024-10-30,,117.267773,116.245314,114.434010,112.579661,111.211060,110.429431,2.54439,30.670000,42.5,...,150.0,49.000000,52.5,120.0,116.55,28.520000,75.0,37.5,233.33,day


In [106]:
LASSO_DATASET.to_csv('./Data/LASSO_DATASET.csv')

### **1.4. Random Forest FS**

In [107]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(random_state=42)
# Fit the model to the training data and make predictions on the validation set
rf.fit(X_train, y_train)
# Make predictions on validation set using best model from GridSearchCV
rf_train_pred = rf.predict(X_train)
rf_val_pred = rf.predict(X_test)
# Calculate Forecast metrics on train set
mse_train_rf = mean_squared_error(y_train, rf_train_pred)
r2_train_rf = r2_score(y_train, rf_train_pred)
mae_train_rf = mean_absolute_error(y_train, rf_train_pred)
print("Train MSE: ", mse_train_rf)
print("Train R2: ", r2_train_rf)
print("Train MAE: ", mae_train_rf)
# Calculate Forecast metrics on validation set
mse_val_rf = mean_squared_error(y_test, rf_val_pred)
r2_val_rf = r2_score(y_test, rf_val_pred)
mae_val_rf = mean_absolute_error(y_test, rf_val_pred)
print("Test MSE: ", mse_val_rf)
print("Test R2: ", r2_val_rf)
print("Test MAE: ", mae_val_rf)

Train MSE:  0.17039404238601857
Train R2:  0.9984914863372726
Train MAE:  0.27666004107816244
Test MSE:  2.0832126695911772
Test R2:  0.9860800021501251
Test MAE:  0.9318758375871946


In [108]:
# Create a DataFrame with the feature importance values
feature_importance_rf = pd.DataFrame({'Feature': X_ml.columns, 'Importance': rf.feature_importances_})
# Sort the DataFrame by importance values in descending order
feature_importance_rf = feature_importance_rf.sort_values(by='Importance', ascending=False).reset_index(drop=True)
feature_importance_rf.columns = ['feat', 'imp_rf']
# Print the feature importance DataFrame
feature_importance_rf[feature_importance_rf['imp_rf'] != 0]

Unnamed: 0,feat,imp_rf
0,ufv,1.938413e-01
1,squash_co,1.271882e-01
2,quinoa_tj,1.062751e-01
3,oil2_or,8.347975e-02
4,lard_co,7.382840e-02
...,...,...
471,wheat_su,9.104068e-07
472,chili_or,7.036635e-07
473,flour_co,6.102950e-07
474,peas_cb,4.469765e-07


In [109]:
# Calculate deciles
feature_importance_rf['decile'] = pd.qcut(feature_importance_rf['imp_rf'], 10, labels=False) + 1

In [110]:
vars_rf = feature_importance_rf.query('decile > 9')['feat'].values
vars_rf

array(['ufv', 'squash_co', 'quinoa_tj', 'oil2_or', 'lard_co', 'peas_co',
       'wheat_lp', 'oil2_su', 'rice4_su', 'milk2_bol', 'milk2_tj',
       'lard_or', 'milk2_tr', 'oil2_tj', 'oil_su', 'zinc', 'onion2_tj',
       'flour_tj', 'flour_tr', 'oil2_tr', 'redpepper_lp', 'paprika_tr',
       'milk2_cb', 'noodle_or', 'lard_su', 'noodle_tr', 'lard_sc',
       'lard_po', 'ycorn_co', 'gold', 'flour_or', 'oil_tj',
       'inflación bolivia', 'noodle_sc', 'papaya_su', 'noodle_tj',
       'orange2_su', 'noodle_bol', 'rice_bol', 'apple_co', 'libor',
       'copper', 'oil2_sc', 'papa1_co', 'silver', 'veglard_tj',
       'orange2_cb', 'veglard_lp', 'lard_cb'], dtype=object)

In [111]:
rf_df_1 = dataset[target_lags]
rf_df_2 = dataset[vars_rf]

RF_DATASET = pd.merge(rf_df_1, rf_df_2, left_index=True, right_index=True)
RF_DATASET['freq'] = dataset['freq']
RF_DATASET

Unnamed: 0,ipc_all,lag_1,lag_2,lag_3,lag_6,lag_9,lag_12,ufv,squash_co,quinoa_tj,...,libor,copper,oil2_sc,papa1_co,silver,veglard_tj,orange2_cb,veglard_lp,lard_cb,freq
2011-01-03,,75.439060,74.207255,73.260267,70.335479,68.646613,68.499278,1.56520,65.00,755.0,...,0.455940,9600.000000,9.00,66.67,37.046200,230.00,32.0,215.00,200.0,day
2011-01-04,,75.439060,74.207255,73.260267,70.335479,68.646613,68.499278,1.56543,65.00,755.0,...,0.455810,9580.000000,9.00,66.67,37.467500,230.00,32.0,222.00,200.0,day
2011-01-05,,75.439060,74.207255,73.260267,70.335479,68.646613,68.499278,1.56566,65.00,755.0,...,0.454560,9550.000000,9.00,66.67,37.665000,232.00,32.0,227.00,200.0,day
2011-01-06,,75.439060,74.207255,73.260267,70.335479,68.646613,68.499278,1.56589,65.00,755.0,...,0.457190,9475.000000,9.25,66.67,35.915596,230.00,33.0,221.00,200.0,day
2011-01-07,,75.439060,74.207255,73.260267,70.335479,68.646613,68.499278,1.56612,65.00,755.0,...,0.457310,9415.000000,9.25,66.67,37.812500,230.00,33.0,224.00,200.0,day
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-10-27,,117.267773,116.245314,114.434010,112.579661,111.211060,110.429431,2.54193,52.21,650.0,...,4.682130,9568.400000,9.90,76.11,34.106400,308.13,85.0,273.59,280.0,week
2024-10-28,,117.267773,116.245314,114.434010,112.579661,111.211060,110.429431,2.54357,52.21,650.0,...,4.682130,9521.500000,9.90,76.11,34.001000,308.13,85.0,273.59,280.0,day
2024-10-29,,117.267773,116.245314,114.434010,112.579661,111.211060,110.429431,2.54398,52.21,650.0,...,4.682130,9507.000000,9.90,76.11,34.441000,308.13,85.0,273.59,280.0,day
2024-10-30,,117.267773,116.245314,114.434010,112.579661,111.211060,110.429431,2.54439,52.21,650.0,...,4.682130,9602.500000,9.90,76.11,34.075000,308.13,85.0,273.59,280.0,day


In [112]:
RF_DATASET.to_csv('./Data/RF_DATASET.csv')