# <font color="red">**STEP 2: FEATURE SELECTION**</font>

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import plotly.graph_objects as go

In [3]:
# Correlation matrix
def lagged_correlation_color(df, specific_var, l=0):
    lags = [l]
    # Create lagged versions of the DataFrame
    lagged_df = pd.concat([df.shift(lag) for lag in lags], axis=1, keys=[f'Lag{lag}' for lag in lags])
    # Calculate correlation coefficients
    correlations = lagged_df.corrwith(df[specific_var])
    # Sort correlations from highest to lowest
    correlations_sorted = correlations.sort_values(ascending=False)
    # Convert the sorted correlations to a data frame and reset index
    df_correlations_sorted = pd.DataFrame(correlations_sorted, columns=['corr']).reset_index()

    lagged_df.columns = df.columns
    return df_correlations_sorted, lagged_df

col_range = range(0,1)


# Growth Rate datafrmaes
def preprocess_data(df, period):
    df_copy = df.copy().pct_change(period).dropna(axis=0)
    df_copy = df_copy.loc[:, np.isfinite(df_copy).all(axis=0)]
    return df_copy.dropna(axis=1)


## **1. Feature Selection (FS)**

In [4]:
dataset_m = pd.read_csv("./Data/M_DATASET.csv", index_col=0)
dataset_m['exchange'] = 1/dataset_m['exchange']  ## to ensure a positive correlation
dataset_m

Unnamed: 0,ipc_all,lag_1,lag_2,lag_3,lag_6,lag_9,lag_12,beef_lp,beef_cb,beef_sc,...,inflacion,inflacion en bolivia,inflación bolivia,ipc,la inflacion,la inflación,pib,pib bolivia,que es inflacion,que es pib
2011-01-31,74.207255,73.260267,71.989803,71.196381,69.071086,68.561311,68.467691,18.492308,20.980769,18.450000,...,58,59,100,79,44,33,50,29,100,54
2011-02-28,75.439060,74.207255,73.260267,71.989803,69.800954,68.549203,68.581371,18.720833,20.500000,18.385000,...,63,44,100,79,45,33,59,51,100,54
2011-03-31,76.108818,75.439060,74.207255,73.260267,70.335479,68.646613,68.499278,19.166667,20.518519,18.140370,...,82,78,100,54,57,33,92,77,100,54
2011-04-30,76.125495,76.108818,75.439060,74.207255,71.196381,69.071086,68.561311,19.076923,19.153846,17.835769,...,100,90,100,93,100,33,98,77,100,59
2011-05-31,76.277495,76.125495,76.108818,75.439060,71.989803,69.800954,68.549203,19.000000,18.884615,17.827692,...,75,68,100,88,73,33,78,69,89,56
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-11-30,110.425657,110.429431,110.440281,110.506839,109.439785,108.697854,108.692935,23.211200,23.000000,21.000000,...,41,39,59,54,53,44,47,37,42,38
2023-12-31,111.123491,110.425657,110.429431,110.440281,109.678594,108.614602,108.818364,23.223333,23.000000,21.000000,...,31,39,68,67,55,51,35,24,54,44
2024-01-31,111.211060,111.123491,110.425657,110.429431,110.081702,108.814654,109.176930,24.120909,23.965909,20.954545,...,29,38,64,86,43,46,30,27,53,43
2024-02-29,111.433348,111.211060,111.123491,110.425657,110.506839,109.439785,108.697854,23.987391,24.956522,21.000000,...,31,38,58,66,38,38,37,34,51,41


### **1.1. Correlation-based FS**

It is a filter technique.

In [5]:
ldf_corr = dataset_m.dropna(axis=1)

g12df_corr = preprocess_data(ldf_corr, 12)
g1df_corr = preprocess_data(ldf_corr, 1)

In [12]:
lcorr_lag0, ldf_lag0 = lagged_correlation_color(ldf_corr, 'ipc_all', 0)
lcorr_lag0.head(10)

corr_plus05 = np.array(lcorr_lag0.query('corr > 0.5')['level_1'])
corr_minus05 = np.array(lcorr_lag0.query('corr < -0.5')['level_1'])

corr_plus05, corr_minus05

  c /= stddev[:, None]
  c /= stddev[None, :]


(array(['ipc_all', 'lag_1', 'lag_2', 'lag_3', 'ufv', 'lag_6', 'lag_9',
        'lag_12', 'milk_lp', 'milk_or', 'milk_bol', 'paprika_tr',
        'milk_sc', 'milk_su', 'beef_lp', 'milk_po', 'squash_tr',
        'banana_co', 'milk_cb', 'corn_co', 'papaya_tr', 'milk2_po',
        'milk2_or', 'apple_sc', 'wheat_sc', 'beef_su', 'beef_bol',
        'rice3_co', 'onion2_po', 'sorghum_lp', 'redpepper_tr', 'flour_tj',
        'banana_bol', 'rice2_co', 'apple_or', 'rice_co', 'banana_tj',
        'beef_or', 'dinero', 'flour_po', 'beef_sc', 'grapefruit_po',
        'bean_tr', 'greenbean_tr', 'beef_cb', 'libor', 'banana_tr',
        'rice4_or', 'zinc', 'grapefruit_bol', 'banana_sc', 'pineapple_or',
        'grapefruit_cb', 'redpepper_po', 'banana_su', 'wheat_po',
        'milk2_lp', 'veglard_co', 'banana_lp', 'peas_tr', 'beef_tr',
        'grapefruit_or', 'oil_co', 'rice2_or', 'soy_po', 'grapefruit_su',
        'platano_tr', 'beef_po'], dtype=object),
 array(['platano_co', 'silver', 'oil_po', 'sugar

In [13]:
CORR_M_DATASET = dataset_m[corr_plus05]
CORR_M_DATASET.info()

<class 'pandas.core.frame.DataFrame'>
Index: 159 entries, 2011-01-31 to 2024-03-31
Data columns (total 68 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   ipc_all         159 non-null    float64
 1   lag_1           159 non-null    float64
 2   lag_2           159 non-null    float64
 3   lag_3           159 non-null    float64
 4   ufv             159 non-null    float64
 5   lag_6           159 non-null    float64
 6   lag_9           159 non-null    float64
 7   lag_12          159 non-null    float64
 8   milk_lp         159 non-null    float64
 9   milk_or         159 non-null    float64
 10  milk_bol        159 non-null    float64
 11  paprika_tr      159 non-null    float64
 12  milk_sc         159 non-null    float64
 13  milk_su         159 non-null    float64
 14  beef_lp         159 non-null    float64
 15  milk_po         159 non-null    float64
 16  squash_tr       159 non-null    float64
 17  banana_co       159 non-

### **1.2. Principal Component FS**

### **1.3. L1-LR FS**

### **1.4. Random Forest FS**