# <font color="red">**INFLATION FORECAST: Weekly estimates**</font>

**Author:** Osmar Bolivar

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import plotly.graph_objects as go

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
def lagged_correlation_color(df, specific_var, l=0):
    lags = [l]
    # Create lagged versions of the DataFrame
    lagged_df = pd.concat([df.shift(lag) for lag in lags], axis=1, keys=[f'Lag{lag}' for lag in lags])
    # Calculate correlation coefficients
    correlations = lagged_df.corrwith(df[specific_var])
    # Sort correlations from highest to lowest
    correlations_sorted = correlations.sort_values(ascending=False)
    # Convert the sorted correlations to a data frame and reset index
    df_correlations_sorted = pd.DataFrame(correlations_sorted, columns=['corr']).reset_index()

    lagged_df.columns = df.columns
    return df_correlations_sorted, lagged_df

col_range = range(0,1)

## **1. Monthly data**

In [3]:
ipc_series = pd.read_excel('/content/drive/MyDrive/Research/CEMLA 2024/IPC_monthly.xlsx', index_col=0)
dataset = pd.read_excel('/content/drive/MyDrive/Research/CEMLA 2024/DATASET.xlsx', index_col=0)
dataset['exchange'] = 1/dataset['exchange']  ## to ensure a positive correlation
dataset_m = dataset.dropna(subset=['ipc_nal'])
dataset_m = dataset_m.drop(['week_set'], axis=1)
dataset_m

Unnamed: 0_level_0,Bien económico - Tema,Contabilidad - Campo de estudio,Coste - Tema,Cuenta - Tema,Deflación - Tema,Deflactor - Tema,Demanda - Economía,Desempleo - Tema,Dinero - Tema,Economía - Campo de estudio,...,libor,compra,venta,spread,ufv,exchange,ipc_nal,ipc_food,ipc_nofood,ipc_ali
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2011-01-31,0,0,0,43,0,0,21,0,38,0,...,0.455426,7.040000,6.940000,0.1,1.568637,0.143147,74.207255,70.885954,77.240962,68.870036
2011-02-28,0,0,0,43,0,0,27,62,36,0,...,0.464045,7.026429,6.926429,0.1,1.577936,0.142624,75.439060,72.663719,77.980264,70.869435
2011-03-31,0,0,100,43,0,0,44,48,45,0,...,0.460783,7.007097,6.907097,0.1,1.588942,0.143280,76.108818,73.107625,78.854304,71.367307
2011-04-30,0,0,0,48,100,0,58,87,44,0,...,0.440875,6.992667,6.892667,0.1,1.601850,0.143785,76.125495,72.727340,79.229494,70.686865
2011-05-31,0,0,0,44,0,0,55,56,48,0,...,0.414302,6.990000,6.890000,0.1,1.615897,0.143392,76.277495,72.728528,79.517765,70.585534
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-08-31,0,0,25,55,0,0,37,37,80,0,...,5.871573,6.960000,6.860000,0.1,2.451340,0.144785,110.506839,113.928148,108.038858,114.236076
2023-09-30,0,0,33,55,13,0,41,44,78,0,...,5.894194,6.960000,6.860000,0.1,2.457040,0.144916,110.440281,113.646141,108.127716,113.722092
2023-10-31,26,0,15,59,0,0,38,34,77,0,...,5.886919,6.960000,6.860000,0.1,2.463165,0.144805,110.429431,113.606645,108.137529,113.582224
2023-11-30,0,0,33,51,0,0,51,40,87,0,...,5.823614,6.960000,6.860000,0.1,2.468625,0.144942,110.425657,113.596885,108.138074,113.601992


In [4]:
dataset_m.drop(['ipc_food', 'ipc_nofood', 'ipc_ali'], axis=1, inplace=True)
dataset_m = dataset_m.assign(ipc_nal_l1 = dataset_m['ipc_nal'].shift(1),
                 ipc_nal_l2 = dataset_m['ipc_nal'].shift(2),
                 ipc_nal_l3 = dataset_m['ipc_nal'].shift(3),
                 ipc_nal_l6 = dataset_m['ipc_nal'].shift(6),
                 ipc_nal_l9 = dataset_m['ipc_nal'].shift(9),
                 ipc_nal_l12 = dataset_m['ipc_nal'].shift(12))
dataset_m

Unnamed: 0_level_0,Bien económico - Tema,Contabilidad - Campo de estudio,Coste - Tema,Cuenta - Tema,Deflación - Tema,Deflactor - Tema,Demanda - Economía,Desempleo - Tema,Dinero - Tema,Economía - Campo de estudio,...,spread,ufv,exchange,ipc_nal,ipc_nal_l1,ipc_nal_l2,ipc_nal_l3,ipc_nal_l6,ipc_nal_l9,ipc_nal_l12
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2011-01-31,0,0,0,43,0,0,21,0,38,0,...,0.1,1.568637,0.143147,74.207255,,,,,,
2011-02-28,0,0,0,43,0,0,27,62,36,0,...,0.1,1.577936,0.142624,75.439060,74.207255,,,,,
2011-03-31,0,0,100,43,0,0,44,48,45,0,...,0.1,1.588942,0.143280,76.108818,75.439060,74.207255,,,,
2011-04-30,0,0,0,48,100,0,58,87,44,0,...,0.1,1.601850,0.143785,76.125495,76.108818,75.439060,74.207255,,,
2011-05-31,0,0,0,44,0,0,55,56,48,0,...,0.1,1.615897,0.143392,76.277495,76.125495,76.108818,75.439060,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-08-31,0,0,25,55,0,0,37,37,80,0,...,0.1,2.451340,0.144785,110.506839,110.081702,109.678594,109.439785,108.697854,108.692935,107.227296
2023-09-30,0,0,33,55,13,0,41,44,78,0,...,0.1,2.457040,0.144916,110.440281,110.506839,110.081702,109.678594,108.614602,108.818364,107.382222
2023-10-31,26,0,15,59,0,0,38,34,77,0,...,0.1,2.463165,0.144805,110.429431,110.440281,110.506839,110.081702,108.814654,109.176930,108.184143
2023-11-30,0,0,33,51,0,0,51,40,87,0,...,0.1,2.468625,0.144942,110.425657,110.429431,110.440281,110.506839,109.439785,108.697854,108.692935


In [5]:
dataset_m.loc['2011-01-31':'2011-01-31', 'ipc_nal_l1'] = ipc_series.loc['2010-12-01':'2010-12-01', 'ipc_nal'].values
dataset_m.loc['2011-01-31':'2011-02-28', 'ipc_nal_l2'] = ipc_series.loc['2010-11-01':'2010-12-01', 'ipc_nal'].values
dataset_m.loc['2011-01-31':'2011-03-31', 'ipc_nal_l3'] = ipc_series.loc['2010-10-01':'2010-12-01', 'ipc_nal'].values
dataset_m.loc['2011-01-31':'2011-06-30', 'ipc_nal_l6'] = ipc_series.loc['2010-07-01':'2010-12-01', 'ipc_nal'].values
dataset_m.loc['2011-01-31':'2011-09-30', 'ipc_nal_l9'] = ipc_series.loc['2010-04-01':'2010-12-01', 'ipc_nal'].values
dataset_m.loc['2011-01-31':'2011-12-31', 'ipc_nal_l12'] = ipc_series.loc['2010-01-01':'2010-12-01', 'ipc_nal'].values
dataset_m.loc[:, 'ipc_nal':].head(15)

Unnamed: 0_level_0,ipc_nal,ipc_nal_l1,ipc_nal_l2,ipc_nal_l3,ipc_nal_l6,ipc_nal_l9,ipc_nal_l12
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2011-01-31,74.207255,73.260267,71.989803,71.196381,69.071086,68.561311,68.467691
2011-02-28,75.43906,74.207255,73.260267,71.989803,69.800954,68.549203,68.581371
2011-03-31,76.108818,75.43906,74.207255,73.260267,70.335479,68.646613,68.499278
2011-04-30,76.125495,76.108818,75.43906,74.207255,71.196381,69.071086,68.561311
2011-05-31,76.277495,76.125495,76.108818,75.43906,71.989803,69.800954,68.549203
2011-06-30,76.387019,76.277495,76.125495,76.108818,73.260267,70.335479,68.646613
2011-07-31,76.793365,76.387019,76.277495,76.125495,74.207255,71.196381,69.071086
2011-08-31,77.0846,76.793365,76.387019,76.277495,75.43906,71.989803,69.800954
2011-09-30,77.319221,77.0846,76.793365,76.387019,76.108818,73.260267,70.335479
2011-10-31,77.685075,77.319221,77.0846,76.793365,76.125495,74.207255,71.196381


In [6]:
ldf_corr = dataset_m.copy()
ldf_corr = ldf_corr.dropna(axis = 1)

g12df_corr = ldf_corr.copy().pct_change(12)
g12df_corr = g12df_corr['2012-01-31':]
g12df_corr = g12df_corr.loc[:, np.isfinite(g12df_corr).all(axis=0)]
g12df_corr = g12df_corr.dropna(axis = 1)

g1df_corr = ldf_corr.copy().pct_change(1)
g1df_corr = g1df_corr['2011-02-28':]
g1df_corr = g1df_corr.loc[:, np.isfinite(g1df_corr).all(axis=0)]
g1df_corr = g1df_corr.dropna(axis = 1)

In [7]:
ldf_corr.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 156 entries, 2011-01-31 to 2023-12-31
Columns: 656 entries, Bien económico - Tema to ipc_nal_l12
dtypes: float64(579), int64(77)
memory usage: 804.8 KB


In [8]:
g12df_corr.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 144 entries, 2012-01-31 to 2023-12-31
Columns: 600 entries, Cuenta - Tema to ipc_nal_l12
dtypes: float64(600)
memory usage: 676.1 KB


In [9]:
g1df_corr.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 155 entries, 2011-02-28 to 2023-12-31
Columns: 600 entries, Cuenta - Tema to ipc_nal_l12
dtypes: float64(600)
memory usage: 727.8 KB


In [10]:
lcorr_lag0, ldf_lag0 = lagged_correlation_color(ldf_corr, 'ipc_nal', 0)
lcorr_lag1, ldf_lag1 = lagged_correlation_color(ldf_corr, 'ipc_nal', 1)
lcorr_lag2, ldf_lag2 = lagged_correlation_color(ldf_corr, 'ipc_nal', 2)
lcorr_lag3, ldf_lag3 = lagged_correlation_color(ldf_corr, 'ipc_nal', 3)

g12corr_lag0, g12df_lag0 = lagged_correlation_color(g12df_corr, 'ipc_nal', 0)
g12corr_lag1, g12df_lag1 = lagged_correlation_color(g12df_corr, 'ipc_nal', 1)
g12corr_lag2, g12df_lag2 = lagged_correlation_color(g12df_corr, 'ipc_nal', 2)
g12corr_lag3, g12df_lag3 = lagged_correlation_color(g12df_corr, 'ipc_nal', 3)

g1corr_lag0, g1df_lag0 = lagged_correlation_color(g1df_corr, 'ipc_nal', 0)
g1corr_lag1, g1df_lag1 = lagged_correlation_color(g1df_corr, 'ipc_nal', 1)
g1corr_lag2, g1df_lag2 = lagged_correlation_color(g1df_corr, 'ipc_nal', 2)
g1corr_lag3, g1df_lag3 = lagged_correlation_color(g1df_corr, 'ipc_nal', 3)

In [11]:
lcorr_lag0.query('corr > 0.5').style.background_gradient(cmap='coolwarm', low=0.5, high=1)

Unnamed: 0,level_0,level_1,corr
0,Lag0,ipc_nal,1.0
1,Lag0,ipc_nal_l1,0.999394
2,Lag0,ipc_nal_l2,0.99863
3,Lag0,ipc_nal_l3,0.997943
4,Lag0,ufv,0.997515
5,Lag0,ipc_nal_l6,0.996944
6,Lag0,ipc_nal_l9,0.995991
7,Lag0,ipc_nal_l12,0.995039
8,Lag0,milk_lp,0.916145
9,Lag0,milk_dlp,0.914967


In [12]:
lcorr_lag1.query('corr > 0.5').style.background_gradient(cmap='coolwarm', low=0.5, high=1)

Unnamed: 0,level_0,level_1,corr
0,Lag1,ipc_nal,0.999383
1,Lag1,ipc_nal_l1,0.998638
2,Lag1,ipc_nal_l2,0.997952
3,Lag1,ipc_nal_l3,0.997502
4,Lag1,ufv,0.997123
5,Lag1,ipc_nal_l6,0.996595
6,Lag1,ipc_nal_l9,0.995564
7,Lag1,ipc_nal_l12,0.994617
8,Lag1,milk_lp,0.915631
9,Lag1,milk_dlp,0.914571


In [13]:
lcorr_lag2.query('corr > 0.5').style.background_gradient(cmap='coolwarm', low=0.5, high=1)

Unnamed: 0,level_0,level_1,corr
0,Lag2,ipc_nal,0.998653
1,Lag2,ipc_nal_l1,0.99803
2,Lag2,ipc_nal_l2,0.997604
3,Lag2,ipc_nal_l3,0.997384
4,Lag2,ufv,0.996836
5,Lag2,ipc_nal_l6,0.996273
6,Lag2,ipc_nal_l9,0.995152
7,Lag2,ipc_nal_l12,0.994416
8,Lag2,milk_lp,0.914819
9,Lag2,milk_dlp,0.913909


In [14]:
lcorr_lag3.query('corr > 0.5').style.background_gradient(cmap='coolwarm', low=0.5, high=1)

Unnamed: 0,level_0,level_1,corr
0,Lag3,ipc_nal,0.998055
1,Lag3,ipc_nal_l1,0.997716
2,Lag3,ipc_nal_l2,0.997534
3,Lag3,ipc_nal_l3,0.997303
4,Lag3,ufv,0.996588
5,Lag3,ipc_nal_l6,0.996089
6,Lag3,ipc_nal_l9,0.994695
7,Lag3,ipc_nal_l12,0.994286
8,Lag3,milk_lp,0.914368
9,Lag3,milk_dlp,0.913621


In [15]:
g12corr_lag0.query('corr > 0.5').style.background_gradient(cmap='coolwarm', low=0.5, high=1)

Unnamed: 0,level_0,level_1,corr
0,Lag0,ipc_nal,1.0
1,Lag0,ipc_nal_l1,0.952142
2,Lag0,ipc_nal_l2,0.878593
3,Lag0,ipc_nal_l3,0.813581
4,Lag0,ufv,0.768689
5,Lag0,ipc_nal_l6,0.763182
6,Lag0,ipc_nal_l9,0.694394
7,Lag0,quinoa_sc,0.635107
8,Lag0,ipc_nal_l12,0.58128
9,Lag0,milk2_tr,0.567554


In [16]:
g1corr_lag0.query('corr > 0.3').style.background_gradient(cmap='coolwarm', low=0.5, high=1)

Unnamed: 0,level_0,level_1,corr
0,Lag0,ipc_nal,1.0
1,Lag0,tomato_lp,0.482031
2,Lag0,tomato_dlp,0.477674
3,Lag0,tomato_bol,0.451963
4,Lag0,tomato_su,0.442053
5,Lag0,tomato_cb,0.439112
6,Lag0,papa2_or,0.420551
7,Lag0,tomato_po,0.414654
8,Lag0,chicken_dlp,0.396956
9,Lag0,chicken_lp,0.39458


In [17]:
df = ldf_corr.copy()

In [18]:
x = df.index
y1 = round(df['ipc_nal'], ndigits=1)

fig1 = go.Figure()
fig1.add_trace(go.Scatter(x=x, y=y1, mode='lines', line=dict(color='#e8702a', width=2, shape='spline')) )
fig1.update_layout(font_family = 'Arial', font_color="#000000", plot_bgcolor='white', separators=',',
                  title_text='CPI',
                  title_font = dict(color='#000000', size=30),
                  xaxis=dict(title='Month', titlefont_size=20, tickfont_size=15),
                  yaxis=dict(title='Index units', titlefont_size=20, tickfont_size=18, linecolor="#000000", ticks='outside', nticks=10, zeroline=True, zerolinewidth=1.5, zerolinecolor='black'),
                  hoverlabel=dict(font_size=20,font_family="Arial"),
                  hovermode="x"
                  )
fig1.show()

In [19]:
x = ['Enero', 'Febrero', 'Marzo', 'Abril', 'Mayo', 'Junio', 'Julio', 'Agosto', 'Septiembre', 'Octubre', 'Noviembre', 'Diciembre']
fig = go.Figure()
for i in range(1, 13):
  fig.add_trace(go.Box(x=round(df[df.index.month == i]['ipc_nal'], ndigits=1), name=x[i-1]))
fig.update_layout(font_family = 'Arial', font_color="#000000", plot_bgcolor='white', separators=',',
                  title='CPI: Boxplot by month', xaxis_title='Index units', yaxis_title='', showlegend=False)
fig.show()

In [20]:
np.array(lcorr_lag0.query('corr > 0.5')['level_1'])

array(['ipc_nal', 'ipc_nal_l1', 'ipc_nal_l2', 'ipc_nal_l3', 'ufv',
       'ipc_nal_l6', 'ipc_nal_l9', 'ipc_nal_l12', 'milk_lp', 'milk_dlp',
       'milk_or', 'milk_bol', 'paprika_tr', 'milk_sc', 'rice3_ea_y',
       'rice3_ea_x', 'beef_dlp', 'beef_lp', 'milk_po', 'milk_su',
       'squash_tr', 'milk_cb', 'banana_co', 'banana_ea', 'pineapple_ea',
       'Precio - Tema', 'beef_bol', 'corn_co', 'milk2_or',
       'Interés - Tema', 'milk2_po', 'ycorn_ea', 'papaya_tr', 'milk_ea',
       'wheat_sc', 'apple_sc', 'dinero', 'Dinero - Tema', 'beef_su',
       'sorghum_lp', 'onion2_po', 'rice_ea', 'banana_bol', 'redpepper_tr',
       'Política - Tema', 'banana_tj', 'flour_tj', 'platano_ea',
       'milk2_dlp', 'la inflación', 'sorghum_bol', 'Inflación', 'beef_or',
       'flour_po', 'sorghum_dlp', 'exchange', 'grapefruit_po', 'beef_sc',
       'bean_tr', 'corn_ea', 'rice2_co', 'zinc', 'Inflación - Tema',
       'apple_or', 'flour2_ea', 'banana_tr', 'beef_cb', 'milk2_lp',
       'greenbean_tr', 'l

In [21]:
predictors1 = ['ipc_nal', 'ipc_nal_l1', 'ipc_nal_l2', 'ipc_nal_l3', 'ipc_nal_l6', 'ipc_nal_l12',
               'ufv', 'milk_lp', 'milk_dlp', 'milk_or', 'milk_bol', 'paprika_tr', 'milk_sc', 'rice3_ea_y', 'rice3_ea_x', 'beef_dlp',
               'beef_lp', 'milk_po', 'milk_su', 'squash_tr', 'milk_cb', 'banana_co', 'banana_ea', 'pineapple_ea', 'Precio - Tema',
               'beef_bol', 'corn_co', 'milk2_or', 'Interés - Tema', 'milk2_po', 'ycorn_ea', 'papaya_tr', 'milk_ea', 'wheat_sc', 'apple_sc',
               'dinero', 'Dinero - Tema', 'beef_su', 'sorghum_lp', 'onion2_po', 'rice_ea', 'banana_bol', 'redpepper_tr', 'Política - Tema',
               'banana_tj', 'flour_tj', 'platano_ea', 'milk2_dlp', 'la inflación', 'sorghum_bol', 'Inflación', 'beef_or', 'flour_po',
               'sorghum_dlp', 'exchange', 'grapefruit_po', 'beef_sc', 'bean_tr', 'corn_ea', 'rice2_co', 'zinc', 'Inflación - Tema', 'apple_or', 'flour2_ea',
               'banana_tr', 'beef_cb', 'milk2_lp', 'greenbean_tr', 'libor', 'banana_sc', 'peas_tr', 'apple_bol', 'grapefruit_cb',
               'pineapple_or', 'que es inflación', 'wheat_bol', 'redpepper_po', 'peas_ea', 'papa1_ea', 'banana_su', 'lemon_cb', 'rice2_or',
               'banana_lp', 'quinoa_ea', 'Salario - Tema', 'oil_co', 'beef_tr']


df = df[predictors1]

## **2. Train and Val sets**

In [22]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [23]:
train, validation = train_test_split(df, test_size=0.2, random_state=42)
print(f'Obs in train set: {train.shape[0]}; variables in train set: {train.shape[1]}')
print(f'Obs in validation set: {validation.shape[0]}; variables in validation set: {validation.shape[1]}')

Obs in train set: 124; variables in train set: 87
Obs in validation set: 32; variables in validation set: 87


In [24]:
scaler = StandardScaler()
train_scaled = pd.DataFrame(scaler.fit_transform(train), columns=train.columns, index=train.index)
validation_scaled = pd.DataFrame(scaler.transform(validation), columns=validation.columns, index=validation.index)
#test_scaled = pd.DataFrame(scaler.transform(test), columns=test.columns, index=test.index)

X_train = train_scaled.drop('ipc_nal', axis=1)
y_train = train_scaled['ipc_nal']

X_validation = validation_scaled.drop('ipc_nal', axis=1)
y_validation = validation_scaled['ipc_nal']

#X_test = test_scaled.drop('ipc_nal', axis=1)

## **3. Algorithms**

In [25]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import TimeSeriesSplit
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor

tscv5 = TimeSeriesSplit(n_splits=5)

### **3.1. Ridge**
Without tuning:
Validation MSE:  0.001453348422371209
Validation R2:  0.9984692589865253
Validation MAE:  0.030880852442606015

In [33]:
#ridge = Ridge()
ridge = Ridge(alpha=0.11326825671361537, fit_intercept=False, positive=True, random_state=0)  ## cv=5
# Fit on training set
ridge.fit(X_train, y_train)
# Make predictions on validation set using best model from GridSearchCV
ridge_train_pred = ridge.predict(X_train)
ridge_val_pred = ridge.predict(X_validation)
# Calculate Forecast metrics on train set
mse_train_ridge = mean_squared_error(y_train, ridge_train_pred, squared=True)
r2_train_ridge = r2_score(y_train, ridge_train_pred)
mae_train_ridge = mean_absolute_error(y_train, ridge_train_pred)
print("Train MSE: ", mse_train_ridge)
print("Train R2: ", r2_train_ridge)
print("Train MAE: ", mae_train_ridge)
# Calculate Forecast metrics on validation set
mse_val_ridge = mean_squared_error(y_validation, ridge_val_pred, squared=True)
r2_val_ridge = r2_score(y_validation, ridge_val_pred)
mae_val_ridge = mean_absolute_error(y_validation, ridge_val_pred)
print("Validation MSE: ", mse_val_ridge)
print("Validation R2: ", r2_val_ridge)
print("Validation MAE: ", mae_val_ridge)

Train MSE:  0.0009685506142711325
Train R2:  0.9990314493857289
Train MAE:  0.02399079998055779
Validation MSE:  0.0008789962502787182
Validation R2:  0.9990741961182323
Validation MAE:  0.02468902048744895


In [27]:
# Define parameter grid for GridSearchCV
alphas_ridge = np.logspace(-1, 2, num=500)
#alphas_ridge = np.arange(1, 100, 0.05)
param_grid = {'alpha': alphas_ridge,
              'positive': [True, False],
              'fit_intercept': [True, False]}

# Instantiate Ridge model
ridge = Ridge()

# Define GridSearchCV object
grid_search_ridge = GridSearchCV(ridge, param_grid, cv=5, scoring='neg_mean_squared_error')  ## works better with 5 cv
#grid_search_ridge = GridSearchCV(ridge, param_grid, cv=5, scoring='r2')  ## works better with 5 cv

# Fit GridSearchCV on training set
grid_search_ridge.fit(X_train, y_train)

# Print best parameter and score from GridSearchCV
print("Best parameter: ", grid_search_ridge.best_params_)
print("Best score: ", -grid_search_ridge.best_score_)

# Make predictions on validation set using best model from GridSearchCV
best_ridge = grid_search_ridge.best_estimator_
y_val_pred = best_ridge.predict(X_validation)

# Calculate RMSE on validation set
mse_val = mean_squared_error(y_validation, y_val_pred, squared=True)
r2_val = r2_score(y_validation, y_val_pred)
mae_val = mean_absolute_error(y_validation, y_val_pred)
print("Validation MSE: ", mse_val)
print("Validation R2: ", r2_val)
print("Validation MAE: ", mae_val)

# Make predictions on test set using best model from GridSearchCV
#y_test_pred_ridge = best_ridge.predict(X_test)

Best parameter:  {'alpha': 0.11017528137883871, 'fit_intercept': False, 'positive': True}
Best score:  0.0025758252589013564
Validation MSE:  0.0008932617208701132
Validation R2:  0.9990591709937855
Validation MAE:  0.024832503304094394


In [None]:
# Get the coefficients from the Ridge model
coef = ridge.coef_
# Create a dataframe of feature importances
feature_importance_ridge = pd.DataFrame({'Feature': X_train.columns, 'Importance': coef})
# Sort the features by importance
feature_importance_ridge = feature_importance_ridge.sort_values('Importance', ascending=False).reset_index(drop=True)
feature_importance_ridge.columns = ['feat_ridge', 'imp_ridge']
# Print the feature importances
feature_importance_ridge.head(15)

Unnamed: 0,feat_ridge,imp_ridge
0,ipc_nal_l1,0.171552
1,ipc_nal_l2,0.140606
2,ufv,0.119632
3,ipc_nal_l3,0.117031
4,ipc_nal_l6,0.115594
5,ipc_nal_l12,0.107948
6,rice2_or,0.030317
7,Precio - Tema,0.025489
8,greenbean_tr,0.020595
9,milk_po,0.018265


### **3.2. Lasso**
Without tuning:
Validation MSE:  0.9523622169433499
Validation R2:  -0.0030766763969758415
Validation MAE:  0.8455503308136401

In [None]:
#lasso = Lasso()
lasso = Lasso(alpha=0.1, fit_intercept=False, positive=True)  ## cv=5
# Fit on training set
lasso.fit(X_train, y_train)
# Make predictions on validation set using best model from GridSearchCV
lasso_train_pred = lasso.predict(X_train)
lasso_val_pred = lasso.predict(X_validation)
# Calculate Forecast metrics on train set
mse_train_lasso = mean_squared_error(y_train, lasso_train_pred, squared=True)
r2_train_lasso = r2_score(y_train, lasso_train_pred)
mae_train_lasso = mean_absolute_error(y_train, lasso_train_pred)
print("Train MSE: ", mse_train_lasso)
print("Train R2: ", r2_train_lasso)
print("Train MAE: ", mae_train_lasso)
# Calculate Forecast metrics on validation set
mse_val_lasso = mean_squared_error(y_validation, lasso_val_pred, squared=True)
r2_val_lasso = r2_score(y_validation, lasso_val_pred)
mae_val_lasso = mean_absolute_error(y_validation, lasso_val_pred)
print("Validation MSE: ", mse_val_lasso)
print("Validation R2: ", r2_val_lasso)
print("Validation MAE: ", mae_val_lasso)

Train MSE:  0.01135096248401723
Train R2:  0.9886490375159828
Train MAE:  0.0886406245973438
Validation MSE:  0.009937567975745763
Validation R2:  0.9895332443063803
Validation MAE:  0.08505641169339459


In [None]:
# Define parameter grid for GridSearchCV
alphas_lasso = np.logspace(-1, 2, num=500)
#alphas_lasso = np.arange(1, 100, 0.05)
param_grid = {'alpha': alphas_lasso,
              'positive': [True, False],
              'fit_intercept': [True, False]}

# Instantiate lasso model
lasso = Lasso()

# Define GridSearchCV object
grid_search_lasso = GridSearchCV(lasso, param_grid, cv=5, scoring='neg_mean_squared_error')  ## works better with 5 cv
#grid_search_lasso = GridSearchCV(lasso, param_grid, cv=5, scoring='r2')  ## works better with 5 cv

# Fit GridSearchCV on training set
grid_search_lasso.fit(X_train, y_train)

# Print best parameter and score from GridSearchCV
print("Best parameter: ", grid_search_lasso.best_params_)
print("Best score: ", -grid_search_lasso.best_score_)

# Make predictions on validation set using best model from GridSearchCV
best_lasso = grid_search_lasso.best_estimator_
y_val_pred = best_lasso.predict(X_validation)

# Calculate RMSE on validation set
mse_val = mean_squared_error(y_validation, y_val_pred, squared=True)
r2_val = r2_score(y_validation, y_val_pred)
mae_val = mean_absolute_error(y_validation, y_val_pred)
print("Validation MSE: ", mse_val)
print("Validation R2: ", r2_val)
print("Validation MAE: ", mae_val)

# Make predictions on test set using best model from GridSearchCV
#y_test_pred_lasso = best_lasso.predict(X_test)

Best parameter:  {'alpha': 0.1, 'fit_intercept': False, 'positive': True}
Best score:  0.012353938460066747
Validation MSE:  0.009937567975745763
Validation R2:  0.9895332443063803
Validation MAE:  0.08505641169339459


In [None]:
# Get the coefficients from the Lasso model
coef = lasso.coef_
# Create a dataframe of feature importances
feature_importance_lasso = pd.DataFrame({'Feature': X_train.columns, 'Importance': coef})
# Sort the features by importance
feature_importance_lasso = feature_importance_lasso.sort_values('Importance', ascending=False).reset_index(drop=True)
feature_importance_lasso.columns = ['feat_lasso', 'imp_lasso']
# Print the feature importances
feature_importance_lasso.head(15)

Unnamed: 0,feat_lasso,imp_lasso
0,ipc_nal_l1,0.899295
1,ipc_nal_l6,5.3e-05
2,beef_sc,0.0
3,banana_tr,0.0
4,flour2_ea,0.0
5,apple_or,0.0
6,Inflación - Tema,0.0
7,zinc,0.0
8,rice2_co,0.0
9,corn_ea,0.0


### **3.3. ElasticNet**
Without tuning:
Validation MSE:  0.2878895501474107
Validation R2:  0.6967799772043428
Validation MAE:  0.4602160948451647

In [None]:
#enet = ElasticNet()
enet = ElasticNet(alpha=0.11326825671361537, l1_ratio=0.1, fit_intercept=False, positive=True)  ## cv=5
# Fit on training set
enet.fit(X_train, y_train)
# Make predictions on validation set using best model from GridSearchCV
enet_train_pred = enet.predict(X_train)
enet_val_pred = enet.predict(X_validation)
# Calculate Forecast metrics on train set
mse_train_enet = mean_squared_error(y_train, enet_train_pred, squared=True)
r2_train_enet = r2_score(y_train, enet_train_pred)
mae_train_enet = mean_absolute_error(y_train, enet_train_pred)
print("Train MSE: ", mse_train_enet)
print("Train R2: ", r2_train_enet)
print("Train MAE: ", mae_train_enet)
# Calculate Forecast metrics on validation set
mse_val_enet = mean_squared_error(y_validation, enet_val_pred, squared=True)
r2_val_enet = r2_score(y_validation, enet_val_pred)
mae_val_enet = mean_absolute_error(y_validation, enet_val_pred)
print("Validation MSE: ", mse_val_enet)
print("Validation R2: ", r2_val_enet)
print("Validation MAE: ", mae_val_enet)

Train MSE:  0.0023597541727734526
Train R2:  0.9976402458272265
Train MAE:  0.03813004212820444
Validation MSE:  0.002107268648489714
Validation R2:  0.9977805167040469
Validation MAE:  0.03561740379378739


In [None]:
# Define parameter grid for GridSearchCV
alphas_enet = np.logspace(-1, 2, num=500)
#alphas_enet = np.arange(1, 100, 0.05)
param_grid = {'alpha': alphas_enet,
              'l1_ratio': np.arange(0.01, 0.95, 0.01),
              'positive': [True, False],
              'fit_intercept': [True, False]}

# Instantiate enet model
enet = ElasticNet()

# Define GridSearchCV object
grid_search_enet = GridSearchCV(enet, param_grid, cv=5, scoring='neg_mean_squared_error')  ## works better with 5 cv
#grid_search_enet = GridSearchCV(enet, param_grid, cv=5, scoring='r2')  ## works better with 5 cv

# Fit GridSearchCV on training set
grid_search_enet.fit(X_train, y_train)

# Print best parameter and score from GridSearchCV
print("Best parameter: ", grid_search_enet.best_params_)
print("Best score: ", -grid_search_enet.best_score_)

# Make predictions on validation set using best model from GridSearchCV
best_enet = grid_search_enet.best_estimator_
y_val_pred = best_enet.predict(X_validation)

# Calculate RMSE on validation set
mse_val = mean_squared_error(y_validation, y_val_pred, squared=True)
r2_val = r2_score(y_validation, y_val_pred)
mae_val = mean_absolute_error(y_validation, y_val_pred)
print("Validation MSE: ", mse_val)
print("Validation R2: ", r2_val)
print("Validation MAE: ", mae_val)

# Make predictions on test set using best model from GridSearchCV
#y_test_pred_enet = best_enet.predict(X_test)

In [None]:
# Get the coefficients from the enet model
coef = enet.coef_
# Create a dataframe of feature importances
feature_importance_enet = pd.DataFrame({'Feature': X_train.columns, 'Importance': coef})
# Sort the features by importance
feature_importance_enet = feature_importance_enet.sort_values('Importance', ascending=False).reset_index(drop=True)
feature_importance_enet.columns = ['feat_enet', 'imp_enet']
# Print the feature importances
feature_importance_enet.head(15)

Unnamed: 0,feat_enet,imp_enet
0,ipc_nal_l1,0.086441
1,ipc_nal_l2,0.08388
2,ipc_nal_l3,0.081695
3,ufv,0.080271
4,ipc_nal_l6,0.079604
5,ipc_nal_l12,0.07685
6,paprika_tr,0.036142
7,Precio - Tema,0.033811
8,rice2_or,0.032323
9,beef_dlp,0.024307


### **3.4. ADA**
Without tuning:
Validation MSE:  0.0036814337026600556
Validation R2:  0.9961225254245258
Validation MAE:  0.0469554086526466

In [None]:
#ada = AdaBoostRegressor()
ada = AdaBoostRegressor(estimator=DecisionTreeRegressor(max_depth=5), n_estimators=115, learning_rate=1.235482888256747, random_state=0)
# Fit the model on training data
ada.fit(X_train, y_train)
# Make predictions on the validation set
ada_train_pred = ada.predict(X_train)
ada_val_pred = ada.predict(X_validation)
# Calculate Forecast metrics on train set
mse_train_ada = mean_squared_error(y_train, ada_train_pred, squared=True)
r2_train_ada = r2_score(y_train, ada_train_pred)
mae_train_ada = mean_absolute_error(y_train, ada_train_pred)
print("Train MSE: ", mse_train_ada)
print("Train R2: ", r2_train_ada)
print("Train MAE: ", mae_train_ada)
# Evaluate the model on the validation set
mse_val_ada = mean_squared_error(y_validation, ada_val_pred, squared=True)
r2_val_ada = r2_score(y_validation, ada_val_pred)
mae_val_ada = mean_absolute_error(y_validation, ada_val_pred)
print("Validation MSE: ", mse_val_ada)
print("Validation R2: ", r2_val_ada)
print("Validation MAE: ", mae_val_ada)

Train MSE:  0.0001518067359025188
Train R2:  0.9998481932640975
Train MAE:  0.008301392934766337
Validation MSE:  0.0014618738110729937
Validation R2:  0.998460279610389
Validation MAE:  0.02910552415848012


In [None]:
# Define the AdaBoost Regressor
ada = AdaBoostRegressor()

# Define the range of hyperparameters to search over
param_grid_ada = {
    'n_estimators': range(50, 200, 5),    ##120 was selected range(50, 200, 5)
    'learning_rate': np.logspace(-2,0.5, 50),
    #'loss': ['linear', 'square', 'exponential']
    'random_state': [0],
    'estimator': [DecisionTreeRegressor(max_depth=3),
                  DecisionTreeRegressor(max_depth=4),
                  DecisionTreeRegressor(max_depth=5),
                  DecisionTreeRegressor(max_depth=6),
                  ]
}

# Tune hyperparameters using GridSearchCV with TimeSeriesSplit
#grid_search_ada = GridSearchCV(estimator=ada, param_grid=param_grid_ada, cv=5, scoring='r2')
grid_search_ada = GridSearchCV(estimator=ada, param_grid=param_grid_ada, cv=5, scoring='neg_mean_squared_error')
grid_search_ada.fit(X_train, y_train)

# Evaluate the model using the best hyperparameters on the test set
ada_best = AdaBoostRegressor(**grid_search_ada.best_params_)
ada_best.fit(X_train, y_train)

# Make predictions on the validation set
y_val_pred = ada_best.predict(X_validation)

# Evaluate the model on the validation set
mse_val = mean_squared_error(y_validation, y_val_pred, squared=True)
r2_val = r2_score(y_validation, y_val_pred)
mae_val = mean_absolute_error(y_validation, y_val_pred)
print("Validation MSE: ", mse_val)
print("Validation R2: ", r2_val)
print("Validation MAE: ", mae_val)

# Print the best hyperparameters and the best score
print("Best parameters found: ", grid_search_ada.best_params_)
print("Lowest MSE found: ", -grid_search_ada.best_score_)

# Evaluate the model on the test set
#y_test_pred_ada = ada_best.predict(X_test)

Validation MSE:  0.0014618738110729937
Validation R2:  0.998460279610389
Validation MAE:  0.02910552415848012
Best parameters found:  {'estimator': DecisionTreeRegressor(max_depth=5), 'learning_rate': 1.235482888256747, 'n_estimators': 115, 'random_state': 0}
Lowest MSE found:  -0.9981856980134719


In [None]:
# Create a DataFrame with the feature importance values
feature_importance_ada = pd.DataFrame({'Feature': X_train.columns, 'Importance': ada.feature_importances_})
# Sort the DataFrame by importance values in descending order
feature_importance_ada = feature_importance_ada.sort_values(by='Importance', ascending=False).reset_index(drop=True)
feature_importance_ada.columns = ['feat', 'imp_ada']
# Print the feature importance DataFrame
feature_importance_ada.head(15)

Unnamed: 0,feat,imp_ada
0,ipc_nal_l3,0.09055
1,ufv,0.084265
2,ipc_nal_l1,0.071661
3,ipc_nal_l2,0.066128
4,ipc_nal_l6,0.06267
5,flour_tj,0.062154
6,ipc_nal_l12,0.045471
7,milk2_or,0.036227
8,milk2_lp,0.036167
9,milk2_po,0.033045


### **3.5. GBR**
Without tuning:
Validation MSE:  0.0014618738110729937
Validation R2:  0.998460279610389
Validation MAE:  0.02910552415848012

In [None]:
#gbr = GradientBoostingRegressor(random_state=0)
gbr = GradientBoostingRegressor(learning_rate=0.040949150623804255, n_estimators=290, random_state=0)
# Fit the model on the training set
gbr.fit(X_train, y_train)
# Make predictions on validation set using best model from GridSearchCV
gbr_train_pred = gbr.predict(X_train)
gbr_val_pred = gbr.predict(X_validation)
# Calculate Forecast metrics on train set
mse_train_gbr = mean_squared_error(y_train, gbr_train_pred, squared=True)
r2_train_gbr = r2_score(y_train, gbr_train_pred)
mae_train_gbr = mean_absolute_error(y_train, gbr_train_pred)
print("Train MSE: ", mse_train_gbr)
print("Train R2: ", r2_train_gbr)
print("Train MAE: ", mae_train_gbr)
# Calculate Forecast metrics on validation set
mse_val_gbr = mean_squared_error(y_validation, gbr_val_pred, squared=True)
r2_val_gbr = r2_score(y_validation, gbr_val_pred)
mae_val_gbr = mean_absolute_error(y_validation, gbr_val_pred)
print("Validation MSE: ", mse_val_gbr)
print("Validation R2: ", r2_val_gbr)
print("Validation MAE: ", mae_val_gbr)

Train MSE:  2.4216058773004873e-06
Train R2:  0.9999975783941227
Train MAE:  0.0011680068652647226
Validation MSE:  0.0009299617979484944
Validation R2:  0.9990205165924618
Validation MAE:  0.02345452785681064


In [None]:
# Define the model
gbr = GradientBoostingRegressor()

# Define the hyperparameters to be tuned
params = {
    'learning_rate': np.logspace(-2,0.5, 50),
    #'loss': ['squared_error', 'absolute_error', 'huber', 'quantile'],   ## 'loss': 'squared_error'
    'n_estimators': range(100, 300, 10),
    'max_depth': range(3,6,1),
    #'min_samples_split': range(2,20,1),
    #'min_samples_leaf': range(1,50, 1),
    #'min_weight_fraction_leaf': np.arange(0.0, 0.5, 0.01),
    #'subsample': np.arange(0.5, 1, 0.05),
    #'max_features': [None, 'sqrt', 'log2'],
    #'max_leaf_nodes': range(2, 200, 1),
    #'criterion': ['friedman_mse', 'squared_error'],
    'random_state': [0]
}

# Create the GridSearchCV object
#grid_gbr = GridSearchCV(gbr, params, cv=5, scoring='r2', n_jobs=-1)
grid_gbr = GridSearchCV(gbr, params, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

# Fit the model on the training set with GridSearchCV
grid_gbr.fit(X_train, y_train)

# Print the best hyperparameters
print('Best hyperparameters:', grid_gbr.best_params_)

# Use the best model to make predictions on the validation set
y_val_pred = grid_gbr.predict(X_validation)

# Compute the mean squared error of the predictions on the validation set
mse_val = mean_squared_error(y_validation, y_val_pred, squared=True)
r2_val = r2_score(y_validation, y_val_pred)
mae_val = mean_absolute_error(y_validation, y_val_pred)
print("Validation MSE: ", mse_val)
print("Validation R2: ", r2_val)
print("Validation MAE: ", mae_val)

# Use the best model to make predictions on the test set
#y_test_pred_gbr = grid_gbr.predict(X_test)


overflow encountered in square



Best hyperparameters: {'learning_rate': 0.040949150623804255, 'max_depth': 3, 'n_estimators': 290, 'random_state': 0}
Validation MSE:  0.0009299617979484944
Validation R2:  0.9990205165924618
Validation MAE:  0.02345452785681064


In [None]:
# Create a DataFrame with the feature importance values
feature_importance_gbr = pd.DataFrame({'Feature': X_train.columns, 'Importance': gbr.feature_importances_})
# Sort the DataFrame by importance values in descending order
feature_importance_gbr = feature_importance_gbr.sort_values(by='Importance', ascending=False).reset_index(drop=True)
feature_importance_gbr.columns = ['feat_gbr', 'imp_gbr']
# Print the feature importance DataFrame
feature_importance_gbr.head(15)

Unnamed: 0,feat_gbr,imp_gbr
0,ipc_nal_l6,0.181419
1,ufv,0.161772
2,ipc_nal_l3,0.130876
3,ipc_nal_l1,0.121839
4,ipc_nal_l12,0.118458
5,flour_tj,0.099594
6,ipc_nal_l2,0.065873
7,ycorn_ea,0.053459
8,paprika_tr,0.010656
9,milk_or,0.008036


### **3.4. RF**
Without tuning:
Validation MSE:  0.00100147205985504
Validation R2:  0.9989451983211514
Validation MAE:  0.026535714623290185

In [None]:
# Define the Random Forest Regression model
#rf = RandomForestRegressor(random_state=0)
rf = RandomForestRegressor(min_samples_split=2, n_estimators=265, random_state=0)

# Fit the model to the training data and make predictions on the validation set
rf.fit(X_train, y_train)
# Make predictions on validation set using best model from GridSearchCV
rf_train_pred = rf.predict(X_train)
rf_val_pred = rf.predict(X_validation)
# Calculate Forecast metrics on train set
mse_train_rf = mean_squared_error(y_train, rf_train_pred, squared=True)
r2_train_rf = r2_score(y_train, rf_train_pred)
mae_train_rf = mean_absolute_error(y_train, rf_train_pred)
print("Train MSE: ", mse_train_rf)
print("Train R2: ", r2_train_rf)
print("Train MAE: ", mae_train_rf)
# Calculate Forecast metrics on validation set
mse_val_rf = mean_squared_error(y_validation, rf_val_pred, squared=True)
r2_val_rf = r2_score(y_validation, rf_val_pred)
mae_val_rf = mean_absolute_error(y_validation, rf_val_pred)
print("Validation MSE: ", mse_val_rf)
print("Validation R2: ", r2_val_rf)
print("Validation MAE: ", mae_val_rf)

Train MSE:  0.0003599513395981111
Train R2:  0.9996400486604019
Train MAE:  0.01178117103932864
Validation MSE:  0.0009983900502684609
Validation R2:  0.9989484444515393
Validation MAE:  0.024970310604635504


In [None]:
# Define the Random Forest Regression model
rf_reg = RandomForestRegressor()

# Define the hyperparameters to tune
param_grid_rf = {
    'n_estimators': range(100, 300, 15),
    #'max_features': [None, 'sqrt', 'sqrt']
    #'max_depth': range(3,7,1),
    'min_samples_split': range(2,20,1),
    #'min_samples_leaf': range(1,50, 1),
    #'min_weight_fraction_leaf': np.arange(0.0, 0.5, 0.01),
    #'bootstrap': [True],
    #'oob_score': [True, False],
    #'warm_start': [True, False],
    #'max_samples': np.arange(0.1, 1.0, 0.01)
    'random_state': [0]
}

# Define the GridSearchCV object
#grid_rf_reg = GridSearchCV(estimator=rf_reg, param_grid=param_grid_rf, cv=5, scoring='r2')
grid_rf_reg = GridSearchCV(estimator=rf_reg, param_grid=param_grid_rf, cv=5, scoring='neg_mean_squared_error')

# Fit the GridSearchCV object to the training data
grid_rf_reg.fit(X_train, y_train)

# Extract the best hyperparameters and score
best_params = grid_rf_reg.best_params_
best_score = grid_rf_reg.best_score_

# Print the best hyperparameters found by GridSearchCV
print(f"Best hyperparameters: {best_params}")
print(f"Best score: {best_score}")

# Instantiate a new Random Forest Regression model using the best hyperparameters
rf_reg_best = RandomForestRegressor(**best_params)

# Fit the model to the training data and make predictions on the validation set
rf_reg_best.fit(X_train, y_train)

# Use the best model to make predictions on the validation set
y_val_pred = rf_reg_best.predict(X_validation)

# Compute the mean squared error of the predictions on the validation set
mse_val = mean_squared_error(y_validation, y_val_pred, squared=True)
r2_val = r2_score(y_validation, y_val_pred)
mae_val = mean_absolute_error(y_validation, y_val_pred)
print("Validation MSE: ", mse_val)
print("Validation R2: ", r2_val)
print("Validation MAE: ", mae_val)

# Use the best model to make predictions on the test set
#y_test_pred_rf = rf_reg_best.predict(X_test)

Best hyperparameters: {'min_samples_split': 2, 'n_estimators': 265, 'random_state': 0}
Best score: -0.0035817383690906874
Validation MSE:  0.0009983900502684609
Validation R2:  0.9989484444515393
Validation MAE:  0.024970310604635504


In [None]:
# Create a DataFrame with the feature importance values
feature_importance_rf = pd.DataFrame({'Feature': X_train.columns, 'Importance': rf.feature_importances_})
# Sort the DataFrame by importance values in descending order
feature_importance_rf = feature_importance_rf.sort_values(by='Importance', ascending=False).reset_index(drop=True)
feature_importance_rf.columns = ['feat', 'imp_rf']
# Print the feature importance DataFrame
feature_importance_rf.head(30)

Unnamed: 0,feat,imp_rf
0,ipc_nal_l6,0.137714
1,ipc_nal_l2,0.118994
2,ufv,0.117774
3,ipc_nal_l12,0.110887
4,ipc_nal_l1,0.104852
5,ipc_nal_l3,0.094636
6,flour_tj,0.090228
7,milk2_po,0.023239
8,milk2_or,0.023221
9,milk_bol,0.015986


### **3.5. ET**

In [None]:
# Define the Extra Trees Regression model
et = ExtraTreesRegressor()
#et = ExtraTreesRegressor(bootstrap=True, max_samples=0.9599999999999995, oob_score=True)
# Fit the model to the training data and make predictions on the validation set
et.fit(X_train, y_train)
# Make predictions on validation set using best model from GridSearchCV
et_train_pred = et.predict(X_train)
et_val_pred = et.predict(X_validation)
# Calculate Forecast metrics on train set
mse_train_et = mean_squared_error(y_train, et_train_pred, squared=True)
r2_train_et = r2_score(y_train, et_train_pred)
mae_train_et = mean_absolute_error(y_train, et_train_pred)
print("Train MSE: ", mse_train_et)
print("Train R2: ", r2_train_et)
print("Train MAE: ", mae_train_et)
# Calculate Forecast metrics on validation set
mse_val_et = mean_squared_error(y_validation, et_val_pred, squared=True)
r2_val_et = r2_score(y_validation, et_val_pred)
mae_val_et = mean_absolute_error(y_validation, et_val_pred)
print("Validation MSE: ", mse_val_et)
print("Validation R2: ", r2_val_et)
print("Validation MAE: ", mae_val_et)

Train MSE:  2.0896434057490927e-30
Train R2:  1.0
Train MAE:  1.0328599534787715e-15
Validation MSE:  0.0004652877455378555
Validation R2:  0.9995099351097103
Validation MAE:  0.016635965301522112


In [None]:
# Define the Extra Trees Regression model
et_reg = ExtraTreesRegressor()

# Define the hyperparameter grid to search over
param_grid = {
    #'n_estimators': range(100, 300, 15),
    #'max_depth': range(3,15,1),
    #'max_features': [None, 'sqrt', 'sqrt']
    'min_samples_split': range(2,20,1),
    'min_samples_leaf': range(1,50, 1),
    #'min_weight_fraction_leaf': np.arange(0.0, 0.5, 0.01),
    'bootstrap': [True],
    'oob_score': [True, False],
    #'warm_start': [True, False],
    'max_samples': np.arange(0.1, 1.0, 0.01)
    #'criterion': ['squared_error', 'absolute_error', 'friedman_mse'],
    #'min_impurity_decrease': np.arange(0.0, 0.01, 0.00001),
    #'random_state': [0]
}

# Use GridSearchCV to find the best hyperparameters
#grid_search = GridSearchCV(et_reg, param_grid=param_grid, cv=5, scoring='r2')
grid_search = GridSearchCV(et_reg, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

# Print the best hyperparameters and the corresponding validation score
print("Best hyperparameters: ", grid_search.best_params_)
#print("Validation score: ", grid_search.best_score_)

# Use the best model to make predictions on the validation set
best_et_reg = grid_search.best_estimator_
y_val_pred = best_et_reg.predict(X_validation)

# Compute the mean squared error of the predictions on the validation set
mse_val = mean_squared_error(y_validation, y_val_pred, squared=True)
r2_val = r2_score(y_validation, y_val_pred)
mae_val = mean_absolute_error(y_validation, y_val_pred)
print("Validation MSE: ", mse_val)
print("Validation R2: ", r2_val)
print("Validation MAE: ", mae_val)

# Use the best model to make predictions on the test set
#y_test_pred_et = best_et_reg.predict(X_test)

In [None]:
# Create a DataFrame with the feature importance values
feature_importance_et = pd.DataFrame({'Feature': X_train.columns, 'Importance': et.feature_importances_})
# Sort the DataFrame by importance values in descending order
feature_importance_et = feature_importance_et.sort_values(by='Importance', ascending=False).reset_index(drop=True)
feature_importance_et.columns = ['feat', 'imp_et']
# Print the feature importance DataFrame
feature_importance_et.head(30)

Unnamed: 0,feat,imp_et
0,ycorn_ea,0.128451
1,ipc_nal_l1,0.115
2,ufv,0.081138
3,ipc_nal_l2,0.073717
4,ipc_nal_l12,0.073355
5,flour_tj,0.068439
6,ipc_nal_l6,0.058653
7,milk_dlp,0.053411
8,milk_lp,0.047272
9,milk_bol,0.045265


## **4. Report**

#### End