In [1]:
import sys

sys.path.insert(0, "..")

In [2]:
import pandas as pd
from mlr import perform_mlr
import plotly.graph_objects as go
from forward_backward import forward_selection, backward_selection
import plotly.express as px

In [3]:
df = pd.read_csv("../../processed_tables/merged_no_damns_standardized.csv")

## Regresión lineal múltiple

Usando `statsmodels` se puede realizar una regresión lineal. Nuestro módulo se encarga de hacer todo este análisis.


In [4]:
mape_test = []
mse_test = []

for i in range(1, 12):
    print(f"Lags: {i}")
    results_i = perform_mlr({"df": df, "lags": i})
    mape_i = results_i["mape_test"]
    mse_test_i = results_i["mse_test"]
    mape_test.append(mape_i)
    mse_test.append(mse_test_i)
    print("-----------------------")

Lags: 1
Mean Squared Error on Test Data: 4590.231728734916
Mean Absolute Percentage Error: 0.09698183327122548
-----------------------
Lags: 2
Mean Squared Error on Test Data: 4488.26027812589
Mean Absolute Percentage Error: 0.0976079025645777
-----------------------
Lags: 3
Mean Squared Error on Test Data: 4498.4269647473275
Mean Absolute Percentage Error: 0.09627604130701675
-----------------------
Lags: 4
Mean Squared Error on Test Data: 4494.097011910895
Mean Absolute Percentage Error: 0.09619779024089208
-----------------------
Lags: 5
Mean Squared Error on Test Data: 4565.825498839827
Mean Absolute Percentage Error: 0.09650747751726266
-----------------------
Lags: 6
Mean Squared Error on Test Data: 4659.251311518415
Mean Absolute Percentage Error: 0.09764519401345485
-----------------------
Lags: 7
Mean Squared Error on Test Data: 4698.809949443617
Mean Absolute Percentage Error: 0.09793933544284252
-----------------------
Lags: 8
Mean Squared Error on Test Data: 4886.6444678729

In [5]:
df_pruebas = pd.DataFrame(
    {"lag": list(range(1, 12)), "mse_test": mse_test, "mape_test": mape_test}
)

fig_mse = px.line(
    df_pruebas, x="lag", y="mse_test", title="MSE de la data de test vs Lags"
)

fig_mse.show()

In [6]:
fig_mape = px.line(
    df_pruebas, x="lag", y="mape_test", title="MAPE de la data de test vs Lags"
)

fig_mape.show()

Usando como métrica de elección el MAPE, vemos que el mejor modelo es con lag = 4


In [7]:
fecha_corte = "2023-07-01"

results = perform_mlr({"df": df, "fecha_corte": fecha_corte, "lags": 4})
y_pred = results["y_pred"]
y_test = results["y_test"]
X_test_dates = results["X_test_dates"]
X_train = results["X_train"]
X_train_dates = results["X_train_dates"]
y_train = results["y_train"]
y_pred_train = results["y_pred_train"]
X_test = results["X_test"]
mse = results["mse_test"]
mape = results["mape_test"]
significant_variables = results["significant_variables"]
df_with_lags = results["df_with_lags"]

Mean Squared Error on Test Data: 4494.097011910895
Mean Absolute Percentage Error: 0.09619779024089208


In [8]:
significant_variables

['precipitacion_vaupes', 'energy_price_lag_1', 'energy_price_lag_3']

In [9]:
significant_variables.append("Date")
significant_variables.append("energy_price")

## MLR with selected variables


In [10]:
# Selecciona las columnas significativas del DataFrame original
df_selected_variables = df_with_lags[significant_variables].copy()

fecha_corte = "2023-07-01"

results = perform_mlr(
    {"df": df_selected_variables, "fecha_corte": fecha_corte, "lags": 4}
)
mse_MRL_selection = results["mse_test"]
mape_MRL_selection = results["mape_test"]
significant_variables = results["significant_variables"]
X_test_dates_selection = results["X_test_dates"]
X_train_dates_selection = results["X_train_dates"]
y_train_selection = results["y_train"]
y_test_selection = results["y_test"]
y_pred_train_selection = results["y_pred_train"]
y_pred_test_selection = results["y_pred"]
y_pred_test_selection_with_shift = y_pred_test_selection[1:]

Mean Squared Error on Test Data: 4452.794998235194
Mean Absolute Percentage Error: 0.09564485089968977


In [11]:
significant_variables

['const', 'precipitacion_vaupes', 'energy_price_lag_1', 'energy_price_lag_3']

## Forward selection


In [12]:
selected_features_forward = forward_selection(X_train, y_train)

In [13]:
print(len(selected_features_forward))
print("Forward Selection Result:", selected_features_forward)

19
Forward Selection Result: ['precipitacion_guainia', 'temp_ARAUCA', 'precipitacion_cesar', 'brent_value', 'temp_ARCHIPIELAGO DE SAN ANDRES PROVIDENCIA Y SANTA CATALINA', 'temp_CAQUETA', 'TRM', 'precipitacion_bogota', 'precipitacion_san andres providencia', 'precipitacion_amazonas', 'precipitacion_casanare', 'precipitacion_guaviare', 'precipitacion_magdalena', 'precipitacion_vichada', 'precipitacion_caqueta', 'precipitacion_putumayo', 'energy_price_lag_1', 'Date', 'energy_price']


In [14]:
df_selected_variables_forward = df_with_lags[selected_features_forward]

fecha_corte = "2023-07-01"

results = perform_mlr(
    {"df": df_selected_variables_forward, "fecha_corte": fecha_corte, "lags": 4}
)
y_pred_f = results["y_pred"]
y_pred_1_f = y_pred[1:]
y_test_f = results["y_test"]
X_test_dates_f = results["X_test_dates"]
X_train_f = results["X_train"]
X_train_dates_f = results["X_train_dates"]
y_train_f = results["y_train"]
y_pred_train_f = results["y_pred_train"]
mse_f = results["mse_test"]
mape_f = results["mape_test"]

Mean Squared Error on Test Data: 4743.77253703601
Mean Absolute Percentage Error: 0.09969678906235703


## Backward selection


In [15]:
selected_features_backward = backward_selection(X_train, y_train)

In [16]:
print(len(selected_features_backward))
print("Backward Selection Result:", selected_features_backward)

20
Backward Selection Result: ['precipitacion_cauca', 'precipitacion_guainia', 'precipitacion_la guajira', 'temp_ARAUCA', 'precipitacion_cesar', 'brent_value', 'temp_CAQUETA', 'precipitacion_cordoba', 'precipitacion_guaviare', 'precipitacion_vichada', 'precipitacion_bolivar', 'temp_BOYACA', 'precipitacion_putumayo', 'precipitacion_vaupes', 'precipitacion_arauca', 'precipitacion_choco', 'energy_price_lag_1', 'energy_price_lag_4', 'Date', 'energy_price']


In [17]:
df_selected_variables_backward = df_with_lags[selected_features_backward]

fecha_corte = "2023-07-01"

results = perform_mlr(
    {"df": df_selected_variables_backward, "fecha_corte": fecha_corte, "lags": 4}
)
mse_b = results["mse_test"]
mape_b = results["mape_test"]

Mean Squared Error on Test Data: 4890.2517366290485
Mean Absolute Percentage Error: 0.10070134506736954


In [18]:
results_MRL_with_lags = pd.DataFrame(
    {
        "Modelo": ["MLR", "MLR_selection", "MLR_forward", "MLR_backward"],
        "MSE": [mse, mse_MRL_selection, mse_f, mse_b],
        "MAPE": [mape, mape_MRL_selection, mape_f, mape_b],
    }
)

results_MRL_with_lags

Unnamed: 0,Modelo,MSE,MAPE
0,MLR,4494.097012,0.096198
1,MLR_selection,4452.794998,0.095645
2,MLR_forward,4743.772537,0.099697
3,MLR_backward,4890.251737,0.100701


Se identifica que la seleccion de variablesque obtuvo mejor desempeño en cuanto a la metrica seleccionada MAPE es el Forward selection.


In [19]:
fig_lag4 = px.line(
    x=X_train_dates_selection,
    y=y_train_selection,
    title="Forecasting of Energy Price with Forward MLR with 4 lags",
)
fig_lag4.update_layout(xaxis_title="Date", yaxis_title="Average Energy Price")
fig_lag4.add_trace(
    go.Scatter(
        x=X_train_dates_selection,
        y=y_pred_train_selection,
        mode="lines",
        name="Valores predichos de train",
    )
)
fig_lag4.add_trace(
    go.Scatter(
        x=X_test_dates_selection,
        y=y_test_selection,
        mode="lines",
        name="Valores reales de test",
    )
)
fig_lag4.add_trace(
    go.Scatter(
        x=X_test_dates_selection,
        y=y_pred_test_selection,
        mode="lines",
        name="Valores predichos de test",
    )
)
fig_lag4.add_trace(
    go.Scatter(
        x=X_test_dates_selection,
        y=y_pred_test_selection_with_shift,
        mode="lines",
        name="Valores predichos de test -1",
    )
)
fig_lag4.show()

Matriz de correlacion de las variables seleccionadas por el mejor modelo, selección Forward


In [20]:
numeric_columns = df_selected_variables_forward.select_dtypes(
    include=["float64", "int64"]
)
correlation_matrix = numeric_columns.corr()
correlation_matrix

Unnamed: 0,precipitacion_guainia,temp_ARAUCA,precipitacion_cesar,brent_value,temp_ARCHIPIELAGO DE SAN ANDRES PROVIDENCIA Y SANTA CATALINA,temp_CAQUETA,TRM,precipitacion_bogota,precipitacion_san andres providencia,precipitacion_amazonas,precipitacion_casanare,precipitacion_guaviare,precipitacion_magdalena,precipitacion_vichada,precipitacion_caqueta,precipitacion_putumayo,energy_price_lag_1,energy_price
precipitacion_guainia,1.0,0.061261,0.095274,0.133163,0.017146,-0.150496,-0.086251,0.058107,0.11377,-0.06099,0.141844,0.114005,0.038178,0.064322,0.145535,0.053669,-0.078413,-0.078516
temp_ARAUCA,0.061261,1.0,0.046196,-0.127091,0.095159,0.182023,-0.168558,-0.053426,0.002041,0.087854,-0.104227,-0.011584,-0.063021,-0.100798,0.02206,-0.03205,-0.16017,-0.160594
precipitacion_cesar,0.095274,0.046196,1.0,0.204447,-0.033574,-0.068623,-0.113215,0.109009,0.003234,-0.015654,0.018671,0.16825,0.185898,0.015089,0.085002,0.005954,-0.133672,-0.13647
brent_value,0.133163,-0.127091,0.204447,1.0,-0.090662,-0.262866,-0.328136,0.250227,0.135416,0.012178,0.0055,0.182742,0.361648,-0.01433,0.048732,-0.157034,-0.36635,-0.363586
temp_ARCHIPIELAGO DE SAN ANDRES PROVIDENCIA Y SANTA CATALINA,0.017146,0.095159,-0.033574,-0.090662,1.0,0.14117,-0.183483,-0.008025,-0.195152,-0.057892,0.148221,-0.017279,-0.003035,0.122271,0.020436,0.015998,0.224531,0.231223
temp_CAQUETA,-0.150496,0.182023,-0.068623,-0.262866,0.14117,1.0,0.175222,-0.119527,-0.107654,0.057531,-0.163412,-0.164676,-0.058575,-0.187662,-0.297716,-0.174746,0.223119,0.233247
TRM,-0.086251,-0.168558,-0.113215,-0.328136,-0.183483,0.175222,1.0,-0.046777,-0.072813,-0.01661,-0.043775,-0.024023,-0.007488,-0.076597,-0.011917,0.030287,-0.01488,-0.017256
precipitacion_bogota,0.058107,-0.053426,0.109009,0.250227,-0.008025,-0.119527,-0.046777,1.0,-0.014237,0.104938,0.117512,0.215049,0.24375,0.036956,0.103425,0.031672,-0.064258,-0.078994
precipitacion_san andres providencia,0.11377,0.002041,0.003234,0.135416,-0.195152,-0.107654,-0.072813,-0.014237,1.0,-0.062397,0.042215,0.133882,0.078084,0.070174,0.127337,0.0628,-0.108681,-0.109135
precipitacion_amazonas,-0.06099,0.087854,-0.015654,0.012178,-0.057892,0.057531,-0.01661,0.104938,-0.062397,1.0,-0.013944,0.042756,-0.000225,0.007979,-0.024298,-0.002941,0.058035,0.055893
