In [1]:
import sys

sys.path.insert(0, "..")

In [2]:
import pandas as pd
from mlr import perform_mlr
import plotly.graph_objects as go
from forward_backward import forward_selection, backward_selection
import plotly.express as px

In [3]:
df = pd.read_csv("../../processed_tables/merged_without_lags_represas_criterio.csv")

## Regresión lineal múltiple

Usando `statsmodels` se puede realizar una regresión lineal. Nuestro módulo se encarga de hacer todo este análisis.


In [4]:
mape_test = []
mse_test = []

for i in range(1, 12):
    print(f"Lags: {i}")
    results_i = perform_mlr({"df": df, "lags": i})
    mape_i = results_i["mape_test"]
    mse_test_i = results_i["mse_test"]
    mape_test.append(mape_i)
    mse_test.append(mse_test_i)
    print("-----------------------")

Lags: 1
Mean Squared Error on Test Data: 4551.823987062198
Mean Absolute Percentage Error: 0.09707363295808719
-----------------------
Lags: 2
Mean Squared Error on Test Data: 4479.034284299025
Mean Absolute Percentage Error: 0.09780186792007217
-----------------------
Lags: 3
Mean Squared Error on Test Data: 4484.427136838624
Mean Absolute Percentage Error: 0.09610509530748834
-----------------------
Lags: 4
Mean Squared Error on Test Data: 4478.845868616389
Mean Absolute Percentage Error: 0.09601620014841875
-----------------------
Lags: 5
Mean Squared Error on Test Data: 4519.179345032139
Mean Absolute Percentage Error: 0.09612714669114251
-----------------------
Lags: 6
Mean Squared Error on Test Data: 4601.175224694923
Mean Absolute Percentage Error: 0.09704955701483083
-----------------------
Lags: 7
Mean Squared Error on Test Data: 4629.229904960638
Mean Absolute Percentage Error: 0.09727254583666965
-----------------------
Lags: 8
Mean Squared Error on Test Data: 4755.422725715

In [5]:
df_pruebas = pd.DataFrame(
    {"lag": list(range(1, 12)), "mse_test": mse_test, "mape_test": mape_test}
)

fig_mse = px.line(
    df_pruebas, x="lag", y="mse_test", title="MSE de la data de test vs Lags"
)

fig_mse.show()

In [6]:
fig_mape = px.line(
    df_pruebas, x="lag", y="mape_test", title="MAPE de la data de test vs Lags"
)

fig_mape.show()

Usando como métrica de elección el MAPE, vemos que el mejor modelo es con lag = 4


In [7]:
fecha_corte = "2023-07-01"

results = perform_mlr({"df": df, "fecha_corte": fecha_corte, "lags": 4})
y_pred = results["y_pred"]
y_test = results["y_test"]
X_test_dates = results["X_test_dates"]
X_train = results["X_train"]
X_train_dates = results["X_train_dates"]
y_train = results["y_train"]
y_pred_train = results["y_pred_train"]
X_test = results["X_test"]
mse = results["mse_test"]
mape = results["mape_test"]
significant_variables = results["significant_variables"]
df_with_lags = results["df_with_lags"]

Mean Squared Error on Test Data: 4478.845868616389
Mean Absolute Percentage Error: 0.09601620014841875


In [8]:
significant_variables.remove("const")
significant_variables.append("Date")
significant_variables.append("energy_price")

## MLR with selected variables


In [9]:
# Selecciona las columnas significativas del DataFrame original
df_selected_variables = df_with_lags[significant_variables].copy()

fecha_corte = "2023-07-01"

results = perform_mlr(
    {"df": df_selected_variables, "fecha_corte": fecha_corte, "lags": 4}
)
mse_MRL_selection = results["mse_test"]
mape_MRL_selection = results["mape_test"]
significant_variables = results["significant_variables"]
X_test_dates_selection = results["X_test_dates"]
X_train_dates_selection = results["X_train_dates"]
y_train_selection = results["y_train"]
y_test_selection = results["y_test"]
y_pred_train_selection = results["y_pred_train"]
y_pred_test_selection = results["y_pred"]
y_pred_test_selection_with_shift = y_pred_test_selection[1:]

Mean Squared Error on Test Data: 4452.7949982352
Mean Absolute Percentage Error: 0.09564485089968987


In [10]:
significant_variables

['const', 'precipitacion_vaupes', 'energy_price_lag_1', 'energy_price_lag_3']

## Forward selection


In [11]:
selected_features_forward = forward_selection(X_train, y_train)
print("Forward Selection Result:", selected_features_forward)

Forward Selection Result: ['precipitacion_amazonas', 'precipitacion_bogota', 'precipitacion_caqueta', 'precipitacion_casanare', 'precipitacion_cesar', 'precipitacion_guainia', 'precipitacion_guaviare', 'precipitacion_la guajira', 'precipitacion_magdalena', 'precipitacion_putumayo', 'precipitacion_san andres providencia', 'precipitacion_vichada', 'temp_ARAUCA', 'temp_ARCHIPIELAGO DE SAN ANDRES PROVIDENCIA Y SANTA CATALINA', 'temp_CAQUETA', 'temp_VICHADA', 'brent_value', 'energy_price_lag_1', 'Date', 'energy_price']


In [12]:
print(len(selected_features_forward))
print("Forward Selection Result:", selected_features_forward)

20
Forward Selection Result: ['precipitacion_amazonas', 'precipitacion_bogota', 'precipitacion_caqueta', 'precipitacion_casanare', 'precipitacion_cesar', 'precipitacion_guainia', 'precipitacion_guaviare', 'precipitacion_la guajira', 'precipitacion_magdalena', 'precipitacion_putumayo', 'precipitacion_san andres providencia', 'precipitacion_vichada', 'temp_ARAUCA', 'temp_ARCHIPIELAGO DE SAN ANDRES PROVIDENCIA Y SANTA CATALINA', 'temp_CAQUETA', 'temp_VICHADA', 'brent_value', 'energy_price_lag_1', 'Date', 'energy_price']


In [13]:
df_selected_variables_forward = df_with_lags[selected_features_forward]

fecha_corte = "2023-07-01"

results = perform_mlr(
    {"df": df_selected_variables_forward, "fecha_corte": fecha_corte, "lags": 4}
)
y_pred_f = results["y_pred"]
y_pred_1_f = y_pred[1:]
y_test_f = results["y_test"]
X_test_dates_f = results["X_test_dates"]
X_train_f = results["X_train"]
X_train_dates_f = results["X_train_dates"]
y_train_f = results["y_train"]
y_pred_train_f = results["y_pred_train"]
mse_f = results["mse_test"]
mape_f = results["mape_test"]

Mean Squared Error on Test Data: 4794.3043972280275
Mean Absolute Percentage Error: 0.1003673844754969


## Backward selection


In [14]:
selected_features_backward = backward_selection(X_train, y_train)

In [15]:
print(len(selected_features_backward))
print("Backward Selection Result:", selected_features_backward)

20
Backward Selection Result: ['precipitacion_arauca', 'precipitacion_bolivar', 'precipitacion_caqueta', 'precipitacion_cesar', 'precipitacion_choco', 'precipitacion_guainia', 'precipitacion_guaviare', 'precipitacion_la guajira', 'precipitacion_vaupes', 'precipitacion_vichada', 'precipitacion_departamentos_represa', 'temp_ARAUCA', 'temp_CAQUETA', 'temp_CHOCO', 'temp_VICHADA', 'brent_value', 'energy_price_lag_1', 'energy_price_lag_4', 'Date', 'energy_price']


In [16]:
df_selected_variables_backward = df_with_lags[selected_features_backward]

fecha_corte = "2023-07-01"

results = perform_mlr(
    {"df": df_selected_variables_backward, "fecha_corte": fecha_corte, "lags": 4}
)
mse_b = results["mse_test"]
mape_b = results["mape_test"]

Mean Squared Error on Test Data: 4717.247650668018
Mean Absolute Percentage Error: 0.09950334510547455


In [17]:
results_MRL_with_lags = pd.DataFrame(
    {
        "Modelo": ["MLR", "MLR_selection", "MLR_forward", "MLR_backward"],
        "MSE": [mse, mse_MRL_selection, mse_f, mse_b],
        "MAPE": [mape, mape_MRL_selection, mape_f, mape_b],
    }
)

results_MRL_with_lags

Unnamed: 0,Modelo,MSE,MAPE
0,MLR,4478.845869,0.096016
1,MLR_selection,4452.794998,0.095645
2,MLR_forward,4794.304397,0.100367
3,MLR_backward,4717.247651,0.099503


Se identifica que la seleccion de variablesque obtuvo mejor desempeño en cuanto a la metrica seleccionada MAPE es el Forward selection.


In [18]:
fig_lag4 = px.line(
    x=X_train_dates_selection,
    y=y_train_selection,
    title="Forecasting of Energy Price with Forward MLR with 4 lags",
)
fig_lag4.update_layout(xaxis_title="Date", yaxis_title="Average Energy Price")
fig_lag4.add_trace(
    go.Scatter(
        x=X_train_dates_selection,
        y=y_pred_train_selection,
        mode="lines",
        name="Valores predichos de train",
    )
)
fig_lag4.add_trace(
    go.Scatter(
        x=X_test_dates_selection,
        y=y_test_selection,
        mode="lines",
        name="Valores reales de test",
    )
)
fig_lag4.add_trace(
    go.Scatter(
        x=X_test_dates_selection,
        y=y_pred_test_selection,
        mode="lines",
        name="Valores predichos de test",
    )
)
fig_lag4.add_trace(
    go.Scatter(
        x=X_test_dates_selection,
        y=y_pred_test_selection_with_shift,
        mode="lines",
        name="Valores predichos de test -1",
    )
)
fig_lag4.show()

Matriz de correlacion de las variables seleccionadas por el mejor modelo, selección Forward


In [19]:
numeric_columns = df_selected_variables_forward.select_dtypes(
    include=["float64", "int64"]
)
correlation_matrix = numeric_columns.corr()
correlation_matrix

Unnamed: 0,precipitacion_amazonas,precipitacion_bogota,precipitacion_caqueta,precipitacion_casanare,precipitacion_cesar,precipitacion_guainia,precipitacion_guaviare,precipitacion_la guajira,precipitacion_magdalena,precipitacion_putumayo,precipitacion_san andres providencia,precipitacion_vichada,temp_ARAUCA,temp_ARCHIPIELAGO DE SAN ANDRES PROVIDENCIA Y SANTA CATALINA,temp_CAQUETA,temp_VICHADA,brent_value,energy_price_lag_1,energy_price
precipitacion_amazonas,1.0,0.104938,-0.024298,-0.013944,-0.015654,-0.06102,0.042756,-0.105555,-0.000225,-0.002941,-0.062397,0.007979,0.087854,-0.057892,0.057531,-0.014141,0.012178,0.058035,0.055893
precipitacion_bogota,0.104938,1.0,0.103425,0.117512,0.109009,0.058128,0.215049,0.002706,0.24375,0.031672,-0.014237,0.036956,-0.053426,-0.008025,-0.119527,-0.080469,0.250227,-0.064258,-0.078994
precipitacion_caqueta,-0.024298,0.103425,1.0,0.228872,0.085002,0.14552,0.222458,-0.008815,0.112019,0.29783,0.127337,0.118293,0.02206,0.020436,-0.297716,-0.032143,0.048732,-0.028794,-0.042744
precipitacion_casanare,-0.013944,0.117512,0.228872,1.0,0.018671,0.141829,0.223168,0.023348,0.123362,0.155049,0.042215,0.258984,-0.104227,0.148221,-0.163412,-0.076758,0.0055,0.140932,0.136466
precipitacion_cesar,-0.015654,0.109009,0.085002,0.018671,1.0,0.095287,0.16825,0.042679,0.185898,0.005954,0.003234,0.015089,0.046196,-0.033574,-0.068623,0.03766,0.204447,-0.133672,-0.13647
precipitacion_guainia,-0.06102,0.058128,0.14552,0.141829,0.095287,1.0,0.114033,0.000616,0.038147,0.05366,0.113797,0.064316,0.061194,0.01715,-0.150503,-0.044262,0.133179,-0.078373,-0.078486
precipitacion_guaviare,0.042756,0.215049,0.222458,0.223168,0.16825,0.114033,1.0,0.033916,0.259762,0.112019,0.133882,0.056771,-0.011584,-0.017279,-0.164676,-0.112172,0.182742,-0.112851,-0.11075
precipitacion_la guajira,-0.105555,0.002706,-0.008815,0.023348,0.042679,0.000616,0.033916,1.0,0.23396,-0.019645,0.0473,-0.00271,0.005616,0.084333,0.035398,0.081587,0.155019,-0.14594,-0.143783
precipitacion_magdalena,-0.000225,0.24375,0.112019,0.123362,0.185898,0.038147,0.259762,0.23396,1.0,-0.034144,0.078084,0.007996,-0.063021,-0.003035,-0.058575,-0.100221,0.361648,-0.235261,-0.238755
precipitacion_putumayo,-0.002941,0.031672,0.29783,0.155049,0.005954,0.05366,0.112019,-0.019645,-0.034144,1.0,0.0628,0.110634,-0.03205,0.015998,-0.174746,-0.04896,-0.157034,0.084284,0.076281
