In [10]:
import sys

sys.path.insert(0, "..")

In [11]:
import pandas as pd
from mlr import perform_mlr
import plotly.graph_objects as go
from forward_backward import forward_selection, backward_selection
import plotly.express as px

In [12]:
df = pd.read_csv("../../processed_tables/merged_standardized.csv")

## Regresión lineal múltiple

Usando `statsmodels` se puede realizar una regresión lineal. Nuestro módulo se encarga de hacer todo este análisis.


In [13]:
mape_test = []
mse_test = []

for i in range(1, 12):
    print(f"Lags: {i}")
    results_i = perform_mlr({"df": df, "lags": i})
    mape_i = results_i["mape_test"]
    mse_test_i = results_i["mse_test"]
    mape_test.append(mape_i)
    mse_test.append(mse_test_i)
    print("-----------------------")

Lags: 1
Mean Squared Error on Test Data: 4778.562363648315
Mean Absolute Percentage Error: 0.09907461574794403
-----------------------
Lags: 2
Mean Squared Error on Test Data: 4786.332444181156
Mean Absolute Percentage Error: 0.10031577056964938
-----------------------
Lags: 3
Mean Squared Error on Test Data: 4770.522412238567
Mean Absolute Percentage Error: 0.09837102875040102
-----------------------
Lags: 4
Mean Squared Error on Test Data: 4772.759086273159
Mean Absolute Percentage Error: 0.098268196371518
-----------------------
Lags: 5
Mean Squared Error on Test Data: 4941.182555505074
Mean Absolute Percentage Error: 0.10095231758660823
-----------------------
Lags: 6
Mean Squared Error on Test Data: 5069.357787957889
Mean Absolute Percentage Error: 0.10254645265701369
-----------------------
Lags: 7
Mean Squared Error on Test Data: 5122.493808012305
Mean Absolute Percentage Error: 0.10303449878803168
-----------------------
Lags: 8
Mean Squared Error on Test Data: 5424.87988832933

In [14]:
df_pruebas = pd.DataFrame(
    {"lag": list(range(1, 12)), "mse_test": mse_test, "mape_test": mape_test}
)

fig_mse = px.line(
    df_pruebas, x="lag", y="mse_test", title="MSE de la data de test vs Lags"
)

fig_mse.show()

In [15]:
fig_mape = px.line(
    df_pruebas, x="lag", y="mape_test", title="MAPE de la data de test vs Lags"
)

fig_mape.show()

Usando como métrica de elección el MAPE, vemos que el mejor modelo es con lag = 4


In [16]:
fecha_corte = "2023-07-01"

results = perform_mlr({"df": df, "fecha_corte": fecha_corte, "lags": 4})
y_pred = results["y_pred"]
y_test = results["y_test"]
X_test_dates = results["X_test_dates"]
X_train = results["X_train"]
X_train_dates = results["X_train_dates"]
y_train = results["y_train"]
y_pred_train = results["y_pred_train"]
X_test = results["X_test"]
mse = results["mse_test"]
mape = results["mape_test"]
significant_variables = results["significant_variables"]
df_with_lags = results["df_with_lags"]

Mean Squared Error on Test Data: 4772.759086273159
Mean Absolute Percentage Error: 0.098268196371518


In [17]:
significant_variables.remove("const")
significant_variables.append("Date")
significant_variables.append("energy_price")

## MLR with selected variables


In [18]:
# Selecciona las columnas significativas del DataFrame original
df_selected_variables = df_with_lags[significant_variables].copy()

fecha_corte = "2023-07-01"

results = perform_mlr(
    {"df": df_selected_variables, "fecha_corte": fecha_corte, "lags": 4}
)
mse_MRL_selection = results["mse_test"]
mape_MRL_selection = results["mape_test"]

Mean Squared Error on Test Data: 4754.938830279311
Mean Absolute Percentage Error: 0.09923206234780495


## Forward selection


In [19]:
all_features = list(df.columns[:])
selected_features_forward = forward_selection(X_train, y_train)
print("Forward Selection Result:", selected_features_forward)

Forward Selection Result: ['precipitacion_amazonas', 'precipitacion_boyaca', 'precipitacion_casanare', 'precipitacion_cauca', 'precipitacion_cesar', 'precipitacion_cordoba', 'precipitacion_guainia', 'precipitacion_guaviare', 'precipitacion_huila', 'precipitacion_la guajira', 'precipitacion_putumayo', 'precipitacion_quindio', 'precipitacion_san andres providencia', 'precipitacion_valle del cauca', 'precipitacion_vichada', 'precipitacion_departamentos_represa', 'temp_ARAUCA', 'temp_ARCHIPIELAGO DE SAN ANDRES PROVIDENCIA Y SANTA CATALINA', 'temp_ATLANTICO', 'temp_BOGOTA', 'temp_BOLIVAR', 'temp_BOYACA', 'temp_CALDAS', 'temp_CAQUETA', 'temp_CASANARE', 'temp_CAUCA', 'temp_CESAR', 'temp_CHOCO', 'temp_GUAVIARE', 'temp_HUILA', 'temp_MAGDALENA', 'temp_META', 'temp_PUTUMAYO', 'temp_SANTANDER', 'temp_SUCRE', 'TRM', 'energy_price_lag_1', 'Date', 'energy_price']


In [20]:
print(len(selected_features_forward))
print("Forward Selection Result:", selected_features_forward)

39
Forward Selection Result: ['precipitacion_amazonas', 'precipitacion_boyaca', 'precipitacion_casanare', 'precipitacion_cauca', 'precipitacion_cesar', 'precipitacion_cordoba', 'precipitacion_guainia', 'precipitacion_guaviare', 'precipitacion_huila', 'precipitacion_la guajira', 'precipitacion_putumayo', 'precipitacion_quindio', 'precipitacion_san andres providencia', 'precipitacion_valle del cauca', 'precipitacion_vichada', 'precipitacion_departamentos_represa', 'temp_ARAUCA', 'temp_ARCHIPIELAGO DE SAN ANDRES PROVIDENCIA Y SANTA CATALINA', 'temp_ATLANTICO', 'temp_BOGOTA', 'temp_BOLIVAR', 'temp_BOYACA', 'temp_CALDAS', 'temp_CAQUETA', 'temp_CASANARE', 'temp_CAUCA', 'temp_CESAR', 'temp_CHOCO', 'temp_GUAVIARE', 'temp_HUILA', 'temp_MAGDALENA', 'temp_META', 'temp_PUTUMAYO', 'temp_SANTANDER', 'temp_SUCRE', 'TRM', 'energy_price_lag_1', 'Date', 'energy_price']


In [21]:
df_selected_variables_forward = df_with_lags[selected_features_forward]

fecha_corte = "2023-07-01"

results = perform_mlr(
    {"df": df_selected_variables_forward, "fecha_corte": fecha_corte, "lags": 4}
)
y_pred_f = results["y_pred"]
y_pred_1_f = y_pred[1:]
y_test_f = results["y_test"]
X_test_dates_f = results["X_test_dates"]
X_train_f = results["X_train"]
X_train_dates_f = results["X_train_dates"]
y_train_f = results["y_train"]
y_pred_train_f = results["y_pred_train"]
mse_f = results["mse_test"]
mape_f = results["mape_test"]

Mean Squared Error on Test Data: 4706.253505340503
Mean Absolute Percentage Error: 0.09862196517697605


## Backward selection


In [22]:
selected_features_backward = backward_selection(X_train, y_train)

In [23]:
print(len(selected_features_backward))
print("Backward Selection Result:", selected_features_backward)

40
Backward Selection Result: ['precipitacion_amazonas', 'precipitacion_arauca', 'precipitacion_atlantico', 'precipitacion_bogota', 'precipitacion_bolivar', 'precipitacion_caqueta', 'precipitacion_casanare', 'precipitacion_cauca', 'precipitacion_cesar', 'precipitacion_choco', 'precipitacion_cundinamarca', 'precipitacion_guainia', 'precipitacion_la guajira', 'precipitacion_magdalena', 'precipitacion_meta', 'precipitacion_narino', 'precipitacion_norte de santander', 'precipitacion_putumayo', 'precipitacion_quindio', 'precipitacion_risaralda', 'precipitacion_san andres providencia', 'precipitacion_sucre', 'precipitacion_vaupes', 'precipitacion_vichada', 'precipitacion_colombia', 'precipitacion_departamentos_represa', 'temp_ATLANTICO', 'temp_BOGOTA', 'temp_BOLIVAR', 'temp_CALDAS', 'temp_CASANARE', 'temp_MAGDALENA', 'temp_PUTUMAYO', 'temp_QUINDIO', 'temp_RISARALDA', 'temp_avg_pais', 'brent_value', 'energy_price_lag_1', 'Date', 'energy_price']


In [24]:
df_selected_variables_backward = df_with_lags[selected_features_backward]

fecha_corte = "2023-07-01"

results = perform_mlr(
    {"df": df_selected_variables_backward, "fecha_corte": fecha_corte, "lags": 4}
)
mse_b = results["mse_test"]
mape_b = results["mape_test"]

Mean Squared Error on Test Data: 4746.924574290919
Mean Absolute Percentage Error: 0.09929987556055843


In [25]:
results_MRL_with_lags = pd.DataFrame(
    {
        "Modelo": ["MLR", "MLR_selection", "MLR_forward", "MLR_backward"],
        "MSE": [mse, mse_MRL_selection, mse_f, mse_b],
        "MAPE": [mape, mape_MRL_selection, mape_f, mape_b],
    }
)

results_MRL_with_lags

Unnamed: 0,Modelo,MSE,MAPE
0,MLR,4772.759086,0.098268
1,MLR_selection,4754.93883,0.099232
2,MLR_forward,4706.253505,0.098622
3,MLR_backward,4746.924574,0.0993


Se identifica que la seleccion de variablesque obtuvo mejor desempeño en cuanto a la metrica seleccionada MAPE es el Forward selection.


In [26]:
fig_lag4 = px.line(
    x=X_train_dates_f,
    y=y_train_f,
    title="Forecasting of Energy Price with Forward MLR with 4 lags",
)
fig_lag4.update_layout(xaxis_title="Date", yaxis_title="Average Energy Price")
fig_lag4.add_trace(
    go.Scatter(
        x=X_train_dates_f, y=y_pred_train_f, mode="lines", name="Valores predichos de train"
    )
)
fig_lag4.add_trace(
    go.Scatter(
        x=X_test_dates_f, y=y_test_f, mode="lines", name="Valores reales de test"
    )
)
fig_lag4.add_trace(
    go.Scatter(
        x=X_test_dates_f, y=y_pred_f, mode="lines", name="Valores predichos de test"
    )
)
fig_lag4.add_trace(
    go.Scatter(
        x=X_test_dates_f,
        y=y_pred_1_f,
        mode="lines",
        name="Valores predichos de test -1",
    )
)
fig_lag4.show()

Matriz de correlacion de las variables seleccionadas por el mejor modelo, selección Forward


In [27]:
numeric_columns = df_selected_variables_forward.select_dtypes(
    include=["float64", "int64"]
)
correlation_matrix = numeric_columns.corr()
correlation_matrix

Unnamed: 0,precipitacion_amazonas,precipitacion_boyaca,precipitacion_casanare,precipitacion_cauca,precipitacion_cesar,precipitacion_cordoba,precipitacion_guainia,precipitacion_guaviare,precipitacion_huila,precipitacion_la guajira,...,temp_GUAVIARE,temp_HUILA,temp_MAGDALENA,temp_META,temp_PUTUMAYO,temp_SANTANDER,temp_SUCRE,TRM,energy_price_lag_1,energy_price
precipitacion_amazonas,1.0,0.003377,-0.013944,0.092787,-0.015654,0.003463,-0.06099,0.042756,-0.012342,-0.105555,...,0.028708,0.085728,-0.011852,0.12745,0.045184,-0.006199,0.109788,-0.01661,0.058035,0.055893
precipitacion_boyaca,0.003377,1.0,0.46923,0.217183,0.07452,0.270436,0.096883,0.31698,0.328735,0.154142,...,-0.209166,-0.093823,-0.002537,-0.112117,-0.003793,-0.166184,-0.145058,-0.074198,-0.063845,-0.070192
precipitacion_casanare,-0.013944,0.46923,1.0,0.029455,0.018671,0.130931,0.141844,0.223168,0.094156,0.023348,...,-0.129983,0.006804,0.15348,-0.277689,-0.128337,0.096555,0.014259,-0.043775,0.140932,0.136466
precipitacion_cauca,0.092787,0.217183,0.029455,1.0,0.018585,0.157075,-0.068266,0.294807,0.439126,0.097578,...,-0.124964,-0.044479,-0.140244,0.256023,0.119101,-0.275566,0.018197,0.13239,-0.090129,-0.097477
precipitacion_cesar,-0.015654,0.07452,0.018671,0.018585,1.0,0.075474,0.095274,0.16825,0.030442,0.042679,...,-0.138443,-0.091645,-0.080625,0.020425,-0.02217,-0.146217,-0.072357,-0.113215,-0.133672,-0.13647
precipitacion_cordoba,0.003463,0.270436,0.130931,0.157075,0.075474,1.0,0.073366,0.239387,0.008482,0.043279,...,-0.241132,-0.040046,-0.044516,0.061909,0.032285,-0.289121,-0.178928,-0.403029,-0.258182,-0.26405
precipitacion_guainia,-0.06099,0.096883,0.141844,-0.068266,0.095274,0.073366,1.0,0.114005,0.059689,0.00067,...,-0.130006,-0.069301,0.019486,-0.062573,-0.080382,-0.038425,-0.011868,-0.086251,-0.078413,-0.078516
precipitacion_guaviare,0.042756,0.31698,0.223168,0.294807,0.16825,0.239387,0.114005,1.0,0.220012,0.033916,...,-0.249592,-0.108831,-0.027935,-0.086157,-0.091764,-0.182547,-0.100444,-0.024023,-0.112851,-0.11075
precipitacion_huila,-0.012342,0.328735,0.094156,0.439126,0.030442,0.008482,0.059689,0.220012,1.0,0.108056,...,-0.224848,-0.312918,-0.12829,-0.043109,-0.061101,-0.200322,-0.034461,0.178945,-0.10961,-0.125479
precipitacion_la guajira,-0.105555,0.154142,0.023348,0.097578,0.042679,0.043279,0.00067,0.033916,0.108056,1.0,...,-0.081025,-0.06298,-0.031452,0.065108,0.075077,-0.177474,-0.16919,0.093914,-0.14594,-0.143783
