In [22]:
import sys

sys.path.insert(0, "..")

In [23]:
import pandas as pd
from mlr import perform_mlr
import plotly.graph_objects as go
from forward_backward import forward_selection, backward_selection
import plotly.express as px
from best_model import create_best_model_df

In [24]:
df = pd.read_csv("../../processed_tables/merged_no_damns_standardized.csv")

## Regresión lineal múltiple

Usando `statsmodels` se puede realizar una regresión lineal. Nuestro módulo se encarga de hacer todo este análisis.


In [25]:
mape_test = []
mse_test = []

for i in range(1, 12):
    print(f"Lags: {i}")
    results_i = perform_mlr({"df": df, "lags": i})
    mape_i = results_i["mape_test"]
    mse_test_i = results_i["mse_test"]
    mape_test.append(mape_i)
    mse_test.append(mse_test_i)
    print("-----------------------")

Lags: 1
Mean Squared Error on Test Data: 4511.783441314193
Mean Absolute Percentage Error: 0.09744538067050071
-----------------------
Lags: 2
Mean Squared Error on Test Data: 4470.853778173426
Mean Absolute Percentage Error: 0.09865146797885199
-----------------------
Lags: 3
Mean Squared Error on Test Data: 4480.874259294345
Mean Absolute Percentage Error: 0.09685444536275664
-----------------------
Lags: 4
Mean Squared Error on Test Data: 4477.461768548015
Mean Absolute Percentage Error: 0.09682880379011129
-----------------------
Lags: 5


Mean Squared Error on Test Data: 4541.440652590759
Mean Absolute Percentage Error: 0.09711666363062581
-----------------------
Lags: 6
Mean Squared Error on Test Data: 4622.198520695146
Mean Absolute Percentage Error: 0.09814913446270389
-----------------------
Lags: 7
Mean Squared Error on Test Data: 4646.935869914854
Mean Absolute Percentage Error: 0.09835127135628645
-----------------------
Lags: 8
Mean Squared Error on Test Data: 4788.261564701469
Mean Absolute Percentage Error: 0.09954359972116233
-----------------------
Lags: 9
Mean Squared Error on Test Data: 4691.757757205302
Mean Absolute Percentage Error: 0.09893199237911643
-----------------------
Lags: 10
Mean Squared Error on Test Data: 4626.239455175907
Mean Absolute Percentage Error: 0.0984151335618146
-----------------------
Lags: 11
Mean Squared Error on Test Data: 4650.887452997909
Mean Absolute Percentage Error: 0.09883848986113487
-----------------------


In [26]:
df_pruebas = pd.DataFrame(
    {"lag": list(range(1, 12)), "mse_test": mse_test, "mape_test": mape_test}
)

fig_mse = px.line(
    df_pruebas, x="lag", y="mse_test", title="MSE de la data de test vs Lags"
)

fig_mse.show()

In [27]:
fig_mape = px.line(
    df_pruebas, x="lag", y="mape_test", title="MAPE de la data de test vs Lags"
)

fig_mape.show()

Usando como métrica de elección el MAPE, vemos que el mejor modelo es con lag = 4


In [28]:
fecha_corte = "2023-07-01"

results = perform_mlr({"df": df, "fecha_corte": fecha_corte, "lags": 4})
y_pred = results["y_pred"]
y_test = results["y_test"]
X_test_dates = results["X_test_dates"]
X_train = results["X_train"]
X_train_dates = results["X_train_dates"]
y_train = results["y_train"]
y_pred_train = results["y_pred_train"]
X_test = results["X_test"]
mse = results["mse_test"]
mape = results["mape_test"]
significant_variables = results["significant_variables"]
df_with_lags = results["df_with_lags"]

Mean Squared Error on Test Data: 4477.461768548015
Mean Absolute Percentage Error: 0.09682880379011129


In [29]:
significant_variables

['precipitacion_vaupes',
 'precipitacion_guaviare',
 'energy_price_lag_1',
 'energy_price_lag_3']

In [30]:
significant_variables.append("Date")
significant_variables.append("energy_price")

## MLR with selected variables


In [31]:
# Selecciona las columnas significativas del DataFrame original
df_selected_variables = df_with_lags[significant_variables].copy()

fecha_corte = "2023-07-01"

results = perform_mlr(
    {"df": df_selected_variables, "fecha_corte": fecha_corte, "lags": 4}
)
mse_MLR_selection = results["mse_test"]
mape_MLR_selection = results["mape_test"]
significant_variables = results["significant_variables"]
X_train_dates_selection = results["X_train_dates"]
y_train_selection = results["y_train"]
y_pred_train_selection = results["y_pred_train"]
y_test_selection = results["y_test"]
X_test_dates_selection = results["X_test_dates"]
y_pred_test_selection = results["y_pred"]
y_pred_test_selection_with_shift = y_pred_test_selection[1:]

Mean Squared Error on Test Data: 4462.064239733968
Mean Absolute Percentage Error: 0.0956110079513892


In [32]:
significant_variables

['const', 'precipitacion_vaupes', 'energy_price_lag_1', 'energy_price_lag_3']

## Forward selection


In [33]:
selected_features_forward = forward_selection(X_train, y_train)

In [34]:
print(len(selected_features_forward))
print("Forward Selection Result:", selected_features_forward)

19
Forward Selection Result: ['TRM', 'precipitacion_putumayo', 'precipitacion_caqueta', 'precipitacion_san andres providencia', 'precipitacion_cesar', 'temp_CAQUETA', 'precipitacion_magdalena', 'precipitacion_bogota', 'precipitacion_vichada', 'temp_ARCHIPIELAGO DE SAN ANDRES PROVIDENCIA Y SANTA CATALINA', 'brent_value', 'precipitacion_guaviare', 'temp_ARAUCA', 'precipitacion_guainia', 'precipitacion_amazonas', 'precipitacion_casanare', 'energy_price_lag_1', 'Date', 'energy_price']


In [35]:
df_selected_variables_forward = df_with_lags[selected_features_forward]

fecha_corte = "2023-07-01"

results = perform_mlr(
    {"df": df_selected_variables_forward, "fecha_corte": fecha_corte, "lags": 4}
)
y_pred_f = results["y_pred"]
y_pred_1_f = y_pred[1:]
y_test_f = results["y_test"]
X_test_dates_f = results["X_test_dates"]
X_train_f = results["X_train"]
X_train_dates_f = results["X_train_dates"]
y_train_f = results["y_train"]
y_pred_train_f = results["y_pred_train"]
mse_f = results["mse_test"]
mape_f = results["mape_test"]

Mean Squared Error on Test Data: 4743.824519541103
Mean Absolute Percentage Error: 0.09969758057484984


## Backward selection


In [36]:
selected_features_backward = backward_selection(X_train, y_train)

In [37]:
print(len(selected_features_backward))
print("Backward Selection Result:", selected_features_backward)

20
Backward Selection Result: ['precipitacion_vaupes', 'precipitacion_bolivar', 'precipitacion_choco', 'precipitacion_la guajira', 'precipitacion_caqueta', 'precipitacion_san andres providencia', 'temp_CHOCO', 'precipitacion_cesar', 'temp_CAQUETA', 'precipitacion_vichada', 'brent_value', 'precipitacion_guaviare', 'temp_ARAUCA', 'precipitacion_guainia', 'precipitacion_arauca', 'energy_price_lag_1', 'energy_price_lag_2', 'energy_price_lag_4', 'Date', 'energy_price']


In [38]:
df_selected_variables_backward = df_with_lags[selected_features_backward]

fecha_corte = "2023-07-01"

results = perform_mlr(
    {"df": df_selected_variables_backward, "fecha_corte": fecha_corte, "lags": 4}
)
mse_b = results["mse_test"]
mape_b = results["mape_test"]

Mean Squared Error on Test Data: 4800.617874812007
Mean Absolute Percentage Error: 0.10014680423044595


In [39]:
results_MRL_with_lags = pd.DataFrame(
    {
        "Modelo": ["MLR", "MLR_selection", "MLR_forward", "MLR_backward"],
        "MSE": [mse, mse_MLR_selection, mse_f, mse_b],
        "MAPE": [mape, mape_MLR_selection, mape_f, mape_b],
    }
)

results_MRL_with_lags

Unnamed: 0,Modelo,MSE,MAPE
0,MLR,4477.461769,0.096829
1,MLR_selection,4462.06424,0.095611
2,MLR_forward,4743.82452,0.099698
3,MLR_backward,4800.617875,0.100147


Se identifica que la seleccion de variablesque obtuvo mejor desempeño en cuanto a la metrica seleccionada MAPE es el Forward selection.


In [40]:
fig_lag4 = px.line(
    x=X_train_dates_selection,
    y=y_train_selection,
    title="Forecasting of Energy Price with MLR selection with 4 lags",
)
fig_lag4.update_layout(xaxis_title="Date", yaxis_title="Average Energy Price")
fig_lag4.add_trace(
    go.Scatter(
        x=X_train_dates_selection,
        y=y_pred_train_selection,
        mode="lines",
        name="Valores predichos de train",
    )
)
fig_lag4.add_trace(
    go.Scatter(
        x=X_test_dates_selection,
        y=y_test_selection,
        mode="lines",
        name="Valores reales de test",
    )
)
fig_lag4.add_trace(
    go.Scatter(
        x=X_test_dates_selection,
        y=y_pred_test_selection,
        mode="lines",
        name="Valores predichos de test",
    )
)
fig_lag4.add_trace(
    go.Scatter(
        x=X_test_dates_selection,
        y=y_pred_test_selection_with_shift,
        mode="lines",
        name="Valores predichos de test -1",
    )
)
fig_lag4.show()

Matriz de correlacion de las variables seleccionadas por el mejor modelo, selección Forward


In [41]:
numeric_columns = df_selected_variables.select_dtypes(
    include=["float64", "int64"]
)
correlation_matrix = numeric_columns.corr()
correlation_matrix

Unnamed: 0,precipitacion_vaupes,precipitacion_guaviare,energy_price_lag_1,energy_price_lag_3,energy_price
precipitacion_vaupes,1.0,0.130231,-0.011952,-0.000848,-0.032476
precipitacion_guaviare,0.130231,1.0,-0.112851,-0.096265,-0.11075
energy_price_lag_1,-0.011952,-0.112851,1.0,0.955585,0.981022
energy_price_lag_3,-0.000848,-0.096265,0.955585,1.0,0.924601
energy_price,-0.032476,-0.11075,0.981022,0.924601,1.0


## Exportación del mejor modelo

In [42]:
create_best_model_df({ 
    "dates_train": X_train_dates_selection,
    "y_train": y_train_selection,
    "y_pred_train": y_pred_train_selection,
    "y_test": y_test_selection,
    "dates_test": X_test_dates_selection,
    "y_pred_test": y_pred_test_selection,
    "output_path_train": "MLR_with_lags_no_criteria_best_model_train.pkl",
    "output_path_test": "MLR_with_lags_no_criteria_best_model_test.pkl"
})