In [1]:
import sys

sys.path.insert(0, "..")

In [2]:
from mlr import perform_mlr
import pandas as pd
import plotly.express as px
from forward_backward import forward_selection, backward_selection
import plotly.graph_objects as go

In [3]:
df = pd.read_csv("../../processed_tables/merged_standardized.csv")

## Sacamos las variables exógenas


In [4]:
lag_columns = df.filter(like="lag_")
price = df[["Date", "energy_price"]]
df = pd.concat([price, lag_columns], axis=1)
df.head()

Unnamed: 0,Date,energy_price
0,2021-10-01,216.929726
1,2021-10-02,256.940174
2,2021-10-03,282.065065
3,2021-10-04,286.526116
4,2021-10-05,278.157325


## Regresión lineal múltiple

Usando `statsmodels` se puede realizar una regresión lineal. Nuestro módulo se encarga de hacer todo este análisis.


In [5]:
mape_test = []
mse_test = []

for i in range(1, 12):
    print(f"Lags: {i}")
    results_i = perform_mlr({"df": df, "lags": i})
    mape_i = results_i["mape_test"]
    mse_test_i = results_i["mse_test"]
    mape_test.append(mape_i)
    mse_test.append(mse_test_i)
    print("-----------------------")

Lags: 1
Mean Squared Error on Test Data: 4699.708443116314
Mean Absolute Percentage Error: 0.09828100720305545
-----------------------
Lags: 2
Mean Squared Error on Test Data: 4536.701919616192
Mean Absolute Percentage Error: 0.09886013116486614
-----------------------
Lags: 3
Mean Squared Error on Test Data: 4537.096764793275
Mean Absolute Percentage Error: 0.09678106898815365
-----------------------
Lags: 4
Mean Squared Error on Test Data: 4543.017051091168
Mean Absolute Percentage Error: 0.09687890352808762
-----------------------
Lags: 5
Mean Squared Error on Test Data: 4617.394467324015
Mean Absolute Percentage Error: 0.09797100504809433
-----------------------
Lags: 6
Mean Squared Error on Test Data: 4663.594649947829
Mean Absolute Percentage Error: 0.0985183035930273
-----------------------
Lags: 7
Mean Squared Error on Test Data: 4660.80203342627
Mean Absolute Percentage Error: 0.09851381568493117
-----------------------
Lags: 8
Mean Squared Error on Test Data: 4727.57033093494

In [6]:
df_pruebas = pd.DataFrame(
    {"lag": list(range(1, 12)), "mse_test": mse_test, "mape_test": mape_test}
)

fig_mse = px.line(
    df_pruebas, x="lag", y="mse_test", title="MSE de la data de test vs Lags"
)

fig_mse.show()

In [7]:
fig_mape = px.line(
    df_pruebas, x="lag", y="mape_test", title="MAPE de la data de test vs Lags"
)

fig_mape.show()

Usando como métrica de elección el MAPE, vemos que el mejor modelo es con lag = 3


In [8]:
fecha_corte = "2023-07-01"

results = perform_mlr({"df": df, "fecha_corte": fecha_corte, "lags": 3})
y_pred = results["y_pred"]
y_test = results["y_test"]
X_test_dates = results["X_test_dates"]
X_train = results["X_train"]
X_train_dates = results["X_train_dates"]
y_train = results["y_train"]
y_pred_train = results["y_pred_train"]
X_test = results["X_test"]
mse = results["mse_test"]
mape = results["mape_test"]
significant_variables = results["significant_variables"]
df_with_lags = results["df_with_lags"]

Mean Squared Error on Test Data: 4537.096764793275
Mean Absolute Percentage Error: 0.09678106898815365


In [9]:
significant_variables.remove("const")
significant_variables.append("Date")
significant_variables.append("energy_price")

## MLR with selected variables


In [10]:
# Selecciona las columnas significativas del DataFrame original
df_selected_variables = df_with_lags[significant_variables].copy()

fecha_corte = "2023-07-01"

results = perform_mlr(
    {"df": df_selected_variables, "fecha_corte": fecha_corte, "lags": 3}
)
y_pred_MRL_selection = results["y_pred"]
y_pred_1_MRL_selection = y_pred[1:]
y_test_MRL_selection = results["y_test"]
X_test_dates_MRL_selection = results["X_test_dates"]
X_train_MRL_selection = results["X_train"]
X_train_dates_MRL_selection = results["X_train_dates"]
y_train_MRL_selection = results["y_train"]
y_pred_train_MRL_selection = results["y_pred_train"]
mse_MRL_selection = results["mse_test"]
mape_MRL_selection = results["mape_test"]

Mean Squared Error on Test Data: 4534.432108792191
Mean Absolute Percentage Error: 0.09674438896074171


## Forward selection


In [11]:
# all_features = list(df.columns[:])
selected_features_forward = forward_selection(X_train, y_train)
print("Forward Selection Result:", selected_features_forward)

Forward Selection Result: ['energy_price_lag_1', 'Date', 'energy_price']


In [12]:
print(len(selected_features_forward))
print("Forward Selection Result:", selected_features_forward)

3
Forward Selection Result: ['energy_price_lag_1', 'Date', 'energy_price']


In [13]:
df_selected_variables_forward = df_with_lags[selected_features_forward]

fecha_corte = "2023-07-01"

results = perform_mlr(
    {"df": df_selected_variables_forward, "fecha_corte": fecha_corte, "lags": 3}
)
mse_f = results["mse_test"]
mape_f = results["mape_test"]

Mean Squared Error on Test Data: 4534.432108792162
Mean Absolute Percentage Error: 0.09674438896074138


## Backward selection


In [14]:
selected_features_backward = backward_selection(X_train, y_train)

In [15]:
print(len(selected_features_backward))
print("Backward Selection Result:", selected_features_backward)

4
Backward Selection Result: ['energy_price_lag_1', 'energy_price_lag_2', 'Date', 'energy_price']


In [16]:
df_selected_variables_backward = df_with_lags[selected_features_backward]

fecha_corte = "2023-07-01"

results = perform_mlr(
    {"df": df_selected_variables_backward, "fecha_corte": fecha_corte, "lags": 3}
)
mse_b = results["mse_test"]
mape_b = results["mape_test"]

Mean Squared Error on Test Data: 4534.432108792162
Mean Absolute Percentage Error: 0.09674438896074138


In [17]:
results_MRL_with_lags = pd.DataFrame(
    {
        "Modelo": ["MLR", "MLR_selection", "MLR_forward", "MLR_backward"],
        "MSE": [mse, mse_MRL_selection, mse_f, mse_b],
        "MAPE": [mape, mape_MRL_selection, mape_f, mape_b],
    }
)

results_MRL_with_lags

Unnamed: 0,Modelo,MSE,MAPE
0,MLR,4537.096765,0.096781
1,MLR_selection,4534.432109,0.096744
2,MLR_forward,4534.432109,0.096744
3,MLR_backward,4534.432109,0.096744


Se identifica que la seleccion de variablesque obtuvo mejor desempeño en cuanto a la metrica seleccionada MAPE es el Backward selection.


In [18]:
fig = px.line(
    x=X_train_dates_MRL_selection,
    y=y_train_MRL_selection,
    title="Forecasting of Energy Price with Forward MLR Selection",
)
fig.update_layout(xaxis_title="Date", yaxis_title="Average Energy Price")
fig.add_trace(
    go.Scatter(
        x=X_train_dates_MRL_selection,
        y=y_pred_train_MRL_selection,
        mode="lines",
        name="Valores predichos de train",
    )
)
fig.add_trace(
    go.Scatter(
        x=X_test_dates_MRL_selection,
        y=y_test_MRL_selection,
        mode="lines",
        name="Valores reales de test",
    )
)
fig.add_trace(
    go.Scatter(
        x=X_test_dates_MRL_selection,
        y=y_pred_MRL_selection,
        mode="lines",
        name="Valores predichos de test",
    )
)
fig.add_trace(
    go.Scatter(
        x=X_test_dates_MRL_selection,
        y=y_pred_1_MRL_selection,
        mode="lines",
        name="Valores predichos de test -1",
    )
)
fig.show()

Matriz de correlacion de las variables seleccionadas por el mejor modelo, selección Forward


In [19]:
numeric_columns = df_selected_variables.select_dtypes(include=["float64", "int64"])
correlation_matrix = numeric_columns.corr()
correlation_matrix

Unnamed: 0,energy_price_lag_1,energy_price_lag_3,energy_price
energy_price_lag_1,1.0,0.955537,0.981022
energy_price_lag_3,0.955537,1.0,0.924551
energy_price,0.981022,0.924551,1.0
