In [1]:
import sys

sys.path.insert(0, "..")

In [2]:
from lasso_mlr import perform_lasso_mlr
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px

In [3]:
df = pd.read_csv("../../processed_tables/merged_without_lags_represas_criterio.csv")

## Sacamos las variables exógenas


In [4]:
lag_columns = df.filter(like="lag_")
price = df[["Date", "energy_price"]]
df = pd.concat([price, lag_columns], axis=1)
df.head()

Unnamed: 0,Date,energy_price
0,2021-10-01,216.929726
1,2021-10-02,256.940174
2,2021-10-03,282.065065
3,2021-10-04,286.526116
4,2021-10-05,278.157325


## Regresión lineal con Lasso

Esta vez utilizaremos `sklearn` para hacer la regresión Lasso y poder ver qué coeficientes son cero.


In [5]:
mape_test = []
mse_test = []

for i in range(1, 12):
    print(f"Lags: {i}")
    results_i = perform_lasso_mlr({"df": df, "lags": i})
    mape_i = results_i["mape_test"]
    mse_i = results_i["mse_test"]
    mape_test.append(mape_i)
    mse_test.append(mse_i)
    print("-----------------------")

Lags: 1
Best parameters found: {'alpha': 206.913808111479}
Best cross-validation score: 0.9355698036817355
Mean Squared Error on Test Data: 4803.445749029225
Mean Absolute Percentage Error on Test Data: 0.09967244325774398
energy_price_lag_1: 0.9656740502995749
-----------------------
Lags: 2
Best parameters found: {'alpha': 0.23357214690901212}
Best cross-validation score: 0.9298106586401882
Mean Squared Error on Test Data: 4536.9713407624695
Mean Absolute Percentage Error on Test Data: 0.09886036782820969
energy_price_lag_1: 1.1056825431610269
energy_price_lag_2: -0.13744556222094514
-----------------------
Lags: 3
Best parameters found: {'alpha': 4.281332398719396}
Best cross-validation score: 0.9402050642672373
Mean Squared Error on Test Data: 4534.664925587474
Mean Absolute Percentage Error on Test Data: 0.09684692758694802
energy_price_lag_1: 1.0793042203114995
energy_price_lag_2: 0.06345804013179508
energy_price_lag_3: -0.18126666165684607
-----------------------
Lags: 4
Best pa

In [6]:
df_pruebas = pd.DataFrame(
    {"lag": list(range(1, 12)), "mse_test": mse_test, "mape_test": mape_test}
)
df_pruebas

Unnamed: 0,lag,mse_test,mape_test
0,1,4803.445749,0.099672
1,2,4536.971341,0.09886
2,3,4534.664926,0.096847
3,4,4536.290929,0.096871
4,5,4605.183194,0.097784
5,6,4727.374376,0.099436
6,7,4651.880501,0.098348
7,8,4716.845407,0.099549
8,9,4627.211793,0.098443
9,10,4542.209592,0.097983


In [7]:
fig_mse = px.line(
    df_pruebas, x="lag", y="mse_test", title="MSE de la data de test vs Lags"
)

fig_mse.show()

In [8]:
fig_mape = px.line(
    df_pruebas, x="lag", y="mape_test", title="MAPE de la data de test vs Lags"
)

fig_mape.show()

Usando como métrica de elección el MAPE, vemos que el mejor modelo es con lag = 3


In [9]:
results = perform_lasso_mlr({"df": df, "lags": 3})
y_pred = results["y_pred"]
y_pred_1 = y_pred[1:]
y_test = results["y_test"]
X_test_dates = results["X_test_dates"]
X_train_dates = results["X_train_dates"]
y_train = results["y_train"]

Best parameters found: {'alpha': 4.281332398719396}
Best cross-validation score: 0.9402050642672373
Mean Squared Error on Test Data: 4534.664925587474
Mean Absolute Percentage Error on Test Data: 0.09684692758694802
energy_price_lag_1: 1.0793042203114995
energy_price_lag_2: 0.06345804013179508
energy_price_lag_3: -0.18126666165684607


In [10]:
fig_lag3 = px.line(
    x=X_train_dates,
    y=y_train,
    title="Forecasting of Energy Price with Regularizated MLR with 3 lags",
)
fig_lag3.update_layout(xaxis_title="Date", yaxis_title="Average Energy Price")
fig_lag3.add_trace(
    go.Scatter(x=X_test_dates, y=y_test, mode="lines", name="Valores reales")
)
fig_lag3.add_trace(
    go.Scatter(x=X_test_dates, y=y_pred, mode="lines", name="Valores predichos")
)
fig_lag3.add_trace(
    go.Scatter(x=X_test_dates, y=y_pred_1, mode="lines", name="Valores predichos -1")
)
fig_lag3.show()

In [11]:
results = perform_lasso_mlr({"df": df, "lags": 4})
y_pred = results["y_pred"]
y_pred_1 = y_pred[1:]
y_test = results["y_test"]
X_test_dates = results["X_test_dates"]

Best parameters found: {'alpha': 4.281332398719396}
Best cross-validation score: 0.9428893493581361
Mean Squared Error on Test Data: 4536.290928974545
Mean Absolute Percentage Error on Test Data: 0.0968714041645754
energy_price_lag_1: 1.0801614200776162
energy_price_lag_2: 0.06334192573406304
energy_price_lag_3: -0.18492451020942136
energy_price_lag_4: 0.0030420675747583675


In [12]:
fig_lag4 = px.line(
    x=X_train_dates,
    y=y_train,
    title="Forecasting of Energy Price with Regularizated MLR with 4 lags",
)
fig_lag4.update_layout(xaxis_title="Date", yaxis_title="Average Energy Price")
fig_lag4.add_trace(
    go.Scatter(x=X_test_dates, y=y_test, mode="lines", name="Valores reales")
)
fig_lag4.add_trace(
    go.Scatter(x=X_test_dates, y=y_pred, mode="lines", name="Valores predichos")
)
fig_lag4.add_trace(
    go.Scatter(x=X_test_dates, y=y_pred_1, mode="lines", name="Valores predichos -1")
)
fig_lag4.show()