In [9]:
import sys

sys.path.insert(0, "..")

In [10]:
from lasso_mlr import perform_lasso_mlr
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px

In [11]:
df = pd.read_csv("../../processed_tables/merged_no_damns_standardized.csv")
df.head()

Unnamed: 0,precipitacion_cauca,precipitacion_guainia,precipitacion_la guajira,precipitacion_norte de santander,temp_ARAUCA,precipitacion_cesar,brent_value,temp_ARCHIPIELAGO DE SAN ANDRES PROVIDENCIA Y SANTA CATALINA,temp_CAQUETA,precipitacion_cordoba,...,precipitacion_caqueta,temp_NARINO,precipitacion_putumayo,precipitacion_vaupes,precipitacion_arauca,precipitacion_choco,temp_AMAZONAS,temp_CUNDINAMARCA,Date,energy_price
0,116.3,1.2,18.62,24.1,19.2,21.6,78.48,31.8,30.404625,148.2,...,0.1,22.810108,0.2,13.92,22.2,1076.0,31.05,18.102143,2021-10-01,216.929726
1,278.5,5.1,21.83,14.4,20.0,39.4,78.48,31.7,30.67792,40.8,...,3.2,22.332781,0.0,8.5,35.8,1309.8,31.5,20.685129,2021-10-02,256.940174
2,88.1,2.7,5.36,39.1,20.5,4.4,78.48,31.75,30.909783,52.3,...,0.2,24.214059,0.7,12.76,34.8,1147.7,30.62,21.016738,2021-10-03,282.065065
3,155.1,1.8,5.18,36.4,20.55,3.7,80.375,30.65,30.304427,55.1,...,7.7,23.544138,2.9,10.08,37.3,1116.8,28.44,20.604678,2021-10-04,286.526116
4,98.4,0.6,3.89,84.8,19.2,33.9,82.16,31.1,28.8254,178.2,...,22.9,21.636563,36.5,18.92,57.2,1071.8,28.85,18.724138,2021-10-05,278.157325


## Regresión lineal con Lasso

Esta vez utilizaremos `sklearn` para hacer la regresión Lasso y poder ver qué coeficientes son cero. Adicionalmente, se va a revisar cuál es el valor óptimo para el número de lags del modelo.


In [12]:
mape_test = []
mse_test = []

for i in range(1, 12):
    print(f"Lags: {i}")
    results_i = perform_lasso_mlr({"df": df, "lags": i})
    mape_i = results_i["mape_test"]
    mse_i = results_i["mse_test"]
    mape_test.append(mape_i)
    mse_test.append(mse_i)
    print("-----------------------")

Lags: 1
Best parameters found: {'alpha': 206.913808111479}
Best cross-validation score: 0.9361763812793633
Mean Squared Error on Test Data: 4808.195731620444
Mean Absolute Percentage Error on Test Data: 0.10016996764359269
precipitacion_norte de santander: -0.00015044480930536457
TRM: 0.001112265012600885
precipitacion_meta: -0.028874691603105414
precipitacion_antioquia: -0.03006544404338674
energy_price_lag_1: 0.9608361641002028
-----------------------
Lags: 2
Best parameters found: {'alpha': 206.913808111479}
Best cross-validation score: 0.9286121998112422
Mean Squared Error on Test Data: 4815.581771403665
Mean Absolute Percentage Error on Test Data: 0.10024673142420139
TRM: 0.001392507192756858
precipitacion_meta: -0.028403733851189158
precipitacion_antioquia: -0.03065846729328456
energy_price_lag_1: 0.9607539667439691
-----------------------
Lags: 3
Best parameters found: {'alpha': 4.281332398719396}
Best cross-validation score: 0.9398325149651473
Mean Squared Error on Test Data: 4

In [13]:
df_pruebas = pd.DataFrame(
    {"lag": list(range(1, 12)), "mse_test": mse_test, "mape_test": mape_test}
)

fig_mse = px.line(
    df_pruebas, x="lag", y="mse_test", title="MSE de la data de tests vs Lags"
)

fig_mse.show()

In [14]:
fig_mape = px.line(
    df_pruebas, x="lag", y="mape_test", title="MAPE de la data de test vs Lags"
)

fig_mape.show()

Usando como métrica de elección el MAPE, vemos que el mejor modelo es con lag = 4


In [15]:
results = perform_lasso_mlr({"df": df, "lags": 4})
y_pred = results["y_pred"]
y_pred_1 = y_pred[1:]
y_test = results["y_test"]
X_test_dates = results["X_test_dates"]
X_train_dates = results["X_train_dates"]
y_train = results["y_train"]
y_pred_train = results["y_pred_train"]

Best parameters found: {'alpha': 4.281332398719396}
Best cross-validation score: 0.9416690021133818
Mean Squared Error on Test Data: 4529.46420817802
Mean Absolute Percentage Error on Test Data: 0.09632654416775278
precipitacion_cauca: 0.008682905103778768
precipitacion_guainia: -0.007519277265589178
precipitacion_la guajira: -0.007550885720436954
precipitacion_norte de santander: -0.03437908536332821
precipitacion_cesar: 0.003324451311109029
brent_value: -0.26838552259426857
TRM: -0.005155950340156032
precipitacion_bogota: 0.13802431541649018
precipitacion_san andres providencia: 0.0281096434352003
precipitacion_casanare: 0.0016056620486618607
precipitacion_guaviare: 0.37934600426373377
precipitacion_magdalena: 0.039713666507249765
precipitacion_vichada: -0.03137348429109366
precipitacion_atlantico: -0.010522254190030054
precipitacion_meta: -0.05695973845252268
precipitacion_antioquia: -0.04829615799737976
precipitacion_bolivar: -0.01190068253134589
temp_BOYACA: 0.6195365181310755
pre

In [16]:
fig_lag4 = px.line(
    x=X_train_dates,
    y=y_train,
    title="Forecasting of Energy Price with Regularizated MLR with 4 lags",
)
fig_lag4.update_layout(xaxis_title="Date", yaxis_title="Average Energy Price")
fig_lag4.add_trace(
    go.Scatter(
        x=X_train_dates, y=y_pred_train, mode="lines", name="Valores predichos de train"
    )
)
fig_lag4.add_trace(
    go.Scatter(x=X_test_dates, y=y_test, mode="lines", name="Valores reales de test")
)
fig_lag4.add_trace(
    go.Scatter(x=X_test_dates, y=y_pred, mode="lines", name="Valores predichos de test")
)
fig_lag4.add_trace(
    go.Scatter(
        x=X_test_dates, y=y_pred_1, mode="lines", name="Valores predichos de test -1"
    )
)
fig_lag4.show()