In [9]:
import sys

sys.path.insert(0, "..")

In [10]:
from lasso_mlr import perform_lasso_mlr
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px

In [11]:
df = pd.read_csv("../../processed_tables/merged_without_lags_represas_criterio.csv")
df.head()

Unnamed: 0,precipitacion_amazonas,precipitacion_arauca,precipitacion_atlantico,precipitacion_bogota,precipitacion_bolivar,precipitacion_caqueta,precipitacion_casanare,precipitacion_cesar,precipitacion_choco,precipitacion_guainia,...,temp_ARCHIPIELAGO DE SAN ANDRES PROVIDENCIA Y SANTA CATALINA,temp_CAQUETA,temp_CHOCO,temp_NARINO,temp_VICHADA,temp_avg_represas,brent_value,TRM,Date,energy_price
0,-0.509005,0.759339,0.13939,-0.557688,4.888002,-0.530775,-0.284457,-0.030483,0.599807,-0.518613,...,0.940739,0.503965,0.994447,1.267949,0.323702,-0.114337,-0.917312,-1.339552,2021-10-01,216.929726
1,-0.593583,1.622908,-0.161734,-0.569948,5.703726,-0.437031,-0.355927,0.273823,1.312841,-0.217009,...,0.864493,0.594068,0.773896,1.044002,1.159376,0.619803,-0.917312,-1.423606,2021-10-02,256.940174
2,-0.538867,1.55941,-0.248126,-0.567816,3.445073,-0.527751,-0.198452,-0.32453,0.818475,-0.402611,...,0.902616,0.67051,1.083718,1.92664,1.159376,0.950203,-0.917312,-1.423606,2021-10-03,282.065065
3,-0.5806,1.718155,-0.183251,-0.287443,3.802742,-0.300951,0.339387,-0.336497,0.724237,-0.472212,...,0.063912,0.470931,1.293766,1.612333,0.057805,0.723472,-0.761153,-1.423606,2021-10-04,286.526116
4,0.016831,2.98176,0.52169,-0.560886,0.362265,0.158697,-0.18016,0.179796,0.586998,-0.565013,...,0.407018,-0.016688,-0.418129,0.717358,-0.701899,-0.197147,-0.614059,-1.411032,2021-10-05,278.157325


## Regresión lineal con Lasso

Esta vez utilizaremos `sklearn` para hacer la regresión Lasso y poder ver qué coeficientes son cero. Adicionalmente, se va a revisar cuál es el valor óptimo para el número de lags del modelo.


In [12]:
mape_test=[]
mse_test=[]

for i in range(1, 12):
    print(f"Lags: {i}")
    results_i=perform_lasso_mlr({ "df": df, "lags": i })
    mape_i=results_i["mape_test"]
    mse_i=results_i["mse_test"]
    mape_test.append(mape_i)
    mse_test.append(mse_i)
    print("-----------------------")

Lags: 1
Best parameters found: {'alpha': 4.281332398719396}
Best cross-validation score: 0.9361042064920715
Mean Squared Error on Test Data: 4630.277545452998
Mean Absolute Percentage Error on Test Data: 0.09776550217291956
precipitacion_meta: -1.0985842759564806
temp_avg_represas: 1.8171824485455155
energy_price_lag_1: 0.9693162391920747
-----------------------
Lags: 2
Best parameters found: {'alpha': 4.281332398719396}
Best cross-validation score: 0.93032768231703
Mean Squared Error on Test Data: 4478.742557931561
Mean Absolute Percentage Error on Test Data: 0.09813590906094909
precipitacion_meta: -1.1302965935109013
temp_avg_represas: 1.5823170542938891
energy_price_lag_1: 1.0975861482566
energy_price_lag_2: -0.1319605219342509
-----------------------
Lags: 3
Best parameters found: {'alpha': 4.281332398719396}
Best cross-validation score: 0.9406776374291278
Mean Squared Error on Test Data: 4455.019598639417
Mean Absolute Percentage Error on Test Data: 0.09599010577558753
precipitaci

In [13]:
df_pruebas=pd.DataFrame({"lag":list(range(1,12)),"mse_test":mse_test,"mape_test":mape_test})

fig_mse = px.line(df_pruebas,x="lag", y="mse_test", title='MSE de la data de test vs Lags')

fig_mse.show()

In [14]:
fig_mape = px.line(df_pruebas,x="lag", y="mape_test", title='MAPE de la data de test vs Lags')

fig_mape.show()

Usando como métrica de elección el MAPE, vemos que el mejor modelo es con lag = 4

In [15]:
results = perform_lasso_mlr({ "df": df, "lags": 4 })
y_pred = results["y_pred"]
y_pred_1=y_pred[1:]
y_test = results["y_test"]
X_test_dates = results["X_test_dates"]
X_train_dates=results["X_train_dates"]
y_train=results["y_train"]
y_pred_train=results["y_pred_train"]
y_pred_train_1=y_pred_train[1:]

Best parameters found: {'alpha': 4.281332398719396}
Best cross-validation score: 0.9430055760185592
Mean Squared Error on Test Data: 4454.757889488515
Mean Absolute Percentage Error on Test Data: 0.09598914073607966
precipitacion_meta: -0.6492039475952551
temp_avg_represas: 2.0752349022665033
energy_price_lag_1: 1.0756034678724657
energy_price_lag_2: 0.06596607491660796
energy_price_lag_3: -0.1843891941358507
energy_price_lag_4: 0.0013896635773296456


In [16]:
fig_lag4 = px.line(x=X_train_dates, y=y_train,title="Forecasting of Energy Price with Regularizated MLR with 4 lags")
fig_lag4.update_layout(xaxis_title='Date', yaxis_title='Average Energy Price')
fig_lag4.add_trace(go.Scatter(x=X_train_dates, y=y_pred_train, mode='lines', name='Valores predichos de train'))
fig_lag4.add_trace(go.Scatter(x=X_test_dates, y=y_test, mode='lines', name='Valores reales de test'))
fig_lag4.add_trace(go.Scatter(x=X_test_dates, y=y_pred, mode='lines', name='Valores predichos de test'))
fig_lag4.add_trace(go.Scatter(x=X_test_dates, y=y_pred_1, mode='lines', name='Valores predichos de test -1'))
fig_lag4.add_trace(go.Scatter(x=X_train_dates, y=y_pred_train_1, mode='lines', name='Valores predichos de train -1'))
fig_lag4.show()