In [1]:
import sys

sys.path.insert(0, "..")

In [2]:
import pandas as pd
from mlr import perform_mlr
import plotly.graph_objects as go
from forward_backward import forward_selection, backward_selection
import plotly.express as px

In [3]:
df = pd.read_csv("../../processed_tables/merged_without_lags_represas_criterio.csv")
df.shape

(644, 34)

## Regresión lineal múltiple

Usando `statsmodels` se puede realizar una regresión lineal. Nuestro módulo se encarga de hacer todo este análisis.


In [4]:
fecha_corte = "2023-07-01"
results = perform_mlr({"df": df, "fecha_corte": fecha_corte})
y_pred = results["y_pred"]
y_test = results["y_test"]
X_test_dates = results["X_test_dates"]
X_train = results["X_train"]
X_train_dates = results["X_train_dates"]
y_train = results["y_train"]
y_pred_train = results["y_pred_train"]
X_test = ["X_test"]
mse = results["mse_test"]
mape = results["mape_test"]
significant_variables = results["significant_variables"]

Mean Squared Error on Test Data: 200381.52433356992
Mean Absolute Percentage Error: 0.46979923948478586


Con esto verificamos que el valor de $R^2$ es de $0.514$, lo cual quiere decir que hay una correlación baja entre las variables exógenas y la variable endógena. Las variables significativas son 15 y están asociados a las temperaturas y precipitaciones de ciertos departamentos y también a la temperatura y precipitación de los departamentos que tienen represas.


## MLR with selected variables


In [5]:
significant_variables.append("Date")
significant_variables.append("energy_price")
significant_variables.remove("const")

In [6]:
# Selecciona las columnas significativas del DataFrame original
df_selected_variables = df[significant_variables].copy()

In [7]:
fecha_corte = "2023-07-01"

results = perform_mlr({"df": df_selected_variables, "fecha_corte": fecha_corte})
y_pred_MRL_selection = results["y_pred"]
y_pred_1_MRL_selection = y_pred[1:]
y_test_MRL_selection = results["y_test"]
X_test_dates_MRL_selection = results["X_test_dates"]
X_train_MRL_selection = results["X_train"]
X_train_dates_MRL_selection = results["X_train_dates"]
y_train_MRL_selection = results["y_train"]
y_pred_train_MRL_selection = results["y_pred_train"]
mse_MRL_selection = results["mse_test"]
mape_MRL_selection = results["mape_test"]

Mean Squared Error on Test Data: 225473.8869772285
Mean Absolute Percentage Error: 0.4844179192016038


## Aplicamos forward para seleccion de variables


In [8]:
all_features = list(df.columns[:])
selected_features_forward = forward_selection(X_train, y_train)

In [9]:
print(len(selected_features_forward))
print("Forward Selection Result:", selected_features_forward)

18
Forward Selection Result: ['precipitacion_amazonas', 'precipitacion_arauca', 'precipitacion_bogota', 'precipitacion_caqueta', 'precipitacion_cesar', 'precipitacion_magdalena', 'precipitacion_meta', 'precipitacion_putumayo', 'precipitacion_risaralda', 'precipitacion_vichada', 'temp_AMAZONAS', 'temp_CAQUETA', 'temp_CHOCO', 'temp_NARINO', 'temp_VICHADA', 'brent_value', 'Date', 'energy_price']


In [10]:
df_selected_variables_forward = df[selected_features_forward]

fecha_corte = "2023-07-01"

results = perform_mlr({"df": df_selected_variables_forward, "fecha_corte": fecha_corte})
y_pred_f = results["y_pred"]
y_pred_1_f = y_pred[1:]
y_test_f = results["y_test"]
X_test_dates_f = results["X_test_dates"]
X_train_f = results["X_train"]
X_train_dates_f = results["X_train_dates"]
y_train_f = results["y_train"]
y_pred_train_f = results["y_pred_train"]
mse_f = results["mse_test"]
mape_f = results["mape_test"]

Mean Squared Error on Test Data: 241570.09988941523
Mean Absolute Percentage Error: 0.527500074269657


## Aplicamos backward para seleccion de variables


In [11]:
# Aplica backward selection
selected_features_backward = backward_selection(X_train, y_train)

In [12]:
print(len(selected_features_backward))
print("Forward Selection Result:", selected_features_backward)

18
Forward Selection Result: ['precipitacion_amazonas', 'precipitacion_arauca', 'precipitacion_bogota', 'precipitacion_bolivar', 'precipitacion_caqueta', 'precipitacion_cesar', 'precipitacion_choco', 'precipitacion_la guajira', 'precipitacion_meta', 'precipitacion_putumayo', 'precipitacion_risaralda', 'precipitacion_departamentos_represa', 'temp_CAQUETA', 'temp_CHOCO', 'temp_NARINO', 'brent_value', 'Date', 'energy_price']


In [13]:
df_selected_variables_backward = df[selected_features_backward]

fecha_corte = "2023-07-01"

results = perform_mlr(
    {"df": df_selected_variables_backward, "fecha_corte": fecha_corte}
)
y_pred_b = results["y_pred"]
y_pred_1_b = y_pred[1:]
y_test_b = results["y_test"]
X_test_dates_b = results["X_test_dates"]
X_train_b = results["X_train"]
X_train_dates_b = results["X_train_dates"]
y_train_b = results["y_train"]
y_pred_train_b = results["y_pred_train"]
mse_b = results["mse_test"]
mape_b = results["mape_test"]

Mean Squared Error on Test Data: 227712.29466129112
Mean Absolute Percentage Error: 0.5137550185658989


In [14]:
results_MRL = pd.DataFrame(
    {
        "Modelo": ["MLR", "MLR_selection", "MLR_forward", "MLR_backward"],
        "MSE": [mse, mse_MRL_selection, mse_f, mse_b],
        "MAPE": [mape, mape_MRL_selection, mape_f, mape_b],
    }
)

results_MRL

Unnamed: 0,Modelo,MSE,MAPE
0,MLR,200381.524334,0.469799
1,MLR_selection,225473.886977,0.484418
2,MLR_forward,241570.099889,0.5275
3,MLR_backward,227712.294661,0.513755


Ahora graficamos los resultados de la regresión lineal múltiple con el mejor MAPE obtenido.


In [15]:
fig = px.line(
    x=X_train_dates,
    y=y_train,
    title="Forecasting of Energy Price with Forward MLR Selection",
)
fig.update_layout(xaxis_title="Date", yaxis_title="Average Energy Price")
fig.add_trace(
    go.Scatter(
        x=X_train_dates, y=y_pred_train, mode="lines", name="Valores predichos de train"
    )
)
fig.add_trace(
    go.Scatter(x=X_test_dates, y=y_test, mode="lines", name="Valores reales de test")
)
fig.add_trace(
    go.Scatter(x=X_test_dates, y=y_pred, mode="lines", name="Valores predichos de test")
)
fig.add_trace(
    go.Scatter(
        x=X_test_dates, y=y_pred, mode="lines", name="Valores predichos de test -1"
    )
)
fig.show()

Matriz de correlacion de las variables seleccionadas por el mejor modelo, selección MRL


In [16]:
numeric_columns = df_selected_variables.select_dtypes(include=["float64", "int64"])
correlation_matrix = numeric_columns.corr()
correlation_matrix

Unnamed: 0,precipitacion_casanare,precipitacion_norte de santander,precipitacion_sucre,precipitacion_vaupes,temp_ARAUCA,brent_value,energy_price
precipitacion_casanare,1.0,-0.126998,0.094089,0.109686,-0.103097,0.006283,0.136711
precipitacion_norte de santander,-0.126998,1.0,0.224334,-0.009287,0.131767,0.126432,-0.26975
precipitacion_sucre,0.094089,0.224334,1.0,0.133493,0.118262,0.233963,-0.140909
precipitacion_vaupes,0.109686,-0.009287,0.133493,1.0,0.076672,0.001619,-0.034262
temp_ARAUCA,-0.103097,0.131767,0.118262,0.076672,1.0,-0.140329,-0.160235
brent_value,0.006283,0.126432,0.233963,0.001619,-0.140329,1.0,-0.361191
energy_price,0.136711,-0.26975,-0.140909,-0.034262,-0.160235,-0.361191,1.0
