In [16]:
import sys
sys.path.insert(0, '..')

In [17]:
import pandas as pd
from mlr import perform_mlr
import plotly.graph_objects as go
from forward_backward import forward_selection, backward_selection
import plotly.express as px

In [18]:
df = pd.read_csv("../../processed_tables/merged_standardized.csv")

## Regresión lineal múltiple

Usando `statsmodels` se puede realizar una regresión lineal. Nuestro módulo se encarga de hacer todo este análisis.


In [19]:
fecha_corte = "2023-07-01"
results = perform_mlr({ "df": df, "fecha_corte": fecha_corte })
y_pred = results["y_pred"]
y_test =results["y_test"]
X_test_dates = results["X_test_dates"]
X_train = results["X_train"]
X_train_dates = results["X_train_dates"]
y_train = results["y_train"]
y_pred_train = results["y_pred_train"]
X_test = ["X_test"]
mse = results["mse_test"]
mape = results["mape_test"]
significant_variables = results["significant_variables"]


Mean Squared Error on Test Data: 161201.53739861868
Mean Absolute Percentage Error: 0.47438162467900846


Con esto verificamos que el valor de $R^2$ es de $0.514$, lo cual quiere decir que hay una correlación baja entre las variables exógenas y la variable endógena. Las variables significativas son 15 y están asociados a las temperaturas y precipitaciones de ciertos departamentos y también a la temperatura y precipitación de los departamentos que tienen represas.

## MLR with selected variables

In [20]:
significant_variables.append("Date")
significant_variables.append("energy_price")
significant_variables.remove("const")

In [21]:
# Selecciona las columnas significativas del DataFrame original
df_selected_variables = df[significant_variables].copy()

In [22]:
fecha_corte = "2023-07-01"

results = perform_mlr({ "df": df_selected_variables, "fecha_corte": fecha_corte })
y_pred_MRL_selection = results["y_pred"]
y_pred_1_MRL_selection =y_pred[1:]
y_test_MLR_selection = results["y_test"]
X_test_dates_MLR_selection = results["X_test_dates"]
X_train_MLR_selection = results["X_train"]
X_train_dates_MLR_selection =results["X_train_dates"]
y_train_MLR_selection =results["y_train"]
y_pred_train_MLR_selection =results["y_pred_train"]
mse_MLR_selection = results["mse_test"]
mape_MLR_selection = results["mape_test"]


Mean Squared Error on Test Data: 163926.07092389063
Mean Absolute Percentage Error: 0.4716718868563289


## Aplicamos forward para seleccion de variables

In [23]:
all_features = list(df.columns[:])
selected_features_forward = forward_selection(X_train, y_train)

In [24]:
print(len(selected_features_forward))
print("Forward Selection Result:", selected_features_forward)

37
Forward Selection Result: ['precipitacion_amazonas', 'precipitacion_antioquia', 'precipitacion_arauca', 'precipitacion_bogota', 'precipitacion_bolivar', 'precipitacion_boyaca', 'precipitacion_caldas', 'precipitacion_caqueta', 'precipitacion_cesar', 'precipitacion_cundinamarca', 'precipitacion_guaviare', 'precipitacion_huila', 'precipitacion_magdalena', 'precipitacion_meta', 'precipitacion_putumayo', 'precipitacion_quindio', 'precipitacion_risaralda', 'precipitacion_santander', 'precipitacion_tolima', 'precipitacion_valle del cauca', 'precipitacion_vichada', 'precipitacion_departamentos_represa', 'temp_AMAZONAS', 'temp_ARAUCA', 'temp_ARCHIPIELAGO DE SAN ANDRES PROVIDENCIA Y SANTA CATALINA', 'temp_BOYACA', 'temp_CAQUETA', 'temp_CASANARE', 'temp_CHOCO', 'temp_GUAINIA', 'temp_GUAVIARE', 'temp_MAGDALENA', 'temp_META', 'temp_NARINO', 'brent_value', 'Date', 'energy_price']


In [25]:
df_selected_variables_forward = df[selected_features_forward]

fecha_corte = "2023-07-01"

results = perform_mlr({ "df": df_selected_variables_forward, "fecha_corte": fecha_corte })
y_pred_f = results["y_pred"]
y_pred_1_f =y_pred[1:]
y_test_f = results["y_test"]
X_test_dates_f = results["X_test_dates"]
X_train_f = results["X_train"]
X_train_dates_f =results["X_train_dates"]
y_train_f =results["y_train"]
y_pred_train_f =results["y_pred_train"]
mse_f = results["mse_test"]
mape_f = results["mape_test"]

Mean Squared Error on Test Data: 228204.48552248106
Mean Absolute Percentage Error: 0.5003108320956435


## Aplicamos backward para seleccion de variables

In [26]:
# Aplica backward selection
selected_features_backward = backward_selection(X_train, y_train)


In [27]:
df_selected_variables_backward = df[selected_features_backward]

fecha_corte = "2023-07-01"

results = perform_mlr({ "df": df_selected_variables_backward, "fecha_corte": fecha_corte })
y_pred_b = results["y_pred"]
y_pred_1_b =y_pred[1:]
y_test_b = results["y_test"]
X_test_dates_b = results["X_test_dates"]
X_train_b = results["X_train"]
X_train_dates_b =results["X_train_dates"]
y_train_b =results["y_train"]
y_pred_train_b =results["y_pred_train"]
mse_b = results["mse_test"]
mape_b = results["mape_test"]


Mean Squared Error on Test Data: 157747.95887340303
Mean Absolute Percentage Error: 0.4592045222871403


In [28]:
results_MRL = pd.DataFrame({
    'Modelo': ['MLR','MLR_selection','MLR_forward','MLR_backward'],
    'MSE': [mse, mse_MLR_selection, mse_f, mse_b],
    'MAPE': [mape, mape_MLR_selection, mape_f, mape_b]
})

results_MRL

Unnamed: 0,Modelo,MSE,MAPE
0,MLR,161201.537399,0.474382
1,MLR_selection,163926.070924,0.471672
2,MLR_forward,228204.485522,0.500311
3,MLR_backward,157747.958873,0.459205


In [29]:
fig = px.line(x=X_train_dates_MLR_selection, y=y_train_MLR_selection,title="Forecasting of Energy Price with Forward MLR Selection")
fig.update_layout(xaxis_title='Date', yaxis_title='Average Energy Price')
fig.add_trace(go.Scatter(x=X_train_dates_MLR_selection, y=y_pred_train_MLR_selection, mode='lines', name='Valores predichos de train'))
fig.add_trace(go.Scatter(x=X_test_dates_MLR_selection, y=y_test_MLR_selection, mode='lines', name='Valores reales de test'))
fig.add_trace(go.Scatter(x=X_test_dates_MLR_selection, y=y_pred_MRL_selection, mode='lines', name='Valores predichos de test'))
fig.add_trace(go.Scatter(x=X_test_dates_MLR_selection, y=y_pred_1_MRL_selection, mode='lines', name='Valores predichos de test -1'))
fig.show()

Matriz de correlacion de las variables seleccionadas por el mejor modelo, selección MRL

In [30]:
numeric_columns = df_selected_variables.select_dtypes(include=['float64', 'int64'])
correlation_matrix = numeric_columns.corr()
correlation_matrix

Unnamed: 0,precipitacion_caqueta,precipitacion_cauca,precipitacion_cundinamarca,precipitacion_vaupes,precipitacion_departamentos_represa,temp_ARAUCA,temp_CALDAS,temp_CAQUETA,temp_CASANARE,temp_CUNDINAMARCA,temp_META,temp_NORTE DE SANTANDER,temp_PUTUMAYO,temp_QUINDIO,temp_avg_represas,brent_value,energy_price
precipitacion_caqueta,1.0,0.054377,0.147628,0.07925,0.213683,0.012223,-0.048599,-0.298821,-0.139115,-0.13005,-0.182588,0.004882,-0.195364,-0.056612,-0.112341,0.051083,-0.041954
precipitacion_cauca,0.054377,1.0,0.290584,0.203663,0.57971,0.227512,-0.252923,0.080221,-0.020888,-0.182039,0.262717,-0.259193,0.124287,-0.304097,-0.181595,0.036725,-0.098708
precipitacion_cundinamarca,0.147628,0.290584,1.0,-0.057658,0.65689,0.019668,-0.124856,-0.054833,-0.066969,-0.145562,-0.047176,-0.133527,-0.025101,-0.131341,-0.153345,0.037665,-0.083629
precipitacion_vaupes,0.07925,0.203663,-0.057658,1.0,0.125409,0.076672,-0.028447,-0.029421,0.01467,-0.047324,0.007606,-0.005199,0.023336,-0.029591,-0.013673,0.001619,-0.034262
precipitacion_departamentos_represa,0.213683,0.57971,0.65689,0.125409,1.0,0.187038,-0.386027,-0.128958,-0.13987,-0.325806,0.019337,-0.330705,0.022894,-0.427838,-0.373307,0.183865,-0.27237
temp_ARAUCA,0.012223,0.227512,0.019668,0.076672,0.187038,1.0,-0.048993,0.187118,0.167828,-0.050773,0.297044,-0.0617,0.236529,-0.071417,0.104438,-0.140329,-0.160235
temp_CALDAS,-0.048599,-0.252923,-0.124856,-0.028447,-0.386027,-0.048993,1.0,0.316675,0.386764,0.647052,0.032734,0.607893,0.145674,0.815937,0.821895,-0.519461,0.348053
temp_CAQUETA,-0.298821,0.080221,-0.054833,-0.029421,-0.128958,0.187118,0.316675,1.0,0.53834,0.424857,0.52846,0.197,0.611406,0.266281,0.419258,-0.265061,0.232082
temp_CASANARE,-0.139115,-0.020888,-0.066969,0.01467,-0.13987,0.167828,0.386764,0.53834,1.0,0.557028,0.508185,0.436877,0.437887,0.455749,0.605086,-0.239194,0.254354
temp_CUNDINAMARCA,-0.13005,-0.182039,-0.145562,-0.047324,-0.325806,-0.050773,0.647052,0.424857,0.557028,1.0,0.184621,0.727086,0.30099,0.701081,0.837538,-0.235658,0.55185
