In [1]:
from sklearn.linear_model import Lasso
import pandas as pd
from sklearn.model_selection import KFold, RandomizedSearchCV
import numpy as np
from sklearn.metrics import mean_squared_error
from lags import create_df_with_lags

In [2]:
df = pd.read_csv("../processed_tables/merged_standardized.csv")

## Creación de lags para el precio de la energía

In [3]:
df = create_df_with_lags(df, 3)
lag_columns = df.filter(like="lag_")
price = df[["Date", "energy_price"]]
df = pd.concat([price, lag_columns], axis=1)
df.head()

Unnamed: 0,Date,energy_price,energy_price_lag_1,energy_price_lag_2,energy_price_lag_3
3,2021-10-04,286.526116,282.065065,256.940174,216.929726
4,2021-10-05,278.157325,286.526116,282.065065,256.940174
5,2021-10-06,261.474788,278.157325,286.526116,282.065065
6,2021-10-07,235.857848,261.474788,278.157325,286.526116
7,2021-10-08,229.313699,235.857848,261.474788,278.157325


## Partición de la data

Debido a que la naturaleza de la data es de carácter temporal, se realizará una partición de la data en 3 partes: entrenamiento y prueba. La data de test será los últimos 3 meses de la data, es decir desde julio 2023 hasta septiembre 2023.


In [4]:
fecha_corte = "2023-07-01"
X_train = df[df["Date"] < fecha_corte].drop(["Date", "energy_price"], axis=1)
X_test = df[df["Date"] >= fecha_corte].drop(["Date", "energy_price"], axis=1)
y_train = df[df["Date"] < fecha_corte]["energy_price"]
y_test = df[df["Date"] >= fecha_corte]["energy_price"]

## Regresión lineal

Esta vez utilizaremos `sklearn` para hacer la regresión Lasso y poder ver qué coeficientes son cero.


In [5]:
lasso = Lasso()

# Define the parameter grid to search
param_grid = {'alpha': np.logspace(-4, 4, 20)}

# Setup Cross-Validation
kfold = KFold(n_splits=5, shuffle=True, random_state=42)

# Setup Randomized Grid Search
lasso_cv = RandomizedSearchCV(lasso, param_grid, cv=kfold, random_state=42)

# Fit the model
lasso_cv.fit(X_train, y_train)

# Print the best parameters and score
print("Best parameters found: ", lasso_cv.best_params_)
print("Best cross-validation score: ", lasso_cv.best_score_)

Best parameters found:  {'alpha': 4.281332398719396}
Best cross-validation score:  0.9402050642672375


In [6]:
y_pred = lasso_cv.predict(X_test)

In [7]:
mse_test = mean_squared_error(y_test, y_pred)

print("Mean Squared Error on Test Data:", mse_test)

Mean Squared Error on Test Data: 4534.664925587474


In [8]:
coefficients = lasso_cv.best_estimator_.coef_

In [9]:
feature_names = X_train.columns

# Filter the coefficients and corresponding feature names
non_zero_coefficients = coefficients[coefficients != 0]
non_zero_features = feature_names[coefficients != 0]

# Print the non-zero coefficients and their corresponding feature names
for feature, coef in zip(non_zero_features, non_zero_coefficients):
    print(f"{feature}: {coef}")

energy_price_lag_1: 1.0793042203114995
energy_price_lag_2: 0.0634580401317947
energy_price_lag_3: -0.1812666616568457
