In [1]:
import pandas as pd
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.neighbors import KNeighborsRegressor
from lags import create_df_with_lags

In [2]:
df = pd.read_csv("../processed_tables/merged_standardized.csv")

## Creación de lags para el precio de la energía

In [3]:
df = create_df_with_lags(df, 3)
df.head()

Unnamed: 0,precipitacion_amazonas,precipitacion_antioquia,precipitacion_arauca,precipitacion_atlantico,precipitacion_bogota,precipitacion_bolivar,precipitacion_boyaca,precipitacion_caldas,precipitacion_caqueta,precipitacion_casanare,...,temp_VICHADA,temp_avg_pais,temp_avg_represas,brent_value,TRM,Date,energy_price,energy_price_lag_1,energy_price_lag_2,energy_price_lag_3
3,-0.5806,-0.100687,1.718155,-0.183251,-0.287443,3.802742,0.274778,-0.533335,-0.300951,0.339387,...,0.057805,1.150745,0.723472,-0.761153,-1.423606,2021-10-04,286.526116,282.065065,256.940174,216.929726
4,0.016831,1.13681,2.98176,0.52169,-0.560886,0.362265,1.586688,0.365171,0.158697,-0.18016,...,-0.701899,0.112145,-0.197147,-0.614059,-1.411032,2021-10-05,278.157325,286.526116,282.065065,256.940174
5,-0.142496,3.004638,4.689849,-0.306373,-0.567283,0.898967,2.67488,0.219436,2.305734,0.758029,...,1.235347,-0.157838,-0.525688,-0.62436,-1.383612,2021-10-06,261.474788,278.157325,286.526116,282.065065
6,0.132014,0.622153,1.953096,-0.256603,-0.57208,-0.005022,0.895201,-0.454157,-0.524727,-0.042188,...,0.171761,0.802658,0.438749,-0.726955,-1.405736,2021-10-07,235.857848,261.474788,278.157325,286.526116
7,-0.556487,4.243553,2.308684,-0.184229,0.316477,0.581741,0.00427,-0.263669,-0.170919,0.066591,...,0.513628,0.57498,-0.071897,-0.569972,-1.447442,2021-10-08,229.313699,235.857848,261.474788,278.157325


## Partición de la data

Debido a que la naturaleza de la data es de carácter temporal, se realizará una partición de la data en 3 partes: entrenamiento y prueba. La data de test será los últimos 3 meses de la data, es decir desde julio 2023 hasta septiembre 2023.


In [4]:
fecha_corte = "2023-07-01"
X_train = df[df["Date"] < fecha_corte].drop(["Date", "energy_price"], axis=1)
X_test = df[df["Date"] >= fecha_corte].drop(["Date", "energy_price"], axis=1)
y_train = df[df["Date"] < fecha_corte]["energy_price"]
y_test = df[df["Date"] >= fecha_corte]["energy_price"]

## KNN

Esta vez utilizaremos `sklearn` para hacer KNN con regresión, variando parámetros como el número de vecinos, la métrica de distancia y la técnica de pesado.

In [5]:
param_dist = {
    'n_neighbors': range(1, 30),
    'weights': ['uniform', 'distance'],
    'p': [1, 2]
}

# Creating the RandomizedSearchCV object
knn_regressor = KNeighborsRegressor()
random_search = RandomizedSearchCV(knn_regressor, param_distributions=param_dist, 
                                   n_iter=100, cv=5, scoring='neg_mean_squared_error', random_state=42)

# Fitting the model
random_search.fit(X_train, y_train)

In [6]:
print(f"Best parameters: {random_search.best_params_}")

Best parameters: {'weights': 'distance', 'p': 2, 'n_neighbors': 7}


In [7]:
best_knn_regressor = random_search.best_estimator_
y_pred = best_knn_regressor.predict(X_test)

In [8]:
mse_test = mean_squared_error(y_test, y_pred)

print("Mean Squared Error on Test Data:", mse_test)

Mean Squared Error on Test Data: 25442.38492607358
