In [325]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import r2_score

Cargamos el dataset

In [326]:
# Cargar el conjunto de datos
data = pd.read_csv('dataset_generation/dataset.csv')

data.head(5)

Unnamed: 0,hour,day,month,year,week_day,working_day,class_day,exits,temperature,a_temperature,humidity,precipitation,rain,wind_speed,demand_satisfied
0,0,1,1,2021,4,0,3,0,6.4,3.6,78.9,0.0,0.0,8.2,1.0
1,1,1,1,2021,4,0,3,1,4.9,2.0,83.0,0.0,0.0,7.8,1.0
2,2,1,1,2021,4,0,3,0,4.6,1.7,84.4,0.0,0.0,7.9,1.0
3,3,1,1,2021,4,0,3,0,4.5,1.7,85.6,0.0,0.0,7.2,1.0
4,4,1,1,2021,4,0,3,0,5.4,2.9,81.0,0.0,0.0,5.8,1.0


In [327]:
data.shape

(25536, 15)

Filtramos los registros para horas entre las 7 y las 23 y quitamos la columna demand satisfied

In [328]:
data = data[(data['hour'] >= 7) & (data['hour'] <= 23)].drop('demand_satisfied', axis=1)

data.head(5)

Unnamed: 0,hour,day,month,year,week_day,working_day,class_day,exits,temperature,a_temperature,humidity,precipitation,rain,wind_speed
7,7,1,1,2021,4,0,3,0,4.3,1.2,87.1,0.0,0.0,9.0
8,8,1,1,2021,4,0,3,0,4.9,2.1,84.8,0.2,0.2,7.4
9,9,1,1,2021,4,0,3,0,6.8,4.7,82.3,0.0,0.0,4.7
10,10,1,1,2021,4,0,3,3,7.8,5.9,83.9,0.9,0.9,5.1
11,11,1,1,2021,4,0,3,0,7.9,5.7,85.7,0.2,0.2,7.9


In [329]:
data.shape

(18088, 14)

Binarizar variables categóricas

In [330]:
categorical_vars_selected = ["hour", "day", "month", "year", "week_day", "working_day", "class_day"]

# Binarizar las variables categóricas y asignar el resultado al DataFrame
data_with_dummies = pd.get_dummies(data, columns=categorical_vars_selected, drop_first=True)

# Identificar las nuevas columnas categóricas creadas
new_categorical_cols = [col for col in data_with_dummies.columns if col not in data.columns]

# Convertir solo las nuevas columnas categóricas a enteros
data[new_categorical_cols] = data_with_dummies[new_categorical_cols].astype(int)

data = data.drop(categorical_vars_selected, axis=1)

pd.set_option('display.max_columns', None)
data.head(5)

Unnamed: 0,exits,temperature,a_temperature,humidity,precipitation,rain,wind_speed,hour_8,hour_9,hour_10,hour_11,hour_12,hour_13,hour_14,hour_15,hour_16,hour_17,hour_18,hour_19,hour_20,hour_21,hour_22,hour_23,day_2,day_3,day_4,day_5,day_6,day_7,day_8,day_9,day_10,day_11,day_12,day_13,day_14,day_15,day_16,day_17,day_18,day_19,day_20,day_21,day_22,day_23,day_24,day_25,day_26,day_27,day_28,day_29,day_30,day_31,month_2,month_3,month_4,month_5,month_6,month_7,month_8,month_9,month_10,month_11,month_12,year_2022,year_2023,week_day_1,week_day_2,week_day_3,week_day_4,week_day_5,week_day_6,working_day_1,class_day_2,class_day_3
7,0,4.3,1.2,87.1,0.0,0.0,9.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1
8,0,4.9,2.1,84.8,0.2,0.2,7.4,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1
9,0,6.8,4.7,82.3,0.0,0.0,4.7,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1
10,3,7.8,5.9,83.9,0.9,0.9,5.1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1
11,0,7.9,5.7,85.7,0.2,0.2,7.9,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1


In [331]:
data.shape

(18088, 75)

Partir entre train y test

In [332]:
X = data.drop(['exits'], axis=1)
y = data['exits']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

X_train.shape, X_test.shape

((12661, 74), (5427, 74))

In [333]:
X_train.head(5)

Unnamed: 0,temperature,a_temperature,humidity,precipitation,rain,wind_speed,hour_8,hour_9,hour_10,hour_11,hour_12,hour_13,hour_14,hour_15,hour_16,hour_17,hour_18,hour_19,hour_20,hour_21,hour_22,hour_23,day_2,day_3,day_4,day_5,day_6,day_7,day_8,day_9,day_10,day_11,day_12,day_13,day_14,day_15,day_16,day_17,day_18,day_19,day_20,day_21,day_22,day_23,day_24,day_25,day_26,day_27,day_28,day_29,day_30,day_31,month_2,month_3,month_4,month_5,month_6,month_7,month_8,month_9,month_10,month_11,month_12,year_2022,year_2023,week_day_1,week_day_2,week_day_3,week_day_4,week_day_5,week_day_6,working_day_1,class_day_2,class_day_3
4786,32.6,33.1,34.8,0.0,0.0,15.7,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1
18929,10.4,7.8,59.1,0.0,0.0,6.0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,1,0,0
17534,16.2,16.2,77.7,0.0,0.0,5.1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1
14973,21.0,23.7,95.5,0.0,0.0,9.0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,1,0,0
6559,17.1,17.4,88.9,0.0,0.0,10.1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0


In [334]:
# Identificar las variables numéricas no binarias
non_binary_numerical_features = [col for col in data if col not in new_categorical_cols]

non_binary_numerical_features

['exits',
 'temperature',
 'a_temperature',
 'humidity',
 'precipitation',
 'rain',
 'wind_speed']

Normalizar variables numéricas

In [335]:
from sklearn.preprocessing import MinMaxScaler

non_binary_numerical_features = [col for col in non_binary_numerical_features if col != 'exits']

# Inicializar MinMaxScaler para las características no binarias
scaler_X_non_binary = MinMaxScaler()

# Ajustar el scaler solo con los datos de entrenamiento para las características no binarias y transformarlos
X_train_scaled_non_binary = scaler_X_non_binary.fit_transform(X_train[non_binary_numerical_features])
X_test_scaled_non_binary = scaler_X_non_binary.transform(X_test[non_binary_numerical_features])

# Crear DataFrames de las características escaladas para combinarlas con las binarias
X_train_scaled_non_binary_df = pd.DataFrame(X_train_scaled_non_binary, columns=non_binary_numerical_features, index=X_train.index)
X_test_scaled_non_binary_df = pd.DataFrame(X_test_scaled_non_binary, columns=non_binary_numerical_features, index=X_test.index)

# Extraer las variables binarias del conjunto de entrenamiento y prueba original
X_train_binary = X_train.drop(non_binary_numerical_features, axis=1)
X_test_binary = X_test.drop(non_binary_numerical_features, axis=1)

# Combinar las variables binarias con las numéricas escaladas
X_train_combined = pd.concat([X_train_binary, X_train_scaled_non_binary_df], axis=1)
X_test_combined = pd.concat([X_test_binary, X_test_scaled_non_binary_df], axis=1)

# Mostrar las primeras 5 filas del conjunto de entrenamiento combinado para verificar
X_train_combined.head()

Unnamed: 0,hour_8,hour_9,hour_10,hour_11,hour_12,hour_13,hour_14,hour_15,hour_16,hour_17,hour_18,hour_19,hour_20,hour_21,hour_22,hour_23,day_2,day_3,day_4,day_5,day_6,day_7,day_8,day_9,day_10,day_11,day_12,day_13,day_14,day_15,day_16,day_17,day_18,day_19,day_20,day_21,day_22,day_23,day_24,day_25,day_26,day_27,day_28,day_29,day_30,day_31,month_2,month_3,month_4,month_5,month_6,month_7,month_8,month_9,month_10,month_11,month_12,year_2022,year_2023,week_day_1,week_day_2,week_day_3,week_day_4,week_day_5,week_day_6,working_day_1,class_day_2,class_day_3,temperature,a_temperature,humidity,precipitation,rain,wind_speed
4786,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0.887701,0.842105,0.238318,0.0,0.0,0.414248
18929,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0.294118,0.287281,0.522196,0.0,0.0,0.158311
17534,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0.449198,0.471491,0.739486,0.0,0.0,0.134565
14973,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0.57754,0.635965,0.94743,0.0,0.0,0.237467
6559,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0.473262,0.497807,0.870327,0.0,0.0,0.266491


In [336]:
X_train_combined.shape

(12661, 74)

In [337]:
# Inicializar MinMaxScaler para la variable objetivo
scaler_y = MinMaxScaler()

# Ajustar y transformar la variable objetivo en el conjunto de entrenamiento
y_train_scaled = scaler_y.fit_transform(y_train.to_numpy().reshape(-1, 1))

# Transformar la variable objetivo en el conjunto de prueba
y_test_scaled = scaler_y.transform(y_test.to_numpy().reshape(-1, 1))

Aplicamos KNN

In [339]:
from sklearn.neighbors import KNeighborsRegressor
from skopt import BayesSearchCV
from sklearn.metrics import make_scorer, mean_squared_error, mean_absolute_error
from sklearn.model_selection import TimeSeriesSplit

# Configuración inicial
pd.set_option('display.precision', 5)
knn = KNeighborsRegressor()

# Definir el espacio de parámetros para la búsqueda
param = {
    'n_neighbors': [1, 3, 5, 7, 11, 15, 20, 25],
    'weights': ['distance', 'uniform'],
    'leaf_size': [1, 5, 10, 15, 20, 25, 30],
    'metric': ['euclidean', 'manhattan', 'cosine']  # Ajustado a los nombres correctos de las métricas
}

# Ajustes para BayesSearchCV
niter = 30  # Ajusta según la cantidad de iteraciones que desees
cv = 5      # Número de pliegues en TimeSeriesSplit
w = 4       # Ventana

# Inicializar BayesSearchCV
knn_bs = BayesSearchCV(
    knn, param, n_iter=niter,
    cv=TimeSeriesSplit(n_splits=cv, gap=w+1),
    scoring=make_scorer(mean_squared_error, greater_is_better=False),
    n_jobs=-1, refit=True, random_state=0
)

# Ajustar el modelo
knn_bs.fit(X_train_combined, y_train_scaled)

# Realizar predicciones y evaluar
y_pred_test = knn_bs.predict(X_test_combined)
mse = mean_squared_error(y_test_scaled, y_pred_test)
mae = mean_absolute_error(y_test_scaled, y_pred_test)

print("Mean Squared Error:", mse)
print("Mean Absolute Error:", mae)


r2 = r2_score(y_test_scaled, y_pred_test)

r2

AttributeError: module 'numpy' has no attribute 'int'.
`np.int` was a deprecated alias for the builtin `int`. To avoid this error in existing code, use `int` by itself. Doing this will not modify any behavior and is safe. When replacing `np.int`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.
The aliases was originally deprecated in NumPy 1.20; for more details and guidance see the original release note at:
    https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations