In [55]:
## importing libraries ##

# Essentials
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error, r2_score

# Setting pandas print options (optional but useful for large dataframes)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)


## importing data ##

file_path = './datasets/chicago_training_data.xlsx'

# Reading training data into Python
modeling_data = './datasets/train.xlsx'
df_train = pd.read_excel(io=modeling_data, sheet_name='data', header=0, index_col='ID')

# Reading testing data into Python
testing_data = './datasets/test.xlsx'
df_test = pd.read_excel(io=testing_data, sheet_name='data', header=0, index_col='ID')

# Concatenating datasets together for missing value analysis and feature engineering
df_train['set'] = 'Not Kaggle'
df_test['set'] = 'Kaggle'

# Concatenating both datasets together for MV analysis and feature engineering
df_full = pd.concat(objs=[df_train, df_test], axis=0, ignore_index=False)

# Checking the concatenated data
print(df_full.head(n=5))

                           DateHour  Temperature(F)  Humidity(%)  Wind speed (mph)  Visibility(miles)  DewPointTemperature(F)  Rainfall(in)  Snowfall(in)  SolarRadiation(MJ/m2) Holiday FunctioningDay  RENTALS         set
ID                                                                                                                                                                                                                          
mb_1039  2023-10-14 05:59:54.810000              52           81               0.4                2.9                    46.4           0.0           0.0                   0.00      No            Yes    519.0  Not Kaggle
mb_1330  2023-10-26 08:59:53.355000              51           53               2.2                NaN                    35.2           0.0           0.0                   1.01      No            Yes   1251.0  Not Kaggle
mb_551   2023-09-23 21:59:57.250000              56           49               2.5                3.4               

In [56]:
# Añadiendo características de interacción iniciales
X_train['Temp_Humidity'] = X_train['Temperature(F)'] * X_train['Humidity(%)']
X_train['Temp_Hour'] = X_train['Temperature(F)'] * X_train['hour']
X_test['Temp_Hour'] = X_test['Temperature(F)'] * X_test['hour']

X_train['Visibility_Solar'] = X_train['Visibility(miles)'] * X_train['SolarRadiation(MJ/m2)']
X_test['Visibility_Solar'] = X_test['Visibility(miles)'] * X_test['SolarRadiation(MJ/m2)']

X_train['Weekday'] = (X_train['dayofweek'] >= 5).astype(int)
X_test['Weekday'] = (X_test['dayofweek'] >= 5).astype(int)
X_train['Solar_Weekday'] = X_train['SolarRadiation(MJ/m2)'] * X_train['Weekday']
X_test['Solar_Weekday'] = X_test['SolarRadiation(MJ/m2)'] * X_test['Weekday']


In [57]:
# Añadiendo nuevas características de interacción
X_train['Wind_Humidity'] = X_train['Wind speed (mph)'] * X_train['Humidity(%)']
X_test['Wind_Humidity'] = X_test['Wind speed (mph)'] * X_test['Humidity(%)']

X_train['Temp_Visibility'] = X_train['Temperature(F)'] * X_train['Visibility(miles)']
X_test['Temp_Visibility'] = X_test['Temperature(F)'] * X_test['Visibility(miles)']


In [58]:
# Recuerda que X_train y X_test deben estar ya definidos y contener las columnas originales

# 1. Interacción entre temperatura y humedad
X_train['Temp_Humidity'] = X_train['Temperature(F)'] * X_train['Humidity(%)']
X_test['Temp_Humidity'] = X_test['Temperature(F)'] * X_test['Humidity(%)']

# 2. Interacción entre temperatura y hora del día
X_train['Temp_Hour'] = X_train['Temperature(F)'] * X_train['hour']
X_test['Temp_Hour'] = X_test['Temperature(F)'] * X_test['hour']

# 3. Interacción entre la visibilidad y la luz solar
X_train['Visibility_Solar'] = X_train['Visibility(miles)'] * X_train['SolarRadiation(MJ/m2)']
X_test['Visibility_Solar'] = X_test['Visibility(miles)'] * X_test['SolarRadiation(MJ/m2)']

# 4. Efecto del día de la semana en la radiación solar
# Creando una característica categórica para día laborable (0) vs. fin de semana (1)
X_train['Weekday'] = (X_train['dayofweek'] >= 5).astype(int)
X_test['Weekday'] = (X_test['dayofweek'] >= 5).astype(int)
X_train['Solar_Weekday'] = X_train['SolarRadiation(MJ/m2)'] * X_train['Weekday']
X_test['Solar_Weekday'] = X_test['SolarRadiation(MJ/m2)'] * X_test['Weekday']


# Estadísticas descriptivas de las nuevas características
new_features = ['Temp_Humidity', 'Temp_Hour', 'Visibility_Solar', 'Solar_Weekday']
print(X_train[new_features].describe())

# Asumiendo que y_train es una Serie de pandas con el mismo índice que X_train
# Primero, agregamos 'RENTALS' temporalmente al DataFrame para calcular correlaciones
X_train_with_target = X_train.copy()
X_train_with_target['RENTALS'] = y_train

# Calculamos las correlaciones
correlations = X_train_with_target[new_features + ['RENTALS']].corr()

# Imprimimos las correlaciones de las nuevas características con 'RENTALS'
print(correlations['RENTALS'].drop('RENTALS'))

# Identificación de valores atípicos en las nuevas características
for feature in new_features:
    print(f"{feature} - Max value: {X_train[feature].max()}, Min value: {X_train[feature].min()}")




       Temp_Humidity    Temp_Hour  Visibility_Solar  Solar_Weekday
count    1638.000000  1090.000000       1499.000000    1558.000000
mean     3358.623321   706.673394          5.284776       0.098273
std      1230.587843   457.736911          8.491600       0.399466
min       600.000000     0.000000          0.000000       0.000000
25%      2537.000000   276.000000          0.000000       0.000000
50%      3240.000000   741.000000          0.000000       0.000000
75%      4070.750000  1061.500000          7.757000       0.000000
max      7178.000000  1725.000000         37.448000       3.020000
Temp_Humidity      -0.168299
Temp_Hour           0.468755
Visibility_Solar    0.177665
Solar_Weekday       0.083297
Name: RENTALS, dtype: float64
Temp_Humidity - Max value: 7178, Min value: 600
Temp_Hour - Max value: 1725.0, Min value: 0.0
Visibility_Solar - Max value: 37.448, Min value: 0.0
Solar_Weekday - Max value: 3.02, Min value: 0.0


In [59]:
## Ingeniería de características ##

import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import make_pipeline
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import Ridge, SGDRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler


## Ingeniería de características y Preparación de Datos ##

# Asumiendo que df_train y df_test ya están definidos correctamente
df_full = pd.concat(objs=[df_train, df_test], axis=0, ignore_index=False)

# Conversión de 'DateHour' a datetime y extracción de características temporales
df_full['DateHour'] = pd.to_datetime(df_full['DateHour'], errors='coerce')
df_full['year'] = df_full['DateHour'].dt.year
df_full['month'] = df_full['DateHour'].dt.month
df_full['day'] = df_full['DateHour'].dt.day
df_full['hour'] = df_full['DateHour'].dt.hour
df_full['dayofweek'] = df_full['DateHour'].dt.dayofweek

# Codificación One-Hot para 'Holiday' y 'FunctioningDay'
encoder = OneHotEncoder(sparse=False, drop='first')
encoded_features = encoder.fit_transform(df_full[['Holiday', 'FunctioningDay']])
encoded_features_df = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out(['Holiday', 'FunctioningDay']), index=df_full.index)

# Combinación de las nuevas columnas codificadas y eliminación de las originales
df_full = pd.concat([df_full.drop(['Holiday', 'FunctioningDay'], axis=1), encoded_features_df], axis=1)

# Separación de los datos en conjuntos de entrenamiento y prueba
df_train_processed = df_full[df_full['set'] == 'Not Kaggle'].drop(['set', 'DateHour'], axis=1)
df_test_processed = df_full[df_full['set'] == 'Kaggle'].drop(['set', 'DateHour', 'RENTALS'], axis=1)

# Definición de X_train, X_test, y y_train
X_train = df_train_processed.drop('RENTALS', axis=1)
y_train = df_train_processed['RENTALS']
X_test = df_test_processed

## Definición y Ajuste de Modelos ##

# K-Nearest Neighbors Regressor
knn_pipeline = make_pipeline(
    SimpleImputer(strategy='mean'),  # Imputación de valores faltantes
    StandardScaler(),  # Estandarización de características
    KNeighborsRegressor(n_neighbors=7, weights='distance', metric='manhattan')
)
knn_pipeline.fit(X_train, y_train)  # Ajuste del pipeline con datos de entrenamiento

# Decision Tree Regressor
dt_model = DecisionTreeRegressor(ccp_alpha=0.1, max_depth=8, min_samples_leaf=5, min_samples_split=3, random_state=42)
dt_model.fit(X_train, y_train)  # Ajuste del modelo con datos de entrenamiento

## Evaluación con Validación Cruzada ##
knn_scores = cross_val_score(knn_pipeline, X_train, y_train, cv=5, scoring='r2')
dt_scores = cross_val_score(dt_model, X_train, y_train, cv=5, scoring='r2')

print("KNN R² score:", np.mean(knn_scores))
print("Decision Tree R² score:", np.mean(dt_scores))

# Linear Regression
lr_pipeline = make_pipeline(
    SimpleImputer(strategy='mean'),  # Imputación de valores faltantes
    StandardScaler(),  # Estandarización de características
    LinearRegression()  # Modelo de regresión lineal
)

lr_model = LinearRegression()
lr_scores = cross_val_score(lr_pipeline, X_train, y_train, cv=5, scoring='r2')
print("Linear Regression R² score:", np.mean(lr_scores))

# Lasso Regression
lasso_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler()),
    ('lasso', Lasso(random_state=42))
])

lasso_model = Lasso(random_state=42)
param_grid_lasso = {'lasso__alpha': [0.001, 0.01, 0.1, 1, 10]}
grid_search_lasso = GridSearchCV(lasso_pipeline, param_grid_lasso, cv=5, scoring='r2')
grid_search_lasso.fit(X_train, y_train)
print("Lasso Regression best R² score:", grid_search_lasso.best_score_)
print("Best parameters for Lasso:", grid_search_lasso.best_params_)

# Ridge Regression
ridge_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler()),
    ('ridge', Ridge(random_state=42))
])

ridge_model = Ridge(random_state=42)
param_grid_ridge = {'ridge__alpha': [0.001, 0.01, 0.1, 1, 10]}
grid_search_ridge = GridSearchCV(ridge_pipeline, param_grid_ridge, cv=5, scoring='r2')
grid_search_ridge.fit(X_train, y_train)
print("Ridge Regression best R² score:", grid_search_ridge.best_score_)
print("Best parameters for Ridge:", grid_search_ridge.best_params_)

# Elastic Net Regression
elastic_net_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler()),
    ('elastic_net', SGDRegressor(loss='huber', penalty='elasticnet', random_state=42, max_iter=1000))
])

elastic_net_model = SGDRegressor(loss='huber', penalty='elasticnet', random_state=42)
param_grid_elastic_net = {
    'elastic_net__alpha': [0.001, 0.01, 0.1, 1],
    'elastic_net__l1_ratio': [0.0, 0.25, 0.5, 0.75, 1.0]
}
grid_search_elastic_net = GridSearchCV(elastic_net_pipeline, param_grid_elastic_net, cv=5, scoring='r2')
grid_search_elastic_net.fit(X_train, y_train)
print("Elastic Net Regression best R² score:", grid_search_elastic_net.best_score_)
print("Best parameters for Elastic Net:", grid_search_elastic_net.best_params_)

## Predicciones para el Conjunto de Prueba ##
predictions_knn = knn_pipeline.predict(X_test)
predictions_dt = dt_model.predict(X_test)

## Generación de Archivos de Sumisión ##
submission_knn = pd.DataFrame({'ID': df_test_processed.index, 'RENTALS': predictions_knn})
submission_dt = pd.DataFrame({'ID': df_test_processed.index, 'RENTALS': predictions_dt})




KNN R² score: 0.6741348186011191
Decision Tree R² score: 0.605845117113854
Linear Regression R² score: 0.514432049992054
Lasso Regression best R² score: 0.5151522862869492
Best parameters for Lasso: {'lasso__alpha': 10}
Ridge Regression best R² score: 0.5146916274090794
Best parameters for Ridge: {'ridge__alpha': 10}








Elastic Net Regression best R² score: -1.409217925727069
Best parameters for Elastic Net: {'elastic_net__alpha': 0.001, 'elastic_net__l1_ratio': 1.0}




In [41]:
from sklearn.model_selection import cross_val_score

# Asume que knn_pipeline y dt_model ya están definidos con los mejores hiperparámetros encontrados

# Reevaluación de KNN
knn_scores = cross_val_score(knn_pipeline, X_train, y_train, cv=5, scoring='r2')
print("Reevaluado KNN R² score:", np.mean(knn_scores))

# Reevaluación de Decision Tree
dt_scores = cross_val_score(dt_model, X_train, y_train, cv=5, scoring='r2')
print("Reevaluado Decision Tree R² score:", np.mean(dt_scores))


Reevaluado KNN R² score: 0.6741348186011191
Reevaluado Decision Tree R² score: 0.605845117113854


In [44]:
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

# Continúa con la definición del pipeline y el proceso de validación cruzada anidada
knn_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler()),
    ('knn', KNeighborsRegressor())
])

# Validación cruzada externa
external_cv = KFold(n_splits=5, shuffle=True, random_state=42)

# Lista para almacenar los scores de cada iteración de validación cruzada externa
nested_scores_knn = []

for train_index, test_index in external_cv.split(X_train):
    X_train_fold, X_test_fold = X_train.iloc[train_index], X_train.iloc[test_index]
    y_train_fold, y_test_fold = y_train.iloc[train_index], y_train.iloc[test_index]
    

# Definición del espacio de hiperparámetros para GridSearchCV con KNN
param_grid_knn = {
    'knn__n_neighbors': [3, 5, 8],  # Prueba con 3, 5, y 8 vecinos
    'knn__weights': ['uniform', 'distance'],  # Prueba ambos tipos de ponderación
    'knn__metric': ['euclidean', 'manhattan']  # Prueba con métricas euclidiana y de manhattan
}

# Ahora puedes continuar con la configuración previa de GridSearchCV y el ciclo de validación cruzada anidada

    
# Búsqueda de hiperparámetros dentro de la validación cruzada interna
grid_search = GridSearchCV(knn_pipeline, param_grid_knn, cv=5, scoring='r2')
grid_search.fit(X_train_fold, y_train_fold)
    
 # Evaluación en el conjunto de prueba de la iteración externa
best_model = grid_search.best_estimator_
nested_score = best_model.score(X_test_fold, y_test_fold)
nested_scores_knn.append(nested_score)

# Resultado final de la validación cruzada anidada para KNN
print("Validación Cruzada Anidada - Score R² para KNN:", np.mean(nested_scores_knn))




Validación Cruzada Anidada - Score R² para KNN: 0.7278227141774567


In [45]:
from sklearn.tree import DecisionTreeRegressor

# Definición del modelo de Decision Tree
dt_model = DecisionTreeRegressor()

# Espacio de hiperparámetros para GridSearchCV con Decision Tree
param_grid_dt = {
    'max_depth': [5, 10, 15, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Lista para almacenar los scores de cada iteración de validación cruzada externa para Decision Tree
nested_scores_dt = []

for train_index, test_index in external_cv.split(X_train):
    X_train_fold, X_test_fold = X_train.iloc[train_index], X_train.iloc[test_index]
    y_train_fold, y_test_fold = y_train.iloc[train_index], y_train.iloc[test_index]
    
    # Búsqueda de hiperparámetros dentro de la validación cruzada interna para Decision Tree
    grid_search_dt = GridSearchCV(dt_model, param_grid_dt, cv=5, scoring='r2')
    grid_search_dt.fit(X_train_fold, y_train_fold)
    
    # Evaluación en el conjunto de prueba de la iteración externa para Decision Tree
    best_model_dt = grid_search_dt.best_estimator_
    nested_score_dt = best_model_dt.score(X_test_fold, y_test_fold)
    nested_scores_dt.append(nested_score_dt)

# Resultado final de la validación cruzada anidada para Decision Tree
print("Validación Cruzada Anidada - Score R² para Decision Tree:", np.mean(nested_scores_dt))


Validación Cruzada Anidada - Score R² para Decision Tree: 0.5393699262686694


In [46]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from scipy.stats import randint

# Definición del pipeline de KNN
knn_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler()),
    ('knn', KNeighborsRegressor())
])

# Espacio de hiperparámetros para RandomizedSearchCV
param_distributions_knn = {
    'knn__n_neighbors': randint(1, 30),  # Explora un rango más amplio para el número de vecinos
    'knn__metric': ['euclidean', 'manhattan'],  # Métricas de distancia
    'knn__weights': ['uniform', 'distance']  # Tipo de pesos
}

# Configuración de RandomizedSearchCV
random_search_knn = RandomizedSearchCV(knn_pipeline, param_distributions=param_distributions_knn, n_iter=100, cv=5, scoring='r2', random_state=42, n_jobs=-1)

# Ejecución de la búsqueda
random_search_knn.fit(X_train, y_train)

# Mejores parámetros y score R²
print("Mejores parámetros para KNN:", random_search_knn.best_params_)
print("Mejor score R² para KNN:", random_search_knn.best_score_)


Mejores parámetros para KNN: {'knn__metric': 'manhattan', 'knn__n_neighbors': 8, 'knn__weights': 'distance'}
Mejor score R² para KNN: 0.6759888564703906


In [47]:
from sklearn.tree import DecisionTreeRegressor

# Espacio de hiperparámetros para RandomizedSearchCV con Decision Tree
param_distributions_dt = {
    'max_depth': randint(3, 20),  # Profundidad máxima del árbol
    'min_samples_split': randint(2, 20),  # Número mínimo de muestras requeridas para dividir un nodo
    'min_samples_leaf': randint(1, 20),  # Número mínimo de muestras requeridas en un nodo hoja
    'ccp_alpha': [0.0, 0.01, 0.1, 1.0]  # Valores de ccp_alpha para la poda de costo-complejidad
}

# Configuración de RandomizedSearchCV para Decision Tree
random_search_dt = RandomizedSearchCV(DecisionTreeRegressor(random_state=42), param_distributions=param_distributions_dt, n_iter=100, cv=5, scoring='r2', random_state=42, n_jobs=-1)

# Ejecución de la búsqueda
random_search_dt.fit(X_train, y_train)

# Mejores parámetros y score R²
print("Mejores parámetros para Decision Tree:", random_search_dt.best_params_)
print("Mejor score R² para Decision Tree:", random_search_dt.best_score_)


Mejores parámetros para Decision Tree: {'ccp_alpha': 0.1, 'max_depth': 8, 'min_samples_leaf': 5, 'min_samples_split': 3}
Mejor score R² para Decision Tree: 0.605845117113854
