In [14]:
## importing libraries ##

# Essentials
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error, r2_score

# Setting pandas print options (optional but useful for large dataframes)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)


## importing data ##

file_path = './datasets/chicago_training_data.xlsx'

# Reading training data into Python
modeling_data = './datasets/train.xlsx'
df_train = pd.read_excel(io=modeling_data, sheet_name='data', header=0, index_col='ID')

# Reading testing data into Python
testing_data = './datasets/test.xlsx'
df_test = pd.read_excel(io=testing_data, sheet_name='data', header=0, index_col='ID')

# Concatenating datasets together for missing value analysis and feature engineering
df_train['set'] = 'Not Kaggle'
df_test['set'] = 'Kaggle'

# Concatenating both datasets together for MV analysis and feature engineering
df_full = pd.concat(objs=[df_train, df_test], axis=0, ignore_index=False)

# Checking the concatenated data
print(df_full.head(n=5))

                           DateHour  Temperature(F)  Humidity(%)  Wind speed (mph)  Visibility(miles)  DewPointTemperature(F)  Rainfall(in)  Snowfall(in)  SolarRadiation(MJ/m2) Holiday FunctioningDay  RENTALS         set
ID                                                                                                                                                                                                                          
mb_1039  2023-10-14 05:59:54.810000              52           81               0.4                2.9                    46.4           0.0           0.0                   0.00      No            Yes    519.0  Not Kaggle
mb_1330  2023-10-26 08:59:53.355000              51           53               2.2                NaN                    35.2           0.0           0.0                   1.01      No            Yes   1251.0  Not Kaggle
mb_551   2023-09-23 21:59:57.250000              56           49               2.5                3.4               

In [15]:
# Asumiendo que df_train y df_test son tus DataFrames de entrenamiento y prueba respectivamente

# Verifica si 'Holiday' y 'FunctioningDay' están en df_train y df_test antes de concatenar
print("Before concatenation:")
print("Train columns:", df_train.columns)
print("Test columns:", df_test.columns)

# Concatenación (asegúrate de que esta parte funciona como esperas)
df_full = pd.concat([df_train, df_test], axis=0)

# Verificación después de la concatenación
print("After concatenation:")
print("Full columns:", df_full.columns)

# Asegúrate de que las columnas existan antes de proceder con la codificación One-Hot u otras operaciones
if 'Holiday' in df_full.columns and 'FunctioningDay' in df_full.columns:
    # Procede con la codificación One-Hot o la operación deseada
    print("Columns are present, proceeding with further processing.")
else:
    print("Columns are missing, check earlier steps.")


Before concatenation:
Train columns: Index(['DateHour', 'Temperature(F)', 'Humidity(%)', 'Wind speed (mph)', 'Visibility(miles)', 'DewPointTemperature(F)', 'Rainfall(in)', 'Snowfall(in)', 'SolarRadiation(MJ/m2)', 'Holiday', 'FunctioningDay', 'RENTALS', 'set'], dtype='object')
Test columns: Index(['DateHour', 'Temperature(F)', 'Humidity(%)', 'Wind speed (mph)', 'Visibility(miles)', 'DewPointTemperature(F)', 'Rainfall(in)', 'Snowfall(in)', 'SolarRadiation(MJ/m2)', 'Holiday', 'FunctioningDay', 'set'], dtype='object')
After concatenation:
Full columns: Index(['DateHour', 'Temperature(F)', 'Humidity(%)', 'Wind speed (mph)', 'Visibility(miles)', 'DewPointTemperature(F)', 'Rainfall(in)', 'Snowfall(in)', 'SolarRadiation(MJ/m2)', 'Holiday', 'FunctioningDay', 'RENTALS', 'set'], dtype='object')
Columns are present, proceeding with further processing.


In [9]:
## Ingeniería de características ##

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import make_pipeline
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.linear_model import SGDRegressor  # Para Elastic Net
from sklearn.model_selection import GridSearchCV

## Concatenación de los conjuntos de datos ##
# Asegúrate de que df_train y df_test están definidos y tienen las columnas 'set' apropiadas

df_full = pd.concat(objs=[df_train, df_test], axis=0, ignore_index=False)

## Ingeniería de características ##

# Conversión de 'DateHour' a datetime
df_full['DateHour'] = pd.to_datetime(df_full['DateHour'], errors='coerce')

# Extracción de características temporales
df_full['year'] = df_full['DateHour'].dt.year
df_full['month'] = df_full['DateHour'].dt.month
df_full['day'] = df_full['DateHour'].dt.day
df_full['hour'] = df_full['DateHour'].dt.hour
df_full['dayofweek'] = df_full['DateHour'].dt.dayofweek

# Codificación One-Hot para 'Holiday' y 'FunctioningDay'
encoder = OneHotEncoder(sparse=False, drop='first')
encoded_features = encoder.fit_transform(df_full[['Holiday', 'FunctioningDay']])
encoded_features_df = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out(['Holiday', 'FunctioningDay']), index=df_full.index)

# Agrega las nuevas columnas codificadas y elimina las originales
df_full = pd.concat([df_full.drop(['Holiday', 'FunctioningDay'], axis=1), encoded_features_df], axis=1)

# Separar los datos procesados de nuevo en conjuntos de entrenamiento y prueba
df_train_processed = df_full[df_full['set'] == 'Not Kaggle'].drop(['set', 'DateHour'], axis=1)
df_test_processed = df_full[df_full['set'] == 'Kaggle'].drop(['set', 'DateHour', 'RENTALS'], axis=1)

# Preparación de los conjuntos de datos para el entrenamiento
X_train = df_train_processed.drop('RENTALS', axis=1)
y_train = df_train_processed['RENTALS']
X_test = df_test_processed

## Entrenamiento y evaluación de modelos ##

knn_pipeline = make_pipeline(
    SimpleImputer(strategy='mean'),  # Imputación de valores faltantes
    StandardScaler(),
    KNeighborsRegressor(n_neighbors=5, weights='distance', metric='manhattan')
)
knn_pipeline.fit(X_train, y_train)  # Ajuste del pipeline con datos de entrenamiento


# Decision Tree Regressor
dt_model = DecisionTreeRegressor(max_depth=10, random_state=42)
dt_model.fit(X_train, y_train)  # Ajuste del modelo con datos de entrenamiento

# Evaluación con validación cruzada
from sklearn.model_selection import cross_val_score
knn_scores = cross_val_score(knn_pipeline, X_train, y_train, cv=5, scoring='r2')
dt_scores = cross_val_score(dt_model, X_train, y_train, cv=5, scoring='r2')

print("KNN R² score:", np.mean(knn_scores))
print("Decision Tree R² score:", np.mean(dt_scores))





## Predicciones para el conjunto de prueba ##
predictions_knn = knn_pipeline.predict(X_test)
predictions_dt = dt_model.predict(X_test)

# Generación de archivos de sumisión
submission_knn = pd.DataFrame({'ID': df_test_processed.index, 'RENTALS': predictions_knn})
submission_dt = pd.DataFrame({'ID': df_test_processed.index, 'RENTALS': predictions_dt})

KNN R² score: 0.6705283065215774
Decision Tree R² score: 0.5499507598395843




In [10]:
from sklearn.model_selection import GridSearchCV

# Definición del pipeline de KNN con imputación y escalado de características
knn_pipeline = make_pipeline(SimpleImputer(strategy='mean'), StandardScaler(), KNeighborsRegressor())

# Parámetros para GridSearchCV
knn_params = {
    'kneighborsregressor__n_neighbors': range(3, 10),
    'kneighborsregressor__weights': ['uniform', 'distance'],
    'kneighborsregressor__metric': ['euclidean', 'manhattan']
}

# Búsqueda de GridSearchCV para KNN
grid_search_knn = GridSearchCV(knn_pipeline, knn_params, cv=5, scoring='r2', n_jobs=-1)
grid_search_knn.fit(X_train, y_train)

# Mejores parámetros y score para KNN
print("Mejores parámetros para KNN:", grid_search_knn.best_params_)
print("Mejor score R² para KNN:", grid_search_knn.best_score_)


Mejores parámetros para KNN: {'kneighborsregressor__metric': 'manhattan', 'kneighborsregressor__n_neighbors': 8, 'kneighborsregressor__weights': 'distance'}
Mejor score R² para KNN: 0.6759888564703906


In [13]:
# Definición del modelo de Decision Tree
dt_model = DecisionTreeRegressor(random_state=42)

# Parámetros para GridSearchCV
dt_params = {
    'max_depth': range(3, 20),
    'min_samples_split': range(2, 10),
    'min_samples_leaf': range(1, 10)
}

# Búsqueda de GridSearchCV para Decision Tree
grid_search_dt = GridSearchCV(dt_model, dt_params, cv=5, scoring='r2', n_jobs=-1)
grid_search_dt.fit(X_train, y_train)

# Mejores parámetros y score para Decision Tree
print("Mejores parámetros para Decision Tree:", grid_search_dt.best_params_)
print("Mejor score R² para Decision Tree:", grid_search_dt.best_score_)


Mejores parámetros para Decision Tree: {'max_depth': 8, 'min_samples_leaf': 5, 'min_samples_split': 2}
Mejor score R² para Decision Tree: 0.606181843317333


In [None]:
# Asumiendo df_full es tu DataFrame completo que incluye tanto los datos de Kaggle como los de modelado

# Separar los datos para Kaggle
kaggle_data = df_full[df_full['set'] == 'Kaggle'].copy()

# Datos para la construcción del modelo
df = df_full[df_full['set'] == 'Not Kaggle'].copy()

# Eliminar el identificador 'set'
kaggle_data.drop(labels='set', axis=1, inplace=True)
df.drop(labels='set', axis=1, inplace=True)