In [11]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error

# setting pandas print options (optional)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

## importing data ##

file_path = './datasets/chicago_training_data.xlsx'

# reading modeling data into Python
modeling_data = './datasets/train.xlsx'


# calling this df_train
df_train = pd.read_excel(io=modeling_data, sheet_name='data', header=0, index_col='ID')

# reading testing data into Python
testing_data = './datasets/test.xlsx'

# calling this df_test
df_test = pd.read_excel(io=testing_data, sheet_name='data', header=0, index_col='ID')

# Correctly convert 'DateHour' to datetime format
df_train['DateHour'] = pd.to_datetime(df_train['DateHour'], format='%Y-%m-%d %H:%M:%S', errors='coerce')
df_test['DateHour'] = pd.to_datetime(df_test['DateHour'], format='%Y-%m-%d %H:%M:%S', errors='coerce')

# Define feature engineering function
def extract_time_features(df):
    df['Hour'] = df['DateHour'].dt.hour
    df['DayOfWeek'] = df['DateHour'].dt.dayofweek
    df['Month'] = df['DateHour'].dt.month
    return df


# concatenating datasets together for mv analysis and feature engineering
df_train['set'] = 'Not Kaggle'
df_test['set'] = 'Kaggle'

# concatenating both datasets together for mv and feature engineering
df_full = pd.concat(objs=[df_train, df_test], axis=0, ignore_index=False)

# checking data
print(df_full.head(n=5))

# checking available features
print(df_full.columns)


                   DateHour  Temperature(F)  Humidity(%)  Wind speed (mph)  Visibility(miles)  DewPointTemperature(F)  Rainfall(in)  Snowfall(in)  SolarRadiation(MJ/m2) Holiday FunctioningDay  RENTALS         set
ID                                                                                                                                                                                                                  
mb_1039                 NaT              52           81               0.4                2.9                    46.4           0.0           0.0                   0.00      No            Yes    519.0  Not Kaggle
mb_1330                 NaT              51           53               2.2                NaN                    35.2           0.0           0.0                   1.01      No            Yes   1251.0  Not Kaggle
mb_551                  NaT              56           49               2.5                3.4                    38.8           0.0           0.0   

In [12]:
## Feature Engineering ##

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.impute import SimpleImputer

# Eliminar infer_datetime_format=True
df_full['DateHour'] = pd.to_datetime(df_full['DateHour'], errors='coerce')

# Función para extraer características temporales
def extract_time_features(df):
    df['Hour'] = df['DateHour'].dt.hour
    df['DayOfWeek'] = df['DateHour'].dt.dayofweek
    df['Month'] = df['DateHour'].dt.month
    df['IsWeekend'] = (df['DateHour'].dt.dayofweek >= 5).astype(int)
    return df

df_full = extract_time_features(df_full)

# Asegurarse de que no hay valores NaN en la variable objetivo antes de dividir los datos
df_full = df_full.dropna(subset=['RENTALS'])

def apply_feature_engineering(df):
    df['DateHour'] = pd.to_datetime(df['DateHour'], errors='coerce')
    df = extract_time_features(df)
    df['LogRainfall'] = np.log1p(df['Rainfall(in)'].fillna(0))  # Using fillna(0) as a placeholder
    df['LogSnowfall'] = np.log1p(df['Snowfall(in)'].fillna(0))
    return df

# Apply feature engineering to the full dataset
df_full = apply_feature_engineering(df_full)

# Transformación logarítmica para Rainfall(in) y Snowfall(in) para normalizar la distribución
df_full['LogRainfall'] = np.log1p(df_full['Rainfall(in)'])
df_full['LogSnowfall'] = np.log1p(df_full['Snowfall(in)'])


# Definir transformaciones específicas para columnas numéricas y categóricas
numeric_features = ['Temperature(F)', 'Humidity(%)', 'Wind speed (mph)', 
                    'Visibility(miles)', 'DewPointTemperature(F)', 'LogRainfall', 
                    'LogSnowfall', 'SolarRadiation(MJ/m2)', 'Hour', 'DayOfWeek', 'Month']
categorical_features = ['Holiday', 'FunctioningDay', 'IsWeekend']

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Integrar todas las transformaciones en un preprocesador
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

df_full = df_full.copy()
df_full['DateHour'] = pd.to_datetime(df_full['DateHour'], errors='coerce')
df_full = extract_time_features(df_full)

# Dividir el conjunto de datos en entrenamiento y prueba (opcionalmente, considera estratificar)
from sklearn.model_selection import train_test_split

# Aplicar el preprocesamiento al conjunto de datos
X = df_full.drop(['RENTALS', 'set'], axis=1)
y = df_full['RENTALS'].dropna()  # Asegurarse de no tener NaNs en y
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Ajustar y transformar los datos de entrenamiento y prueba con el preprocesador
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)

In [13]:
kaggle_data = df_full[df_full['set'] == 'Kaggle'].copy()
df = df_full[df_full['set'] == 'Not Kaggle'].copy()

# Eliminando la columna 'set'
kaggle_data.drop(['set'], axis=1, inplace=True)
df.drop(['set'], axis=1, inplace=True)

In [16]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
from math import sqrt
import os
import pandas as pd
import numpy as np
from sklearn.linear_model import Lasso, Ridge, SGDRegressor

# Supongamos que df_train y df_test son tus DataFrames de entrenamiento y prueba ya cargados
# Aquí asumiremos que 'kaggle_data' es df_test para este ejemplo

# Preparar el preprocesador con las transformaciones para cada tipo de columna
numeric_features = ['Temperature(F)', 'Humidity(%)', 'Wind speed (mph)', 'Visibility(miles)', 'DewPointTemperature(F)', 'Rainfall(in)', 'Snowfall(in)', 'SolarRadiation(MJ/m2)', 'Hour', 'DayOfWeek', 'Month']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_features = ['Holiday', 'FunctioningDay']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

def extract_time_features(df):
    df['Hour'] = df['DateHour'].dt.hour
    df['DayOfWeek'] = df['DateHour'].dt.dayofweek
    df['Month'] = df['DateHour'].dt.month
    return df

# Asegúrate de aplicar esta función a tus datos de entrenamiento y prueba antes del preprocesamiento
df_train = extract_time_features(df_train)
df_test = extract_time_features(df_test)

# Ajustar el preprocesador solo a los datos de entrenamiento
X_train = df_train.drop('RENTALS', axis=1)
y_train = df_train['RENTALS']
X_test = df_test.drop('RENTALS', axis=1, errors='ignore')  # Asumiendo que df_test puede no tener 'RENTALS'

X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)

# Definición del directorio de salida para los archivos de envío
output_dir = "./model_output"
os.makedirs(output_dir, exist_ok=True)

models = {
    "Lasso": Lasso(alpha=2.782559402207126, random_state=42),
    "Ridge": Ridge(alpha=21.54434690031882, random_state=42),
    "ElasticNet": SGDRegressor(alpha=0.01, l1_ratio=0.25, penalty='elasticnet', max_iter=1000, tol=1e-3, random_state=42),
    "DecisionTree": DecisionTreeRegressor(max_depth=10, min_samples_leaf=5, min_samples_split=2, random_state=42),
    "KNN": KNeighborsRegressor(n_neighbors=5, weights='distance')
}

for name, model in models.items():
    # Entrenar el modelo
    model.fit(X_train_preprocessed, y_train)
    
    # Predecir en el conjunto de prueba de Kaggle
    y_pred_kaggle = model.predict(X_test_preprocessed)
    
    # Crear DataFrame de envío
    submission_df = pd.DataFrame({'ID': df_test.index, 'RENTALS': y_pred_kaggle})
    submission_file_path = os.path.join(output_dir, f"Kaggle_Submission_{name}.csv")
    submission_df.to_csv(submission_file_path, index=False)
    print(f"Submission file for {name} created at {submission_file_path}")


Submission file for Lasso created at ./model_output\Kaggle_Submission_Lasso.csv
Submission file for Ridge created at ./model_output\Kaggle_Submission_Ridge.csv
Submission file for ElasticNet created at ./model_output\Kaggle_Submission_ElasticNet.csv
Submission file for DecisionTree created at ./model_output\Kaggle_Submission_DecisionTree.csv
Submission file for KNN created at ./model_output\Kaggle_Submission_KNN.csv
