In [None]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression, Lasso, Ridge, SGDRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# Setting pandas print options (optional but useful for large dataframes)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)


## importing data ##

file_path = './datasets/chicago_training_data.xlsx'

# Reading training data into Python
modeling_data = './datasets/train.xlsx'
df_train = pd.read_excel(io=modeling_data, sheet_name='data', header=0, index_col='ID')

# Reading testing data into Python
testing_data = './datasets/test.xlsx'
df_test = pd.read_excel(io=testing_data, sheet_name='data', header=0, index_col='ID')

# Concatenating datasets together for missing value analysis and feature engineering
df_train['set'] = 'Not Kaggle'
df_test['set'] = 'Kaggle'

# Concatenating both datasets together for MV analysis and feature engineering
df_full = pd.concat(objs=[df_train, df_test], axis=0, ignore_index=False)

# Drop 'DateHour' if it hasn't been dropped already
if 'DateHour' in df_train_prepared.columns:
    df_train_prepared = df_train_prepared.drop('DateHour', axis=1)

X = df_train_prepared.drop(['RENTALS', 'set'], axis=1, errors='ignore')  # Drop 'RENTALS' and 'set', ignore errors if not present

# Checking the concatenated data
print(df_full.head(n=5))

In [None]:
# Summary statistics for the full dataset
summary_statistics = df_full.describe(include='all')
print(summary_statistics)

# Checking for missing values in the full dataset
missing_values = df_full.isnull().sum()
print(missing_values)

# Select only numeric columns for correlation analysis
numeric_df_train = df_train.select_dtypes(include=[np.number])

# Calculate the correlation matrix on numeric columns only
correlation_matrix_numeric = numeric_df_train.corr()

# Display the correlation with the target variable 'RENTALS'
print(correlation_matrix_numeric['RENTALS'].sort_values(ascending=False))


# For categorical variables 'Holiday' and 'FunctioningDay', check their distribution
print(df_full['Holiday'].value_counts())
print(df_full['FunctioningDay'].value_counts())

# Potential impact on the target can be explored through groupby if the target variable is in the dataset
if 'RENTALS' in df_train.columns:
    print(df_train.groupby('Holiday')['RENTALS'].mean())
    print(df_train.groupby('FunctioningDay')['RENTALS'].mean())


In [None]:
#  Data Imputation

# Impute missing values for 'Visibility(miles)', 'DewPointTemperature(F)', and 'SolarRadiation(MJ/m2)' with the mean
imputer = SimpleImputer(strategy='mean')
df_full[['Visibility(miles)', 'DewPointTemperature(F)', 'SolarRadiation(MJ/m2)']] = imputer.fit_transform(df_full[['Visibility(miles)', 'DewPointTemperature(F)', 'SolarRadiation(MJ/m2)']])

In [None]:
# Categorical Feature Encoding

# Encoding 'Holiday' and 'FunctioningDay' using OneHotEncoder
encoder = OneHotEncoder(drop='first', sparse_output=False)  # Updated parameter name if using scikit-learn version >= 1.2
categorical_features = ['Holiday', 'FunctioningDay']
encoded_features = encoder.fit_transform(df_full[categorical_features])

# Creating a DataFrame with encoded features
# Use get_feature_names_out() for scikit-learn version >= 0.23
encoded_features_df = pd.DataFrame(encoded_features, index=df_full.index, columns=encoder.get_feature_names_out(categorical_features))

# Concatenating encoded features with the original DataFrame and dropping original categorical columns
df_full_encoded = pd.concat([df_full.drop(categorical_features, axis=1), encoded_features_df], axis=1)

In [None]:
#  Creating Interaction Terms

# Interaction term for 'Temperature(F)' and 'Humidity(%)' to capture "feels like" temperature
df_full_encoded['Temp_Humidity_Interaction'] = df_full_encoded['Temperature(F)'] * df_full_encoded['Humidity(%)']

# Interaction term for 'SolarRadiation(MJ/m2)' with a binary variable for non-rainy days ('Rainfall(in)' == 0)
df_full_encoded['SolarRadiation_NonRainy'] = df_full_encoded['SolarRadiation(MJ/m2)'] * (df_full_encoded['Rainfall(in)'] == 0).astype(int)


In [None]:
# Identify non-numeric columns
non_numeric_columns = X.select_dtypes(exclude=[np.number]).columns
print(f"Non-numeric columns: {list(non_numeric_columns)}")

# Drop the 'DateHour' column explicitly from X
if 'DateHour' in X.columns:
    X = X.drop('DateHour', axis=1)

# Confirm the column has been removed
if 'DateHour' not in X.columns:
    print("DateHour column successfully removed.")
else:
    print("DateHour column still present.")

# Now, with 'DateHour' removed, ensure all remaining features are numeric
numeric_features = X.select_dtypes(include=[np.number]).columns.tolist()

# Apply scaling only to numeric features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X[numeric_features])

# Convert scaled features back to DataFrame
X_scaled_df = pd.DataFrame(X_scaled, index=X.index, columns=numeric_features)