In [54]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, RegressorMixin
import random
import matplotlib.pyplot as plt

In [47]:
#load cleaned data

file_path = "C:/Users/Janis/aaa-2024-group-5/data/"

# loading the different spatial resolution for 1H aggregation
df_census_1H = pd.read_csv(f"{file_path}features_census_1H.csv")
df_hex6_1H = pd.read_csv(f"{file_path}features_hex6_1H.csv")
df_hex7_1H = pd.read_csv(f"{file_path}features_hex7_1H.csv")

In [55]:
## drawing random samples from each data frame to account for limited computing power

# Set a random seed for reproducibility
random_seed = 42

# Select a subset of 5000 rows from each data frame
df_census_subset = df_census_1H.sample(n=10000, random_state=random_seed)
df_hex6_subset = df_hex6_1H.sample(n=10000, random_state=random_seed)
df_hex7_subset = df_hex7_1H.sample(n=10000, random_state=random_seed)

In [49]:
def preprocess_data(df, train_ratio=0.75, validation_ratio=0.15):
    features = ['trip_seconds', 'trip_miles', 'fare', 'temp', 'precip', 'preciprob', 'snow', 'snowdepth', 'windspeed',
                'snow_binary', 'rain_binary', 'min_dist_airport', 'dist_centre', 'num_stadiums', 'num_hotels',
                'num_bars', 'perc_transport', 'perc_resid', 'perc_commerc', 'perc_open', 'hour', 'day_of_week',
                'month', 'weekend_binary', 'bar_hours', 'morning_commuting', 'evening_commuting', 'bar_hours_weekend',
                'morning_commuting_week', 'evening_commuting_week']
    target = 'rides'
    
    X = df[features]
    y = df[target]
    
    # Splitting data
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, train_size=train_ratio, random_state=random_seed)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=validation_ratio / (1 - train_ratio), random_state=random_seed)
    
    return X_train, X_val, X_test, y_train, y_val, y_test

In [50]:
# Neural Network model
def create_model(learning_rate=0.01, dropout_rate=0.2):
    model = models.Sequential([
        layers.Input(shape=(X_train.shape[1],)),
        layers.Dense(128, activation='relu'),
        layers.Dropout(dropout_rate),
        layers.Dense(64, activation='relu'),
        layers.Dropout(dropout_rate),
        layers.Dense(32, activation='relu'),
        layers.Dense(1)
    ])
    optimizer = Adam(learning_rate=learning_rate)
    model.compile(optimizer=optimizer, loss='mean_squared_error')
    return model

In [51]:
class KerasRegressorWrapper(BaseEstimator, RegressorMixin):
    def __init__(self, learning_rate=0.01, dropout_rate=0.2):
        self.learning_rate = learning_rate
        self.dropout_rate = dropout_rate
        self.model = create_model(learning_rate=self.learning_rate, dropout_rate=self.dropout_rate)
    
    def fit(self, X, y):
        self.model.fit(X, y, epochs=10, batch_size=32, verbose=0)
        return self
    
    def predict(self, X):
        return self.model.predict(X).flatten()
    
    def score(self, X, y):
        y_pred = self.predict(X)
        return -mean_squared_error(y, y_pred)  # Scikit-learn expects higher scores for better models

In [52]:
# Prepare datasets for training
datasets = {
    'census': preprocess_data(df_census_subset),
    'hex6': preprocess_data(df_hex6_subset),
    'hex7': preprocess_data(df_hex7_subset)
}

# Grid search parameters
param_grid = {
    'learning_rate': [0.001, 0.01, 0.1],
    'dropout_rate': [0.1, 0.2, 0.3]
}

In [53]:
# Train models and perform grid search
for name, (X_train, X_val, X_test, y_train, y_val, y_test) in datasets.items():
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)
    X_test_scaled = scaler.transform(X_test)
    
    model = KerasRegressorWrapper()
    
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, scoring='neg_mean_squared_error', n_jobs=-1)
    grid_search.fit(X_train_scaled, y_train)
    
    best_params = grid_search.best_params_
    best_model = grid_search.best_estimator_
    
    y_pred = best_model.predict(X_test_scaled)
    
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    
    print(f"Results for {name}:")
    print(f"Best parameters: {best_params}")
    print(f"Mean Squared Error: {mse}")
    print(f"Root Mean Squared Error: {rmse}")
    print(f"R^2 Score: {r2}")
    print()

Results for census:
Best parameters: {'dropout_rate': 0.3, 'learning_rate': 0.01}
Mean Squared Error: 758.0491409077368
Root Mean Squared Error: 27.532692220481035
R^2 Score: 0.5565353396950109

Results for hex6:
Best parameters: {'dropout_rate': 0.3, 'learning_rate': 0.001}
Mean Squared Error: 10743.503940384528
Root Mean Squared Error: 103.65087525141564
R^2 Score: 0.8911294611983469

Results for hex7:
Best parameters: {'dropout_rate': 0.3, 'learning_rate': 0.01}
Mean Squared Error: 2256.9109913702955
Root Mean Squared Error: 47.50695729438264
R^2 Score: 0.8203198169833212

