In [12]:
pip install h3

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import numpy as np
import geopandas as gpd
import matplotlib.pyplot as plt
from shapely.geometry import Point
import h3
from datetime import datetime
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.svm import SVR, LinearSVR
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
#from tensorflow.keras.optimizers import Adam
#import papermill as pm
from sklearn.metrics import mean_squared_error
from sklearn.inspection import permutation_importance
import concurrent.futures
import random
import os 
#from tensorflow import keras
#from tensorflow.keras import layers
import copy
#import tensorflow as tf

In [3]:
#load cleaned data

file_path = "./data/"

trips_df = pd.read_csv(f"{file_path}clean_taxi_data.csv")



In [8]:
trips_df.head()

Unnamed: 0.1,Unnamed: 0,taxi_id,trip_start,trip_end,trip_seconds,trip_miles,pickup_census,dropoff_census,fare,pickup_location,dropoff_location,start_day,end_day,start_time,end_time
0,16,13,2019-01-01 00:00:00,2019-01-01 00:15:00,600.0,0.0,17031081402,17031839100,9.0,POINT (-87.6129454143 41.8919715078),POINT (-87.6327464887 41.8809944707),2019-01-01,2019-01-01,00:00:00,00:15:00
1,18,15,2019-01-01 00:00:00,2019-01-01 00:30:00,1260.0,0.6,17031030800,17031841900,29.5,POINT (-87.6641882421 41.9799124453),POINT (-87.6429586652 41.8679024175),2019-01-01,2019-01-01,00:00:00,00:30:00
2,19,16,2019-01-01 00:00:00,2019-01-01 00:00:00,120.0,0.3,17031839100,17031320400,4.0,POINT (-87.6327464887 41.8809944707),POINT (-87.6219716519 41.8774061234),2019-01-01,2019-01-01,00:00:00,00:00:00
3,20,17,2019-01-01 00:00:00,2019-01-01 00:15:00,360.0,0.8,17031081300,17031081500,5.75,POINT (-87.6207628651 41.8983317935),POINT (-87.6262149064 41.8925077809),2019-01-01,2019-01-01,00:00:00,00:15:00
4,22,19,2019-01-01 00:00:00,2019-01-01 00:15:00,360.0,1.0,17031081403,17031081700,6.25,POINT (-87.6188683546 41.8909220259),POINT (-87.6318639497 41.8920421365),2019-01-01,2019-01-01,00:00:00,00:15:00


In [None]:
def preprocess_data(df, train_ratio=0.75, validation_ratio=0.15):
    features = ['pickup_census', 'trip_start', 'start_day', 'pickup_location']
    #features_to_scale = ['demand_h-2', 'demand_h-24', 'temperature', 'precip']
    #target = 'demand'
    
    # Copy the input DataFrame
    df_copy = df.copy()

    # Select features and target
    X = df_copy[features]
    y = df_copy[target]

    # Split into train, validation, and test sets
    test_ratio = (1-train_ratio)-validation_ratio
    X_train_unscaled, X_test_unscaled, y_train, y_test = train_test_split(X, y, test_size=(1 - train_ratio), random_state=RANDOM_STATE)
    X_val_unscaled, X_test_unscaled, y_val, y_test = train_test_split(X_test_unscaled, y_test, test_size=test_ratio / (validation_ratio + test_ratio), random_state=RANDOM_STATE)

    # Scaling
    scaler = StandardScaler()
    scaler.fit(X_train_unscaled[features_to_scale])

    X_train = X_train_unscaled.copy()
    X_val = X_val_unscaled.copy()
    X_test = X_test_unscaled.copy()

    X_train[features_to_scale] = scaler.transform(X_train_unscaled[features_to_scale])
    X_val[features_to_scale] = scaler.transform(X_val_unscaled[features_to_scale])
    X_test[features_to_scale] = scaler.transform(X_test_unscaled[features_to_scale])

    return (X_train, X_val, X_test, y_train, y_val, y_test)

In [None]:
def optimize_hyperparameters(param_grid, model, X, y, randomized=False):
    if randomized:
        grid = RandomizedSearchCV(model, param_grid)
    else:
        grid = GridSearchCV(model, param_grid, verbose=3)

    grid.fit(X, y)
    print(f"Best params: {grid.best_params_}")
    print(f"Scoring: {grid.best_score_}")
    return grid

In [None]:
# Model Evaluation function:
def evaluate_model(y, y_pred):
    mse = mean_squared_error(y, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y, y_pred)
    r2 = r2_score(y, y_pred)
    return rmse, mae, r2

In [None]:
# Hyperparameters
EPOCHS=10
BATCH_SIZE=128
LEARNING_RATE=0.0025 # higher learn rate as we have a bad gpu
OPTIMIZER=keras.optimizers.Adam(learning_rate=LEARNING_RATE)
LOSS='mean_squared_error'


# Training Multiprocessing
# It seams on a 16 core cpu we can increase this to more than 8
MAX_WORKER_THREADS = 6

In [None]:
# Model Architecture

# as they are all the same and have no order in the dict we will just grab any element and get the shape of the train dataframe
X_train, X_val, X_test, y_train, y_val, y_test = datasets[random.choice(list(datasets.keys()))]

model_abstract = keras.Sequential([
    layers.Input(shape=(X_train.shape[1],)),  # Input layer for time series data
    layers.Dense(128, activation='relu'),     # Hidden layer 1
    layers.Dropout(0.3),                      # Dropout layer for regularization
    layers.Dense(64, activation='relu'),      # Hidden layer 2
    layers.Dropout(0.2),                      # Dropout layer for regularization
    layers.Dense(32, activation='relu'),      # Hidden layer 3
    layers.Dropout(0.1),                      # Dropout layer for regularization
    layers.Dense(1)                           # Output layer
])
model_abstract.compile(optimizer=OPTIMIZER, loss=LOSS)

# Training Loop

def train_with_dataset_and_parameters(dataset_name, dataset_values):
    X_train, X_val, X_test, y_train, y_val, y_test = dataset_values
    
    model = keras.models.clone_model(model_abstract)

    OPTIMIZER.build(model.trainable_variables)
    model.compile(optimizer=OPTIMIZER, loss=LOSS)

    model.fit(X_train, y_train, epochs=EPOCHS, batch_size=BATCH_SIZE)

    y_preds = model.predict(X_test)

    nn_metrics[dataset_name] = evaluate_model(y_test, y_preds)
    nn_models[dataset_name] = model
    nn_importance[dataset_name] = permutation_importance(model, X_test, y_test, n_repeats=10, random_state=42)


nn_metrics = {}
nn_models = {}
nn_importance = {}

# Parallel execution using concurrent.futures
with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKER_THREADS) as executor:
    futures = [executor.submit(train_with_dataset_and_parameters, dataset_name, dataset_values)
               for dataset_name, dataset_values in datasets_reduced_poly.items()]

# Wait for all futures to complete
concurrent.futures.wait(futures)

for future in futures:
    exception = future.exception()
    if exception:
        print(f"Exception in future: {exception}")