This notebook is my attempt to try to create a baseline for the main purpose of my project: predicting the trip time duration (minimizing the RMSE).  I pulled the algorithm from: https://cseweb.ucsd.edu/classes/wi17/cse258-a/reports/a077.pdf

In essence, they created clusters of the geopoints and used the clusters as a feature instead of the geopoints.  They one-hot encoded most of the features since they were categorical.  They then trained a Gradient Boosting Regressor over the data and achieved a RMSE of 4.87 (minutes) for the trip duration.

I tried to replicate their efforts to the best of my understanding.  I continued to use some of the extra features that I had generated, such as weather data and time to sun rise/set.  Unfortunately, I ran into a memory issue when trying to train on all 1 million rows of the training data, so I limited myself to using a random sample of 50000 records from the training data.  Using their method, I achieved a RMSE of 5.66 for the test data.

In [1]:
import pandas as pd
import math
from random import shuffle
import numpy as np
from sklearn.cluster import MiniBatchKMeans
import random
from random import shuffle

random.seed(6789)
pd.set_option('display.max_columns', None)

In [2]:
train_data_load = pd.read_pickle("data/earlyAprilDataEnriched")
test_data_load = pd.read_pickle("data/lateAprilDataEnriched")

In [3]:
train_data_load[0:10]

Unnamed: 0,VendorID,lpep_pickup_datetime,Lpep_dropoff_datetime,Store_and_fwd_flag,RateCodeID,Pickup_longitude,Pickup_latitude,Dropoff_longitude,Dropoff_latitude,Passenger_count,Trip_type,VincentyMiles,AvgTemp,Precipitation,NewSnow,trip_length_seconds,trip_length_minutes_rounded,pickup_minute,pickup_hour,pickup_day,pickup_weekday,dropoff_minute,dropoff_hour,dropoff_day,dropoff_weekday,pickup_sunrise,pickup_sunset,dropoff_sunrise,dropoff_sunset
4532400,2,2016-04-01 00:02:03,2016-04-01 00:05:53,0,1,-73.99118,40.685608,-73.984116,40.69598,1,1.0,0.806173,59.0,0.02,0.0,230.0,4.0,2,0,1,4,5,0,1,4,0.0,0.0,0.0,0.0
4532401,2,2016-04-01 00:01:31,2016-04-01 00:05:55,0,1,-73.844292,40.721432,-73.850441,40.724144,1,1.0,0.373149,59.0,0.02,0.0,264.0,4.0,1,0,1,4,5,0,1,4,0.0,0.0,0.0,0.0
4532402,2,2016-04-01 00:00:57,2016-04-01 00:07:36,0,1,-73.944008,40.714539,-73.938705,40.724926,1,1.0,0.768917,59.0,0.02,0.0,399.0,7.0,0,0,1,4,7,0,1,4,0.0,0.0,0.0,0.0
4532403,2,2016-04-01 00:01:22,2016-04-01 00:06:12,0,1,-73.952789,40.810749,-73.963509,40.796486,1,1.0,1.133387,59.0,0.02,0.0,290.0,5.0,1,0,1,4,6,0,1,4,0.0,0.0,0.0,0.0
4532404,2,2016-04-01 00:00:56,2016-04-01 00:05:25,0,1,-73.991249,40.691433,-73.988762,40.683598,3,1.0,0.556217,59.0,0.02,0.0,269.0,4.0,0,0,1,4,5,0,1,4,0.0,0.0,0.0,0.0
4532405,2,2016-04-01 00:00:47,2016-04-01 00:14:49,0,1,-73.968704,40.677856,-73.935036,40.651569,1,1.0,2.533694,59.0,0.02,0.0,842.0,14.0,0,0,1,4,14,0,1,4,0.0,0.0,0.0,0.0
4532406,2,2016-04-01 00:00:07,2016-04-01 00:03:41,0,1,-73.957878,40.71104,-73.955887,40.707653,1,1.0,0.256062,59.0,0.02,0.0,214.0,4.0,0,0,1,4,3,0,1,4,0.0,0.0,0.0,0.0
4532407,2,2016-04-01 00:00:13,2016-04-01 00:18:43,0,1,-73.960648,40.719345,-73.917854,40.781212,1,1.0,4.823582,59.0,0.02,0.0,1110.0,18.0,0,0,1,4,18,0,1,4,0.0,0.0,0.0,0.0
4532408,2,2016-04-01 00:00:34,2016-04-01 00:12:19,0,1,-73.984062,40.676144,-74.027718,40.632233,1,1.0,3.800473,59.0,0.02,0.0,705.0,12.0,0,0,1,4,12,0,1,4,0.0,0.0,0.0,0.0
4532409,2,2016-04-01 00:47:13,2016-04-01 00:59:30,0,1,-73.884544,40.755604,-73.856026,40.745335,1,1.0,1.655823,59.0,0.02,0.0,737.0,12.0,47,0,1,4,59,0,1,4,0.0,0.0,0.0,0.0


In [4]:
geoPoints_raw = np.concatenate((train_data_load[["Pickup_latitude", "Pickup_longitude"]].values, train_data_load[["Dropoff_latitude", "Dropoff_longitude"]].values))
geoPoints = np.unique(geoPoints_raw, axis = 0)

kmeans = MiniBatchKMeans(n_clusters=40, random_state=23456).fit(geoPoints)

In [6]:
def getInverseForSundata(x):
    if x == 0:
        return x
    else:
        return 1/x
    
def oneHotEncodedData(orig_data, get_minutes = False):
    label_column = "trip_length_seconds"
    if get_minutes:
        label_column = "trip_length_minutes_rounded"
    labels_array = orig_data[label_column].copy().values
    labels = np.reshape(labels_array, (len(labels_array), 1))
    clean_data = orig_data.drop(labels = ["lpep_pickup_datetime", "Lpep_dropoff_datetime", "trip_length_seconds", "trip_length_minutes_rounded"], axis = 1)
    clean_data = pd.get_dummies(clean_data, columns = ["pickup_weekday", "dropoff_weekday", "RateCodeID", "VendorID", "Trip_type "])
    clean_data["pickup_minute_ofday"] = clean_data["pickup_minute"] + clean_data["pickup_hour"]*60
    clean_data["dropoff_minute_ofday"] = clean_data["dropoff_minute"] + clean_data["dropoff_hour"]*60
    clean_data["pickup_sunset_inverse"] = clean_data["pickup_sunset"].map(lambda x: getInverseForSundata(x))
    clean_data["pickup_sunrise_inverse"] = clean_data["pickup_sunrise"].map(lambda x: getInverseForSundata(x))
    clean_data["dropoff_sunset_inverse"] = clean_data["dropoff_sunset"].map(lambda x: getInverseForSundata(x))
    clean_data["dropoff_sunrise_inverse"] = clean_data["dropoff_sunrise"].map(lambda x: getInverseForSundata(x))
    clean_data = clean_data.drop(labels = ["pickup_minute", "pickup_hour", "dropoff_minute", "dropoff_hour",
                                           "pickup_sunset", "pickup_sunrise", "dropoff_sunset", "dropoff_sunrise"], axis = 1)
    clean_data["pickup_cluster"] = pd.Series(kmeans.predict(clean_data[["Pickup_latitude", "Pickup_longitude"]].values), index=clean_data.index)
    clean_data["dropoff_cluster"] = pd.Series(kmeans.predict(clean_data[["Dropoff_latitude", "Dropoff_longitude"]].values), index=clean_data.index)
    clean_data["both_cluster"] = clean_data[["pickup_cluster", "dropoff_cluster"]].apply(lambda x: "{}-{}".format(x[0], x[1]), axis = 1)
    clean_data = pd.get_dummies(clean_data, columns = ["both_cluster"])
    clean_data = clean_data.drop(labels = ["pickup_cluster", "dropoff_cluster", "Pickup_latitude", "Pickup_longitude", "Dropoff_latitude", "Dropoff_longitude"], axis = 1)

    return (clean_data.values, labels)

I have to do the one hot encoding on the pickup/dropoff cluster pairs with the data all at once so that the features are the same for the train_data and test_data.

In [16]:
try_with_minutes = True
train_indices = train_data_load.index.values
test_indices = test_data_load.index.values
all_data = train_data_load.append(test_data_load)
all_data_oh, all_labels_oh = oneHotEncodedData(all_data, try_with_minutes)

In [40]:
train_data = all_data_oh[0:len(train_indices)]
test_data = all_data_oh[len(train_indices):]
train_labels = all_labels_oh[0:len(train_indices)]
test_labels = all_labels_oh[len(train_indices):]

In [21]:
from sklearn.ensemble import GradientBoostingRegressor

gbr = GradientBoostingRegressor(n_estimators = 30, max_depth = 10)

In [27]:
indicies = list(range(0, len(train_data)))
shuffle(indicies)

indicies_to_use = indicies[0:50000]
gbr.fit(train_data[indicies_to_use], train_labels[indicies_to_use].flatten())

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=10, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=30, presort='auto', random_state=None,
             subsample=1.0, verbose=0, warm_start=False)

In [35]:
test_predictions = gbr.predict(test_data)

In [42]:
np.sqrt(((test_predictions - test_labels.flatten()) ** 2).mean())

5.6566943354068551