# Predict the journey time of taxis in NYC

Use our pre-trained model for this

In [69]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append('../')

import pandas as pd
import numpy as np
import geopandas
from random import seed, randint
from datetime import datetime

from keras.utils import to_categorical
from model.model import nn

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [18]:
model_weights = '../logs/weights-2018-11-21-14-06-29.hdf5'
trip_data_file = '../data/taxi_data/cleansed_yellow_tripdata_2018-06.csv'
weather_data_file = '../data/weather_data/ny_jfk_weather_2018-06.csv'
taxi_zones = '../data/taxi_zones'

* Load taxi trip data

In [48]:
tripdata = pd.read_csv(trip_data_file, delimiter=',')

In [49]:
tripdata.head(10)

Unnamed: 0.1,Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,PULocationID,DOLocationID,tip_amount,total_amount,duration
0,1,1,2018-06-01 00:04:18,2018-06-01 00:09:18,1,1.0,230,161,1.35,8.15,5.0
1,5,1,2018-06-01 00:09:00,2018-06-01 00:24:01,1,2.0,161,234,2.55,15.35,15.016667
2,6,1,2018-06-01 00:02:33,2018-06-01 00:13:01,2,1.5,163,233,1.95,11.75,10.466667
3,7,1,2018-06-01 00:13:23,2018-06-01 00:16:52,1,0.7,186,246,1.85,8.15,3.483333
4,11,1,2018-06-01 00:39:42,2018-06-01 00:57:23,1,3.5,141,179,2.95,17.75,17.683333
5,12,1,2018-06-01 00:25:43,2018-06-01 00:40:07,1,3.0,138,7,3.45,17.25,14.4
6,16,2,2018-06-01 00:02:18,2018-06-01 00:08:05,1,0.75,229,141,4.0,10.8,5.783333
7,17,2,2018-06-01 00:40:22,2018-06-01 00:52:20,1,3.94,148,233,3.7,18.5,11.966667
8,18,1,2018-06-01 00:22:51,2018-06-01 00:30:01,1,1.4,142,238,1.75,10.55,7.166667
9,23,1,2018-06-01 00:09:50,2018-06-01 00:20:33,1,2.8,231,52,2.15,13.95,10.716667


* Load weather data

In [36]:
weather_data = pd.read_csv(weather_data_file, delimiter=',')

In [37]:
weather_data.head(10)

Unnamed: 0.1,Unnamed: 0,DATE,AWND,PRCP,SNOW,SNWD,TAVG,month
0,335,2018-06-01,2.1,30.0,0.0,0.0,19.2,6
1,336,2018-06-02,4.5,0.0,0.0,0.0,23.8,6
2,337,2018-06-03,7.1,2.8,0.0,0.0,17.1,6
3,338,2018-06-04,3.8,15.5,0.0,0.0,13.9,6
4,339,2018-06-05,4.4,0.0,0.0,0.0,17.9,6
5,340,2018-06-06,3.9,0.0,0.0,0.0,17.2,6
6,341,2018-06-07,4.0,0.0,0.0,0.0,15.8,6
7,342,2018-06-08,4.2,0.0,0.0,0.0,18.3,6
8,343,2018-06-09,2.9,0.0,0.0,0.0,21.6,6
9,344,2018-06-10,2.7,0.5,0.0,0.0,20.0,6


* Load taxizone geo data and convert to degrees based lat/long coordinates

In [38]:
taxizone_data = geopandas.read_file(taxi_zones).set_index('OBJECTID').to_crs({'init': 'epsg:4326'})
zone_ids = taxizone_data.index.tolist()
taxizone_data['centroids'] = taxizone_data.geometry.centroid.to_crs({'init': 'epsg:4326'})

In [39]:
taxizone_data.head(10)

Unnamed: 0_level_0,Shape_Leng,Shape_Area,zone,LocationID,borough,geometry,centroids
OBJECTID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,0.116357,0.000782,Newark Airport,1,EWR,"POLYGON ((-74.18445299999996 40.6949959999999,...",POINT (-74.17400027276298 40.69183120640134)
2,0.43347,0.004866,Jamaica Bay,2,Queens,(POLYGON ((-73.82337597260663 40.6389870471767...,POINT (-73.83129854302199 40.61674529165965)
3,0.084341,0.000314,Allerton/Pelham Gardens,3,Bronx,POLYGON ((-73.84792614099985 40.87134223399993...,POINT (-73.84742223236724 40.86447368477531)
4,0.043567,0.000112,Alphabet City,4,Manhattan,POLYGON ((-73.97177410965318 40.72582128133706...,POINT (-73.97696825691764 40.72375214158458)
5,0.092146,0.000498,Arden Heights,5,Staten Island,POLYGON ((-74.17421738099989 40.56256808599989...,POINT (-74.18848410184934 40.55265928694552)
6,0.150491,0.000606,Arrochar/Fort Wadsworth,6,Staten Island,POLYGON ((-74.06367318899999 40.60219816599994...,POINT (-74.0717705589514 40.60032414603445)
7,0.107417,0.00039,Astoria,7,Queens,POLYGON ((-73.90413637799996 40.76752031699986...,POINT (-73.91969431946065 40.76149256216355)
8,0.027591,2.7e-05,Astoria Park,8,Queens,POLYGON ((-73.92334041500001 40.77512891199993...,POINT (-73.92308615750756 40.77855865375055)
9,0.099784,0.000338,Auburndale,9,Queens,POLYGON ((-73.78502434699996 40.76103651599986...,POINT (-73.78794887777896 40.75103502557216)
10,0.099839,0.000436,Baisley Park,10,Queens,"POLYGON ((-73.7832662499999 40.68999429299992,...",POINT (-73.79098635986959 40.6789533101151)


In [77]:
taxizone_data.loc[1].centroids.x

-74.17400027276298

* Load the pre-trained model

In [3]:
model = nn(input_shape=68)
model.load_weights(model_weights)

nn(): Creating NN with parameters:

image_shape=68
output_shape=1
dropout=0.25
activation=elu
optimizer=<keras.optimizers.Adam object at 0x11c2e7e10>
loss=mean_squared_error


* Create test data

In [50]:
test_start_idx = int(len(tripdata) * .8)
test_tripdata = tripdata[test_start_idx:]

In [51]:
test_tripdata.head(10)

Unnamed: 0.1,Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,PULocationID,DOLocationID,tip_amount,total_amount,duration
4464536,6963159,2,2018-06-24 20:11:06,2018-06-24 20:22:44,1,1.73,263,239,2.16,12.96,11.633333
4464537,6963160,2,2018-06-24 20:31:46,2018-06-24 20:44:46,1,1.72,48,68,1.0,12.3,13.0
4464538,6963161,2,2018-06-24 20:48:48,2018-06-24 20:58:53,1,1.76,90,163,1.0,11.3,10.083333
4464539,6963163,2,2018-06-24 20:26:48,2018-06-24 20:38:54,2,1.94,161,79,2.7,13.5,12.1
4464540,6963164,2,2018-06-24 20:40:36,2018-06-24 20:46:11,1,0.95,79,137,1.46,8.76,5.583333
4464541,6963165,2,2018-06-24 20:10:01,2018-06-24 20:14:57,1,1.08,237,236,1.46,8.76,4.933333
4464542,6963167,2,2018-06-24 20:43:24,2018-06-24 21:30:20,1,8.98,142,231,1.0,39.3,46.933333
4464543,6963169,2,2018-06-24 20:34:55,2018-06-24 20:50:24,2,1.54,100,137,2.46,14.76,15.483333
4464544,6963170,2,2018-06-24 21:00:27,2018-06-24 21:06:16,1,0.95,107,137,1.46,8.76,5.816667
4464545,6963174,2,2018-06-24 20:55:41,2018-06-24 21:09:47,1,2.84,161,238,3.32,16.62,14.1


* Function to map this row of data to fields for our pre-trained model

In [95]:
def get_features(sample, weather_data, taxizone_data):
    PULocation = taxizone_data.loc[sample['PULocationID']].centroids
    PULocationLong, PULocationLat = PULocation.x, PULocation.y
    DOLocation = taxizone_data.loc[sample['DOLocationID']].centroids
    DOLocationLong, DOLocationLat = DOLocation.x, DOLocation.y

    # Get month date, day of week and hours/mins for pickup
    PUDateTime = datetime.strptime(sample.tpep_pickup_datetime, '%Y-%m-%d %H:%M:%S')
    PUDate = PUDateTime.strftime('%Y-%m-%d')
    PUYear, PUMonth, PUMonthDate = PUDate.split('-')
    # TODO - Add this to pre-processing of trip data! Some random months in the data!!
    if PUYear == '2018' and PUMonth == '06':        
        PUDayOfWeek = PUDateTime.weekday()
        PUTimeHour, PUTimeMinute = datetime.strptime(
            sample.tpep_pickup_datetime, '%Y-%m-%d %H:%M:%S'
        ).strftime('%H:%M').split(':')

        # Get precipitation for that day
        Precipitation = weather_data[weather_data['DATE'] == PUDate]['PRCP'].values[0]

        X = np.concatenate((np.array([

            PULocationLat,
            PULocationLong,
            DOLocationLat,
            DOLocationLong,
            abs((PULocationLat - DOLocationLat) ** 2 + abs(PULocationLong - DOLocationLong) ** 2) ** 0.5,
            Precipitation
        ]),
            to_categorical(PUDayOfWeek, 7),
            to_categorical(PUMonthDate, 31),
            to_categorical(PUTimeHour, 24)
        ))

        y = [sample['duration']]
    else:
        X, y = [], []
    
    return X, y

* Find a random sample for illustrative purposes

There are over 1million rows in the test set, so let's pick a selection for test purposes

In [97]:
# seed(42)
num_test_examples = 10
test_index = list(test_tripdata.index)
for i in range(num_test_examples):
    test_example_idx = test_index[randint(0, len(test_tripdata))]
    X, y = get_features(test_tripdata.loc[test_example_idx], weather_data, taxizone_data)
    predicted_duration = model.predict(X.reshape(1,68))
    print('Sample {} / index {}'.format(i, test_example_idx))
    print('Predicted duration = {}, actual duration = {} '.format(predicted_duration, y))
    print('')
    
    
    
    

Sample 0 / index 4527025
Predicted duration = [[22.499388]], actual duration = [30.43333333333333] 

Sample 1 / index 4661029
Predicted duration = [[25.101683]], actual duration = [18.9] 

Sample 2 / index 4923053
Predicted duration = [[14.354203]], actual duration = [11.45] 

Sample 3 / index 4952461
Predicted duration = [[23.44576]], actual duration = [20.983333333333334] 

Sample 4 / index 5524342
Predicted duration = [[11.64035]], actual duration = [7.166666666666668] 

Sample 5 / index 4520185
Predicted duration = [[16.21321]], actual duration = [32.98333333333333] 

Sample 6 / index 4881528
Predicted duration = [[13.119324]], actual duration = [8.783333333333333] 

Sample 7 / index 5344332
Predicted duration = [[19.130287]], actual duration = [17.0] 

Sample 8 / index 4926833
Predicted duration = [[14.344682]], actual duration = [8.433333333333334] 

Sample 9 / index 5406595
Predicted duration = [[13.058309]], actual duration = [14.733333333333333] 

