In [163]:
import numpy as np
import matplotlib.pyplot as plt 
from mpl_toolkits.mplot3d import Axes3D
from uszipcode import ZipcodeSearchEngine
from geopy.geocoders import Nominatim

import pandas as pd
import seaborn.apionly as sns
from datetime import date, datetime
from haversine import haversine

# statistics package
import statsmodels.api as sm
from statsmodels.formula.api import ols
from scipy import stats

# packages for mapping
from mpl_toolkits.basemap import Basemap

# packages for interactive graphs
from ipywidgets import widgets, interact
from IPython.display import display
from copy import deepcopy as copy
import time
from geopy.geocoders import Nominatim
from time import sleep
import requests
%matplotlib inline

In [31]:
geolocator = Nominatim()

In [6]:
historical_data = pd.read_csv('train.csv')

In [126]:
historical_data.shape

(14776615, 22)

In [7]:
historical_data.head()

Unnamed: 0.1,Unnamed: 0,medallion,hack_license,vendor_id,rate_code,store_and_fwd_flag,pickup_datetime,dropoff_datetime,passenger_count,trip_time_in_secs,...,pickup_latitude,dropoff_longitude,dropoff_latitude,fare_amount,payment_type,surcharge,mta_tax,tip_amount,tolls_amount,total_amount
0,0,89D227B655E5C82AECF13C3F540D4CF4,BA96DE419E711691B9445D6A6307C170,CMT,1,N,2013-01-01 15:11:48,2013-01-01 15:18:10,4,382,...,40.757977,-73.989838,40.751171,6.5,CSH,0.0,0.5,0.0,0.0,7.0
1,1,0BD7C8F5BA12B88E0B67BED28BEA73D8,9FD8F69F0804BDB5549F40E9DA1BE472,CMT,1,N,2013-01-06 00:18:35,2013-01-06 00:22:54,1,259,...,40.731781,-73.994499,40.75066,6.0,CSH,0.5,0.5,0.0,0.0,7.0
2,2,0BD7C8F5BA12B88E0B67BED28BEA73D8,9FD8F69F0804BDB5549F40E9DA1BE472,CMT,1,N,2013-01-05 18:49:41,2013-01-05 18:54:23,1,282,...,40.73777,-74.009834,40.726002,5.5,CSH,1.0,0.5,0.0,0.0,7.0
3,3,DFD2202EE08F7A8DC9A57B02ACB81FE2,51EE87E3205C985EF8431D850C786310,CMT,1,N,2013-01-07 23:54:15,2013-01-07 23:58:20,2,244,...,40.759945,-73.984734,40.759388,5.0,CSH,0.5,0.5,0.0,0.0,6.0
4,4,DFD2202EE08F7A8DC9A57B02ACB81FE2,51EE87E3205C985EF8431D850C786310,CMT,1,N,2013-01-07 23:25:03,2013-01-07 23:34:24,1,560,...,40.748528,-74.002586,40.747868,9.5,CSH,0.5,0.5,0.0,0.0,10.5


## Data Preprocessing

In [8]:
process_train_data = copy(historical_data)

## Train Data

In [9]:
train_data = copy(process_train_data.loc[0:100000,])
del train_data['Unnamed: 0']
train_data.head()

Unnamed: 0,medallion,hack_license,vendor_id,rate_code,store_and_fwd_flag,pickup_datetime,dropoff_datetime,passenger_count,trip_time_in_secs,trip_distance,...,pickup_latitude,dropoff_longitude,dropoff_latitude,fare_amount,payment_type,surcharge,mta_tax,tip_amount,tolls_amount,total_amount
0,89D227B655E5C82AECF13C3F540D4CF4,BA96DE419E711691B9445D6A6307C170,CMT,1,N,2013-01-01 15:11:48,2013-01-01 15:18:10,4,382,1.0,...,40.757977,-73.989838,40.751171,6.5,CSH,0.0,0.5,0.0,0.0,7.0
1,0BD7C8F5BA12B88E0B67BED28BEA73D8,9FD8F69F0804BDB5549F40E9DA1BE472,CMT,1,N,2013-01-06 00:18:35,2013-01-06 00:22:54,1,259,1.5,...,40.731781,-73.994499,40.75066,6.0,CSH,0.5,0.5,0.0,0.0,7.0
2,0BD7C8F5BA12B88E0B67BED28BEA73D8,9FD8F69F0804BDB5549F40E9DA1BE472,CMT,1,N,2013-01-05 18:49:41,2013-01-05 18:54:23,1,282,1.1,...,40.73777,-74.009834,40.726002,5.5,CSH,1.0,0.5,0.0,0.0,7.0
3,DFD2202EE08F7A8DC9A57B02ACB81FE2,51EE87E3205C985EF8431D850C786310,CMT,1,N,2013-01-07 23:54:15,2013-01-07 23:58:20,2,244,0.7,...,40.759945,-73.984734,40.759388,5.0,CSH,0.5,0.5,0.0,0.0,6.0
4,DFD2202EE08F7A8DC9A57B02ACB81FE2,51EE87E3205C985EF8431D850C786310,CMT,1,N,2013-01-07 23:25:03,2013-01-07 23:34:24,1,560,2.1,...,40.748528,-74.002586,40.747868,9.5,CSH,0.5,0.5,0.0,0.0,10.5


In [10]:
train_data["pickup_datetime"] = pd.to_datetime(train_data["pickup_datetime"])
train_data["dropoff_datetime"] = pd.to_datetime(train_data["dropoff_datetime"])

In [12]:
train_data["pickup_day"] = train_data["pickup_datetime"].apply(lambda x: x.day)
train_data["pickup_weekday"] = train_data["pickup_datetime"].apply(lambda x: x.weekday())
train_data["pickup_hour"] = train_data["pickup_datetime"].apply(lambda x: x.hour)
train_data["pickup_minute"] = train_data["pickup_datetime"].apply(lambda x: x.minute)
train_data["pickup_time"] = train_data["pickup_hour"] + (train_data["pickup_minute"] / 60)
train_data["dropoff_hour"] = train_data["dropoff_datetime"].apply(lambda x: x.hour)

In [13]:
train_data['pickup_date'] = [date.date() for date in train_data['pickup_datetime']]
train_data['dropoff_date'] = [date.date() for date in train_data['dropoff_datetime']]

### The distance is calculated in kilometers

In [14]:
def distance(lat1, lon1, lat2, lon2):
    """calculates the Manhattan distance between 2 points
        using their coordinates
    
    Parameters
    ----------
    lat1: float
        latitude of first point
        
    lon1: float
        longitude of first point
        
    lat2: float
        latitude of second point
    
    lon2: float
        longitude of second point
        
    Returns
    -------
    d: float
        The Manhattan distance between the two points in kilometers
        
    """
    
    d = haversine((lat1, lon1), (lat2, lon1)) + haversine((lat2, lon1), (lat2, lon2))
    return d

In [15]:
train_data["distance"] = train_data.apply(lambda row: distance(row["pickup_latitude"], 
                                               row["pickup_longitude"], 
                                               row["dropoff_latitude"], 
                                               row["dropoff_longitude"]), axis=1)

### The speed is calculated in km/h

In [16]:
train_data["speed"] = train_data["distance"] / (train_data["trip_time_in_secs"] / 3600)

In [17]:
pickup_datetime = train_data['pickup_datetime']
day_interval = []
for i in range(pickup_datetime.size):
    hour = pickup_datetime[i].hour
    if hour>=6 and hour<12:
        day_interval.append(0)
    elif hour>=12 and hour<18:
        day_interval.append(1)
    elif hour>=18 and hour<24:
        day_interval.append(2)
    else:
        day_interval.append(3)
train_data['day_interval'] = day_interval

In [18]:
def add_waiting_time_and_penalty(train_data):
    sorted_train_data = copy(train_data.sort_values(by=['hack_license', 'pickup_datetime'], ascending=True))
    sorted_train_data['waiting_time'] = 0.0
    sorted_train_data['waiting_penalty'] = 0.0
    previous_license = None
    previous_drpoff_datatime = 0
    previous_date = None
    previous_row = None
    wait_time_list = []
    wait_penalty_list = []
    average_speed = sorted_train_data['speed'].mean()
    for index,row in sorted_train_data.T.iteritems():
        if row['hack_license'] == previous_license and row['pickup_date'] == previous_date:
            diff =  row['pickup_datetime'] - previous_row['dropoff_datetime']
            _distance = distance(row["pickup_latitude"], row["pickup_longitude"], previous_row["dropoff_latitude"], previous_row["dropoff_longitude"])
            speed = row['speed']
            if(speed <= 0):
                speed = average_speed
            time_take_to_reach = (_distance/speed)*3600
            if (time_take_to_reach>diff.total_seconds()):
                wait_time_list.append(0.0)
                wait_penalty_list.append(0.0)
            else:
                wait_time_list.append(diff.total_seconds()-time_take_to_reach)
                wait_penalty_list.append((diff.total_seconds()-time_take_to_reach)/3600*5)
        else:
            wait_time_list.append(0.0)
            previous_license = row['hack_license'] 
            previous_date = row['pickup_date']
            previous_row = row
            wait_penalty_list.append(0.0)
    sorted_train_data['waiting_time'] = wait_time_list
    sorted_train_data['waiting_penalty'] = wait_penalty_list
    return sorted_train_data

In [19]:
sorted_train_data = add_waiting_time_and_penalty(train_data)
sorted_train_data.head()

Unnamed: 0,medallion,hack_license,vendor_id,rate_code,store_and_fwd_flag,pickup_datetime,dropoff_datetime,passenger_count,trip_time_in_secs,trip_distance,...,pickup_minute,pickup_time,dropoff_hour,pickup_date,dropoff_date,distance,speed,day_interval,waiting_time,waiting_penalty
5708,F4A9B95166FF93094F5B6A98D2D41B45,001C8AAB90AEE49F36FCAA7B4136C81A,VTS,2,,2013-01-03 05:54:00,2013-01-03 05:55:00,1,60,0.0,...,54,5.9,5,2013-01-03,2013-01-03,12723.128217,763387.692992,3,0.0,0.0
25276,F4A9B95166FF93094F5B6A98D2D41B45,001C8AAB90AEE49F36FCAA7B4136C81A,VTS,1,,2013-01-13 02:22:00,2013-01-13 02:32:00,1,600,2.89,...,22,2.366667,2,2013-01-13,2013-01-13,5.979824,35.878941,3,0.0,0.0
4012,F4A9B95166FF93094F5B6A98D2D41B45,001C8AAB90AEE49F36FCAA7B4136C81A,VTS,1,,2013-01-13 04:19:00,2013-01-13 04:41:00,1,1320,11.39,...,19,4.316667,4,2013-01-13,2013-01-13,15.443777,42.119392,3,5948.979058,8.262471
4956,F4A9B95166FF93094F5B6A98D2D41B45,001C8AAB90AEE49F36FCAA7B4136C81A,VTS,1,,2013-01-13 06:20:00,2013-01-13 06:30:00,1,600,3.64,...,20,6.333333,6,2013-01-13,2013-01-13,6.54844,39.290641,0,13335.466406,18.521481
1531,F4A9B95166FF93094F5B6A98D2D41B45,001C8AAB90AEE49F36FCAA7B4136C81A,VTS,1,,2013-01-13 06:40:00,2013-01-13 06:44:00,1,240,0.81,...,40,6.666667,6,2013-01-13,2013-01-13,0.767674,11.515105,0,14206.429068,19.731151


## Introduce Zip Code in the Historical Data

In [193]:
trip_data_with_zip_code = copy(sorted_train_data)

In [191]:
# documentation: https://pythonhosted.org/uszipcode/#by-coordinate
def add_zip_code(trip_data):
    pickup_zipcode_list = []
    dropoff_zipcode_list = []
    for index,row in trip_data.T.iteritems():
        pickup_res = findzip.by_coordinate(row['pickup_latitude'], row['pickup_longitude'], radius=10, returns=1)
        if(len(pickup_res) > 0):
            zipcode = pickup_res[0]["Zipcode"]
        else:
            zipcode = 'NaN'
        pickup_zipcode_list.append(zipcode)

        dropoff_res = findzip.by_coordinate(row['dropoff_latitude'], row['dropoff_longitude'], radius=10, returns=1)
        if(len(dropoff_res) > 0):
            zipcode = dropoff_res[0]["Zipcode"]
        else:
            zipcode = 'NaN'
        dropoff_zipcode_list.append(zipcode)

    trip_data['pickup_zipcode'] = pickup_zipcode_list
    trip_data['dropoff_zipcode'] = dropoff_zipcode_list

In [None]:
add_zip_code(trip_data_with_zip_code)
trip_data_with_zip_code = copy(trip_data_with_zip_code[(trip_data_with_zip_code['pickup_zipcode'] != 'NaN') & (trip_data_with_zip_code['dropoff_zipcode'] != 'NaN')])

## Build Weather Dictionary

In [None]:
trip_data_with_weather = copy(trip_data_with_zip_code)

In [189]:
def get_weather(lat, lon, pickup_datetime):
    base_url = 'http://api.openweathermap.org/data/2.5/forecast?'
    payload = {
                    'lat': lat,
                    'lon': lon,
                    'start': pickup_datetime,
                    'cnt': 1,
                    'APPID': 'e42277154fa7b3c31cb3b98fae71c220'
                    }
    r = requests.get(base_url, params=payload)
    if 'list' in r.json() and 'weather' in r.json()['list'][0] and 'main' in r.json()['list'][0]['weather'][0]:
        return r.json()['list'][0]['weather'][0]['main']
    else:
        return 'Sunny'

In [188]:
weather_dictionary = dict()
interval_to_hour = {0:6, 1:12, 2:18, 3:1}
def build_weather_dictionary(trip_data):
    count = 0
    for index,row in trip_data.T.iteritems():
        key = str(row['pickup_zipcode']) + '_' + str(row['pickup_date']) + '_' + str(row['day_interval'])
        if key not in weather_dictionary:
            _date = datetime.strptime(row['pickup_date'], '%Y-%m-%d')
            _date = _date.replace(hour=interval_to_hour[row['day_interval']])
            weather_dictionary[key] = get_weather(row['pickup_latitude'], row['pickup_longitude'], time.mktime(_date.timetuple()))
            count += 1
            if count >= 60:
                count = 0
                sleep(60)
        else:
            pass


In [None]:
build_weather_dictionary(trip_data_with_weather)

## Store Weather Dictionary

In [None]:
import json

with open('weather_data.json', 'w') as fp:
    json.dump(weather_dictionary, fp)


## Read Weather Data from File

In [None]:
with open('weather_data.json', 'r') as fp:
    _data = json.load(fp)

## Store Data

In [192]:
trip_data_with_zip_code.to_csv('clean_data.csv', index=False )

## Read Data

In [None]:
check_data = pd.read_csv('clean_data.csv')

In [29]:
check_data.head()

Unnamed: 0,medallion,hack_license,vendor_id,rate_code,store_and_fwd_flag,pickup_datetime,dropoff_datetime,passenger_count,trip_time_in_secs,trip_distance,...,pickup_date,dropoff_date,distance,speed,day_interval,waiting_time,waiting_penalty,pickup_zip_code,pickup_latlng,dropoff_latlng
5708,F4A9B95166FF93094F5B6A98D2D41B45,001C8AAB90AEE49F36FCAA7B4136C81A,VTS,2,,2013-01-03 05:54:00,2013-01-03 05:55:00,1,60,0.0,...,2013-01-03,2013-01-03,12723.128217,763387.692992,3,0.0,0.0,,"40.645481,-73.77636","0.0,0.0"
25276,F4A9B95166FF93094F5B6A98D2D41B45,001C8AAB90AEE49F36FCAA7B4136C81A,VTS,1,,2013-01-13 02:22:00,2013-01-13 02:32:00,1,600,2.89,...,2013-01-13,2013-01-13,5.979824,35.878941,3,0.0,0.0,,"40.730011,-73.983566","40.76461,-73.958244"
4012,F4A9B95166FF93094F5B6A98D2D41B45,001C8AAB90AEE49F36FCAA7B4136C81A,VTS,1,,2013-01-13 04:19:00,2013-01-13 04:41:00,1,1320,11.39,...,2013-01-13,2013-01-13,15.443777,42.119392,3,5948.979058,8.262471,,"40.74181,-73.993576","40.630337,-74.029701"
4956,F4A9B95166FF93094F5B6A98D2D41B45,001C8AAB90AEE49F36FCAA7B4136C81A,VTS,1,,2013-01-13 06:20:00,2013-01-13 06:30:00,1,600,3.64,...,2013-01-13,2013-01-13,6.54844,39.290641,0,13335.466406,18.521481,,"40.786541,-73.942551","40.755627,-73.979485"
1531,F4A9B95166FF93094F5B6A98D2D41B45,001C8AAB90AEE49F36FCAA7B4136C81A,VTS,1,,2013-01-13 06:40:00,2013-01-13 06:44:00,1,240,0.81,...,2013-01-13,2013-01-13,0.767674,11.515105,0,14206.429068,19.731151,,"40.761356,-73.97953000000003","40.758469,-73.984833"


In [None]:
trip_data_with_zip_code[trip_data_with_zip_code['pickup_zipcode'] == None].shape

In [314]:
trip_data_with_zip_code[:100]

Unnamed: 0,medallion,hack_license,vendor_id,rate_code,store_and_fwd_flag,pickup_datetime,dropoff_datetime,passenger_count,trip_time_in_secs,trip_distance,...,dropoff_date,time_inteval,day_interval,waiting_time,waiting_penalty,pickup_zip_code,dropoff_zip_codes,latlng,pickup_latlng,dropoff_latlng
5708,F4A9B95166FF93094F5B6A98D2D41B45,001C8AAB90AEE49F36FCAA7B4136C81A,VTS,2,,2013-01-03 05:54:00,2013-01-03 05:55:00,1,60,0.00,...,2013-01-03,Night,3,0.000000,0.000000,"40.645481,-73.77636",10119,"40.645481,-73.77636","40.645481,-73.77636","0.0,0.0"
25276,F4A9B95166FF93094F5B6A98D2D41B45,001C8AAB90AEE49F36FCAA7B4136C81A,VTS,1,,2013-01-13 02:22:00,2013-01-13 02:32:00,1,600,2.89,...,2013-01-13,Night,3,0.000000,0.000000,"40.730011,-73.983566",10119,"40.730011,-73.983566","40.730011,-73.983566","40.76461,-73.958244"
4012,F4A9B95166FF93094F5B6A98D2D41B45,001C8AAB90AEE49F36FCAA7B4136C81A,VTS,1,,2013-01-13 04:19:00,2013-01-13 04:41:00,1,1320,11.39,...,2013-01-13,Night,3,5948.979058,8.262471,"40.74181,-73.993576",10013,"40.74181,-73.993576","40.74181,-73.993576","40.630337,-74.029701"
4956,F4A9B95166FF93094F5B6A98D2D41B45,001C8AAB90AEE49F36FCAA7B4136C81A,VTS,1,,2013-01-13 06:20:00,2013-01-13 06:30:00,1,600,3.64,...,2013-01-13,Day,0,13335.466406,18.521481,"40.786541,-73.942551",10020,"40.786541,-73.942551","40.786541,-73.942551","40.755627,-73.979485"
1531,F4A9B95166FF93094F5B6A98D2D41B45,001C8AAB90AEE49F36FCAA7B4136C81A,VTS,1,,2013-01-13 06:40:00,2013-01-13 06:44:00,1,240,0.81,...,2013-01-13,Day,0,14206.429068,19.731151,"40.761356,-73.97953000000003",10011,"40.761356,-73.97953000000003","40.761356,-73.97953000000003","40.758469,-73.984833"
10875,0A34A99FFA084D5791FF569897425D1C,0025133AD810DBE80D35FCA8BF0BCA1F,VTS,1,,2013-01-13 02:37:00,2013-01-13 02:46:00,1,540,2.74,...,2013-01-13,Night,3,0.000000,0.000000,"40.742538,-74.003914",10016,"40.742538,-74.003914","40.742538,-74.003914","40.769596,-73.984604"
18956,0A34A99FFA084D5791FF569897425D1C,0025133AD810DBE80D35FCA8BF0BCA1F,VTS,1,,2013-01-13 03:26:00,2013-01-13 03:41:00,1,900,3.07,...,2013-01-13,Night,3,1534.797878,2.131664,"40.744495,-74.006416",10011,"40.744495,-74.006416","40.744495,-74.006416","40.715981,-73.98691600000002"
21367,C014DC27E83E3074352DCEC486D05F0C,002C093A2CB9FD40C8C54AB5D158FC47,VTS,1,,2013-01-13 10:34:00,2013-01-13 10:41:00,1,420,0.99,...,2013-01-13,Day,0,0.000000,0.000000,"40.780106,-73.944664",11369,"40.780106,-73.944664","40.780106,-73.944664","40.783138,-73.957359"
17805,C014DC27E83E3074352DCEC486D05F0C,002C093A2CB9FD40C8C54AB5D158FC47,VTS,1,,2013-01-13 10:48:00,2013-01-13 11:06:00,1,1080,5.53,...,2013-01-13,Day,0,196.769079,0.273290,"40.773781,-73.962051",10010,"40.773781,-73.962051","40.773781,-73.962051","40.72270200000001,-73.947037"
42340,C014DC27E83E3074352DCEC486D05F0C,002C093A2CB9FD40C8C54AB5D158FC47,VTS,1,,2013-01-13 11:15:00,2013-01-13 11:17:00,1,120,0.65,...,2013-01-13,Day,0,766.258010,1.064247,"40.721611,-73.94641899999998",10026,"40.721611,-73.94641899999998","40.721611,-73.94641899999998","40.716312,-73.944733"
