### Import the library 

In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from math import sin, cos, sqrt, atan2, radians
# import pymc3 as pm

### Import the data

In [12]:
df = pd.read_csv("nyc-taxi-trip-duration/train.csv")
display(df.shape)
df.head()

(1458644, 11)

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,N,455
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,N,663
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,N,2124
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,N,429
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,N,435


In [136]:
wall_street_location = [40.70729,-74.01095]
# sample_location = [[40.73,-74.05],[40.665,-73.795],[40.77,-73.995],[40.74,-73.99],[40.695,-73.95],[40.767,-73.875]]
all_location = [[40.73,-74.05],[40.665,-73.795],[40.77,-73.995],[40.74,-73.99],[40.695,-73.95],[40.767,-73.875],[40.70729,-74.01095]]
all_location_label = ["1","2","3","4","5","6","HQ"]

dict_location = dict(zip(all_location_label, all_location))


In [137]:
def get_distance_between_points(lat1,lon1,lat2,lon2):
    '''
    This function is used to get the distance between lat1, lon1 and lat2, lon2
    '''
    # approximate radius of earth in km
    R = 6378.0
    lat1 = radians(lat1)
    lon1 = radians(lon1)
    lat2 = radians(lat2)
    lon2 = radians(lon2)
    
    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))

    distance = R * c
    return distance

In [138]:
# calculate the distance between the location of wall street to others point in the dataset.
df['dist_pickup_to_wall_street'] = np.vectorize(get_distance_between_points)(wall_street_location[0],wall_street_location[1],df['pickup_latitude'], df['pickup_longitude'])



In order to extract the most relevant information from the
dataset for our purpose, we create a filtered dataset, in
which either the pick-up or drop-off location lies within
500 m from Wall Street (indicated by the red oval in Fig.
3(b)). In addition, we filter trips with unrealistically long
durations (>20 000 s).

In [139]:
df_wall = df[( df['dist_pickup_to_wall_street'] <= 0.5)& (df['trip_duration'] < 20000)]
df_wall.shape



(20535, 13)

### Choosing the prediction model

simply use 5 different prediction algorithm as the model, and compare its performance.

From this data, we will only use its location (dropoff_longitude and dropoff_latitude) and distance from the HQ to predict the trip duration.

In [140]:
 
temp_df = df_wall[['dropoff_latitude','dropoff_longitude','dist_pickup_to_dropoff','trip_duration']].copy()

# Declare the model
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import BaggingRegressor
from sklearn import model_selection
from sklearn.model_selection import train_test_split
experiments = {"Algorithm":["Ordinary Least Squares", "Regression Tree", 
                            "Big Random Forest", "Random Forest", 
                            "Bagging"], 
               "Objects" : [lambda : LinearRegression(), 
                            lambda : DecisionTreeRegressor(), 
                            lambda : RandomForestRegressor(n_estimators=100), 
                            lambda : RandomForestRegressor(), 
                            lambda : BaggingRegressor()], 
               "Predictions":[[] for _ in range(5)]}
actuals = []

In [141]:
# run the 5 differenet algorithm and compare the result

for _ in range (4):
    train_X, test_X, train_y, test_y = (
        train_test_split(temp_df.drop("trip_duration", axis=1), 
                         temp_df["trip_duration"], test_size=0.33))
    for i, obj_factory in enumerate(experiments["Objects"]):
        obj = obj_factory()
        obj.fit(y=train_y,X=train_X)
        experiments["Predictions"][i] += list(obj.predict(test_X))
    actuals += list(test_y)
actuals = pd.Series(actuals)
experiments["Predictions"] = list(map(pd.Series, experiments["Predictions"]))

# Get the performance of every model
experiments["Results"] = []
for o in experiments["Objects"]:
    experiments["Results"].append(
        model_selection.cross_val_score(o(), y=temp_df['trip_duration'], 
                                        X=temp_df.drop("trip_duration", axis=1),
                                        cv=5).mean())

pd.DataFrame(experiments).drop(["Objects", "Predictions"], 
                            axis=1).set_index("Algorithm")



Unnamed: 0_level_0,Results
Algorithm,Unnamed: 1_level_1
Ordinary Least Squares,0.51472
Regression Tree,0.17877
Big Random Forest,0.529662
Random Forest,0.496302
Bagging,0.495507


Big random forest performance very well compare to the other. So, in this case, we will use big random forest as the algorithm for predicting trip duration. To easily re-produce the model, it is important to create the model in a function.

In [142]:
def create_model(temp_df, wall_street_location):
    '''
    parameter : 
        - temp_df = the training dataframe
        - wall_street_location = the headquarter
    '''
    train_X, test_X, train_y, test_y = ( train_test_split(temp_df.drop("trip_duration", axis=1), 
                             temp_df["trip_duration"], test_size=0.33))

    obj = RandomForestRegressor(n_estimators=100)
    obj.fit(y=train_y,X=train_X)
#     obj.predict(test_X)
    
#     print("Cross val score : ",model_selection.cross_val_score(obj, y=temp_df['trip_duration'], 
#                                             X=temp_df.drop("trip_duration", axis=1),
#                                             cv=5).mean())
#     obj.predict(temp_df.drop("trip_duration", axis=1))

    ## CREATE THE SAMPLE FOR THE MODEL
    sample_lat = []
    sample_lng = []
    sample_distance = []

    for i in dict_location:
        sample_lat.append(dict_location[i][0])
        sample_lng.append(dict_location[i][1])
        sample_distance.append(get_distance_between_points(wall_street_location[0],wall_street_location[1],dict_location[i][0],dict_location[i][1]))
        
    
    d = {'dropoff_latitude' : sample_lat,
     'dropoff_longitude' : sample_lng,
    'dist_pickup_to_dropoff': sample_distance}
    temp_pred_df = pd.DataFrame(d)  
    
    temp_pred_df['predicted_duration'] = obj.predict(temp_pred_df)
    return temp_pred_df

In [143]:
def get_location_names(lat,lng):
    for k,v in dict_location.items():
        if  (v[0] == lat) & (v[1] == lng) :
            return k
        

In [144]:
df1 = create_model(temp_df,dict_location["1"])
df1

Unnamed: 0,dropoff_latitude,dropoff_longitude,dist_pickup_to_dropoff,predicted_duration
0,40.73,-74.05,0.0,927.91
1,40.665,-73.795,22.704878,3522.95
2,40.77,-73.995,6.429525,1451.4
3,40.74,-73.99,5.181914,1555.01
4,40.695,-73.95,9.293823,1830.51
5,40.767,-73.875,15.322028,2131.94
6,40.70729,-74.01095,4.152766,992.27


In [145]:
prediction_list = []
for i in dict_location:
    tdf = create_model(temp_df,dict_location[str(i)])
    prediction_list.append(list(tdf['predicted_duration']))
    print(list(tdf['predicted_duration']))

[748.7916666666665, 2913.22, 1446.79, 1607.26, 1783.82, 2280.95, 815.87]
[2427.34, 1597.092, 2582.67, 2345.66, 2050.02, 2079.1, 2354.74]
[1362.69, 3070.45, 1388.255, 877.39, 2066.19, 1846.79, 1140.55]
[2475.44, 2807.82, 964.76, 832.47, 1198.09, 2057.61, 845.81]
[2258.58, 2722.01, 1915.74, 1402.79, 1323.86, 2125.69, 1354.34]
[2318.36, 3481.04, 2052.84, 1763.97, 1864.5, 1491.5366666666669, 1841.84]
[931.05, 2111.62, 1507.4, 1452.48, 1190.89, 2571.8, 467.82]


In [146]:
distance_list = []
for i in dict_location:
    tdf = create_model(temp_df,dict_location[str(i)])
    distance_list.append(list(tdf['dist_pickup_to_dropoff'].round(3)))
    print(list(tdf['dist_pickup_to_dropoff'].round(3)))

[0.0, 22.705, 6.43, 5.182, 9.294, 15.322, 4.153]
[22.705, 0.0, 20.527, 18.453, 13.504, 13.209, 18.827]
[6.43, 20.527, 0.0, 3.366, 9.171, 10.122, 7.109]
[5.182, 18.453, 3.366, 0.0, 6.04, 10.153, 4.047]
[9.294, 13.504, 9.171, 6.04, 0.0, 10.211, 5.323]
[15.322, 13.209, 10.122, 10.153, 10.211, 0.0, 13.254]
[4.153, 18.827, 7.109, 4.047, 5.323, 13.254, 0.0]


In [157]:
for i in range(len(prediction_list)):
    prediction_list[i][i] = 0
prediction_list

[[0, 2913.22, 1446.79, 1607.26, 1783.82, 2280.95, 815.87],
 [2427.34, 0, 2582.67, 2345.66, 2050.02, 2079.1, 2354.74],
 [1362.69, 3070.45, 0, 877.39, 2066.19, 1846.79, 1140.55],
 [2475.44, 2807.82, 964.76, 0, 1198.09, 2057.61, 845.81],
 [2258.58, 2722.01, 1915.74, 1402.79, 0, 2125.69, 1354.34],
 [2318.36, 3481.04, 2052.84, 1763.97, 1864.5, 0, 1841.84],
 [931.05, 2111.62, 1507.4, 1452.48, 1190.89, 2571.8, 0]]

In [164]:
import plotly.express as px
px.set_mapbox_access_token(open(".mapbox_token").read())
carshare = px.data.carshare()
fig = px.scatter_mapbox(carshare, lat="centroid_lat", lon="centroid_lon",     color="peak_hour", size="car_hours",
                  color_continuous_scale=px.colors.cyclical.IceFire, size_max=15, zoom=10)
fig.show()

ModuleNotFoundError: No module named 'plotly.express'

In [161]:
df

Unnamed: 0,iata,airport,city,state,country,lat,long,cnt,text
0,ORD,Chicago O'Hare International,Chicago,IL,USA,41.979595,-87.904464,25129,"Chicago O'Hare InternationalChicago, ILArrival..."
1,ATL,William B Hartsfield-Atlanta Intl,Atlanta,GA,USA,33.640444,-84.426944,21925,"William B Hartsfield-Atlanta IntlAtlanta, GAAr..."
2,DFW,Dallas-Fort Worth International,Dallas-Fort Worth,TX,USA,32.895951,-97.037200,20662,Dallas-Fort Worth InternationalDallas-Fort Wor...
3,PHX,Phoenix Sky Harbor International,Phoenix,AZ,USA,33.434167,-112.008056,17290,"Phoenix Sky Harbor InternationalPhoenix, AZArr..."
4,DEN,Denver Intl,Denver,CO,USA,39.858408,-104.667002,13781,"Denver IntlDenver, COArrivals: 13781"
5,IAH,George Bush Intercontinental,Houston,TX,USA,29.980472,-95.339722,13223,"George Bush IntercontinentalHouston, TXArrival..."
6,SFO,San Francisco International,San Francisco,CA,USA,37.619002,-122.374843,12016,"San Francisco InternationalSan Francisco, CAAr..."
7,LAX,Los Angeles International,Los Angeles,CA,USA,33.942536,-118.408074,11797,"Los Angeles InternationalLos Angeles, CAArriva..."
8,MCO,Orlando International,Orlando,FL,USA,28.428889,-81.316028,10536,"Orlando InternationalOrlando, FLArrivals: 10536"
9,CLT,Charlotte/Douglas International,Charlotte,NC,USA,35.214011,-80.943126,10490,"Charlotte/Douglas InternationalCharlotte, NCAr..."


In [153]:
# import libraries
import folium
import pandas as pd
 
lat_list = []
lon_list= []
for i in dict_location:
    lat_list.append(dict_location[i][0])
    lon_list.append(dict_location[i][1])
    
# Make a data frame with dots to show on the map
data = pd.DataFrame({
'lat': lat_list,
'lon': lon_list,
'name':['HQ','1','2','3','4','5','6']
})
data
 
# Make an empty map
# m = folium.Map(location=[20, 0], tiles="Mapbox Bright", zoom_start=2)
m = folium.Map(location=wall_street_location, tiles="Mapbox Control Room")

 
# I can add marker one by one on the map
for i in range(0,len(data)):
    folium.Marker([data.iloc[i]['lat'], data.iloc[i]['lon']], popup=data.iloc[i]['name']).add_to(m)

m