# NYC Taxi - Fare Prediction project

In [1]:
import pandas as pd
import sklearn as sk
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from geopy.distance import geodesic
from pyproj import Geod
import seaborn as sns
from math import sin, cos, sqrt, atan2, radians

In [2]:
train_data = pd.read_csv('train.csv')

In [3]:
# Visually inspecting the data
train_data.head()

Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2009-06-15 17:26:21.0000001,4.5,2009-06-15 17:26:21 UTC,-73.844311,40.721319,-73.84161,40.712278,1
1,2010-01-05 16:52:16.0000002,16.9,2010-01-05 16:52:16 UTC,-74.016048,40.711303,-73.979268,40.782004,1
2,2011-08-18 00:35:00.00000049,5.7,2011-08-18 00:35:00 UTC,-73.982738,40.76127,-73.991242,40.750562,2
3,2012-04-21 04:30:42.0000001,7.7,2012-04-21 04:30:42 UTC,-73.98713,40.733143,-73.991567,40.758092,1
4,2010-03-09 07:51:00.000000135,5.3,2010-03-09 07:51:00 UTC,-73.968095,40.768008,-73.956655,40.783762,1


In [4]:
# Checking columns names
train_data.columns

Index(['key', 'fare_amount', 'pickup_datetime', 'pickup_longitude',
       'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude',
       'passenger_count'],
      dtype='object')

In [5]:
train_data.shape

(55423856, 8)

In [6]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55423856 entries, 0 to 55423855
Data columns (total 8 columns):
key                  object
fare_amount          float64
pickup_datetime      object
pickup_longitude     float64
pickup_latitude      float64
dropoff_longitude    float64
dropoff_latitude     float64
passenger_count      int64
dtypes: float64(5), int64(1), object(2)
memory usage: 3.3+ GB


In [7]:
#Counting the values in passenger_count column
train_data.passenger_count.value_counts(dropna=False)

1      38337524
2       8175243
5       3929346
3       2432712
4       1178852
6       1174647
0        195416
208          64
9            23
7            15
8             9
129           2
51            1
49            1
34            1
Name: passenger_count, dtype: int64

In [11]:
#Counting the values in pickup_datetime column 
train_data.pickup_datetime.value_counts(dropna=False)

2009-11-01 01:17:00 UTC    62
2013-11-03 01:29:00 UTC    62
2010-11-07 01:01:00 UTC    60
2010-11-07 01:10:00 UTC    59
2009-11-01 01:05:00 UTC    59
2010-11-07 01:05:00 UTC    58
2010-11-07 01:00:00 UTC    57
2014-03-13 20:11:00 UTC    57
2009-11-01 01:45:00 UTC    56
2013-04-26 20:15:00 UTC    55
2013-11-03 01:11:00 UTC    55
2013-11-03 01:09:00 UTC    55
2010-04-16 20:37:00 UTC    55
2010-11-07 01:35:00 UTC    55
2009-10-21 19:01:00 UTC    54
2009-11-01 01:31:00 UTC    54
2010-02-23 18:38:00 UTC    54
2009-11-01 01:41:00 UTC    54
2013-04-19 18:34:00 UTC    54
2013-11-23 23:14:00 UTC    54
2013-04-27 23:58:00 UTC    53
2013-04-20 00:49:00 UTC    53
2009-11-01 01:38:00 UTC    53
2012-03-09 22:06:00 UTC    53
2009-11-01 01:02:00 UTC    53
2013-04-20 23:54:00 UTC    53
2013-04-19 19:17:00 UTC    53
2013-04-20 22:33:00 UTC    53
2012-06-06 19:47:00 UTC    53
2012-09-22 21:51:00 UTC    53
                           ..
2015-05-03 01:28:50 UTC     1
2015-03-14 12:36:14 UTC     1
2012-02-29

In [8]:
#Counting the values in fare_amount column 
train_data.fare_amount.value_counts(dropna=False)

 6.50      2691783
 4.50      2270305
 8.50      2080476
 5.30      1623420
 5.70      1620795
 6.10      1578275
 4.90      1571360
 6.90      1445389
 10.50     1408721
 7.30      1361582
 7.70      1268537
 4.10      1175806
 8.10      1175770
 7.00      1159454
 6.00      1155132
 5.50      1122703
 7.50      1113201
 8.00      1056206
 5.00      1028036
 8.90       975650
 9.00       946448
 12.50      938777
 9.30       889903
 9.50       875406
 9.70       809765
 3.70       808442
 10.00      803013
 10.10      736606
 11.00      681711
 14.50      635346
            ...   
 310.10          1
 71.74           1
 71.01           1
 64.74           1
 70.51           1
-67.50           1
-67.00           1
 279.65          1
 275.60          1
 262.60          1
 269.60          1
 269.35          1
 269.10          1
-64.00           1
 306.60          1
 65.01           1
 319.60          1
 293.60          1
 290.90          1
 479.35          1
 476.10          1
 459.90     

In [9]:
train_data.describe()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
count,55423860.0,55423860.0,55423860.0,55423480.0,55423480.0,55423860.0
mean,11.34505,-72.50968,39.91979,-72.51121,39.92068,1.68538
std,20.71083,12.84888,9.642353,12.7822,9.633346,1.327664
min,-300.0,-3442.06,-3492.264,-3442.025,-3547.887,0.0
25%,6.0,-73.99207,40.73493,-73.9914,40.73403,1.0
50%,8.5,-73.9818,40.75265,-73.98015,40.75316,1.0
75%,12.5,-73.96708,40.76713,-73.96367,40.7681,2.0
max,93963.36,3457.626,3408.79,3457.622,3537.133,208.0


In [23]:
#Fare amount has some negative values
train_data[train_data.fare_amount<0]

Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
2039,2010-03-09 23:37:10.0000005,-2.90,2010-03-09 23:37:10 UTC,-73.789450,40.643498,-73.788665,40.641952,1
2486,2015-03-22 05:14:27.0000001,-2.50,2015-03-22 05:14:27 UTC,-74.000031,40.720631,-73.999809,40.720539,1
13032,2013-08-30 08:57:10.0000002,-3.00,2013-08-30 08:57:10 UTC,-73.995062,40.740755,-73.995885,40.741357,4
28839,2013-08-11 13:39:10.0000001,-2.50,2013-08-11 13:39:10 UTC,-73.785260,40.648442,0.000000,0.000000,1
36722,2015-04-30 15:19:45.0000003,-2.50,2015-04-30 15:19:45 UTC,-73.952187,40.790112,-73.950043,40.792839,1
42337,2015-03-09 10:29:46.0000004,-5.00,2015-03-09 10:29:46 UTC,-73.990974,40.755985,-73.980820,40.759869,1
56748,2015-06-26 01:13:18.0000002,-5.00,2015-06-26 01:13:18 UTC,-73.979797,40.743240,-73.981216,40.737240,6
58937,2010-02-19 23:47:10.0000002,-44.90,2010-02-19 23:47:10 UTC,-73.871120,40.773902,-73.676533,40.786890,1
97838,2015-06-07 02:54:14.0000004,-3.00,2015-06-07 02:54:14 UTC,-73.913246,40.766212,-73.914963,40.764065,5
102938,2010-02-10 12:33:10.0000004,-2.90,2010-02-10 12:33:10 UTC,-73.970775,40.783425,-73.973443,40.779775,1


In [10]:
# Removing negative fare_amount
train_data = train_data[train_data.fare_amount>=0]

In [11]:
#Remove missing data
print(train_data.isnull().sum())

key                    0
fare_amount            0
pickup_datetime        0
pickup_longitude       0
pickup_latitude        0
dropoff_longitude    376
dropoff_latitude     376
passenger_count        0
dtype: int64


In [12]:
# Dropping Null Values
print('Old size: %d' % len(train_data))
train_data = train_data.dropna(how = 'any', axis = 'rows')
print('New size: %d' % len(train_data))

Old size: 55421402
New size: 55421026


In [13]:
# Removing passenger_count = 0 
print('Old size: %d' % len(train_data))
train_data = train_data[train_data.passenger_count>=1]
print('New size: %d' % len(train_data))

Old size: 55421026
New size: 55225987


In [14]:
# Removing passenger_count greater than 10
print('Old size: %d' % len(train_data))
train_data = train_data[train_data.passenger_count<=10]
print('New size: %d' % len(train_data))

Old size: 55225987
New size: 55225918


In [15]:
# Making sure the data has no null values before we export it
print(train_data.isnull().sum())

key                  0
fare_amount          0
pickup_datetime      0
pickup_longitude     0
pickup_latitude      0
dropoff_longitude    0
dropoff_latitude     0
passenger_count      0
dtype: int64


In [17]:
# Exporting cleaned data to a CSV
train_data.to_csv('train_cleaned.csv')

In [2]:
# Importing the cleaned training data
train_data = pd.read_csv('train_cleaned.csv')

In [3]:
train_data.head()

Unnamed: 0.1,Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,0,2009-06-15 17:26:21.0000001,4.5,2009-06-15 17:26:21 UTC,-73.844311,40.721319,-73.84161,40.712278,1
1,1,2010-01-05 16:52:16.0000002,16.9,2010-01-05 16:52:16 UTC,-74.016048,40.711303,-73.979268,40.782004,1
2,2,2011-08-18 00:35:00.00000049,5.7,2011-08-18 00:35:00 UTC,-73.982738,40.76127,-73.991242,40.750562,2
3,3,2012-04-21 04:30:42.0000001,7.7,2012-04-21 04:30:42 UTC,-73.98713,40.733143,-73.991567,40.758092,1
4,4,2010-03-09 07:51:00.000000135,5.3,2010-03-09 07:51:00 UTC,-73.968095,40.768008,-73.956655,40.783762,1


In [4]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55225918 entries, 0 to 55225917
Data columns (total 9 columns):
Unnamed: 0           int64
key                  object
fare_amount          float64
pickup_datetime      object
pickup_longitude     float64
pickup_latitude      float64
dropoff_longitude    float64
dropoff_latitude     float64
passenger_count      int64
dtypes: float64(5), int64(2), object(2)
memory usage: 3.7+ GB


In [8]:
lat1 = train_data.loc[:,['pickup_latitude']]
long1 = train_data.loc[:,['pickup_longitude']]
lat2 = train_data.loc[:,['dropoff_latitude']]
long2 = train_data.loc[:,['dropoff_longitude']]

In [6]:
# To get the distance between the two rides we need to convert Latitude and Longitude in Radians
train_data['pickup_latitude_radians'] = train_data['pickup_latitude'].apply(lambda x: radians(x))
train_data['pickup_longitude_radians'] = train_data['pickup_longitude'].apply(lambda x: radians(x))
train_data['dropoff_latitude_radians'] = train_data['dropoff_latitude'].apply(lambda x: radians(x))
train_data['dropoff_longitude_radians'] = train_data['dropoff_longitude'].apply(lambda x: radians(x))

In [10]:
train_data.head()

Unnamed: 0.1,Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_latitude_radians,pickup_longitude_radians,dropoff_latitude_radians,dropoff_longitude_radians
0,0,2009-06-15 17:26:21.0000001,4.5,2009-06-15 17:26:21 UTC,-73.844311,40.721319,-73.84161,40.712278,1,0.710721,-1.288826,0.710563,-1.288779
1,1,2010-01-05 16:52:16.0000002,16.9,2010-01-05 16:52:16 UTC,-74.016048,40.711303,-73.979268,40.782004,1,0.710546,-1.291824,0.71178,-1.291182
2,2,2011-08-18 00:35:00.00000049,5.7,2011-08-18 00:35:00 UTC,-73.982738,40.76127,-73.991242,40.750562,2,0.711418,-1.291242,0.711231,-1.291391
3,3,2012-04-21 04:30:42.0000001,7.7,2012-04-21 04:30:42 UTC,-73.98713,40.733143,-73.991567,40.758092,1,0.710927,-1.291319,0.711363,-1.291396
4,4,2010-03-09 07:51:00.000000135,5.3,2010-03-09 07:51:00 UTC,-73.968095,40.768008,-73.956655,40.783762,1,0.711536,-1.290987,0.711811,-1.290787


In [23]:
lat1 = train_data['pickup_latitude_radians']
long1 = train_data['pickup_longitude_radians']
lat2 = train_data['dropoff_latitude_radians']
long2 = train_data['dropoff_longitude_radians']

train_data['dlon'] = long2 - long1
train_data['dlat'] = lat2 - lat1

In [33]:
# checking difference between latitude and longitude in radians in training data
train_data.to_csv('train_cleaned.csv')

In [35]:
train_data.head()

Unnamed: 0.1,Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_latitude_radians,pickup_longitude_radians,dropoff_latitude_radians,dropoff_longitude_radians,dlat,dlon
0,0,2009-06-15 17:26:21.0000001,4.5,2009-06-15 17:26:21 UTC,-73.844311,40.721319,-73.84161,40.712278,1,0.710721,-1.288826,0.710563,-1.288779,-0.000158,4.7e-05
1,1,2010-01-05 16:52:16.0000002,16.9,2010-01-05 16:52:16 UTC,-74.016048,40.711303,-73.979268,40.782004,1,0.710546,-1.291824,0.71178,-1.291182,0.001234,0.000642
2,2,2011-08-18 00:35:00.00000049,5.7,2011-08-18 00:35:00 UTC,-73.982738,40.76127,-73.991242,40.750562,2,0.711418,-1.291242,0.711231,-1.291391,-0.000187,-0.000148
3,3,2012-04-21 04:30:42.0000001,7.7,2012-04-21 04:30:42 UTC,-73.98713,40.733143,-73.991567,40.758092,1,0.710927,-1.291319,0.711363,-1.291396,0.000435,-7.7e-05
4,4,2010-03-09 07:51:00.000000135,5.3,2010-03-09 07:51:00 UTC,-73.968095,40.768008,-73.956655,40.783762,1,0.711536,-1.290987,0.711811,-1.290787,0.000275,0.0002


In [36]:
dlon = train_data['dlon']
dlat = train_data['dlat']

R = 6373.0
def distance(dlat,dlon,lat1,lat2,R):
    a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))
    distance = R * c
    return distance

#result = train_data.apply(lambda dist: distance(dlat,dlon,lat1,lat2,R), axis=1)

#print "Result", distance

In [None]:
# Plot the histogram with default number of bins; label your axes
_ = plt.plot(train_data['fare_amount'], train_data['passenger_count'], marker='.',linestyle='none')
_ = plt.xlabel('Fare Amount')
_ = plt.ylabel('Passenger_Count')
# Show the plot
plt.show()

In [None]:
#train_data = train_data.drop(['pickup_radians'], axis=1)