In [86]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import math as mt

import warnings
warnings.filterwarnings(action = "ignore")

In [87]:
df = pd.read_csv('../Data/uber.csv')

In [88]:
df.head()

Unnamed: 0.1,Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,24238194,2015-05-07 19:52:06.0000003,7.5,2015-05-07 19:52:06 UTC,-73.999817,40.738354,-73.999512,40.723217,1
1,27835199,2009-07-17 20:04:56.0000002,7.7,2009-07-17 20:04:56 UTC,-73.994355,40.728225,-73.99471,40.750325,1
2,44984355,2009-08-24 21:45:00.00000061,12.9,2009-08-24 21:45:00 UTC,-74.005043,40.74077,-73.962565,40.772647,1
3,25894730,2009-06-26 08:22:21.0000001,5.3,2009-06-26 08:22:21 UTC,-73.976124,40.790844,-73.965316,40.803349,3
4,17610152,2014-08-28 17:47:00.000000188,16.0,2014-08-28 17:47:00 UTC,-73.925023,40.744085,-73.973082,40.761247,5


In [89]:
df.tail()

Unnamed: 0.1,Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
199995,42598914,2012-10-28 10:49:00.00000053,3.0,2012-10-28 10:49:00 UTC,-73.987042,40.739367,-73.986525,40.740297,1
199996,16382965,2014-03-14 01:09:00.0000008,7.5,2014-03-14 01:09:00 UTC,-73.984722,40.736837,-74.006672,40.73962,1
199997,27804658,2009-06-29 00:42:00.00000078,30.9,2009-06-29 00:42:00 UTC,-73.986017,40.756487,-73.858957,40.692588,2
199998,20259894,2015-05-20 14:56:25.0000004,14.5,2015-05-20 14:56:25 UTC,-73.997124,40.725452,-73.983215,40.695415,1
199999,11951496,2010-05-15 04:08:00.00000076,14.1,2010-05-15 04:08:00 UTC,-73.984395,40.720077,-73.985508,40.768793,1


In [90]:
df.drop(columns = ['Unnamed: 0'], axis = 1, inplace = True)

In [91]:
df.shape

(200000, 8)

This dataset contain 200,000 rows and 8 columns.

In [92]:
all_col = list(df.columns)

print(f"All features : {all_col}")

All features : ['key', 'fare_amount', 'pickup_datetime', 'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', 'passenger_count']


1. key - a unique identifier for each trip
2. fare_amount - the cost of each trip in usd
3. pickup_datetime - date and time when the meter was engaged
4. passenger_count - the number of passengers in the vehicle (driver entered value)
5. pickup_longitude - the longitude where the meter was engaged
6. pickup_latitude - the latitude where the meter was engaged
7. dropoff_longitude - the longitude where the meter was disengaged
8. dropoff_latitude - the latitude where the meter was disengaged

In [93]:
df.duplicated().sum()

0

There are no duplicated values in that dataset.

In [94]:
df.isnull().sum()

key                  0
fare_amount          0
pickup_datetime      0
pickup_longitude     0
pickup_latitude      0
dropoff_longitude    1
dropoff_latitude     1
passenger_count      0
dtype: int64

In [95]:
df[df.isnull().any(axis = 1)]

Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
87946,2013-07-02 03:51:57.0000001,24.1,2013-07-02 03:51:57 UTC,-73.950581,40.779692,,,0


In [96]:
df.dropna(inplace = True)

In [97]:
df.shape

(199999, 8)

Now it contain 199,999 rows and 8 columns.

In [98]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 199999 entries, 0 to 199999
Data columns (total 8 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   key                199999 non-null  object 
 1   fare_amount        199999 non-null  float64
 2   pickup_datetime    199999 non-null  object 
 3   pickup_longitude   199999 non-null  float64
 4   pickup_latitude    199999 non-null  float64
 5   dropoff_longitude  199999 non-null  float64
 6   dropoff_latitude   199999 non-null  float64
 7   passenger_count    199999 non-null  int64  
dtypes: float64(5), int64(1), object(2)
memory usage: 13.7+ MB


In [99]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
fare_amount,199999.0,11.359892,9.90176,-52.0,6.0,8.5,12.5,499.0
pickup_longitude,199999.0,-72.527631,11.437815,-1340.64841,-73.992065,-73.981823,-73.967154,57.418457
pickup_latitude,199999.0,39.935881,7.720558,-74.015515,40.734796,40.752592,40.767158,1644.421482
dropoff_longitude,199999.0,-72.525292,13.117408,-3356.6663,-73.991407,-73.980093,-73.963658,1153.572603
dropoff_latitude,199999.0,39.92389,6.794829,-881.985513,40.733823,40.753042,40.768001,872.697628
passenger_count,199999.0,1.684543,1.385995,0.0,1.0,1.0,2.0,208.0


In [100]:
df[(df['pickup_longitude'] < -180) | (df['pickup_longitude'] > 180) | (df['dropoff_longitude'] > 180) | (df['dropoff_longitude'] < -180) | (df['pickup_latitude'] < -90) | (df['pickup_latitude'] > 90) | (df['dropoff_latitude'] > 90) | (df['dropoff_latitude'] < -90)]

Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
4949,2012-04-28 00:58:00.000000235,4.9,2012-04-28 00:58:00 UTC,-748.016667,40.739957,-74.00357,40.734192,1
32549,2012-06-16 10:04:00.00000061,15.7,2012-06-16 10:04:00 UTC,-74.016055,40.715155,-737.916665,40.697862,2
48506,2011-11-05 23:26:00.000000309,33.7,2011-11-05 23:26:00 UTC,-735.2,40.770092,-73.980187,40.76553,1
56617,2012-03-11 07:24:00.00000031,8.1,2012-03-11 07:24:00 UTC,-73.960828,404.433332,-73.988357,40.769037,1
61793,2012-06-13 05:45:00.0000006,8.5,2012-06-13 05:45:00 UTC,-73.951385,401.066667,-73.98211,40.754117,1
75851,2011-11-05 00:22:00.00000051,15.7,2011-11-05 00:22:00 UTC,-1340.64841,1644.421482,-3356.6663,872.697628,1
91422,2011-05-18 13:24:00.000000213,16.1,2011-05-18 13:24:00 UTC,57.418457,1292.016128,1153.572603,-881.985513,1
103745,2011-10-14 19:04:00.000000202,12.9,2011-10-14 19:04:00 UTC,-736.216667,40.767035,-73.982377,40.725562,1
139447,2012-01-20 11:50:00.00000088,13.7,2012-01-20 11:50:00 UTC,-74.011042,40.70978,-73.983163,493.533332,4
144253,2009-08-26 11:55:00.00000023,7.3,2009-08-26 11:55:00 UTC,-768.55,40.757812,-73.99704,40.740007,1


In [101]:
drop_index = list(np.asarray(df[(df['pickup_longitude'] < -180) | (df['pickup_longitude'] > 180) | (df['dropoff_longitude'] > 180) | (df['dropoff_longitude'] < -180) | (df['pickup_latitude'] < -90) | (df['pickup_latitude'] > 90) | (df['dropoff_latitude'] > 90) | (df['dropoff_latitude'] < -90)].index))

drop_index

[4949,
 32549,
 48506,
 56617,
 61793,
 75851,
 91422,
 103745,
 139447,
 144253,
 161652,
 199936]

In [102]:
df.drop(index = drop_index, axis = 0, inplace = True)

In [103]:
df[(df['pickup_longitude'] < -180) | (df['pickup_longitude'] > 180) | (df['dropoff_longitude'] > 180) | (df['dropoff_longitude'] < -180) | (df['pickup_latitude'] < -90) | (df['pickup_latitude'] > 90) | (df['dropoff_latitude'] > 90) | (df['dropoff_latitude'] < -90)]

Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count


In [104]:
def calculate_distance(long1, lati1, long2, lati2):
    distance = []
    
    for i in range (len(list(pickup_longitude))):
        
        lon1,lat1,lon2,lat2 = map(mt.radians, [long1[i], lati1[i], long2[i], lati2[i]])
        
        x = abs(lat1 - lat2)
        y = abs(lon1 - lon2)
    
        a = (mt.sin(x/2) ** 2) + mt.cos(lat1) * mt.cos(lat2) * (mt.sin(y/2) ** 2)
        c = 2 * mt.atan2(np.sqrt(a), np.sqrt(1-a))
        distance.append(6371 * c)
    
    return distance

In [105]:
pickup_longitude = np.asarray(df['pickup_longitude'])
pickup_latitude	 = np.asarray(df['pickup_latitude'])
dropoff_longitude = np.asarray(df['dropoff_longitude'])
dropoff_latitude = np.asarray(df['dropoff_latitude'])

In [106]:
df['Distance (km)'] = calculate_distance(pickup_longitude, pickup_latitude, dropoff_longitude, dropoff_latitude)

In [107]:
df.shape

(199987, 9)

Now it has 199,987 rows and 9 columns. 

In [108]:
df.drop(columns = ['pickup_datetime', 'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude'], axis = 1, inplace = True)

In [109]:
df.shape

(199987, 4)

Now it has 199,987 rows and 4 columns.

In [110]:
df.head()

Unnamed: 0,key,fare_amount,passenger_count,Distance (km)
0,2015-05-07 19:52:06.0000003,7.5,1,1.683323
1,2009-07-17 20:04:56.0000002,7.7,1,2.45759
2,2009-08-24 21:45:00.00000061,12.9,1,5.036377
3,2009-06-26 08:22:21.0000001,5.3,3,1.661683
4,2014-08-28 17:47:00.000000188,16.0,5,4.47545


In [111]:
df[df['fare_amount'] == 0]

Unnamed: 0,key,fare_amount,passenger_count,Distance (km)
20744,2015-04-22 23:25:07.0000008,0.0,1,0.0
22182,2010-03-20 02:59:51.0000002,0.0,2,11.065289
87467,2015-01-04 03:51:13.0000002,0.0,2,8665.983754
156738,2015-02-17 08:48:08.0000005,0.0,1,0.000643
197172,2015-02-13 07:35:32.0000002,0.0,5,0.0


In [112]:
df[df['Distance (km)'] == 0 ]

Unnamed: 0,key,fare_amount,passenger_count,Distance (km)
5,2011-02-12 02:27:09.0000006,4.9,1,0.0
7,2012-12-11 13:52:00.00000029,2.5,1,0.0
11,2011-05-23 22:15:00.00000086,8.5,1,0.0
48,2013-01-03 22:24:41.0000002,56.8,1,0.0
65,2014-05-05 19:27:00.00000034,6.0,1,0.0
...,...,...,...,...
199880,2014-02-22 06:45:46.0000002,6.5,1,0.0
199883,2012-09-10 17:39:00.00000090,12.5,2,0.0
199917,2013-06-24 22:17:43.0000002,4.5,1,0.0
199932,2011-03-22 13:59:00.00000018,24.9,5,0.0
