# EDA
- This notebook conducts exploratory data analysis using SQL
- I document the following:
  - the dataset does feature 'negative' fare values
  - trip time and distance both have a minimum value of zero
  - a significant portion of rides feature no driver pay
  - A trivial portion of rides features wheelchairs
  - Pickup and dropoff locations feature no missing values
  - Less than 15% of rides feature tips

In [1]:
from sqlalchemy import create_engine
engine = create_engine('postgresql://root:root@localhost:5432/uber')
engine.connect()

%load_ext sql
%sql postgresql://root:root@localhost:5432/uber

In [2]:
%%sql
-- test query
SELECT * FROM main LIMIT 10;

 * postgresql://root:***@localhost:5432/uber
10 rows affected.


index,hvfhs_license_num,dispatching_base_num,originating_base_num,request_datetime,on_scene_datetime,pickup_datetime,dropoff_datetime,PULocationID,DOLocationID,trip_miles,trip_time,base_passenger_fare,tolls,bcf,sales_tax,congestion_surcharge,airport_fee,tips,driver_pay,shared_request_flag,shared_match_flag,access_a_ride_flag,wav_request_flag,wav_match_flag,pickup_hour,pickup_dayofweek,platform,has_tips,driver_pay_per_mile,driver_pay_per_minute
12425669,HV0005,B02510,,2019-07-20 14:20:08,,2019-07-20 14:23:05,2019-07-20 14:28:21,75,41,0.91,316,6.84,0.0,0.17,0.61,0.0,,0.0,5.39,N,N,N,N,N,14,Saturday,Lyft,0,5.923077,1.0234177
15600885,HV0005,B02510,,2019-07-25 05:30:11,,2019-07-25 05:36:40,2019-07-25 05:50:48,21,14,3.185,848,14.72,0.0,0.27,0.95,0.0,,0.0,0.0,Y,N,N,N,N,5,Thursday,Lyft,0,0.0,0.0
13211568,HV0005,B02510,,2019-07-21 13:12:18,,2019-07-21 13:17:27,2019-07-21 14:20:37,132,265,31.671,3790,83.01,0.0,2.75,0.0,0.0,,0.0,0.0,Y,N,N,N,N,13,Sunday,Lyft,0,0.0,0.0
15328381,HV0005,B02510,,2019-07-24 17:58:11,,2019-07-24 18:11:21,2019-07-24 18:53:10,138,75,7.852,2509,24.84,6.38,0.78,2.77,0.0,,0.0,29.3,N,N,N,N,N,18,Wednesday,Lyft,0,3.7315333,0.7006776
3049118,HV0005,B02510,,2019-07-06 10:40:31,,2019-07-06 10:41:35,2019-07-06 10:54:15,98,130,2.751,760,10.94,0.0,0.13,0.46,0.0,,0.0,9.29,N,N,N,N,N,10,Saturday,Lyft,0,3.3769538,0.733421
14036928,HV0005,B02510,,2019-07-22 17:55:37,,2019-07-22 18:05:12,2019-07-22 18:34:06,13,142,5.351,1734,46.18,0.0,1.15,4.1,2.75,,0.0,38.46,N,N,N,N,N,18,Monday,Lyft,0,7.187442,1.3307959
8477582,HV0004,B02800,,2019-07-14 17:35:34,,2019-07-14 17:43:48,2019-07-14 18:05:32,225,255,1.01,1304,7.45,0.0,0.0,0.0,0.0,,0.0,0.0,Y,N,N,N,N,17,Sunday,Via,0,0.0,0.0
18436771,HV0005,B02510,,2019-07-29 00:41:00,,2019-07-29 00:43:17,2019-07-29 00:52:29,41,42,1.278,552,7.05,0.0,0.18,0.63,0.0,,0.0,5.96,N,N,N,N,N,0,Monday,Lyft,0,4.6635365,0.6478261
18833108,HV0004,B02800,,2019-07-29 17:44:57,,2019-07-29 17:54:18,2019-07-29 18:20:20,79,48,2.81,1562,6.5,0.0,0.0,0.58,0.75,,0.0,0.0,Y,Y,N,N,N,17,Monday,Via,0,0.0,0.0
11139046,HV0005,B02510,,2019-07-18 21:06:18,,2019-07-18 21:11:41,2019-07-18 21:36:50,49,260,7.806,1509,22.79,0.0,0.42,1.49,0.0,,0.0,0.0,Y,Y,N,N,N,21,Thursday,Lyft,0,0.0,0.0


In [3]:
%%sql
-- test query
SELECT * FROM zones LIMIT 10;

 * postgresql://root:***@localhost:5432/uber
10 rows affected.


index,LocationID,Borough,Zone,service_zone
0,1,EWR,Newark Airport,EWR
1,2,Queens,Jamaica Bay,Boro Zone
2,3,Bronx,Allerton/Pelham Gardens,Boro Zone
3,4,Manhattan,Alphabet City,Yellow Zone
4,5,Staten Island,Arden Heights,Boro Zone
5,6,Staten Island,Arrochar/Fort Wadsworth,Boro Zone
6,7,Queens,Astoria,Boro Zone
7,8,Queens,Astoria Park,Boro Zone
8,9,Queens,Auburndale,Boro Zone
9,10,Queens,Baisley Park,Boro Zone


In [5]:
%%sql
/*
min and max request dates
can see min request is on the preceding month
max drop is on the following month
only min_pickup and max_pickup is exclusive to the month
*/
SELECT
MIN(request_datetime) min_request, 
MAX(request_datetime) max_request,
MIN(pickup_datetime) min_pickup, 
MAX(pickup_datetime) max_pickup,
MIN(dropoff_datetime) min_dropoff, 
MAX(dropoff_datetime) max_dropoff
FROM main;

 * postgresql://root:***@localhost:5432/uber
1 rows affected.


min_request,max_request,min_pickup,max_pickup,min_dropoff,max_dropoff
2019-06-30 23:44:58,2019-07-31 23:55:28,2019-07-01 00:00:05,2019-07-31 23:59:32,2019-07-01 00:11:08,2019-08-01 00:40:27


In [13]:
%%sql
-- no null time variables
SELECT
SUM( CAST((request_datetime IS NULL) AS INT)) null_request_time,
SUM( CAST((pickup_datetime IS NULL) AS INT)) null_pickup_time,
SUM( CAST((dropoff_datetime IS NULL) AS INT)) null_dropoff_time
FROM main
LIMIT 10;

 * postgresql://root:***@localhost:5432/uber
1 rows affected.


null_request_time,null_pickup_time,null_dropoff_time
0,0,0


### Fares

In [13]:
%%sql
-- Min/max for fare
-- we can see that a negative base fare exists
SELECT
MIN(base_passenger_fare) min_fare, 
MAX(base_passenger_fare) max_fare
FROM main;

 * postgresql://root:***@localhost:5432/uber
1 rows affected.


min_fare,max_fare
-62.69,627.65


In [14]:
%%sql
-- How many rides have a negative fare?
-- Less than 1% of rides have a negative base fare
-- possibly accounts for refunds?
SELECT 
ROUND(
    100 * SUM((base_passenger_fare < 0)::int)::numeric
    / (SELECT COUNT(*) FROM main), 2) 
    "Percent Negative Fares"
FROM main
LIMIT 10

 * postgresql://root:***@localhost:5432/uber
1 rows affected.


Percent Negative Fares
0.39


### Trip Miles

In [15]:
%%sql
-- Min/max for miles
-- We can see a 'zero miles' trip exists
SELECT
MIN(trip_miles) min_miles, 
MAX(trip_miles) max_miles
FROM main;

 * postgresql://root:***@localhost:5432/uber
1 rows affected.


min_miles,max_miles
0.0,220.21


### Trip Time

In [16]:
%%sql
-- Are these trips an anomaly?
-- The base fare is positive which suggests these are brief but real rides
SELECT
trip_miles,
base_passenger_fare,
trip_time/60 "Trip time (minutes)"
FROM main
WHERE trip_miles = (SELECT MIN(trip_miles) FROM main);

 * postgresql://root:***@localhost:5432/uber
20 rows affected.


trip_miles,base_passenger_fare,Trip time (minutes)
0.0,7.37,2
0.0,4.6,0
0.0,7.59,11
0.0,11.0,37
0.0,2.84,0
0.0,-4.8,0
0.0,5.38,2
0.0,8.49,18
0.0,0.0,0
0.0,14.56,4


In [17]:
%%sql
-- min/max trip tipe in minutes
SELECT 
min(trip_time) / 60 "Min Trip Time (Minutes)",
max(trip_time) / 60 "Max Trip Time (Minutes)"
FROM main

 * postgresql://root:***@localhost:5432/uber
1 rows affected.


Min Trip Time (Minutes),Max Trip Time (Minutes)
0,557


### Driver Pay

In [18]:
%%sql
-- min/max pay of zero
-- possibly due to refund / cancelled rides?
SELECT 
MIN(driver_pay) min_pay, 
MAX(driver_pay) max_pay
FROM main

 * postgresql://root:***@localhost:5432/uber
1 rows affected.


min_pay,max_pay
0.0,527.43


In [19]:
%%sql
-- Percent with no pay
SELECT 
ROUND(
100 * AVG(( -- take average to get percent of rides there this is the case
    driver_pay = (SELECT MIN(driver_pay) FROM main) -- indicator: driver pay equals the minimum pay (zero)
)::int)
, 2) "Percent No Driver Pay"
FROM main

 * postgresql://root:***@localhost:5432/uber
1 rows affected.


Percent No Driver Pay
12.93


### Sharing Rides

In [20]:
%%sql
-- distinct values
SELECT DISTINCT shared_request_flag
FROM main

 * postgresql://root:***@localhost:5432/uber
2 rows affected.


shared_request_flag
Y
N


In [21]:
%%sql
-- Percent that requested share
SELECT 
100 * ROUND(
AVG((shared_request_flag = 'Y')::int)
, 2) "Percent Requested Share"
FROM main

 * postgresql://root:***@localhost:5432/uber
1 rows affected.


Percent Requested Share
21.0


In [22]:
%%sql
-- but only about 14% actually receive a share
SELECT 
100 * ROUND(
AVG((shared_request_flag = 'Y')::int)
, 2) "Percent Requested Share",
100 * ROUND(
AVG((shared_match_flag = 'Y')::int)
, 2) "Percent Successful Share"
FROM main

 * postgresql://root:***@localhost:5432/uber
1 rows affected.


Percent Requested Share,Percent Successful Share
21.0,13.0


In [33]:
%%sql
-- finding: shared rides are less likely to tip
-- However, could be due to a different average trip time
SELECT 
shared_request_flag, 
ROUND(
    100*AVG(has_tips)
    , 2) percent_with_tips,
ROUND(
    AVG(trip_miles)
    , 2) avg_trip
FROM main 
GROUP BY shared_request_flag
ORDER BY AVG(has_tips) DESC

 * postgresql://root:***@localhost:5432/uber
(psycopg2.errors.UndefinedFunction) function round(double precision, integer) does not exist
LINE 8: ROUND(
        ^
HINT:  No function matches the given name and argument types. You might need to add explicit type casts.

[SQL: -- finding: shared rides are less likely to tip
-- However, could be due to a different average trip time
SELECT 
shared_request_flag, 
ROUND(
    100*AVG(has_tips)
    , 2) percent_with_tips,
ROUND(
    AVG(trip_miles)
    , 2) avg_trip
FROM main 
GROUP BY shared_request_flag
ORDER BY AVG(has_tips) DESC]
(Background on this error at: https://sqlalche.me/e/20/f405)


### Wheelchair Rides

In [18]:
%%sql
-- Percent that requested wheelchair
-- very low
SELECT 
100 * ROUND(
AVG((wav_request_flag = 'Y')::int)
, 2) "Percent Requested Wheelchair"
FROM main

 * postgresql://root:***@localhost:5432/uber
1 rows affected.


Percent Requested Wheelchair
0.0


### Tips

In [26]:
%%sql
SELECT 
COUNT(*) count_null_tips
FROM main
WHERE tips IS NULL;

 * postgresql://root:***@localhost:5432/uber
1 rows affected.


count_null_tips
0


In [27]:
%%sql
SELECT 
MIN(tips) min_tips,
MAX(tips) max_tips
FROM main

 * postgresql://root:***@localhost:5432/uber
1 rows affected.


min_tips,max_tips
0.0,75.0


In [28]:
%%sql
SELECT 
ROUND(
    100*AVG((tips > 0)::int)
    , 2) "Percent with Tips"
FROM main

 * postgresql://root:***@localhost:5432/uber
1 rows affected.


Percent with Tips
13.13


### Location

In [32]:
%%sql
-- can see we have no null values for either pickup or dropoff locations in entire dataset
SELECT 
SUM(CASE WHEN "PULocationID" IS NULL THEN 1 ELSE 0 END) null_pickups,
SUM(CASE WHEN "DOLocationID" IS NULL THEN 1 ELSE 0 END) null_dropoffs
FROM main;

 * postgresql://root:***@localhost:5432/uber
1 rows affected.


null_pickups,null_dropoffs
0,0


### Tolls


In [63]:
%%sql
SELECT 
ROUND(
    100*AVG((tolls > 0)::int)
    , 2) "Percent with Tolls"
FROM main

 * postgresql://postgres:***@localhost:5432/uber
1 rows affected.


Percent with Tolls
10.39
