# Load Data
## Loading
- This notebook loads the trip data and taxi zone data into a SQL database

## Feature Engineering
- time: create hour and day of week features
- platforms: create platform variable (e.g. 'Uber', 'Lyft', etc)
- tips: indicator for whether a customer tipped
- driver pay metrics: pay per minute and pay per mile

In [106]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

### Uber Data

In [107]:
df = pd.read_parquet('../data/fhvhv_tripdata_2019-07.parquet')
df = df.sample(100000)

In [108]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 100000 entries, 14073898 to 9365500
Data columns (total 24 columns):
 #   Column                Non-Null Count   Dtype         
---  ------                --------------   -----         
 0   hvfhs_license_num     100000 non-null  object        
 1   dispatching_base_num  100000 non-null  object        
 2   originating_base_num  72209 non-null   object        
 3   request_datetime      100000 non-null  datetime64[us]
 4   on_scene_datetime     72210 non-null   datetime64[us]
 5   pickup_datetime       100000 non-null  datetime64[us]
 6   dropoff_datetime      100000 non-null  datetime64[us]
 7   PULocationID          100000 non-null  int64         
 8   DOLocationID          100000 non-null  int64         
 9   trip_miles            100000 non-null  float64       
 10  trip_time             100000 non-null  int64         
 11  base_passenger_fare   100000 non-null  float64       
 12  tolls                 100000 non-null  float64       
 

In [109]:
from sqlalchemy import create_engine
engine = create_engine('postgresql://root:root@localhost:5432/uber')
engine.connect()

<sqlalchemy.engine.base.Connection at 0x2bedb7e20>

In [110]:
%load_ext sql
%sql postgresql://root:root@localhost:5432/uber

The sql extension is already loaded. To reload it, use:
  %reload_ext sql


In [111]:
%%sql
DROP VIEW aux

 * postgresql://root:***@localhost:5432/uber
(psycopg2.errors.UndefinedTable) view "aux" does not exist

[SQL: DROP VIEW aux]
(Background on this error at: https://sqlalche.me/e/20/f405)


In [112]:
df.to_sql(name='main', con=engine, if_exists='replace')

1000

### Taxi Zones

In [113]:
taxi_zones = pd.read_csv('../data/taxi+_zone_lookup.csv')
taxi_zones.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 265 entries, 0 to 264
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   LocationID    265 non-null    int64 
 1   Borough       265 non-null    object
 2   Zone          264 non-null    object
 3   service_zone  263 non-null    object
dtypes: int64(1), object(3)
memory usage: 8.4+ KB


In [114]:
taxi_zones

Unnamed: 0,LocationID,Borough,Zone,service_zone
0,1,EWR,Newark Airport,EWR
1,2,Queens,Jamaica Bay,Boro Zone
2,3,Bronx,Allerton/Pelham Gardens,Boro Zone
3,4,Manhattan,Alphabet City,Yellow Zone
4,5,Staten Island,Arden Heights,Boro Zone
...,...,...,...,...
260,261,Manhattan,World Trade Center,Yellow Zone
261,262,Manhattan,Yorkville East,Yellow Zone
262,263,Manhattan,Yorkville West,Yellow Zone
263,264,Unknown,NV,


In [115]:
taxi_zones.to_sql(name='zones', con=engine, if_exists='replace')

265

In [116]:
%%sql
DELETE FROM zones 
WHERE service_zone IS NULL;

 * postgresql://root:***@localhost:5432/uber
2 rows affected.


[]

### Main

In [117]:
%load_ext sql
%sql postgresql://root:root@localhost:5432/uber

The sql extension is already loaded. To reload it, use:
  %reload_ext sql


In [118]:
%%sql
-- basic query
SELECT * FROM main LIMIT 10;

 * postgresql://root:***@localhost:5432/uber
10 rows affected.


index,hvfhs_license_num,dispatching_base_num,originating_base_num,request_datetime,on_scene_datetime,pickup_datetime,dropoff_datetime,PULocationID,DOLocationID,trip_miles,trip_time,base_passenger_fare,tolls,bcf,sales_tax,congestion_surcharge,airport_fee,tips,driver_pay,shared_request_flag,shared_match_flag,access_a_ride_flag,wav_request_flag,wav_match_flag
14073898,HV0003,B02871,B02871,2019-07-22 18:27:36,2019-07-22 18:34:12,2019-07-22 18:35:17,2019-07-22 18:46:59,61,35,1.68,703,8.49,0.0,0.0,0.74,0.0,,0.0,7.63,N,N,,N,N
9262571,HV0003,B02869,B02869,2019-07-15 23:24:07,2019-07-15 23:26:05,2019-07-15 23:28:23,2019-07-15 23:50:20,35,139,9.35,1316,23.62,0.0,0.0,2.04,0.0,,0.0,21.1,N,N,,N,
9004029,HV0003,B02876,B02876,2019-07-15 15:44:52,2019-07-15 15:45:45,2019-07-15 15:47:53,2019-07-15 16:25:25,161,138,10.03,2251,44.94,6.12,0.0,4.42,2.75,,8.73,44.4,N,N,,N,
674083,HV0003,B02875,B02875,2019-07-02 08:17:17,2019-07-02 08:24:33,2019-07-02 08:25:32,2019-07-02 08:42:34,161,137,1.7,1023,-0.75,0.0,0.0,0.0,0.75,,0.0,6.58,Y,Y,,N,
1982460,HV0003,B02764,B02764,2019-07-04 14:41:58,2019-07-04 14:46:55,2019-07-04 14:48:39,2019-07-04 15:01:16,205,215,2.86,758,5.0,0.0,0.0,0.43,0.0,,0.0,5.93,Y,Y,,N,
3885337,HV0003,B02883,B02883,2019-07-07 17:40:31,2019-07-07 17:41:21,2019-07-07 17:43:01,2019-07-07 18:07:33,68,138,9.48,1472,38.69,6.12,0.0,3.88,2.75,,0.0,31.66,N,N,,N,
4275091,HV0003,B02764,B02764,2019-07-08 09:59:27,2019-07-08 09:59:45,2019-07-08 10:01:08,2019-07-08 10:17:33,192,16,4.49,984,14.45,0.0,0.0,1.25,0.0,,0.0,13.03,N,N,,N,
18672189,HV0004,B02800,,2019-07-29 12:45:22,,2019-07-29 12:53:37,2019-07-29 13:22:25,161,148,4.2,1728,8.75,0.0,0.0,0.78,0.75,,0.0,0.0,Y,Y,N,N,N
7202241,HV0002,B03035,B03035,2019-07-13 00:45:19,1970-01-01 00:00:00,2019-07-13 00:47:28,2019-07-13 01:10:19,37,129,6.04,1371,21.58,0.0,0.53,1.92,0.0,,0.0,18.91,N,N,N,N,N
482401,HV0003,B02395,B02395,2019-07-01 20:18:58,2019-07-01 20:22:44,2019-07-01 20:23:45,2019-07-01 20:38:02,223,260,3.4,858,10.43,0.0,0.0,0.0,0.0,,2.0,0.0,N,N,,N,


### Hour and Day of Week


In [119]:
%%sql
ALTER TABLE main
ADD pickup_hour INT;

UPDATE main
SET pickup_hour = CAST(EXTRACT(hour FROM pickup_datetime) AS int)

 * postgresql://root:***@localhost:5432/uber
Done.


100000 rows affected.


[]

In [120]:
%%sql
ALTER TABLE main
ADD pickup_dayofweek varchar(15);

ALTER TABLE main
ADD pickup_dayofweek_int INT;

UPDATE main 
SET pickup_dayofweek_int = CAST(
        EXTRACT(isodow FROM pickup_datetime) 
    AS INT);

UPDATE main 
SET pickup_dayofweek = 
CASE 
    WHEN pickup_dayofweek_int = 1 THEN 'Monday'
    WHEN pickup_dayofweek_int = 2 THEN 'Tuesday'
    WHEN pickup_dayofweek_int = 3 THEN 'Wednesday'
    WHEN pickup_dayofweek_int = 4 THEN 'Thursday'
    WHEN pickup_dayofweek_int = 5 THEN 'Friday'
    WHEN pickup_dayofweek_int = 6 THEN 'Saturday'
    WHEN pickup_dayofweek_int = 7 THEN 'Sunday'
END;


 * postgresql://root:***@localhost:5432/uber
Done.
Done.
100000 rows affected.
100000 rows affected.


[]

In [121]:
%%sql
ALTER TABLE main drop pickup_dayofweek_int;

 * postgresql://root:***@localhost:5432/uber
Done.


[]

### Platforms

In [122]:
%%sql
ALTER TABLE main 
ADD platform varchar(10);

UPDATE main 
SET platform = 
CASE 
    WHEN Hvfhs_license_num = 'HV0002' THEN 'Juno'
    WHEN Hvfhs_license_num = 'HV0003' THEN 'Uber'
    WHEN Hvfhs_license_num = 'HV0004' THEN 'Via'
    WHEN Hvfhs_license_num = 'HV0005' THEN 'Lyft'
END;

 * postgresql://root:***@localhost:5432/uber
Done.
100000 rows affected.


[]

### Tips

In [123]:
%%sql
ALTER TABLE main 
ADD has_tips INT;

UPDATE main 
SET has_tips = CAST(tips > 0 AS INT);

 * postgresql://root:***@localhost:5432/uber
Done.
100000 rows affected.


[]

### Driver Pay

In [124]:
%%sql
ALTER TABLE main 
ADD driver_pay_per_mile real DEFAULT NULL;

UPDATE main
SET driver_pay_per_mile = (driver_pay / trip_miles)::numeric
WHERE trip_miles > 0;

 * postgresql://root:***@localhost:5432/uber
Done.
99976 rows affected.


[]

In [125]:
%%sql
ALTER TABLE main 
ADD driver_pay_per_minute real DEFAULT NULL;

UPDATE main
SET driver_pay_per_minute = (driver_pay / (trip_time/60.0))::numeric
WHERE trip_time > 0;

 * postgresql://root:***@localhost:5432/uber
Done.
99999 rows affected.


[]