# Load Data
## Loading
- This notebook loads the trip data and taxi zone data into a SQL database

## Feature Engineering
- time: create hour and day of week features
- platforms: create platform variable (e.g. 'Uber', 'Lyft', etc)
- tips: indicator for whether a customer tipped
- driver pay metrics: pay per minute and pay per mile

In [40]:
import pandas as pd
import pyarrow.parquet as pq
%matplotlib inline
import os

In [8]:
from sqlalchemy import create_engine
engine = create_engine('postgresql://root:root@localhost:5432/uber')
engine.connect()

%load_ext sql
%sql postgresql://root:root@localhost:5432/uber

In [15]:
%%sql
CREATE TABLE main; 

 * postgresql://root:***@localhost:5432/uber
(psycopg2.errors.SyntaxError) syntax error at or near ";"
LINE 1: CREATE TABLE main;
                         ^

[SQL: CREATE TABLE main;]
(Background on this error at: https://sqlalche.me/e/20/f405)


In [13]:
for file in os.listdir('../data'):
    print(file)

.DS_Store
fhvhv_tripdata_2019-08.parquet
taxi_zones
taxi+_zone_lookup.csv
fhvhv_tripdata_2019-07.parquet
tableau_data.csv
taxi_zones.zip


In [14]:
fhvhv_datasets = [x for x in os.listdir('../data') if 'fhvhv' in x]
fhvhv_datasets

['fhvhv_tripdata_2019-08.parquet', 'fhvhv_tripdata_2019-07.parquet']

### Uber Data

In [49]:
# limit sample size to keep things sane on local computer
sample_size = 100000

In [50]:
%%sql
DROP TABLE IF EXISTS main;

 * postgresql://root:***@localhost:5432/uber
Done.


[]

In [51]:
for dataset in fhvhv_datasets:
    df = pd.read_parquet('../data/'+dataset)
    df = df.sample(sample_size)
    df.to_sql(name='main', con=engine, if_exists='append')

In [67]:
"""
# alternative method using iter_batches
for dataset in fhvhv_datasets:
    parquet_file = pq.ParquetFile('../data/' + dataset)
    for batch in parquet_file.iter_batches(batch_size=1000):
        df = batch.to_pandas()
        #df = pd.read_parquet('../data/' + dataset)
        #df = df.sample(sample_size)
        df.to_sql(name='main', con=engine, if_exists='append')
"""

"\n\nfor dataset in fhvhv_datasets:\n    parquet_file = pq.ParquetFile('../data/' + dataset)\n    for batch in parquet_file.iter_batches(batch_size=1000):\n        df = batch.to_pandas()\n        #df = pd.read_parquet('../data/' + dataset)\n        #df = df.sample(sample_size)\n        df.to_sql(name='main', con=engine, if_exists='append')\n"

In [53]:
%%sql
SELECT 
MIN(request_datetime) ,
MAX(request_datetime) 
FROM 
main;

 * postgresql://root:***@localhost:5432/uber
1 rows affected.


min,max
2019-06-30 23:48:05,2019-08-31 23:56:55


In [54]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 100000 entries, 1962725 to 8207163
Data columns (total 24 columns):
 #   Column                Non-Null Count   Dtype         
---  ------                --------------   -----         
 0   hvfhs_license_num     100000 non-null  object        
 1   dispatching_base_num  100000 non-null  object        
 2   originating_base_num  72347 non-null   object        
 3   request_datetime      100000 non-null  datetime64[us]
 4   on_scene_datetime     72347 non-null   datetime64[us]
 5   pickup_datetime       100000 non-null  datetime64[us]
 6   dropoff_datetime      100000 non-null  datetime64[us]
 7   PULocationID          100000 non-null  int64         
 8   DOLocationID          100000 non-null  int64         
 9   trip_miles            100000 non-null  float64       
 10  trip_time             100000 non-null  int64         
 11  base_passenger_fare   100000 non-null  float64       
 12  tolls                 100000 non-null  float64       
 1

### Taxi Zones

In [55]:
taxi_zones = pd.read_csv('../data/taxi+_zone_lookup.csv')
taxi_zones.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 265 entries, 0 to 264
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   LocationID    265 non-null    int64 
 1   Borough       265 non-null    object
 2   Zone          264 non-null    object
 3   service_zone  263 non-null    object
dtypes: int64(1), object(3)
memory usage: 8.4+ KB


In [56]:
taxi_zones

Unnamed: 0,LocationID,Borough,Zone,service_zone
0,1,EWR,Newark Airport,EWR
1,2,Queens,Jamaica Bay,Boro Zone
2,3,Bronx,Allerton/Pelham Gardens,Boro Zone
3,4,Manhattan,Alphabet City,Yellow Zone
4,5,Staten Island,Arden Heights,Boro Zone
...,...,...,...,...
260,261,Manhattan,World Trade Center,Yellow Zone
261,262,Manhattan,Yorkville East,Yellow Zone
262,263,Manhattan,Yorkville West,Yellow Zone
263,264,Unknown,NV,


In [57]:
taxi_zones.to_sql(name='zones', con=engine, if_exists='replace')

265

In [58]:
%%sql
DELETE FROM zones 
WHERE service_zone IS NULL;

 * postgresql://root:***@localhost:5432/uber
2 rows affected.


[]

### Main

In [59]:
%%sql
-- basic query
SELECT * FROM main LIMIT 10;

 * postgresql://root:***@localhost:5432/uber
10 rows affected.


index,hvfhs_license_num,dispatching_base_num,originating_base_num,request_datetime,on_scene_datetime,pickup_datetime,dropoff_datetime,PULocationID,DOLocationID,trip_miles,trip_time,base_passenger_fare,tolls,bcf,sales_tax,congestion_surcharge,airport_fee,tips,driver_pay,shared_request_flag,shared_match_flag,access_a_ride_flag,wav_request_flag,wav_match_flag
10840404,HV0005,B02510,,2019-08-17 15:59:10,,2019-08-17 16:02:35,2019-08-17 16:32:28,143,145,4.239,1793,19.41,0.16,0.49,1.74,2.75,,0.0,19.44,N,N,N,N,N
2252549,HV0003,B02870,B02870,2019-08-04 03:15:51,2019-08-04 03:24:27,2019-08-04 03:24:27,2019-08-04 03:31:54,80,198,1.54,447,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,N,N,,N,N
13745783,HV0003,B02879,B02879,2019-08-22 07:42:04,2019-08-22 07:43:54,2019-08-22 07:45:32,2019-08-22 07:50:41,69,119,1.32,308,3.85,0.0,0.0,0.33,0.0,,0.0,5.39,N,N,,N,N
2620430,HV0005,B02510,,2019-08-04 17:25:30,,2019-08-04 17:29:54,2019-08-04 17:43:10,144,162,2.85,796,19.74,0.0,0.49,1.75,2.75,,3.71,9.7,N,N,N,N,N
3601407,HV0003,B02875,B02875,2019-08-06 12:08:15,2019-08-06 12:08:25,2019-08-06 12:10:25,2019-08-06 12:21:24,244,119,3.1,636,9.34,0.0,0.0,0.81,0.0,,0.0,8.83,N,N,,N,N
2863573,HV0003,B02869,B02869,2019-08-05 05:00:06,2019-08-05 05:00:12,2019-08-05 05:02:47,2019-08-05 05:13:18,167,213,3.5,632,17.76,0.0,0.0,1.54,0.0,,0.0,13.57,N,N,,N,N
7773871,HV0003,B02883,B02883,2019-08-12 19:49:50,2019-08-12 19:50:01,2019-08-12 19:52:26,2019-08-12 20:03:52,125,148,1.64,675,11.5,0.0,0.0,1.0,2.75,,0.0,7.45,N,N,,N,N
13497465,HV0005,B02510,,2019-08-21 18:57:26,,2019-08-21 19:02:31,2019-08-21 19:36:55,186,229,2.796,2064,11.35,0.0,0.28,1.01,0.75,,0.0,0.0,Y,Y,N,N,N
12036795,HV0003,B02877,B02877,2019-08-19 09:28:33,2019-08-19 09:30:20,2019-08-19 09:31:26,2019-08-19 09:34:50,119,247,0.84,203,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,N,N,,N,N
13943106,HV0005,B02510,,2019-08-22 14:00:58,,2019-08-22 14:05:32,2019-08-22 15:06:48,36,48,9.696,3676,31.48,0.0,0.86,3.06,0.75,,0.0,0.0,Y,Y,N,N,N


### Hour and Day of Week


In [60]:
%%sql
ALTER TABLE main
ADD pickup_hour INT;

UPDATE main
SET pickup_hour = CAST(EXTRACT(hour FROM pickup_datetime) AS int)

 * postgresql://root:***@localhost:5432/uber
Done.
200000 rows affected.


[]

In [61]:
%%sql
ALTER TABLE main
ADD pickup_dayofweek varchar(15);

ALTER TABLE main
ADD pickup_dayofweek_int INT;

UPDATE main 
SET pickup_dayofweek_int = CAST(
        EXTRACT(isodow FROM pickup_datetime) 
    AS INT);

UPDATE main 
SET pickup_dayofweek = 
CASE 
    WHEN pickup_dayofweek_int = 1 THEN 'Monday'
    WHEN pickup_dayofweek_int = 2 THEN 'Tuesday'
    WHEN pickup_dayofweek_int = 3 THEN 'Wednesday'
    WHEN pickup_dayofweek_int = 4 THEN 'Thursday'
    WHEN pickup_dayofweek_int = 5 THEN 'Friday'
    WHEN pickup_dayofweek_int = 6 THEN 'Saturday'
    WHEN pickup_dayofweek_int = 7 THEN 'Sunday'
END;


 * postgresql://root:***@localhost:5432/uber
Done.
Done.
200000 rows affected.
200000 rows affected.


[]

In [62]:
%%sql
ALTER TABLE main drop pickup_dayofweek_int;

 * postgresql://root:***@localhost:5432/uber
Done.


[]

### Platforms

In [63]:
%%sql
ALTER TABLE main 
ADD platform varchar(10);

UPDATE main 
SET platform = 
CASE 
    WHEN Hvfhs_license_num = 'HV0002' THEN 'Juno'
    WHEN Hvfhs_license_num = 'HV0003' THEN 'Uber'
    WHEN Hvfhs_license_num = 'HV0004' THEN 'Via'
    WHEN Hvfhs_license_num = 'HV0005' THEN 'Lyft'
END;

 * postgresql://root:***@localhost:5432/uber
Done.
200000 rows affected.


[]

### Tips

In [64]:
%%sql
ALTER TABLE main 
ADD has_tips INT;

UPDATE main 
SET has_tips = CAST(tips > 0 AS INT);

 * postgresql://root:***@localhost:5432/uber
Done.
200000 rows affected.


[]

### Driver Pay

In [65]:
%%sql
ALTER TABLE main 
ADD driver_pay_per_mile real DEFAULT NULL;

UPDATE main
SET driver_pay_per_mile = (driver_pay / trip_miles)::numeric
WHERE trip_miles > 0;

 * postgresql://root:***@localhost:5432/uber
Done.
199938 rows affected.


[]

In [66]:
%%sql
ALTER TABLE main 
ADD driver_pay_per_minute real DEFAULT NULL;

UPDATE main
SET driver_pay_per_minute = (driver_pay / (trip_time/60.0))::numeric
WHERE trip_time > 0;

 * postgresql://root:***@localhost:5432/uber
Done.
200000 rows affected.


[]