# Load Data
- This notebook loads the uber taxi data into the SQL database.

In [40]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

### Uber Data

In [41]:
df = pd.read_parquet('../data/fhvhv_tripdata_2019-07.parquet')
df = df.sample(100000)

In [42]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 100000 entries, 13079918 to 19114466
Data columns (total 24 columns):
 #   Column                Non-Null Count   Dtype         
---  ------                --------------   -----         
 0   hvfhs_license_num     100000 non-null  object        
 1   dispatching_base_num  99999 non-null   object        
 2   originating_base_num  72534 non-null   object        
 3   request_datetime      100000 non-null  datetime64[us]
 4   on_scene_datetime     72537 non-null   datetime64[us]
 5   pickup_datetime       100000 non-null  datetime64[us]
 6   dropoff_datetime      100000 non-null  datetime64[us]
 7   PULocationID          100000 non-null  int64         
 8   DOLocationID          100000 non-null  int64         
 9   trip_miles            100000 non-null  float64       
 10  trip_time             100000 non-null  int64         
 11  base_passenger_fare   100000 non-null  float64       
 12  tolls                 100000 non-null  float64       


In [43]:
from sqlalchemy import create_engine
engine = create_engine('postgresql://root:root@localhost:5432/uber')
engine.connect()

<sqlalchemy.engine.base.Connection at 0x2b3fdef10>

In [44]:
%%sql
DROP VIEW aux

 * postgresql://root:***@localhost:5432/uber
(psycopg2.errors.UndefinedTable) view "aux" does not exist

[SQL: DROP VIEW aux]
(Background on this error at: https://sqlalche.me/e/20/f405)


In [45]:
df.to_sql(name='main', con=engine, if_exists='replace')

1000

### Taxi Zones

In [46]:
taxi_zones = pd.read_csv('../data/taxi+_zone_lookup.csv')
taxi_zones.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 265 entries, 0 to 264
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   LocationID    265 non-null    int64 
 1   Borough       265 non-null    object
 2   Zone          264 non-null    object
 3   service_zone  263 non-null    object
dtypes: int64(1), object(3)
memory usage: 8.4+ KB


In [47]:
taxi_zones

Unnamed: 0,LocationID,Borough,Zone,service_zone
0,1,EWR,Newark Airport,EWR
1,2,Queens,Jamaica Bay,Boro Zone
2,3,Bronx,Allerton/Pelham Gardens,Boro Zone
3,4,Manhattan,Alphabet City,Yellow Zone
4,5,Staten Island,Arden Heights,Boro Zone
...,...,...,...,...
260,261,Manhattan,World Trade Center,Yellow Zone
261,262,Manhattan,Yorkville East,Yellow Zone
262,263,Manhattan,Yorkville West,Yellow Zone
263,264,Unknown,NV,


In [48]:
taxi_zones.to_sql(name='zones', con=engine, if_exists='replace')

265

In [49]:
%load_ext sql
%sql postgresql://root:root@localhost:5432/uber

The sql extension is already loaded. To reload it, use:
  %reload_ext sql


In [50]:
%%sql
-- basic query
SELECT * FROM main LIMIT 10;

 * postgresql://root:***@localhost:5432/uber
10 rows affected.


index,hvfhs_license_num,dispatching_base_num,originating_base_num,request_datetime,on_scene_datetime,pickup_datetime,dropoff_datetime,PULocationID,DOLocationID,trip_miles,trip_time,base_passenger_fare,tolls,bcf,sales_tax,congestion_surcharge,airport_fee,tips,driver_pay,shared_request_flag,shared_match_flag,access_a_ride_flag,wav_request_flag,wav_match_flag
13079918,HV0003,B02884,B02884,2019-07-21 09:30:06,2019-07-21 09:31:09,2019-07-21 09:33:25,2019-07-21 09:40:03,20,78,0.87,292,4.87,0.0,0.0,0.42,0.0,,0.0,6.26,N,N,,N,
12235925,HV0005,B02510,,2019-07-20 09:36:09,,2019-07-20 09:41:32,2019-07-20 09:52:27,205,203,3.459,655,11.97,0.0,0.3,1.06,0.0,,2.0,9.2,N,N,N,N,N
8014595,HV0003,B02765,B02765,2019-07-14 00:38:43,2019-07-14 00:40:30,2019-07-14 00:43:28,2019-07-14 01:06:39,143,209,5.1,1357,18.09,0.0,0.0,1.61,2.75,,0.0,17.05,N,N,,N,
3002919,HV0003,B02836,B02836,2019-07-06 08:25:08,2019-07-06 08:26:03,2019-07-06 08:27:46,2019-07-06 08:43:35,116,100,6.73,948,25.21,0.0,0.0,2.18,2.75,,0.0,15.19,N,N,,N,
8755879,HV0003,B02617,B02617,2019-07-15 05:25:56,2019-07-15 05:31:27,2019-07-15 05:32:47,2019-07-15 05:57:19,181,138,11.55,1471,36.4,0.0,0.0,3.15,0.0,,0.0,24.78,N,N,,N,
4658235,HV0003,B02765,B02765,2019-07-09 04:40:21,2019-07-09 04:40:28,2019-07-09 04:41:48,2019-07-09 04:50:26,91,71,1.28,519,5.09,0.0,0.0,0.44,0.0,,0.0,4.64,Y,Y,,N,
16290538,HV0004,B02800,,2019-07-26 06:13:09,,2019-07-26 06:15:36,2019-07-26 06:30:32,140,107,2.87,896,6.95,0.0,0.0,0.0,0.75,,0.0,0.0,Y,Y,N,N,N
11950654,HV0005,B02510,,2019-07-19 22:11:47,,2019-07-19 22:15:39,2019-07-19 22:38:46,146,170,2.983,1387,14.23,1.81,0.4,1.42,2.75,,0.0,14.71,N,N,N,N,N
12777732,HV0005,B02510,,2019-07-20 22:01:33,,2019-07-20 22:05:43,2019-07-20 22:24:29,244,137,10.169,1126,27.97,0.23,0.71,2.5,2.75,,0.0,20.44,N,N,N,N,N
5727968,HV0003,B02884,B02884,2019-07-10 22:37:31,2019-07-10 22:42:15,2019-07-10 22:45:26,2019-07-10 23:21:04,106,69,15.8,2138,49.65,0.0,0.0,4.3,2.75,,0.0,34.94,N,N,,N,


### Hour and Day of Week


In [51]:
%%sql
ALTER TABLE main
ADD pickup_hour INT;

UPDATE main
SET pickup_hour = CAST(EXTRACT(hour FROM pickup_datetime) AS int)

 * postgresql://root:***@localhost:5432/uber
Done.
100000 rows affected.


[]

In [52]:
%%sql
ALTER TABLE main
ADD pickup_dayofweek varchar(15);

ALTER TABLE main
ADD pickup_dayofweek_int INT;

UPDATE main 
SET pickup_dayofweek_int = CAST(
        EXTRACT(isodow FROM pickup_datetime) 
    AS INT);

UPDATE main 
SET pickup_dayofweek = 
CASE 
    WHEN pickup_dayofweek_int = 1 THEN 'Monday'
    WHEN pickup_dayofweek_int = 2 THEN 'Tuesday'
    WHEN pickup_dayofweek_int = 3 THEN 'Wednesday'
    WHEN pickup_dayofweek_int = 4 THEN 'Thursday'
    WHEN pickup_dayofweek_int = 5 THEN 'Friday'
    WHEN pickup_dayofweek_int = 6 THEN 'Saturday'
    WHEN pickup_dayofweek_int = 7 THEN 'Sunday'
END;


 * postgresql://root:***@localhost:5432/uber
Done.
Done.
100000 rows affected.
100000 rows affected.


[]

In [53]:
%%sql
ALTER TABLE main drop pickup_dayofweek_int;

 * postgresql://root:***@localhost:5432/uber
Done.


[]

### Platforms

In [54]:
%%sql
ALTER TABLE main 
ADD platform varchar(10);

UPDATE main 
SET platform = 
CASE 
    WHEN Hvfhs_license_num = 'HV0002' THEN 'Juno'
    WHEN Hvfhs_license_num = 'HV0003' THEN 'Uber'
    WHEN Hvfhs_license_num = 'HV0004' THEN 'Via'
    WHEN Hvfhs_license_num = 'HV0005' THEN 'Lyft'
END;

 * postgresql://root:***@localhost:5432/uber
Done.
100000 rows affected.


[]

### Tips

In [55]:
%%sql
ALTER TABLE main 
ADD has_tips INT;

UPDATE main 
SET has_tips = CAST(tips > 0 AS INT)

 * postgresql://root:***@localhost:5432/uber
Done.
100000 rows affected.


[]

### Adding Zones

In [56]:
%%sql

ALTER TABLE main 
ADD "Borough" varchar(50);

ALTER TABLE main 
ADD "Zone" varchar(50);

 * postgresql://root:***@localhost:5432/uber
Done.
Done.
Done.


[]

In [57]:
%%sql
UPDATE main as m
SET "Borough" = zones."Borough"
FROM zones
WHERE m."PULocationID" = zones."LocationID"

 * postgresql://root:***@localhost:5432/uber
100000 rows affected.


[]

In [58]:
%%sql
UPDATE main as m
SET "Zone" = zones."Zone"
FROM zones
WHERE m."PULocationID" = zones."LocationID"

 * postgresql://root:***@localhost:5432/uber
100000 rows affected.


[]

In [37]:
%%sql
SELECT "Borough"
FROM zones LIMIT 10;

 * postgresql://root:***@localhost:5432/uber
10 rows affected.


Borough
EWR
Queens
Bronx
Manhattan
Staten Island
Staten Island
Queens
Queens
Queens
Queens


### Saving for Tableau

In [28]:
QUERY = """
SELECT * FROM main;
"""
df_out = pd.read_sql_query(QUERY, engine)

In [None]:
df_out.to_csv('../data/')

In [32]:
df_out

Unnamed: 0,index,hvfhs_license_num,dispatching_base_num,originating_base_num,request_datetime,on_scene_datetime,pickup_datetime,dropoff_datetime,PULocationID,DOLocationID,...,driver_pay,shared_request_flag,shared_match_flag,access_a_ride_flag,wav_request_flag,wav_match_flag,pickup_hour,pickup_dayofweek,platform,has_tips
0,1865934,HV0003,B02617,B02617,2019-07-04 09:07:02,2019-07-04 09:08:25,2019-07-04 09:09:56,2019-07-04 09:31:36,138,230,...,33.91,N,N,,N,,9,Thursday,Uber,0
1,19046488,HV0003,B02867,B02867,2019-07-30 01:05:15,2019-07-30 01:05:25,2019-07-30 01:07:48,2019-07-30 01:22:09,48,116,...,12.99,N,N,,N,N,1,Tuesday,Uber,0
2,17133460,HV0003,B02875,B02875,2019-07-27 09:53:21,2019-07-27 09:56:25,2019-07-27 09:58:00,2019-07-27 10:06:11,160,160,...,0.00,N,N,,N,N,9,Saturday,Uber,0
3,16290311,HV0003,B02866,B02866,2019-07-26 06:15:04,2019-07-26 06:17:25,2019-07-26 06:18:57,2019-07-26 07:06:37,71,75,...,60.38,N,N,,N,N,6,Friday,Uber,1
4,15396138,HV0003,B02764,B02764,2019-07-24 20:25:04,2019-07-24 20:26:01,2019-07-24 20:28:17,2019-07-24 20:49:05,234,13,...,14.33,N,N,,N,N,20,Wednesday,Uber,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,2644657,HV0003,B02871,B02871,2019-07-05 16:21:31,2019-07-05 16:23:18,2019-07-05 16:25:47,2019-07-05 16:55:20,82,34,...,22.80,N,N,,N,,16,Friday,Uber,0
99996,19080020,HV0003,B02875,B02875,2019-07-30 05:40:17,2019-07-30 05:37:34,2019-07-30 05:40:53,2019-07-30 06:04:05,249,1,...,56.63,N,N,,N,N,5,Tuesday,Uber,0
99997,14708683,HV0003,B02865,B02865,2019-07-23 18:12:16,2019-07-23 18:17:54,2019-07-23 18:18:48,2019-07-23 18:26:05,85,89,...,4.15,Y,Y,,N,N,18,Tuesday,Uber,0
99998,15686934,HV0005,B02510,,2019-07-25 08:38:44,NaT,2019-07-25 08:45:21,2019-07-25 09:31:56,189,246,...,33.58,N,N,N,N,N,8,Thursday,Lyft,0
