# Load Data
- This notebook loads the uber taxi data into the SQL database.

In [59]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

### Uber Data

In [60]:
df = pd.read_parquet('../data/fhvhv_tripdata_2019-07.parquet')
df = df.sample(100000)

In [61]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 100000 entries, 4514778 to 13202402
Data columns (total 24 columns):
 #   Column                Non-Null Count   Dtype         
---  ------                --------------   -----         
 0   hvfhs_license_num     100000 non-null  object        
 1   dispatching_base_num  100000 non-null  object        
 2   originating_base_num  72468 non-null   object        
 3   request_datetime      100000 non-null  datetime64[us]
 4   on_scene_datetime     72470 non-null   datetime64[us]
 5   pickup_datetime       100000 non-null  datetime64[us]
 6   dropoff_datetime      100000 non-null  datetime64[us]
 7   PULocationID          100000 non-null  int64         
 8   DOLocationID          100000 non-null  int64         
 9   trip_miles            100000 non-null  float64       
 10  trip_time             100000 non-null  int64         
 11  base_passenger_fare   100000 non-null  float64       
 12  tolls                 100000 non-null  float64       
 

In [62]:
from sqlalchemy import create_engine
engine = create_engine('postgresql://root:root@localhost:5432/uber')
engine.connect()

<sqlalchemy.engine.base.Connection at 0x2c6e11700>

In [63]:
%%sql
DROP VIEW aux

 * postgresql://root:***@localhost:5432/uber
(psycopg2.errors.UndefinedTable) view "aux" does not exist

[SQL: DROP VIEW aux]
(Background on this error at: https://sqlalche.me/e/20/f405)


In [64]:
df.to_sql(name='main', con=engine, if_exists='replace')

1000

### Taxi Zones

In [65]:
taxi_zones = pd.read_csv('../data/taxi+_zone_lookup.csv')
taxi_zones.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 265 entries, 0 to 264
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   LocationID    265 non-null    int64 
 1   Borough       265 non-null    object
 2   Zone          264 non-null    object
 3   service_zone  263 non-null    object
dtypes: int64(1), object(3)
memory usage: 8.4+ KB


In [66]:
taxi_zones

Unnamed: 0,LocationID,Borough,Zone,service_zone
0,1,EWR,Newark Airport,EWR
1,2,Queens,Jamaica Bay,Boro Zone
2,3,Bronx,Allerton/Pelham Gardens,Boro Zone
3,4,Manhattan,Alphabet City,Yellow Zone
4,5,Staten Island,Arden Heights,Boro Zone
...,...,...,...,...
260,261,Manhattan,World Trade Center,Yellow Zone
261,262,Manhattan,Yorkville East,Yellow Zone
262,263,Manhattan,Yorkville West,Yellow Zone
263,264,Unknown,NV,


In [67]:
taxi_zones.to_sql(name='zones', con=engine, if_exists='replace')

265

In [68]:
%load_ext sql
%sql postgresql://root:root@localhost:5432/uber

The sql extension is already loaded. To reload it, use:
  %reload_ext sql


In [69]:
%%sql
-- basic query
SELECT * FROM main LIMIT 10;

 * postgresql://root:***@localhost:5432/uber
10 rows affected.


index,hvfhs_license_num,dispatching_base_num,originating_base_num,request_datetime,on_scene_datetime,pickup_datetime,dropoff_datetime,PULocationID,DOLocationID,trip_miles,trip_time,base_passenger_fare,tolls,bcf,sales_tax,congestion_surcharge,airport_fee,tips,driver_pay,shared_request_flag,shared_match_flag,access_a_ride_flag,wav_request_flag,wav_match_flag
4514778,HV0003,B02872,B02872,2019-07-08 20:55:07,2019-07-08 20:55:26,2019-07-08 20:58:37,2019-07-08 21:10:55,14,22,3.75,738,11.26,0.0,0.0,0.98,0.0,,0.0,10.2,N,N,,N,
13459379,HV0005,B02510,,2019-07-21 19:38:31,,2019-07-21 19:43:25,2019-07-21 19:56:26,18,247,2.503,781,13.0,0.0,0.24,0.84,0.0,,0.0,0.0,Y,N,N,N,N
12008064,HV0003,B02877,B02877,2019-07-19 23:04:25,2019-07-19 23:07:19,2019-07-19 23:07:31,2019-07-19 23:10:59,108,108,1.08,208,5.15,0.0,0.0,0.45,0.0,,0.0,5.39,N,N,,N,
3745204,HV0003,B02889,B02889,2019-07-07 13:23:57,2019-07-07 13:25:35,2019-07-07 13:27:19,2019-07-07 13:38:54,35,61,1.52,695,9.54,0.0,0.0,0.83,0.0,,0.0,7.39,N,N,,N,
2053896,HV0005,B02510,,2019-07-04 16:42:43,,2019-07-04 16:45:14,2019-07-04 17:00:21,256,97,2.924,907,16.53,0.0,0.31,1.11,0.0,,0.0,10.69,N,N,N,N,N
8249788,HV0003,B02871,B02871,2019-07-14 10:31:36,2019-07-14 10:32:52,2019-07-14 10:35:11,2019-07-14 10:53:25,80,79,4.26,1093,14.59,0.0,0.0,1.27,2.75,,0.0,13.68,N,N,,N,
18408529,HV0003,B02682,B02682,2019-07-28 23:25:39,2019-07-28 23:27:17,2019-07-28 23:28:19,2019-07-28 23:48:17,243,242,4.4,1199,0.0,0.0,0.0,0.0,0.0,,3.0,17.71,N,N,,N,N
12220426,HV0003,B02875,B02875,2019-07-20 07:53:21,2019-07-20 08:04:59,2019-07-20 08:07:52,2019-07-20 08:31:18,76,62,3.73,1407,6.41,0.0,0.0,0.55,0.0,,0.0,6.61,Y,Y,,N,
5920669,HV0005,B02510,,2019-07-11 08:07:48,,2019-07-11 08:13:49,2019-07-11 08:27:01,89,188,1.887,792,12.63,0.0,0.32,1.12,0.0,,0.0,8.61,N,N,N,N,N
921374,HV0003,B02682,B02682,2019-07-02 17:04:57,2019-07-02 17:07:50,2019-07-02 17:08:56,2019-07-02 17:11:54,16,16,0.84,179,7.37,0.0,0.0,0.64,0.0,,0.0,5.39,N,N,,N,


### Hour and Day of Week


In [70]:
%%sql
ALTER TABLE main
ADD pickup_hour INT;

UPDATE main
SET pickup_hour = CAST(EXTRACT(hour FROM pickup_datetime) AS int)

 * postgresql://root:***@localhost:5432/uber
Done.
100000 rows affected.


[]

In [71]:
%%sql
ALTER TABLE main
ADD pickup_dayofweek varchar(15);

ALTER TABLE main
ADD pickup_dayofweek_int INT;

UPDATE main 
SET pickup_dayofweek_int = CAST(
        EXTRACT(isodow FROM pickup_datetime) 
    AS INT);

UPDATE main 
SET pickup_dayofweek = 
CASE 
    WHEN pickup_dayofweek_int = 1 THEN 'Monday'
    WHEN pickup_dayofweek_int = 2 THEN 'Tuesday'
    WHEN pickup_dayofweek_int = 3 THEN 'Wednesday'
    WHEN pickup_dayofweek_int = 4 THEN 'Thursday'
    WHEN pickup_dayofweek_int = 5 THEN 'Friday'
    WHEN pickup_dayofweek_int = 6 THEN 'Saturday'
    WHEN pickup_dayofweek_int = 7 THEN 'Sunday'
END;


 * postgresql://root:***@localhost:5432/uber
Done.
Done.
100000 rows affected.
100000 rows affected.


[]

In [72]:
%%sql
ALTER TABLE main drop pickup_dayofweek_int;

 * postgresql://root:***@localhost:5432/uber
Done.


[]

### Platforms

In [73]:
%%sql
ALTER TABLE main 
ADD platform varchar(10);

UPDATE main 
SET platform = 
CASE 
    WHEN Hvfhs_license_num = 'HV0002' THEN 'Juno'
    WHEN Hvfhs_license_num = 'HV0003' THEN 'Uber'
    WHEN Hvfhs_license_num = 'HV0004' THEN 'Via'
    WHEN Hvfhs_license_num = 'HV0005' THEN 'Lyft'
END;

 * postgresql://root:***@localhost:5432/uber
Done.
100000 rows affected.


[]

### Tips

In [74]:
%%sql
ALTER TABLE main 
ADD has_tips INT;

UPDATE main 
SET has_tips = CAST(tips > 0 AS INT);

 * postgresql://root:***@localhost:5432/uber
Done.
100000 rows affected.


[]

### Saving for Tableau

In [76]:
QUERY = """
SELECT * FROM main;
"""
df_out = pd.read_sql_query(QUERY, engine)
df_out.to_csv('../data/tableau_data.csv', index=False)
df_out

Unnamed: 0,index,hvfhs_license_num,dispatching_base_num,originating_base_num,request_datetime,on_scene_datetime,pickup_datetime,dropoff_datetime,PULocationID,DOLocationID,...,driver_pay,shared_request_flag,shared_match_flag,access_a_ride_flag,wav_request_flag,wav_match_flag,pickup_hour,pickup_dayofweek,platform,has_tips
0,3761537,HV0005,B02510,,2019-07-07 13:45:13,NaT,2019-07-07 13:50:14,2019-07-07 14:31:38,74,78,...,26.86,N,N,N,N,N,13,Sunday,Lyft,0
1,4514778,HV0003,B02872,B02872,2019-07-08 20:55:07,2019-07-08 20:55:26,2019-07-08 20:58:37,2019-07-08 21:10:55,14,22,...,10.20,N,N,,N,,20,Monday,Uber,0
2,13459379,HV0005,B02510,,2019-07-21 19:38:31,NaT,2019-07-21 19:43:25,2019-07-21 19:56:26,18,247,...,0.00,Y,N,N,N,N,19,Sunday,Lyft,0
3,12008064,HV0003,B02877,B02877,2019-07-19 23:04:25,2019-07-19 23:07:19,2019-07-19 23:07:31,2019-07-19 23:10:59,108,108,...,5.39,N,N,,N,,23,Friday,Uber,0
4,3745204,HV0003,B02889,B02889,2019-07-07 13:23:57,2019-07-07 13:25:35,2019-07-07 13:27:19,2019-07-07 13:38:54,35,61,...,7.39,N,N,,N,,13,Sunday,Uber,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,5131113,HV0003,B02395,B02395,2019-07-09 22:14:07,2019-07-09 22:16:26,2019-07-09 22:19:24,2019-07-09 22:38:52,159,241,...,14.24,N,N,,N,,22,Tuesday,Uber,0
99996,10621990,HV0005,B02510,,2019-07-18 02:23:11,NaT,2019-07-18 02:25:59,2019-07-18 02:38:17,114,141,...,10.58,N,N,N,N,N,2,Thursday,Lyft,0
99997,9970511,HV0005,B02510,,2019-07-17 07:51:59,NaT,2019-07-17 07:55:44,2019-07-17 08:08:47,24,74,...,8.39,N,N,N,N,N,7,Wednesday,Lyft,0
99998,1517254,HV0005,B02510,,2019-07-03 17:33:00,NaT,2019-07-03 17:36:01,2019-07-03 17:55:19,225,35,...,12.16,N,N,N,N,N,17,Wednesday,Lyft,0
