# Load Data
- This notebook loads the uber taxi data into the SQL database.

In [1]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

### Uber Data

In [2]:
df = pd.read_parquet('../data/fhvhv_tripdata_2019-07.parquet')
df = df.sample(100000)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 100000 entries, 19413262 to 9745639
Data columns (total 24 columns):
 #   Column                Non-Null Count   Dtype         
---  ------                --------------   -----         
 0   hvfhs_license_num     100000 non-null  object        
 1   dispatching_base_num  99999 non-null   object        
 2   originating_base_num  72397 non-null   object        
 3   request_datetime      100000 non-null  datetime64[us]
 4   on_scene_datetime     72399 non-null   datetime64[us]
 5   pickup_datetime       100000 non-null  datetime64[us]
 6   dropoff_datetime      100000 non-null  datetime64[us]
 7   PULocationID          100000 non-null  int64         
 8   DOLocationID          100000 non-null  int64         
 9   trip_miles            100000 non-null  float64       
 10  trip_time             100000 non-null  int64         
 11  base_passenger_fare   100000 non-null  float64       
 12  tolls                 100000 non-null  float64       
 

In [4]:
from sqlalchemy import create_engine
engine = create_engine('postgresql://root:root@localhost:5432/uber')
engine.connect()

<sqlalchemy.engine.base.Connection at 0x29941a9a0>

In [5]:
df.to_sql(name='main', con=engine, if_exists='replace')

1000

### Taxi Zones

In [6]:
taxi_zones = pd.read_csv('../data/taxi+_zone_lookup.csv')
taxi_zones.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 265 entries, 0 to 264
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   LocationID    265 non-null    int64 
 1   Borough       265 non-null    object
 2   Zone          264 non-null    object
 3   service_zone  263 non-null    object
dtypes: int64(1), object(3)
memory usage: 8.4+ KB


In [7]:
taxi_zones

Unnamed: 0,LocationID,Borough,Zone,service_zone
0,1,EWR,Newark Airport,EWR
1,2,Queens,Jamaica Bay,Boro Zone
2,3,Bronx,Allerton/Pelham Gardens,Boro Zone
3,4,Manhattan,Alphabet City,Yellow Zone
4,5,Staten Island,Arden Heights,Boro Zone
...,...,...,...,...
260,261,Manhattan,World Trade Center,Yellow Zone
261,262,Manhattan,Yorkville East,Yellow Zone
262,263,Manhattan,Yorkville West,Yellow Zone
263,264,Unknown,NV,


In [8]:
taxi_zones.to_sql(name='zones', con=engine, if_exists='replace')

265

In [6]:
%load_ext sql
%sql postgresql://root:root@localhost:5432/uber

In [7]:
%%sql
-- basic query
SELECT * FROM main LIMIT 10;

 * postgresql://root:***@localhost:5432/uber
10 rows affected.


index,hvfhs_license_num,dispatching_base_num,originating_base_num,request_datetime,on_scene_datetime,pickup_datetime,dropoff_datetime,PULocationID,DOLocationID,trip_miles,trip_time,base_passenger_fare,tolls,bcf,sales_tax,congestion_surcharge,airport_fee,tips,driver_pay,shared_request_flag,shared_match_flag,access_a_ride_flag,wav_request_flag,wav_match_flag
19413262,HV0003,B02864,B02864,2019-07-30 17:19:03,2019-07-30 17:19:22,2019-07-30 17:27:08,2019-07-30 17:55:39,233,48,2.82,1710,23.5,0.0,0.0,2.04,2.75,,0.0,17.2,N,N,,N,N
10669053,HV0003,B02872,B02872,2019-07-18 07:40:03,2019-07-18 07:42:52,2019-07-18 07:43:40,2019-07-18 07:55:11,224,233,2.35,691,4.36,0.0,0.0,0.39,0.75,,1.0,5.48,Y,Y,,N,
5101164,HV0003,B02682,B02682,2019-07-09 21:37:27,2019-07-09 21:42:52,2019-07-09 21:46:05,2019-07-09 22:28:34,124,7,17.76,2549,22.1,0.0,0.0,1.91,0.0,,0.0,30.36,Y,Y,,N,
1310001,HV0003,B02764,B02764,2019-07-03 10:10:33,2019-07-03 10:10:44,2019-07-03 10:12:56,2019-07-03 10:38:18,162,138,9.06,1522,32.86,6.12,0.0,3.38,2.75,,0.0,28.6,N,N,,N,
9802365,HV0003,B02869,B02869,2019-07-16 21:25:01,2019-07-16 21:27:25,2019-07-16 21:28:43,2019-07-16 21:37:01,198,226,2.3,498,9.64,0.0,0.0,0.84,0.0,,0.0,6.63,N,N,,N,
17509961,HV0003,B02617,B02617,2019-07-27 19:25:49,2019-07-27 19:27:37,2019-07-27 19:28:27,2019-07-27 19:56:38,231,100,2.78,1692,14.49,0.0,0.0,1.32,2.75,,0.0,17.0,N,N,,N,N
10057147,HV0005,B02510,,2019-07-17 09:18:10,,2019-07-17 09:24:56,2019-07-17 09:36:59,55,108,0.602,723,6.84,0.0,0.17,0.61,0.0,,0.0,6.63,N,N,N,N,N
4117356,HV0003,B02888,B02888,2019-07-08 04:13:28,2019-07-08 04:18:02,2019-07-08 04:19:16,2019-07-08 04:30:15,42,169,3.9,658,15.88,0.0,0.0,1.37,0.0,,0.0,9.7,N,N,,N,
16375661,HV0003,B02867,B02867,2019-07-26 09:15:07,2019-07-26 09:22:45,2019-07-26 09:24:00,2019-07-26 09:47:56,79,170,2.34,1436,8.32,0.0,0.0,0.72,0.75,,0.0,12.11,Y,Y,,N,N
13740874,HV0004,B02800,,2019-07-22 08:43:13,,2019-07-22 08:56:49,2019-07-22 09:08:45,24,42,1.7,716,6.75,0.0,0.0,0.6,0.0,,0.0,0.0,Y,Y,N,N,N
