# Download parquet format data and upload it to PostgreSQL database

In [1]:
from sqlalchemy import create_engine
from pyarrow.dataset import dataset
import pandas as pd

from time import time

In [2]:
input_file = "yellow_tripdata_2021-01.parquet"
table_name = "yellow_taxi_data2"

In [3]:
# Download NY Taxi data
!wget https://d37ci6vzurychx.cloudfront.net/trip-data/{input_file}

--2022-10-15 12:17:38--  https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2021-01.parquet
Resolving d37ci6vzurychx.cloudfront.net (d37ci6vzurychx.cloudfront.net)... 54.230.159.45, 54.230.159.160, 54.230.159.130, ...
Connecting to d37ci6vzurychx.cloudfront.net (d37ci6vzurychx.cloudfront.net)|54.230.159.45|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 21686067 (21M) [application/x-www-form-urlencoded]
Saving to: ‘yellow_tripdata_2021-01.parquet’


2022-10-15 12:17:41 (5.90 MB/s) - ‘yellow_tripdata_2021-01.parquet’ saved [21686067/21686067]



In [4]:
# Create an engine to connect to postgresql
engine = create_engine('postgresql://root:root@localhost:5432/ny_taxi')
engine.connect()

<sqlalchemy.engine.base.Connection at 0x7f211c657c10>

In [5]:
# Drop table
query = f"""
DROP TABLE IF EXISTS "{table_name}";
"""

results = engine.execute(query)
print(query)


DROP TABLE IF EXISTS "yellow_taxi_data2";



In [6]:
def transform(df):
    df.tpep_pickup_datetime = pd.to_datetime(df.tpep_pickup_datetime)
    df.tpep_dropoff_datetime = pd.to_datetime(df.tpep_dropoff_datetime)

In [7]:
# Read the file by chunks
ds = dataset(input_file, format="parquet")

batches = ds.to_batches()

for batch in batches:
    
    t_start = time()

    df = batch.to_pandas()
    transform(df)
    df.to_sql(name=table_name, con=engine, if_exists='append')
    
    t_end = time()
    print(f"Inserted {df.shape[0]} rows, took {(t_end - t_start):.2f}")
    
    # Restriction for the test purposes
    break

Inserted 131072 rows, took 10.99


In [12]:
# Generate script to create our table
print(pd.io.sql.get_schema(df, name=table_name, con=engine))


CREATE TABLE yellow_taxi_data3 (
	"VendorID" BIGINT, 
	tpep_pickup_datetime TIMESTAMP WITHOUT TIME ZONE, 
	tpep_dropoff_datetime TIMESTAMP WITHOUT TIME ZONE, 
	passenger_count FLOAT(53), 
	trip_distance FLOAT(53), 
	"RatecodeID" FLOAT(53), 
	store_and_fwd_flag TEXT, 
	"PULocationID" BIGINT, 
	"DOLocationID" BIGINT, 
	payment_type BIGINT, 
	fare_amount FLOAT(53), 
	extra FLOAT(53), 
	mta_tax FLOAT(53), 
	tip_amount FLOAT(53), 
	tolls_amount FLOAT(53), 
	improvement_surcharge FLOAT(53), 
	total_amount FLOAT(53), 
	congestion_surcharge FLOAT(53), 
	airport_fee FLOAT(53)
)




In [11]:
assert False

AssertionError: 

In [35]:
!wget https://s3.amazonaws.com/nyc-tlc/misc/taxi+_zone_lookup.csv

--2022-01-15 23:57:02--  https://s3.amazonaws.com/nyc-tlc/misc/taxi+_zone_lookup.csv
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.113.61
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.113.61|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 12322 (12K) [application/octet-stream]
Saving to: 'taxi+_zone_lookup.csv'

     0K .......... ..                                         100%  910K=0.01s

2022-01-15 23:57:02 (910 KB/s) - 'taxi+_zone_lookup.csv' saved [12322/12322]



In [36]:
df_zones = pd.read_csv('taxi+_zone_lookup.csv')

In [38]:
df_zones.head()

Unnamed: 0,LocationID,Borough,Zone,service_zone
0,1,EWR,Newark Airport,EWR
1,2,Queens,Jamaica Bay,Boro Zone
2,3,Bronx,Allerton/Pelham Gardens,Boro Zone
3,4,Manhattan,Alphabet City,Yellow Zone
4,5,Staten Island,Arden Heights,Boro Zone


In [42]:
df_zones.to_sql(name='zones', con=engine, if_exists='replace')