# Database

In [1]:
import time
import pandas as pd
import psycopg2 as pg
from sqlalchemy import create_engine
engine = create_engine('postgresql://postgres:postgres@localhost:5432/taxi_trip')

In [16]:
df_full = pd.read_csv("../data/yellow_tripdata_2021-01.csv", nrows=10, parse_dates=['tpep_pickup_datetime', 'tpep_dropoff_datetime'])
print(pd.io.sql.get_schema(df_full, 'yellow_trip', con=engine))


CREATE TABLE yellow_trip (
	"VendorID" BIGINT, 
	tpep_pickup_datetime TIMESTAMP WITHOUT TIME ZONE, 
	tpep_dropoff_datetime TIMESTAMP WITHOUT TIME ZONE, 
	passenger_count BIGINT, 
	trip_distance FLOAT(53), 
	"RatecodeID" BIGINT, 
	store_and_fwd_flag TEXT, 
	"PULocationID" BIGINT, 
	"DOLocationID" BIGINT, 
	payment_type BIGINT, 
	fare_amount FLOAT(53), 
	extra FLOAT(53), 
	mta_tax FLOAT(53), 
	tip_amount FLOAT(53), 
	tolls_amount BIGINT, 
	improvement_surcharge FLOAT(53), 
	total_amount FLOAT(53), 
	congestion_surcharge FLOAT(53)
)




In [10]:

df_iter = pd.read_csv("../data/yellow_tripdata_2021-01.csv", chunksize=10000, iterator=True)

count = 1
while True:
    t_start = time.time()

    df = next(df_iter)

    df.tpep_pickup_datetime = pd.to_datetime(df.tpep_pickup_datetime)
    df.tpep_dropoff_datetime = pd.to_datetime(df.tpep_dropoff_datetime)
    
    df.to_sql(name='yellow_trip', con=engine, if_exists='append')

    t_end = time.time()

    print(F'Inserted chunk #{count}, took {t_end - t_start:.3f} second')
    
    count+=1
    
    try:
        df = next(df_iter)
    except StopIteration:
        print('Finished')
        break
    except Exception as e:
        print(e)
        break


Inserted chunk #1, took 0.660 second
Inserted chunk #2, took 0.654 second
Inserted chunk #3, took 0.686 second
Inserted chunk #4, took 0.604 second
Inserted chunk #5, took 0.606 second
Inserted chunk #6, took 0.653 second
Inserted chunk #7, took 0.594 second
Inserted chunk #8, took 0.604 second
Inserted chunk #9, took 0.694 second
Inserted chunk #10, took 0.635 second
Inserted chunk #11, took 0.651 second
Inserted chunk #12, took 0.676 second
Inserted chunk #13, took 0.613 second
Inserted chunk #14, took 0.605 second
Inserted chunk #15, took 0.710 second
Inserted chunk #16, took 0.638 second
Inserted chunk #17, took 0.593 second
Inserted chunk #18, took 0.581 second
Inserted chunk #19, took 0.686 second
Inserted chunk #20, took 0.652 second
Inserted chunk #21, took 0.642 second
Inserted chunk #22, took 0.694 second
Inserted chunk #23, took 0.627 second
Inserted chunk #24, took 0.654 second
Inserted chunk #25, took 0.636 second
Inserted chunk #26, took 0.672 second
Inserted chunk #27, t

In [3]:
df_zones = pd.read_csv('../data/taxi+_zone_lookup.csv')
df_zones.to_sql(name='zones', con=engine, if_exists='replace')