In [None]:
#install sqlalchemy and psycopg2

In [5]:
import pandas as pd
from sqlalchemy import create_engine
from time import time

In [6]:
engine = create_engine('postgresql://root:root@localhost:5432/ny_taxi')

In [7]:
engine.connect()

<sqlalchemy.engine.base.Connection at 0x75f819d2fb30>

In [None]:
query = """
select 1 as number;
"""

pd.read_sql(query, con = engine)

In [4]:
#The following is the equivalent of `\\dt` in pgcli. Only, we're using sqlalchemy"
query = """
select *
from pg_catalog.pg_tables
where schemaname != 'pg_catalog' 
and schemaname != 'information_schema';
"""

pd.read_sql(query, con = engine)

Unnamed: 0,schemaname,tablename,tableowner,tablespace,hasindexes,hasrules,hastriggers,rowsecurity


In [6]:
df = pd.read_csv('/workspaces/2025_data_engineering_zoomcamp/week_1_basics_and_setup/2_Docker_sql/yellow_tripdata_2021-01.csv', low_memory=False)

In [8]:
df.tpep_pickup_datetime = pd.to_datetime(df.tpep_pickup_datetime)
df.tpep_dropoff_datetime = pd.to_datetime(df.tpep_dropoff_datetime)

In [9]:
print(pd.io.sql.get_schema(df, name = 'yellow_taxi_data', con= engine))


CREATE TABLE yellow_taxi_data (
	"VendorID" FLOAT(53), 
	tpep_pickup_datetime TIMESTAMP WITHOUT TIME ZONE, 
	tpep_dropoff_datetime TIMESTAMP WITHOUT TIME ZONE, 
	passenger_count FLOAT(53), 
	trip_distance FLOAT(53), 
	"RatecodeID" FLOAT(53), 
	store_and_fwd_flag TEXT, 
	"PULocationID" BIGINT, 
	"DOLocationID" BIGINT, 
	payment_type FLOAT(53), 
	fare_amount FLOAT(53), 
	extra FLOAT(53), 
	mta_tax FLOAT(53), 
	tip_amount FLOAT(53), 
	tolls_amount FLOAT(53), 
	improvement_surcharge FLOAT(53), 
	total_amount FLOAT(53), 
	congestion_surcharge FLOAT(53)
)




In [16]:
#df is now not a dataframe, it's an iterator. We have to use this method as we can't add 1300000 rows to db all at once
df_iter = pd.read_csv('yellow_tripdata_2021-01.csv', iterator = True, chunksize=100000)

In [None]:
#see file type?
df_iter

In [None]:
#to make it a df. This will only return one iteration for now though. See len(df)
df = next(df_iter)
len(df)

In [23]:
df.tpep_pickup_datetime = pd.to_datetime(df.tpep_pickup_datetime)
df.tpep_dropoff_datetime = pd.to_datetime(df.tpep_dropoff_datetime)

In [None]:
#we will now use the schema-creation code to create oour table. First we will just create column names df.head(n=0)
df.head(0)

In [10]:
#note minor differences in creation of column names and inserting data to the table (chunks- update)
df.head(0).to_sql(name= 'yellow_taxi_data', con = engine, if_exists = 'replace')

0

In [11]:
query = """
select *
from pg_catalog.pg_tables
where schemaname != 'pg_catalog' 
and schemaname != 'information_schema';
"""

pd.read_sql(query, con = engine)

Unnamed: 0,schemaname,tablename,tableowner,tablespace,hasindexes,hasrules,hastriggers,rowsecurity
0,public,yellow_taxi_data,root,,True,False,False,False


In [12]:
query = """
select * from yellow_taxi_data;
"""

pd.read_sql(query, con = engine)

Unnamed: 0,index,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge


In [None]:
#now we will update our table with information in the chunks. Removing df.head(0) - note `append`
%time df.to_sql(name= 'yellow_taxi_data', con = engine, if_exists = 'append')

In [14]:
query = """
select count(*) from yellow_taxi_data;
"""

pd.read_sql(query, con = engine)

Unnamed: 0,count
0,0


In [17]:
#appended one chunk of 100000. now we need to do it iteratively. 


while True:
    try:
        t_start = time()

        df = next(df_iter)

        df.tpep_pickup_datetime = pd.to_datetime(df.tpep_pickup_datetime)
        df.tpep_dropoff_datetime = pd.to_datetime(df.tpep_dropoff_datetime)

        df.to_sql(name= 'yellow_taxi_data', con = engine, if_exists = 'append') #adds data to the table as chunks because inside while loop

        t_end = time()

        duration = t_end - t_start

        print('inserted another chunk... this chunk took %.3f seconds' % (duration))
    
    except StopIteration:
        print('finished inserting all chunks.')
        break



inserted another chunk... this chunk took 8.966 seconds
inserted another chunk... this chunk took 8.557 seconds
inserted another chunk... this chunk took 8.576 seconds
inserted another chunk... this chunk took 10.931 seconds
inserted another chunk... this chunk took 8.648 seconds
inserted another chunk... this chunk took 8.641 seconds
inserted another chunk... this chunk took 8.578 seconds
inserted another chunk... this chunk took 9.465 seconds
inserted another chunk... this chunk took 9.026 seconds
inserted another chunk... this chunk took 9.366 seconds
inserted another chunk... this chunk took 9.258 seconds
inserted another chunk... this chunk took 9.094 seconds


  df = next(df_iter)


inserted another chunk... this chunk took 8.668 seconds
inserted another chunk... this chunk took 5.801 seconds
finished inserting all chunks.


In [None]:
#verify that we've added all rows to our database:
query = """
select count(*) from yellow_taxi_data;
"""

pd.read_sql(query, con = engine)

In [None]:
#we can also do a bit of eda
query = """
select max(tpep_pickup_datetime) as pick_up_max, min(tpep_pickup_datetime) as pickup_min, max(total_amount) as total_amount_max
from yellow_taxi_data;
"""

pd.read_sql(query, con = engine)

In [None]:
#The following is the equivalent of `\\dt` in pgcli. Only, we're using sqlalchemy"
query = """
select * from yellow_taxi_data
limit 10
"""

pd.read_sql(query, con = engine)

In [2]:
# !wget https://s3.amazonaws.com/nyc-tlc/misc/taxi+zone_lookup.csv
!wget https://github.com/DataTalksClub/nyc-tlc-data/releases/download/misc/taxi_zone_lookup.csv

--2025-01-26 15:26:00--  https://github.com/DataTalksClub/nyc-tlc-data/releases/download/misc/taxi_zone_lookup.csv
Resolving github.com (github.com)... 140.82.121.3
Connecting to github.com (github.com)|140.82.121.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://objects.githubusercontent.com/github-production-release-asset-2e65be/513814948/5a2cc2f5-b4cd-4584-9c62-a6ea97ed0e6a?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=releaseassetproduction%2F20250126%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20250126T152600Z&X-Amz-Expires=300&X-Amz-Signature=ae64eabc33a506aa30abfa764c729d351d450f9817717f920e25b42adcd8544e&X-Amz-SignedHeaders=host&response-content-disposition=attachment%3B%20filename%3Dtaxi_zone_lookup.csv&response-content-type=application%2Foctet-stream [following]
--2025-01-26 15:26:00--  https://objects.githubusercontent.com/github-production-release-asset-2e65be/513814948/5a2cc2f5-b4cd-4584-9c62-a6ea97ed0e6a?X-Amz-Algorithm=AWS4-HMAC-