## Getting data in one shot from the file

### Importing required packages

In [1]:
import pandas as pd
import sqlalchemy

### URL to extract data from

In [8]:
# URl to extract data from
url = 'https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2021-01.csv'

# we will use wget and download this data on the local file

### Reading data

In [5]:
df = pd.read_csv('yellow_tripdata_2021-01.csv')

  df = pd.read_csv('yellow_tripdata_2021-01.csv')


In [6]:
# Changing the data type to datetime for the dates
df.tpep_pickup_datetime = pd.to_datetime(df.tpep_pickup_datetime)
df.tpep_dropoff_datetime = pd.to_datetime(df.tpep_dropoff_datetime)

In [7]:
df.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge
0,1.0,2021-01-01 00:30:10,2021-01-01 00:36:12,1.0,2.1,1.0,N,142,43,2.0,8.0,3.0,0.5,0.0,0.0,0.3,11.8,2.5
1,1.0,2021-01-01 00:51:20,2021-01-01 00:52:19,1.0,0.2,1.0,N,238,151,2.0,3.0,0.5,0.5,0.0,0.0,0.3,4.3,0.0
2,1.0,2021-01-01 00:43:30,2021-01-01 01:11:06,1.0,14.7,1.0,N,132,165,1.0,42.0,0.5,0.5,8.65,0.0,0.3,51.95,0.0
3,1.0,2021-01-01 00:15:48,2021-01-01 00:31:01,0.0,10.6,1.0,N,138,132,1.0,29.0,0.5,0.5,6.05,0.0,0.3,36.35,0.0
4,2.0,2021-01-01 00:31:49,2021-01-01 00:48:21,1.0,4.94,1.0,N,68,33,1.0,16.5,0.5,0.5,4.06,0.0,0.3,24.36,2.5


### Connecting to postgresql and creating a table

In [2]:
# Connecting to postgresql
from sqlalchemy import create_engine
engine = create_engine('postgresql://root:root@localhost:5432/ny_taxi')

In [3]:
query = """SELECT 1 as number;"""

pd.read_sql(query, con=engine)

Unnamed: 0,number
0,1


In [4]:
query = """
SELECT *
FROM pg_catalog.pg_tables
WHERE schemaname != 'pg_catalog' AND
    schemaname != 'information_schema';
"""

pd.read_sql(query, con=engine)

Unnamed: 0,schemaname,tablename,tableowner,tablespace,hasindexes,hasrules,hastriggers,rowsecurity
0,public,yellow_taxi_data,root,,True,False,False,False


In [12]:
df.to_sql('yellow_taxi_data', con=engine, index=False, if_exists='replace')

765

In [13]:
query = """
SELECT 
    COUNT(1)
FROM yellow_taxi_data;
"""

pd.read_sql(query, con=engine)

Unnamed: 0,count
0,1369765


## Getting data in iterations from the local file

### Importing required packages

In [1]:
import pandas as pd
import sqlalchemy

### URL to extract data from

In [8]:
# URl to extract data from
url = 'https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2021-01.csv'

# we will use wget and download this data on the local file

### Reading data

In [16]:
# reading first 100,000 rows of downloaded data
df_iter = pd.read_csv('yellow_tripdata_2021-01.csv', iterator=True, chunksize=100000)
df = next(df_iter)

In [6]:
# Changing the data type to datetime for the dates
df.tpep_pickup_datetime = pd.to_datetime(df.tpep_pickup_datetime)
df.tpep_dropoff_datetime = pd.to_datetime(df.tpep_dropoff_datetime)

In [17]:
df.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge
0,1,2021-01-01 00:30:10,2021-01-01 00:36:12,1,2.1,1,N,142,43,2,8.0,3.0,0.5,0.0,0.0,0.3,11.8,2.5
1,1,2021-01-01 00:51:20,2021-01-01 00:52:19,1,0.2,1,N,238,151,2,3.0,0.5,0.5,0.0,0.0,0.3,4.3,0.0
2,1,2021-01-01 00:43:30,2021-01-01 01:11:06,1,14.7,1,N,132,165,1,42.0,0.5,0.5,8.65,0.0,0.3,51.95,0.0
3,1,2021-01-01 00:15:48,2021-01-01 00:31:01,0,10.6,1,N,138,132,1,29.0,0.5,0.5,6.05,0.0,0.3,36.35,0.0
4,2,2021-01-01 00:31:49,2021-01-01 00:48:21,1,4.94,1,N,68,33,1,16.5,0.5,0.5,4.06,0.0,0.3,24.36,2.5


In [18]:
df.shape

(100000, 18)

### Connecting to postgresql and creating a table

In [22]:
# Connecting to postgresql
from time import time 
from sqlalchemy import create_engine
engine = create_engine('postgresql://root:root@localhost:5432/ny_taxi')

In [19]:
df.head(n=0).to_sql(name='yellow_taxi_data', con=engine, if_exists='replace')

0

In [20]:
df.to_sql(name='yellow_taxi_data', con=engine, if_exists='append')

1000

### Adding remaining data in iterations

In [23]:
while True:
    t_start = time()
    
    # Getting data
    df = next(df_iter)
    
    # Changing the data type to datetime for the dates
    df.tpep_pickup_datetime = pd.to_datetime(df.tpep_pickup_datetime)
    df.tpep_dropoff_datetime = pd.to_datetime(df.tpep_dropoff_datetime)
    
    df.to_sql(name='yellow_taxi_data', con=engine, if_exists='append')
    
    t_end = time()
    
    print('inserted another chunk in %.3f second' % (t_end-t_start))

inserted another chunk in 6.738 second
inserted another chunk in 6.360 second
inserted another chunk in 6.438 second
inserted another chunk in 6.298 second
inserted another chunk in 6.479 second
inserted another chunk in 6.768 second
inserted another chunk in 6.882 second
inserted another chunk in 7.285 second
inserted another chunk in 6.953 second
inserted another chunk in 7.165 second
inserted another chunk in 7.095 second


  df = next(df_iter)


inserted another chunk in 6.787 second
inserted another chunk in 4.440 second


StopIteration: 

In [24]:
query = """
SELECT 
    COUNT(1)
FROM yellow_taxi_data;
"""

pd.read_sql(query, con=engine)

Unnamed: 0,count
0,1369765


### Getting Zone data and putting it into Postgres

In [5]:
!wget https://s3.amazonaws.com/nyc-tlc/misc/taxi+_zone_lookup.csv

--2022-04-22 02:54:17--  https://s3.amazonaws.com/nyc-tlc/misc/taxi+_zone_lookup.csv
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.163.77
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.163.77|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 12322 (12K) [application/octet-stream]
Saving to: ‘taxi+_zone_lookup.csv’


2022-04-22 02:54:17 (81.3 MB/s) - ‘taxi+_zone_lookup.csv’ saved [12322/12322]



In [6]:
df_zones = pd.read_csv('taxi+_zone_lookup.csv')

In [7]:
df_zones.head()

Unnamed: 0,LocationID,Borough,Zone,service_zone
0,1,EWR,Newark Airport,EWR
1,2,Queens,Jamaica Bay,Boro Zone
2,3,Bronx,Allerton/Pelham Gardens,Boro Zone
3,4,Manhattan,Alphabet City,Yellow Zone
4,5,Staten Island,Arden Heights,Boro Zone


In [10]:
# Connecting to postgresql
from time import time 
from sqlalchemy import create_engine
engine = create_engine('postgresql://root:root@localhost:5431/ny_taxi')

In [11]:
df_zones.to_sql(name='zones', con=engine, if_exists='replace')

265