In [5]:
# !pip install pandas
# !pip install pyarrow

# Step 1. Processing taxi data

In [1]:
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq

In [11]:
raw_yellow_partition = pd.DataFrame()
raw_yellow = pd.DataFrame()
for i in range(12,13):
    raw_yellow_partition = pq.read_table('yellow_tripdata_2022-'+str(i)+'.parquet')
    raw_yellow_partition = raw_yellow_partition.to_pandas()[['tpep_pickup_datetime','tpep_dropoff_datetime',
                                                'passenger_count','PULocationID','DOLocationID']]
    print('Number of duplicate (excluding original) rows is:', raw_yellow_partition.duplicated().sum())
    raw_yellow = pd.concat([raw_yellow,raw_yellow_partition],axis=1)
    break
raw_yellow.shape

Number of duplicate (excluding original) rows is: 23917


(3399549, 5)

In [8]:
raw_yellow_partition = pd.DataFrame()
raw_yellow = pd.DataFrame()
for i in range(12,13):
    raw_yellow_partition = pq.read_table('yellow_tripdata_2022-'+str(i)+'.parquet')
    raw_yellow_pickup = raw_yellow_partition.to_pandas()[['tpep_pickup_datetime',
                                                'passenger_count','PULocationID']]
    raw_yellow_pickup = raw_yellow_pickup.set_axis(['datetime', 'passenger_count', 'taxi_zone'], axis='columns')
    raw_yellow_pickup['trip_type']='pickup'
    raw_yellow_dropoff = raw_yellow_partition.to_pandas()[['tpep_dropoff_datetime',
                                                'passenger_count','DOLocationID']]
    raw_yellow_dropoff = raw_yellow_dropoff.set_axis(['datetime', 'passenger_count', 'taxi_zone'], axis='columns')
    raw_yellow_dropoff['trip_type']='dropoff'

    raw_yellow = pd.concat([raw_yellow,raw_yellow_pickup,raw_yellow_dropoff],axis=0)
    break
raw_yellow.shape

(6799098, 4)

In [9]:
# Print the number of duplicates, without the original rows that were duplicated
print('Number of duplicate (excluding original) rows is:', raw_yellow.duplicated().sum())

# Use "keep=False" to mark all duplicates as true, including the original rows that were duplicated.
print('Number of duplicate rows (including original) is:', raw_yellow[raw_yellow.duplicated(keep=False)].shape[0])

Number of duplicate (excluding original) rows is: 126370
Number of duplicate rows (including original) is: 250528


In [4]:
raw_yellow.head()

Unnamed: 0,datetime,passenger_count,taxi_zone,trip_type
0,2022-12-01 00:37:35,1.0,170,pickup
1,2022-12-01 00:34:35,0.0,138,pickup
2,2022-12-01 00:33:26,1.0,140,pickup
3,2022-12-01 00:45:51,1.0,141,pickup
4,2022-12-01 00:49:49,1.0,261,pickup


In [5]:
# Convert to hourly data
raw_yellow["datetime"] = raw_yellow["datetime"].dt.round("H")

In [6]:
raw_yellow.describe().T

Unnamed: 0,count,mean,min,25%,50%,75%,max,std
datetime,6799098.0,2022-12-15 07:40:49.740596736,2022-11-30 19:00:00,2022-12-07 22:00:00,2022-12-14 20:00:00,2022-12-21 22:00:00,2023-01-02 00:00:00,
passenger_count,6546172.0,1.419582,0.0,1.0,1.0,2.0,9.0,0.951364
taxi_zone,6799098.0,164.118078,1.0,125.0,162.0,234.0,265.0,67.768088


In [65]:
passenger = pd.pivot_table(raw_yellow, index=['datetime','taxi_zone'], values='passenger_count', aggfunc=pd.Series.sum)

In [67]:
passenger.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,passenger_count
datetime,taxi_zone,Unnamed: 2_level_1
2022-11-30 19:00:00,132,1.0
2022-11-30 20:00:00,100,1.0
2022-11-30 20:00:00,107,1.0
2022-11-30 20:00:00,132,1.0
2022-11-30 20:00:00,148,1.0


# Step 2. Writing data into postgresql