# About the data

Download Data from this Page: https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page<br>
Data Dictionary: https://www.nyc.gov/assets/tlc/downloads/pdf/data_dictionary_trip_records_yellow.pdf

# Imports

In [1]:
import os
import urllib

import multiprocessing as mp

import pandas as pd
import polars as pl

In [2]:
# from google.colab import drive
# drive.mount('/content/drive')

# GLOBAL_VARS

In [3]:
CWD = os.getcwd()

In [4]:
PATH_INPUT_FOLDER = os.path.join(CWD, "input")

# Fixing Data Types

In [5]:
def get_file_types(input_folder):

    file_schemas = []

    for file_name in os.listdir(input_folder):

        file_path = os.path.join(input_folder, file_name)

        try:
            pldf_ = pl.read_parquet(file_path)
            schema_ = pldf_.schema

            columns = list(schema_.keys())
            types = list(schema_.values())
            file_names = [file_name for i in range(len(columns))]
            
            for c, t, f in zip(columns, types, file_names):
                file_schemas.append((c, t, f))

        except:
            print(file_path)

    return file_schemas

In [6]:
df = pd.DataFrame(
    data = get_file_types(input_folder=PATH_INPUT_FOLDER),
    columns = ["Col", "Type", "File"]
)

In [7]:
df.head()

Unnamed: 0,Col,Type,File
0,VendorID,Int32,yellow_tripdata_2023-06.parquet
1,tpep_pickup_datetime,"Datetime(time_unit='ns', time_zone=None)",yellow_tripdata_2023-06.parquet
2,tpep_dropoff_datetime,"Datetime(time_unit='ns', time_zone=None)",yellow_tripdata_2023-06.parquet
3,passenger_count,Int64,yellow_tripdata_2023-06.parquet
4,trip_distance,Float64,yellow_tripdata_2023-06.parquet


In [8]:
(
    df["Col"].value_counts()
)

Col
VendorID                 50
tpep_pickup_datetime     50
congestion_surcharge     50
total_amount             50
improvement_surcharge    50
tolls_amount             50
tip_amount               50
mta_tax                  50
extra                    50
fare_amount              50
payment_type             50
DOLocationID             50
PULocationID             50
store_and_fwd_flag       50
RatecodeID               50
trip_distance            50
passenger_count          50
tpep_dropoff_datetime    50
Airport_fee              25
airport_fee              25
cbd_congestion_fee        2
Name: count, dtype: int64

In [9]:
df["Col"].nunique()

21

In [10]:
(
    df["File"].value_counts()
)

File
yellow_tripdata_2025-02.parquet    20
yellow_tripdata_2025-01.parquet    20
yellow_tripdata_2023-06.parquet    19
yellow_tripdata_2023-12.parquet    19
yellow_tripdata_2023-01.parquet    19
yellow_tripdata_2022-06.parquet    19
yellow_tripdata_2023-10.parquet    19
yellow_tripdata_2024-02.parquet    19
yellow_tripdata_2024-12.parquet    19
yellow_tripdata_2021-03.parquet    19
yellow_tripdata_2023-09.parquet    19
yellow_tripdata_2024-09.parquet    19
yellow_tripdata_2022-04.parquet    19
yellow_tripdata_2021-08.parquet    19
yellow_tripdata_2023-02.parquet    19
yellow_tripdata_2022-07.parquet    19
yellow_tripdata_2021-01.parquet    19
yellow_tripdata_2021-11.parquet    19
yellow_tripdata_2024-10.parquet    19
yellow_tripdata_2021-10.parquet    19
yellow_tripdata_2024-01.parquet    19
yellow_tripdata_2024-11.parquet    19
yellow_tripdata_2024-08.parquet    19
yellow_tripdata_2022-05.parquet    19
yellow_tripdata_2021-09.parquet    19
yellow_tripdata_2023-11.parquet    19
yellow_

In [11]:
(
    df
    .groupby(["Col"])
    .agg(
        nr_unique_types = ("Type", lambda series: len(set(series))),
        unique_types = ("Type", lambda series: set(series))
    )
    .pipe(
        lambda df: df[df["nr_unique_types"] != 1]
    )
)

Unnamed: 0_level_0,nr_unique_types,unique_types
Col,Unnamed: 1_level_1,Unnamed: 2_level_1
DOLocationID,2,"{Int32, Int64}"
PULocationID,2,"{Int32, Int64}"
RatecodeID,2,"{Float64, Int64}"
VendorID,2,"{Int32, Int64}"
passenger_count,2,"{Float64, Int64}"
tpep_dropoff_datetime,2,"{Datetime(time_unit='ns', time_zone=None), Dat..."
tpep_pickup_datetime,2,"{Datetime(time_unit='ns', time_zone=None), Dat..."


In [12]:
def get_default_rename_dict():
    
    rename_dict = {
        
        "VendorID":"VendorID",
        "tpep_pickup_datetime":"TpepPickupDatetime",
        "congestion_surcharge":"CongestionSurcharge",
        "total_amount":"TotalAmount",
        "improvement_surcharge":"ImprovementSurcharge",
        "tolls_amount":"TollsAmount",
        "tip_amount":"TipAmount",
        "mta_tax":"MtaTax",
        "extra":"Extra",
        "fare_amount":"FareAmount",
        "payment_type":"PaymentType",
        "DOLocationID":"DOLocationID",
        "PULocationID":"PULocationID",
        "store_and_fwd_flag":"StoreAndFwdFlag",
        "RatecodeID":"RatecodeID",
        "trip_distance":"TripDistance",
        "passenger_count":"PassengerCount",
        "tpep_dropoff_datetime":"TpepDropoffDatetime",
        "Airport_fee":"AirportFee"
    }
    
    return rename_dict

In [15]:
def standarize_files(input_folder):
    
    for file_name in os.listdir(input_folder):
        
        if file_name != ".ipynb_checkpoints":

            file_path = os.path.join(input_folder, file_name)

            rename_dict = get_default_rename_dict()
            pldf = pl.read_parquet(file_path)
            columns = pldf.columns

            if "airport_fee" in columns:
                _ = rename_dict.pop("Airport_fee")
                rename_dict["airport_fee"] = "AirportFee"

            pldf = (
                pldf
                .rename(rename_dict)
                .with_columns(
                    pl.col("VendorID").cast(pl.Int64),
                    pl.col("DOLocationID").cast(pl.Int64),
                    pl.col("PULocationID").cast(pl.Int64),
                    pl.col("RatecodeID").cast(pl.Float64),
                    pl.col("PassengerCount").cast(pl.Float64),
                    pl.col("TpepDropoffDatetime").cast(pl.Datetime),
                    pl.col("TpepPickupDatetime").cast(pl.Datetime)
                )
            )

            pldf.write_parquet(file = os.path.join(input_folder, file_name))

In [16]:
standarize_files(input_folder=PATH_INPUT_FOLDER)

# Final Test

In [17]:
plldf = pl.scan_parquet(source = os.path.join(PATH_INPUT_FOLDER, "yellow_*.parquet"))

In [18]:
(
    plldf.describe()
)

statistic,VendorID,TpepPickupDatetime,TpepDropoffDatetime,PassengerCount,TripDistance,RatecodeID,StoreAndFwdFlag,PULocationID,DOLocationID,PaymentType,FareAmount,Extra,MtaTax,TipAmount,TollsAmount,ImprovementSurcharge,TotalAmount,CongestionSurcharge,AirportFee
str,f64,str,str,f64,f64,f64,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""count""",157093121.0,"""157093121""","""157093121""",147498449.0,157093121.0,147498449.0,"""147498449""",157093121.0,157093121.0,157093121.0,157093121.0,157093121.0,157093121.0,157093121.0,157093121.0,157093121.0,157093121.0,147498449.0,143335726.0
"""null_count""",0.0,"""0""","""0""",9594672.0,0.0,9594672.0,"""9594672""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9594672.0,13757395.0
"""mean""",1.738652,"""2023-03-17 15:37:03.319531""","""2023-03-17 16:03:09.611825""",1.377389,5.434257,1.678083,,164.762269,163.125727,1.157803,15.845501,1.260498,0.485919,4.140453,0.522419,0.672459,24.717212,2.2643,0.117748
"""std""",0.467983,,,0.918344,510.503523,7.665988,,64.947041,70.065956,0.57956,11219.058848,1.801412,0.107759,11218.321529,2.090922,0.378249,128.840967,0.797632,0.421012
"""min""",1.0,"""2001-01-01 00:03:14""","""1970-01-20 10:16:32""",0.0,0.0,1.0,"""N""",1.0,1.0,0.0,-133390000.0,-39.17,-0.55,-411.0,-140.63,-1.0,-2567.8,-2.5,-1.75
"""25%""",1.0,"""2022-03-26 08:55:15""","""2022-03-26 09:12:29""",1.0,1.06,1.0,,132.0,113.0,1.0,7.9,0.0,0.5,0.0,0.0,0.3,13.6,2.5,0.0
"""50%""",2.0,"""2023-03-18 21:36:03""","""2023-03-18 21:51:45""",1.0,1.8,1.0,,162.0,162.0,1.0,12.0,0.5,0.5,2.36,0.0,1.0,18.48,2.5,0.0
"""75%""",2.0,"""2024-03-26 20:50:01""","""2024-03-26 21:03:35""",1.0,3.4,1.0,,234.0,234.0,1.0,19.1,2.5,0.5,3.85,0.0,1.0,27.02,2.5,0.0
"""max""",7.0,"""2098-09-11 02:23:31""","""2098-09-11 02:52:04""",112.0,398608.62,99.0,"""Y""",265.0,265.0,5.0,863372.12,10002.5,53.16,133390000.0,1702.88,2.0,863380.37,3.0,6.75
