# About the data

Download Data from this Page: https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page<br>
Data Dictionary: https://www.nyc.gov/assets/tlc/downloads/pdf/data_dictionary_trip_records_yellow.pdf

# Imports

In [1]:
import os


import pandas as pd
import polars as pl

In [2]:
# from google.colab import drive
# drive.mount('/content/drive')

# GLOBAL_VARS

In [3]:
CWD = os.getcwd()

In [4]:
PATH_INPUT_FOLDER = os.path.join(CWD, "input")
PATH_OUTPUT_FOLDER = os.path.join(CWD, "output")

In [5]:
try:
    os.makedirs(PATH_OUTPUT_FOLDER)
except:
    pass

# Fixing Data Types

In [6]:
def get_file_types(input_folder):
    file_schemas = []

    for file_name in os.listdir(input_folder):
        file_path = os.path.join(input_folder, file_name)

        try:
            pldf_ = pl.read_parquet(file_path)
            schema_ = pldf_.schema

            columns = list(schema_.keys())
            types = list(schema_.values())
            file_names = [file_name for i in range(len(columns))]

            for c, t, f in zip(columns, types, file_names):
                file_schemas.append((c, t, f))

        except:
            print(file_path)

    return file_schemas

In [7]:
df = pd.DataFrame(
    data=get_file_types(input_folder=PATH_INPUT_FOLDER), columns=["Col", "Type", "File"]
)

In [8]:
df.head()

Unnamed: 0,Col,Type,File
0,VendorID,Int32,yellow_tripdata_2023-06.parquet
1,tpep_pickup_datetime,"Datetime(time_unit='ns', time_zone=None)",yellow_tripdata_2023-06.parquet
2,tpep_dropoff_datetime,"Datetime(time_unit='ns', time_zone=None)",yellow_tripdata_2023-06.parquet
3,passenger_count,Int64,yellow_tripdata_2023-06.parquet
4,trip_distance,Float64,yellow_tripdata_2023-06.parquet


In [9]:
(df["Col"].value_counts())

Col
VendorID                 59
tpep_pickup_datetime     59
congestion_surcharge     59
total_amount             59
improvement_surcharge    59
tolls_amount             59
tip_amount               59
mta_tax                  59
extra                    59
fare_amount              59
payment_type             59
DOLocationID             59
PULocationID             59
store_and_fwd_flag       59
RatecodeID               59
trip_distance            59
passenger_count          59
tpep_dropoff_datetime    59
Airport_fee              34
airport_fee              25
cbd_congestion_fee       11
Name: count, dtype: int64

In [10]:
df["Col"].nunique()

21

In [11]:
(df["File"].value_counts())

File
yellow_tripdata_2025-04.parquet    20
yellow_tripdata_2025-07.parquet    20
yellow_tripdata_2025-08.parquet    20
yellow_tripdata_2025-10.parquet    20
yellow_tripdata_2025-01.parquet    20
yellow_tripdata_2025-11.parquet    20
yellow_tripdata_2025-06.parquet    20
yellow_tripdata_2025-09.parquet    20
yellow_tripdata_2025-03.parquet    20
yellow_tripdata_2025-05.parquet    20
yellow_tripdata_2025-02.parquet    20
yellow_tripdata_2024-12.parquet    19
yellow_tripdata_2021-03.parquet    19
yellow_tripdata_2021-08.parquet    19
yellow_tripdata_2023-09.parquet    19
yellow_tripdata_2024-02.parquet    19
yellow_tripdata_2023-10.parquet    19
yellow_tripdata_2024-09.parquet    19
yellow_tripdata_2022-04.parquet    19
yellow_tripdata_2023-06.parquet    19
yellow_tripdata_2021-01.parquet    19
yellow_tripdata_2023-12.parquet    19
yellow_tripdata_2023-02.parquet    19
yellow_tripdata_2023-01.parquet    19
yellow_tripdata_2021-11.parquet    19
yellow_tripdata_2024-10.parquet    19
yellow_

In [12]:
(
    df.groupby(["Col"])
    .agg(
        nr_unique_types=("Type", lambda series: len(set(series))),
        unique_types=("Type", lambda series: set(series)),
    )
    .pipe(lambda df: df[df["nr_unique_types"] != 1])
)

Unnamed: 0_level_0,nr_unique_types,unique_types
Col,Unnamed: 1_level_1,Unnamed: 2_level_1
DOLocationID,2,"{Int64, Int32}"
PULocationID,2,"{Int64, Int32}"
RatecodeID,2,"{Int64, Float64}"
VendorID,2,"{Int64, Int32}"
passenger_count,2,"{Int64, Float64}"
tpep_dropoff_datetime,2,"{Datetime(time_unit='us', time_zone=None), Dat..."
tpep_pickup_datetime,2,"{Datetime(time_unit='us', time_zone=None), Dat..."


In [13]:
def get_default_rename_dict():
    rename_dict = {
        "VendorID": "VendorID",
        "tpep_pickup_datetime": "TpepPickupDatetime",
        "congestion_surcharge": "CongestionSurcharge",
        "total_amount": "TotalAmount",
        "improvement_surcharge": "ImprovementSurcharge",
        "tolls_amount": "TollsAmount",
        "tip_amount": "TipAmount",
        "mta_tax": "MtaTax",
        "extra": "Extra",
        "fare_amount": "FareAmount",
        "payment_type": "PaymentType",
        "DOLocationID": "DOLocationID",
        "PULocationID": "PULocationID",
        "store_and_fwd_flag": "StoreAndFwdFlag",
        "RatecodeID": "RatecodeID",
        "trip_distance": "TripDistance",
        "passenger_count": "PassengerCount",
        "tpep_dropoff_datetime": "TpepDropoffDatetime",
        "Airport_fee": "AirportFee",
    }

    return rename_dict

In [14]:
def standarize_files(input_folder, output_folder):
    for file_name in os.listdir(input_folder):
        if file_name != ".ipynb_checkpoints":
            file_path = os.path.join(input_folder, file_name)

            rename_dict = get_default_rename_dict()
            pldf = pl.read_parquet(file_path)
            columns = pldf.columns

            if "airport_fee" in columns:
                _ = rename_dict.pop("Airport_fee")
                rename_dict["airport_fee"] = "AirportFee"

            pldf = pldf.rename(rename_dict).with_columns(
                pl.col("VendorID").cast(pl.Int64),
                pl.col("DOLocationID").cast(pl.Int64),
                pl.col("PULocationID").cast(pl.Int64),
                pl.col("RatecodeID").cast(pl.Float64),
                pl.col("PassengerCount").cast(pl.Float64),
                pl.col("TpepDropoffDatetime").cast(pl.Datetime),
                pl.col("TpepPickupDatetime").cast(pl.Datetime),
            )
            output_path = os.path.join(output_folder, file_name)
            print(output_path)
            pldf.write_parquet(file=output_path)

In [15]:
standarize_files(input_folder=PATH_INPUT_FOLDER, output_folder=PATH_OUTPUT_FOLDER)

/Users/nicolaepopescul/code/polanyt/output/yellow_tripdata_2023-06.parquet
/Users/nicolaepopescul/code/polanyt/output/yellow_tripdata_2022-10.parquet
/Users/nicolaepopescul/code/polanyt/output/yellow_tripdata_2025-02.parquet
/Users/nicolaepopescul/code/polanyt/output/yellow_tripdata_2021-05.parquet
/Users/nicolaepopescul/code/polanyt/output/yellow_tripdata_2022-09.parquet
/Users/nicolaepopescul/code/polanyt/output/yellow_tripdata_2024-04.parquet
/Users/nicolaepopescul/code/polanyt/output/yellow_tripdata_2025-03.parquet
/Users/nicolaepopescul/code/polanyt/output/yellow_tripdata_2021-04.parquet
/Users/nicolaepopescul/code/polanyt/output/yellow_tripdata_2022-08.parquet
/Users/nicolaepopescul/code/polanyt/output/yellow_tripdata_2024-05.parquet
/Users/nicolaepopescul/code/polanyt/output/yellow_tripdata_2023-07.parquet
/Users/nicolaepopescul/code/polanyt/output/yellow_tripdata_2022-01.parquet
/Users/nicolaepopescul/code/polanyt/output/yellow_tripdata_2022-11.parquet
/Users/nicolaepopescul/co

# Final Test

In [16]:
plldf = pl.scan_parquet(source=os.path.join(PATH_OUTPUT_FOLDER, "yellow_*.parquet"))

In [17]:
(plldf.describe())

statistic,VendorID,TpepPickupDatetime,TpepDropoffDatetime,PassengerCount,TripDistance,RatecodeID,StoreAndFwdFlag,PULocationID,DOLocationID,PaymentType,FareAmount,Extra,MtaTax,TipAmount,TollsAmount,ImprovementSurcharge,TotalAmount,CongestionSurcharge,AirportFee
str,f64,str,str,f64,f64,f64,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""count""",194457948.0,"""194457948""","""194457948""",175793950.0,194457948.0,175793950.0,"""175793950""",194457948.0,194457948.0,194457948.0,194457948.0,194457948.0,194457948.0,194457948.0,194457948.0,194457948.0,194457948.0,175793950.0,171631227.0
"""null_count""",0.0,"""0""","""0""",18663998.0,0.0,18663998.0,"""18663998""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,18663998.0,22826721.0
"""mean""",1.764114,"""2023-08-28 09:39:02.277372""","""2023-08-28 10:03:30.244110""",1.364151,5.735442,1.923776,,164.031496,162.658534,1.114957,16.310616,1.239971,0.484188,3.89128,0.520204,0.725627,25.108223,2.250067,0.122974
"""std""",0.525438,,,0.890711,541.879512,9.055872,,65.232708,70.170145,0.623302,10083.810105,1.806027,0.39307,10083.089561,2.102138,0.378569,120.895506,0.827172,0.442126
"""min""",1.0,"""2001-01-01 00:03:14""","""1970-01-20 10:16:32""",0.0,0.0,1.0,"""N""",1.0,1.0,0.0,-133390000.0,-39.17,-21.74,-411.0,-148.17,-1.0,-2567.8,-2.5,-1.75
"""25%""",1.0,"""2022-06-13 00:40:29""","""2022-06-13 01:00:06""",1.0,1.06,1.0,,132.0,113.0,1.0,8.0,0.0,0.5,0.0,0.0,0.3,14.0,2.5,0.0
"""50%""",2.0,"""2023-09-14 18:16:17""","""2023-09-14 18:36:18""",1.0,1.81,1.0,,161.0,162.0,1.0,12.1,0.5,0.5,2.3,0.0,1.0,18.96,2.5,0.0
"""75%""",2.0,"""2024-11-25 16:56:22""","""2024-11-25 17:17:03""",1.0,3.48,1.0,,234.0,234.0,1.0,19.9,2.5,0.5,3.86,0.0,1.0,27.8,2.5,0.0
"""max""",7.0,"""2098-09-11 02:23:31""","""2098-09-11 02:52:04""",112.0,398608.62,99.0,"""Y""",265.0,265.0,5.0,863372.12,10002.5,5243.38,133390000.0,1702.88,2.5,863380.37,3.0,6.75
