# About the data

Download Data from this Page: https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page<br>
Data Dictionary: https://www.nyc.gov/assets/tlc/downloads/pdf/data_dictionary_trip_records_yellow.pdf

# Imports

In [1]:
import os
import urllib

import multiprocessing as mp

import pandas as pd
import polars as pl

# GLOBAL_VARS

In [2]:
CWD = os.getcwd()
PATH_INPUT_FOLDER = os.path.join(CWD, "input")

In [3]:
FILES_TO_DOWNLOAD = [
    
    "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-01.parquet",
    "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-02.parquet",
    
    "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet",
    "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-02.parquet",
    "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-03.parquet",
    "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-04.parquet",
    "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-05.parquet",
    "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-06.parquet",
    "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-07.parquet",
    "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-08.parquet",
    "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-09.parquet",
    "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-10.parquet",
    "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-11.parquet",
    "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-12.parquet",
    
    "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2022-01.parquet",
    "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2022-02.parquet",
    "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2022-03.parquet",
    "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2022-04.parquet",
    "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2022-05.parquet",
    "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2022-06.parquet",
    "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2022-07.parquet",
    "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2022-08.parquet",
    "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2022-09.parquet",
    "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2022-10.parquet",
    "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2022-11.parquet",
    "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2022-12.parquet",
    
    "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2021-01.parquet",
    "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2021-02.parquet",
    "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2021-03.parquet",
    "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2021-04.parquet",
    "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2021-05.parquet",
    "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2021-06.parquet",
    "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2021-07.parquet",
    "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2021-08.parquet",
    "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2021-09.parquet",
    "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2021-10.parquet",
    "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2021-11.parquet",
    "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2021-12.parquet",
    
]

# Data Downloading

In [4]:
def make_input_folder(folder_path):
    
    if not os.path.isdir(folder_path):
        os.makedirs(folder_path)

In [5]:
def extract_basename(url):
    
    return os.path.basename(url)

In [6]:
def download_file(t):
    
    folder_path, url = t
    basename = os.path.join(folder_path, extract_basename(url = url))
    
    urllib.request.urlretrieve(url = url, filename = basename)

In [7]:
iterable = list(zip([PATH_INPUT_FOLDER for i in range(len(FILES_TO_DOWNLOAD))], FILES_TO_DOWNLOAD))
cpu_cores = mp.cpu_count() - 2

In [8]:
make_input_folder(folder_path = PATH_INPUT_FOLDER)

In [9]:
pool = mp.get_context(method="fork").Pool(processes = cpu_cores)
pool.map(func = download_file, iterable = iterable)
pool.close()

# Fixing Data Types

In [10]:
def get_file_types(input_folder):

    file_schemas = []

    for file_name in os.listdir(input_folder):

        file_path = os.path.join(input_folder, file_name)

        try:
            pldf_ = pl.read_parquet(file_path)
            schema_ = pldf_.schema

            columns = list(schema_.keys())
            types = list(schema_.values())
            file_names = [file_name for i in range(len(columns))]
            
            for c, t, f in zip(columns, types, file_names):
                file_schemas.append((c, t, f))

        except:
            print(file_path)

    return file_schemas

In [11]:
df = pd.DataFrame(
    data = get_file_types(input_folder=PATH_INPUT_FOLDER),
    columns = ["Col", "Type", "File"]
)

In [12]:
df.head()

Unnamed: 0,Col,Type,File
0,VendorID,Int64,yellow_tripdata_2020-03.parquet
1,tpep_pickup_datetime,"Datetime(time_unit='ns', time_zone=None)",yellow_tripdata_2020-03.parquet
2,tpep_dropoff_datetime,"Datetime(time_unit='ns', time_zone=None)",yellow_tripdata_2020-03.parquet
3,passenger_count,Float64,yellow_tripdata_2020-03.parquet
4,trip_distance,Float64,yellow_tripdata_2020-03.parquet


In [13]:
(
    df["Col"].value_counts()
)

Col
VendorID                 24
fare_amount              24
congestion_surcharge     24
total_amount             24
improvement_surcharge    24
tolls_amount             24
tip_amount               24
mta_tax                  24
extra                    24
payment_type             24
tpep_pickup_datetime     24
DOLocationID             24
PULocationID             24
store_and_fwd_flag       24
RatecodeID               24
trip_distance            24
passenger_count          24
tpep_dropoff_datetime    24
airport_fee              24
Name: count, dtype: int64

In [14]:
df["Col"].nunique()

19

In [15]:
(
    df["File"].value_counts()
)

File
yellow_tripdata_2020-03.parquet    19
yellow_tripdata_2019-06.parquet    19
yellow_tripdata_2020-06.parquet    19
yellow_tripdata_2020-07.parquet    19
yellow_tripdata_2019-12.parquet    19
yellow_tripdata_2019-02.parquet    19
yellow_tripdata_2020-05.parquet    19
yellow_tripdata_2019-10.parquet    19
yellow_tripdata_2019-09.parquet    19
yellow_tripdata_2019-08.parquet    19
yellow_tripdata_2019-11.parquet    19
yellow_tripdata_2019-01.parquet    19
yellow_tripdata_2020-04.parquet    19
yellow_tripdata_2019-04.parquet    19
yellow_tripdata_2020-01.parquet    19
yellow_tripdata_2020-11.parquet    19
yellow_tripdata_2020-08.parquet    19
yellow_tripdata_2020-09.parquet    19
yellow_tripdata_2020-10.parquet    19
yellow_tripdata_2019-05.parquet    19
yellow_tripdata_2020-02.parquet    19
yellow_tripdata_2020-12.parquet    19
yellow_tripdata_2019-07.parquet    19
yellow_tripdata_2019-03.parquet    19
Name: count, dtype: int64

In [16]:
(
    df
    .groupby(["Col"])
    .agg(
        nr_unique_types = ("Type", lambda series: len(set(series))),
        unique_types = ("Type", lambda series: set(series))
    )
    .pipe(
        lambda df: df[df["nr_unique_types"] != 1]
    )
)

Unnamed: 0_level_0,nr_unique_types,unique_types
Col,Unnamed: 1_level_1,Unnamed: 2_level_1
airport_fee,2,"{Float64, Null}"


In [None]:
def get_default_rename_dict():
    
    rename_dict = {
        
        "VendorID":"VendorID",
        "tpep_pickup_datetime":"TpepPickupDatetime",
        "congestion_surcharge":"CongestionSurcharge",
        "total_amount":"TotalAmount",
        "improvement_surcharge":"ImprovementSurcharge",
        "tolls_amount":"TollsAmount",
        "tip_amount":"TipAmount",
        "mta_tax":"MtaTax",
        "extra":"Extra",
        "fare_amount":"FareAmount",
        "payment_type":"PaymentType",
        "DOLocationID":"DOLocationID",
        "PULocationID":"PULocationID",
        "store_and_fwd_flag":"StoreAndFwdFlag",
        "RatecodeID":"RatecodeID",
        "trip_distance":"TripDistance",
        "passenger_count":"PassengerCount",
        "tpep_dropoff_datetime":"TpepDropoffDatetime",
        "Airport_fee":"AirportFee"
    }
    
    return rename_dict

In [None]:
def standarize_files(input_folder):
    
    for file_name in os.listdir(input_folder):
        
        if file_name != ".ipynb_checkpoints":

            file_path = os.path.join(input_folder, file_name)

            rename_dict = get_default_rename_dict()
            pldf = pl.read_parquet(file_path)
            columns = pldf.columns

            if "airport_fee" in columns:
                _ = rename_dict.pop("Airport_fee")
                rename_dict["airport_fee"] = "AirportFee"

            pldf = (
                pldf
                .rename(rename_dict)
                .with_columns(
                    pl.col("VendorID").cast(pl.Int64),
                    pl.col("DOLocationID").cast(pl.Int64),
                    pl.col("PULocationID").cast(pl.Int64),
                    pl.col("RatecodeID").cast(pl.Float64),
                    pl.col("PassengerCount").cast(pl.Float64)
                )
            )

            pldf.write_parquet(file = os.path.join(input_folder, file_name))

In [None]:
standarize_files(input_folder=PATH_INPUT_FOLDER)

# Final Test

In [None]:
plldf = pl.scan_parquet(source = os.path.join(PATH_INPUT_FOLDER, "yellow_*.parquet"))

In [None]:
(
    plldf.describe()
)