In [9]:
import logging
import tempfile
from pathlib import Path

import polars as pl
import requests
from tqdm.notebook import tqdm

if Path().resolve().name == "notebooks":
    PROJECT_ROOT = Path("../").resolve()
else:
    PROJECT_ROOT = Path().resolve()

In [10]:
def get_nyc_taxi(root: Path, start=(2022, 1), end=(2025, 5)):
    try:
        # Check if the combined file already exists
        output_file = (
            root
            / f"data/raw/yellow_tripdata_{start[0]:04d}-{start[1]:02d}_{end[0]:04d}-{end[1]:02d}.parquet"
        )

        if output_file.exists():
            logging.info(
                f"Found existing parquet file for NYC Taxi data from {start[0]}-{start[1]} to {end[0]}-{end[1]}. Loading it."
            )
            df = pl.read_parquet(output_file)
        else:
            logging.info(
                f"Downloading NYC Taxi data from {start[0]}-{start[1]} to {end[0]}-{end[1]}."
            )

            # Create year-month tuples for the date range
            start_year, start_month = start
            end_year, end_month = end
            year_month_tuples = [
                (year, month)
                for year in range(start_year, end_year + 1)
                for month in range(1, 13)
                if (year, month) >= (start_year, start_month)
                and (year, month) <= (end_year, end_month)
            ]

            with tempfile.TemporaryDirectory() as temp_dir:
                temp_folder_path = Path(temp_dir)

                base = "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_{:04d}-{:02d}.parquet"
                for year, month in tqdm(
                    year_month_tuples, desc="Downloading NYC Taxi Data"
                ):
                    file = (
                        temp_folder_path
                        / f"yellow_tripdata_{year:04d}-{month:02d}.parquet"
                    )
                    if not file.exists():
                        r = requests.get(base.format(year, month), stream=True)
                        if r.ok:
                            with open(file, "wb") as out:
                                out.write(r.content)
                        else:
                            logging.warning(
                                f"Failed to download data for {year}-{month}. Status code: {r.status_code}"
                            )

                logging.info("Concatenating all downloaded parquet files.")
                parquet_files = [
                    f
                    for f in temp_folder_path.iterdir()
                    if f.is_file() and f.suffix == ".parquet"
                ]

                if not parquet_files:
                    raise FileNotFoundError(
                        "No parquet files were downloaded to the temporary directory."
                    )

                dfs = [
                    pl.read_parquet(x)
                    for x in tqdm(
                        parquet_files,
                        desc="Reading temp Parquet files.",
                        total=len(parquet_files),
                    )
                ]
                df = pl.concat(dfs, how="diagonal_relaxed", rechunk=True)

                output_file.parent.mkdir(parents=True, exist_ok=True)

                logging.info(f"Saving concatenated parquet file to {output_file}")
                df.write_parquet(output_file.resolve())

        return df

    except FileNotFoundError as e:
        logging.error(f"File not found error: {str(e)}")
        raise
    except requests.RequestException as e:
        logging.error(f"Network error occurred: {str(e)}")
        raise
    except Exception as e:
        logging.error(
            f"An error occurred while downloading or processing the NYC Taxi data: {str(e)}"
        )
        raise

In [11]:
start = (2022, 1)
end = (2025, 5)
df = get_nyc_taxi(root=PROJECT_ROOT, start=start, end=end)

In [12]:
df.tail(5)

VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee,airport_fee,cbd_congestion_fee
i64,datetime[μs],datetime[μs],f64,f64,f64,str,i64,i64,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
2,2023-03-31 23:24:25,2023-03-31 23:40:54,,3.16,,,163,75,0,12.13,0.0,0.5,4.23,0.0,1.0,20.36,,,,
2,2023-03-31 23:24:50,2023-04-01 00:04:12,,6.89,,,125,198,0,40.92,0.0,0.5,8.98,0.0,1.0,53.9,,,,
2,2023-03-31 23:26:31,2023-03-31 23:49:39,,4.01,,,50,224,0,24.02,0.0,0.5,0.0,0.0,1.0,28.02,,,,
2,2023-03-31 23:07:51,2023-03-31 23:15:56,,1.31,,,113,158,0,8.51,0.0,0.5,3.5,0.0,1.0,16.01,,,,
2,2023-03-31 23:26:12,2023-03-31 23:31:47,,0.88,,,41,166,0,13.51,0.0,0.5,2.25,0.0,1.0,17.26,,,,


In [13]:
df.describe(
    percentiles=[0.01, 0.25, 0.5, 0.75, 0.99],
)

statistic,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee,airport_fee,cbd_congestion_fee
str,f64,str,str,f64,f64,f64,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""count""",1.38896468e8,"""138896468""","""138896468""",1.27921922e8,1.38896468e8,1.27921922e8,"""127921922""",1.38896468e8,1.38896468e8,1.38896468e8,1.38896468e8,1.38896468e8,1.38896468e8,1.38896468e8,1.38896468e8,1.38896468e8,1.38896468e8,1.27921922e8,8.6639104e7,4.1282818e7,1.9760424e7
"""null_count""",0.0,"""0""","""0""",1.0974546e7,0.0,1.0974546e7,"""10974546""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0974546e7,5.2257364e7,9.761365e7,1.19136044e8
"""mean""",1.754169,"""2023-10-06 03:09:41.255780""","""2023-10-06 03:37:04.034708""",1.359772,5.255372,1.87244,,164.458796,163.152636,1.132167,16.565063,1.304069,0.483731,4.426969,0.54893,0.781858,26.005767,2.254399,0.143168,0.098144,0.524461
"""std""",0.471936,,,0.876464,473.916795,8.764473,,64.735367,69.917767,0.608798,11931.048396,1.903596,0.116377,11930.560976,2.149349,0.364023,108.112336,0.823469,0.492262,0.339819,0.359651
"""min""",1.0,"""2001-01-01 00:03:14""","""1970-01-20 10:16:32""",0.0,0.0,1.0,"""N""",1.0,1.0,0.0,-1.3339e8,-39.17,-0.55,-411.0,-148.17,-1.0,-2567.8,-2.5,-1.75,-1.25,-0.75
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""25%""",1.0,"""2022-11-15 14:12:59""","""2022-11-15 14:33:10""",1.0,1.05,1.0,,132.0,113.0,1.0,8.5,0.0,0.5,0.0,0.0,0.3,14.51,2.5,0.0,0.0,0.0
"""50%""",2.0,"""2023-10-16 15:07:57""","""2023-10-16 15:28:34""",1.0,1.8,1.0,,161.0,162.0,1.0,12.8,1.0,0.5,2.46,0.0,1.0,19.56,2.5,0.0,0.0,0.75
"""75%""",2.0,"""2024-08-30 00:28:00""","""2024-08-30 00:42:03""",1.0,3.43,1.0,,234.0,234.0,1.0,20.5,2.5,0.5,4.0,0.0,1.0,28.56,2.5,0.0,0.0,0.75
"""99%""",2.0,"""2025-05-21 21:09:30""","""2025-05-21 21:24:30""",5.0,20.08,5.0,,263.0,264.0,4.0,72.3,7.5,0.5,17.18,6.94,1.0,102.2,2.5,1.75,1.25,0.75
