In [8]:
import pandas as pd
import numpy as np
import os
import urllib
from concurrent.futures import ThreadPoolExecutor

In [9]:
BASE_URL = "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-"
DOWNLOAD_DIR = "data/"
def download_file(month):
    url = f"{BASE_URL}{month}.parquet"
    file_path = os.path.join(DOWNLOAD_DIR, f"yellow_tripdata_2024-{month}.parquet")

    try:
        print(f"Downloading {url}...")
        urllib.request.urlretrieve(url, file_path)
        print(f"Downloaded: {file_path}")
        return file_path
    except Exception as e:
        print(f"Failed to download {url}: {e}")
        return None

In [10]:
MONTHS = [f"{i:02d}" for i in range(1, 7)]

In [12]:
with ThreadPoolExecutor(max_workers=4) as executor:
    file_paths = list(executor.map(download_file, MONTHS))

Downloading https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-01.parquet...
Downloading https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-02.parquet...
Downloading https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-03.parquet...
Downloading https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-04.parquet...
Downloaded: data/yellow_tripdata_2024-03.parquet
Downloading https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-05.parquet...
Downloaded: data/yellow_tripdata_2024-04.parquet
Downloading https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-06.parquet...
Downloaded: data/yellow_tripdata_2024-02.parquet
Downloaded: data/yellow_tripdata_2024-01.parquet
Downloaded: data/yellow_tripdata_2024-06.parquet
Downloaded: data/yellow_tripdata_2024-05.parquet


In [14]:
df = pd.read_parquet('data')

In [15]:
df

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee
0,2,2024-01-01 00:57:55,2024-01-01 01:17:43,1.0,1.72,1.0,N,186,79,2,17.70,1.0,0.5,0.00,0.0,1.0,22.70,2.5,0.0
1,1,2024-01-01 00:03:00,2024-01-01 00:09:36,1.0,1.80,1.0,N,140,236,1,10.00,3.5,0.5,3.75,0.0,1.0,18.75,2.5,0.0
2,1,2024-01-01 00:17:06,2024-01-01 00:35:01,1.0,4.70,1.0,N,236,79,1,23.30,3.5,0.5,3.00,0.0,1.0,31.30,2.5,0.0
3,1,2024-01-01 00:36:38,2024-01-01 00:44:56,1.0,1.40,1.0,N,79,211,1,10.00,3.5,0.5,2.00,0.0,1.0,17.00,2.5,0.0
4,1,2024-01-01 00:46:51,2024-01-01 00:52:57,1.0,0.80,1.0,N,211,148,1,7.90,3.5,0.5,3.20,0.0,1.0,16.10,2.5,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20332088,2,2024-06-30 23:07:36,2024-06-30 23:18:35,,2.51,,,255,148,0,21.09,0.0,0.5,0.00,0.0,1.0,25.09,,
20332089,2,2024-06-30 23:46:07,2024-07-01 00:11:53,,7.55,,,68,74,0,33.96,0.0,0.5,0.00,0.0,1.0,37.96,,
20332090,2,2024-06-30 23:18:50,2024-06-30 23:35:09,,4.25,,,41,100,0,21.01,0.0,0.5,0.00,0.0,1.0,25.01,,
20332091,1,2024-06-30 23:33:36,2024-06-30 23:42:37,,1.60,,,158,231,0,14.41,0.0,0.5,0.00,0.0,1.0,18.41,,


In [19]:
df[df.fare_amount==0]

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee
854,1,2024-01-01 00:27:42,2024-01-01 00:28:49,1.0,0.20,1.0,N,239,143,4,0.0,0.0,0.0,0.00,0.0,0.0,0.00,0.0,0.0
6955,2,2024-01-01 01:53:45,2024-01-01 02:25:58,1.0,2.97,1.0,N,170,50,2,0.0,0.0,-0.5,0.00,0.0,-1.0,-4.00,-2.5,0.0
6956,2,2024-01-01 01:53:45,2024-01-01 02:25:58,1.0,2.97,1.0,N,170,50,2,0.0,0.0,0.5,0.00,0.0,1.0,4.00,2.5,0.0
11910,1,2024-01-01 02:30:38,2024-01-01 02:30:55,1.0,4.00,5.0,N,261,261,1,0.0,0.0,0.0,39.99,0.0,1.0,40.99,0.0,0.0
12290,1,2024-01-01 02:09:43,2024-01-01 02:10:32,1.0,0.50,5.0,N,265,265,4,0.0,0.0,0.0,0.00,0.0,1.0,1.00,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20327534,2,2024-06-30 16:34:55,2024-06-30 17:15:31,,5.75,,,239,209,0,0.0,0.0,0.5,0.00,0.0,1.0,4.00,,
20328934,2,2024-06-30 17:21:00,2024-06-30 17:33:00,,1.21,,,100,163,0,0.0,0.0,0.5,0.00,0.0,1.0,4.00,,
20329622,1,2024-06-30 17:09:43,2024-06-30 17:11:04,,0.00,,,158,158,0,0.0,0.0,0.0,0.00,0.0,0.0,0.00,,
20330004,1,2024-06-30 18:41:42,2024-06-30 18:41:48,,0.00,,,170,170,0,0.0,0.0,0.0,0.00,0.0,0.0,0.00,,
