# Loading TLC Data

This notebook downloads NYC taxi data.
- **Training data**: 6 months of 2024 data (yellow and green taxis)
- **Testing data**: 6 months of 2025 data (yellow and green taxis)

In [19]:
import os
import requests
from pathlib import Path
import time

# Create directory structure for data storage
directories = ['../data/raw/taxi_2024', '../data/raw/taxi_2025']
for directory in directories:
    Path(directory).mkdir(parents=True, exist_ok=True)
    print(f"Created directory: {directory}")

Created directory: ../data/raw/taxi_2024
Created directory: ../data/raw/taxi_2025


## Download Function
The function below downloads both yellow and green taxi data from the official TLC source.

In [22]:
def download_taxi_data(year, months, taxi_types=['yellow', 'green']):
    """
    Downloads NYC taxi data from TLC official source
    Args:
        year (int): Year to download (2024 or 2025)  
        months (list): List of months to download (1-12)
        taxi_types (list): Types of taxis to download ('yellow', 'green')
    """
    base_url = "https://d37ci6vzurychx.cloudfront.net/trip-data"
    
    for taxi_type in taxi_types:
        print(f"\n--- Downloading {taxi_type} taxi data for {year} ---")
        
        for month in months:
            filename = f"{taxi_type}_tripdata_{year}-{month:02d}.parquet"
            file_url = f"{base_url}/{filename}"
            local_path = f"../data/raw/taxi_{year}/{filename}"
            
            # Skip if file already exists
            if os.path.exists(local_path):
                print(f"✓ {filename} already exists")
                continue
                
            print(f"Downloading {filename}...")
            
            try:
                response = requests.get(file_url)
                response.raise_for_status()  # Check for HTTP errors
                
                # Save file
                with open(local_path, 'wb') as f:
                    f.write(response.content)
                    
                file_size = len(response.content) / (1024*1024)  # Size in MB
                print(f"✓ Downloaded {filename} ({file_size:.1f} MB)")
                
                time.sleep(1)  # Brief pause between downloads
                
            except Exception as e:
                print(f"✗ Failed to download {filename}: {e}")

## Download 2024 Training Data
Downloading 6 months of 2024 data (Jan-June) for both yellow and green taxis.

In [23]:
# Download 6 months of 2024 data (training set)
training_months = [1, 2, 3, 4, 5, 6]
print("Downloading 2024 taxi data...")
download_taxi_data(2024, training_months)

Downloading 2024 taxi data...

--- Downloading yellow taxi data for 2024 ---
Downloading yellow_tripdata_2024-01.parquet...
✓ Downloaded yellow_tripdata_2024-01.parquet (47.6 MB)
Downloading yellow_tripdata_2024-02.parquet...
✓ Downloaded yellow_tripdata_2024-02.parquet (48.0 MB)
Downloading yellow_tripdata_2024-03.parquet...
✓ Downloaded yellow_tripdata_2024-03.parquet (57.3 MB)
Downloading yellow_tripdata_2024-04.parquet...
✓ Downloaded yellow_tripdata_2024-04.parquet (56.4 MB)
Downloading yellow_tripdata_2024-05.parquet...
✓ Downloaded yellow_tripdata_2024-05.parquet (59.7 MB)
Downloading yellow_tripdata_2024-06.parquet...
✓ Downloaded yellow_tripdata_2024-06.parquet (57.1 MB)

--- Downloading green taxi data for 2024 ---
Downloading green_tripdata_2024-01.parquet...
✓ Downloaded green_tripdata_2024-01.parquet (1.3 MB)
Downloading green_tripdata_2024-02.parquet...
✓ Downloaded green_tripdata_2024-02.parquet (1.2 MB)
Downloading green_tripdata_2024-03.parquet...
✓ Downloaded green_tr

## Download 2025 Testing Data
Downloading 6 months of 2025 data (Jan-June) for both yellow and green taxis.

In [25]:
# Download 6 months of 2025 data (testing set)
testing_months = [1, 2, 3, 4, 5, 6]
print("Downloading 2025 taxi data...")
download_taxi_data(2025, testing_months)

Downloading 2025 taxi data...

--- Downloading yellow taxi data for 2025 ---
Downloading yellow_tripdata_2025-01.parquet...
✓ Downloaded yellow_tripdata_2025-01.parquet (56.4 MB)
Downloading yellow_tripdata_2025-02.parquet...
✓ Downloaded yellow_tripdata_2025-02.parquet (57.5 MB)
Downloading yellow_tripdata_2025-03.parquet...
✓ Downloaded yellow_tripdata_2025-03.parquet (66.7 MB)
Downloading yellow_tripdata_2025-04.parquet...
✓ Downloaded yellow_tripdata_2025-04.parquet (64.2 MB)
Downloading yellow_tripdata_2025-05.parquet...
✓ Downloaded yellow_tripdata_2025-05.parquet (74.2 MB)
Downloading yellow_tripdata_2025-06.parquet...
✓ Downloaded yellow_tripdata_2025-06.parquet (70.1 MB)

--- Downloading green taxi data for 2025 ---
Downloading green_tripdata_2025-01.parquet...
✓ Downloaded green_tripdata_2025-01.parquet (1.1 MB)
Downloading green_tripdata_2025-02.parquet...
✓ Downloaded green_tripdata_2025-02.parquet (1.1 MB)
Downloading green_tripdata_2025-03.parquet...
✓ Downloaded green_tr

In [None]:
from urllib.request import urlretrieve
# download the lookup data
lookup_url = "https://d37ci6vzurychx.cloudfront.net/misc/taxi_zone_lookup.csv"
lookup_output_dir = "../data/taxi_zone_lookup.csv"
urlretrieve(lookup_url, lookup_output_dir)

('../data/taxi_zones/taxi_zone_lookup.csv',
 <http.client.HTTPMessage at 0x10d5fb6d0>)