In [3]:
import os
import requests
import zipfile
from io import BytesIO
from pathlib import Path

# ------------------------------------------------------------------
# Configuration
# ------------------------------------------------------------------
DATA_URL      = "https://s3.amazonaws.com/tripdata/2020-citibike-tripdata.zip"
DOWNLOAD_PATH = Path("data")                 # ./data
EXTRACT_PATH  = DOWNLOAD_PATH / "raw"        # ./data/raw
EXTRACT_PATH.mkdir(parents=True, exist_ok=True)

# The 12 files we expect to end up with
EXPECTED_FILES = [
    f"2020{month:02d}-citibike-tripdata_1.csv" for month in range(1, 13)
]

# ------------------------------------------------------------------
# Helpers
# ------------------------------------------------------------------
def all_csvs_present() -> bool:
    """Return True iff every expected CSV is already in data/raw/."""
    return all((EXTRACT_PATH / fname).is_file() for fname in EXPECTED_FILES)


def fetch_and_extract_tripdata() -> None:
    """Download the big 2020‑data ZIP and extract only *tripdata_1.csv files."""
    if all_csvs_present():
        print("✅ All 12 monthly CSVs are already in", EXTRACT_PATH)
        return

    print("Downloading Citi Bike 2020 ZIP …")
    response = requests.get(DATA_URL, timeout=60)
    response.raise_for_status()

    with zipfile.ZipFile(BytesIO(response.content)) as main_zip:
        print("Extracting monthly ZIP files …")
        for inner_zip_name in main_zip.namelist():
            if not inner_zip_name.endswith(".zip"):
                continue

            with main_zip.open(inner_zip_name) as inner_zip_file:
                with zipfile.ZipFile(BytesIO(inner_zip_file.read())) as month_zip:
                    for member in month_zip.namelist():
                        if member.endswith("_tripdata_1.csv"):
                            print(f"  → {member}  (from {inner_zip_name})")
                            month_zip.extract(member, EXTRACT_PATH)

    print("✅ Finished. CSVs are in", EXTRACT_PATH)


if __name__ == "__main__":
    fetch_and_extract_tripdata()


✅ All 12 monthly CSVs are already in data\raw
