In [2]:
import os
import pandas as pd
from pathlib import Path

# ------------------------------------------------------------------
# Local paths
# ------------------------------------------------------------------
RAW_DIR       = Path("data/raw")
PROCESSED_DIR = Path("data/processed")
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)

# 12 Citi Bike filenames we care about
MONTH_FILES = [f"2020{m:02d}-citibike-tripdata_1.csv" for m in range(1, 13)]

# ------------------------------------------------------------------
def clean_citibike_file(fname: str) -> None:
    """Load RAW_DIR/fname, do basic cleaning, write to PROCESSED_DIR/*_processed.csv."""
    src  = RAW_DIR / fname
    dest = PROCESSED_DIR / fname.replace(".csv", "_processed.csv")

    # Skip work when we’ve already produced the processed file
    if dest.is_file():
        print(f"✅ processed: {dest.name}")
        return

    try:
        df = pd.read_csv(src, low_memory=False)

        # ── Clean & feature‑engineer ────────────────────────────────
        dt_cols = ["started_at", "ended_at"]
        df[dt_cols] = df[dt_cols].apply(pd.to_datetime, errors='coerce')

        df.dropna(subset=dt_cols + ["start_station_id", "end_station_id"], inplace=True)

        df["trip_duration_min"] = (df["ended_at"] - df["started_at"]).dt.total_seconds() / 60.0

        df["hour"]    = df["started_at"].dt.hour
        df["day"]     = df["started_at"].dt.day
        df["weekday"] = df["started_at"].dt.weekday
        df["month"]   = df["started_at"].dt.month
        df["year"]    = df["started_at"].dt.year

        df["start_station_id"] = df["start_station_id"].astype(str)
        df["end_station_id"]   = df["end_station_id"].astype(str)

        # ── Save ────────────────────────────────────────────────────
        df.to_csv(dest, index=False)
        print(f"📝 Processed → {dest.name}")

    except Exception as err:
        print(f"❌ Error processing {fname}: {err}")

# ------------------------------------------------------------------
if __name__ == "__main__":
    for csv in MONTH_FILES:
        clean_citibike_file(csv)


✅ processed: 202001-citibike-tripdata_1_processed.csv
✅ processed: 202002-citibike-tripdata_1_processed.csv
✅ processed: 202003-citibike-tripdata_1_processed.csv
✅ processed: 202004-citibike-tripdata_1_processed.csv
✅ processed: 202005-citibike-tripdata_1_processed.csv
✅ processed: 202006-citibike-tripdata_1_processed.csv
✅ processed: 202007-citibike-tripdata_1_processed.csv
✅ processed: 202008-citibike-tripdata_1_processed.csv
✅ processed: 202009-citibike-tripdata_1_processed.csv
✅ processed: 202010-citibike-tripdata_1_processed.csv
✅ processed: 202011-citibike-tripdata_1_processed.csv
✅ processed: 202012-citibike-tripdata_1_processed.csv
