In [11]:
import os
import pandas as pd
from datetime import datetime

# Path to monthly cleaned CSVs (adjust if needed)
RAW_DATA_DIR = os.path.expanduser("~/Downloads/Monthly Data")

# Project root and output file
PROJECT_ROOT = os.path.expanduser("~/TaxiApp/data")
OUTPUT_PATH = os.path.join(PROJECT_ROOT, "data_models_api/hotspot_model/historical_lags.csv")

# Mapping of all 12 months
month_file_map = {
    1: "Clean_January_Taxi.csv",
    2: "Clean_February_Taxi.csv",
    3: "Clean_March_Taxi.csv",
    4: "Clean_April_Taxi.csv",
    5: "Clean_May_Taxi.csv",
    6: "Clean_June_Taxi.csv",
    7: "Clean_July_Taxi.csv",
    8: "Clean_August_Taxi.csv",
    9: "Clean_September_Taxi.csv",
    10: "Clean_October_Taxi.csv",
    11: "Clean_November_Taxi.csv",
    12: "Clean_December_Taxi.csv",
}

# Target months for proxy lag
target_months = [7, 8]
all_agg = []

for month in target_months:
    filename = month_file_map[month]
    file_path = os.path.join(RAW_DATA_DIR, filename)

    if not os.path.exists(file_path):
        print(f"File not found: {file_path}")
        continue

    print(f"Processing {filename}...")
    df = pd.read_csv(file_path)

    # Parse datetime
    df["pickup_datetime"] = pd.to_datetime(df["tpep_pickup_datetime"], errors="coerce")
    df = df.dropna(subset=["pickup_datetime"])
    df["pickup_date"] = df["pickup_datetime"].dt.date
    df["pickup_hour"] = df["pickup_datetime"].dt.hour

    # Ensure column is present
    if "pickup_zone" not in df.columns:
        print(f"'pickup_zone' column missing in {filename}, skipping.")
        continue

    # Aggregate to zone-hour-level trip counts
    agg = (
        df.groupby(["pickup_date", "pickup_hour", "pickup_zone"])
        .size()
        .reset_index(name="trip_count")
    )

    all_agg.append(agg)

if not all_agg:
    raise ValueError("No valid data processed. Nothing to save.")

# Combine all month-level aggregations
historical_lags = pd.concat(all_agg, ignore_index=True)

# Ensure target output folder exists
os.makedirs(os.path.dirname(OUTPUT_PATH), exist_ok=True)

# Write to project path
historical_lags.to_csv(OUTPUT_PATH, index=False)

print(f"Saved July & August lag data to: {OUTPUT_PATH}")


Processing Clean_July_Taxi.csv...
Processing Clean_August_Taxi.csv...
Saved July & August lag data to: /Users/elliekavanagh/TaxiApp/data/data_models_api/hotspot_model/historical_lags.csv
