# 01 — Data collection & quick EDA

Day 2–3 notebook.

Prereqs:
1) Create `.env` from `.env.example`
2) Fill API keys
3) Install requirements


In [2]:
from pathlib import Path
import sys

here = Path().resolve()
if (here / "src").exists():
    repo_root = here
elif (here.parent / "src").exists():
    repo_root = here.parent
else:
    repo_root = here.parents[1]

sys.path.insert(0, str((repo_root / "src").resolve()))
print("Repo root:", repo_root)


Repo root: /Users/davidbazalduamendez/Documents/GitHub/Short-term-traffic-congestion-prediction-for-London-Final-Project


## Load clients

In [3]:
from traffic_forecast.config import get_settings
from traffic_forecast.http import HttpClient, build_session
from traffic_forecast.clients.tomtom import TomTomClient
from traffic_forecast.clients.tfl import TflClient
from traffic_forecast.clients.dft import DftRoadTrafficClient

s = get_settings()
http = HttpClient(session=build_session(user_agent=s.user_agent), timeout_seconds=s.http_timeout_seconds)
tomtom = TomTomClient(api_key=s.tomtom_api_key, http=http)
tfl = TflClient(app_key=s.tfl_app_key, app_id=s.tfl_app_id, http=http)
dft = DftRoadTrafficClient(http=http)


## Create/load monitoring points (DfT)

In [4]:
import pandas as pd
from pathlib import Path

OUT_PATH = Path("data/interim/monitoring_points.csv")
MAX_POINTS = 200
LAS_PAGE_SIZE = 200
COUNTPOINTS_PAGE_SIZE = 1000
MAX_PAGES_COUNTPOINTS = 400  # safety para no irte al infinito

def extract_data(resp):
    # resp puede ser list o dict (paginado)
    if resp is None:
        return []
    if isinstance(resp, list):
        return resp
    if isinstance(resp, dict):
        return resp.get("data", [])
    return []

def has_more_pages(resp, page):
    # si viene dict con last_page, úsalo; si no (list), asumimos que no hay paginación visible
    if isinstance(resp, dict):
        last_page = resp.get("last_page", page)
        try:
            return page < int(last_page)
        except Exception:
            return False
    return False

# -------------------------
# 1) Load ALL local authorities (sin name_filter)
# -------------------------
all_las = []
page = 1

while True:
    resp = dft.list_local_authorities(page_size=LAS_PAGE_SIZE, page_number=page, name_filter=None)
    data = extract_data(resp)
    if not data:
        break
    all_las.extend(data)

    # si el endpoint expone paginación, seguimos; si no, salimos después de 1 página
    if not has_more_pages(resp, page):
        break
    page += 1

las_df = pd.DataFrame(all_las)
print("local authorities fetched:", len(las_df))
display(las_df.head(5))

# -------------------------
# 2) Filter Greater London using ONS code (E09...)
# -------------------------
# en tu JSON viene ons_code por autoridad 
if "ons_code" not in las_df.columns:
    raise RuntimeError("No encuentro la columna 'ons_code' en local authorities. Imprime las_df.columns para ver los nombres reales.")

london_las = las_df[
    las_df["ons_code"].astype(str).str.startswith("E09", na=False)
].copy()

print("Greater London local authorities found:", len(london_las))
display(london_las[["id", "name", "ons_code"]].sort_values("name").head(15))

london_la_ids = set(london_las["id"].astype(int).tolist())
if not london_la_ids:
    raise RuntimeError("No pude identificar boroughs de Londres. Revisa si ons_code trae valores tipo E090000xx.")

# -------------------------
# 3) Stream count points pages + filter by local_authority_id
# -------------------------
london_points = []
page = 1

while page <= MAX_PAGES_COUNTPOINTS and len(london_points) < MAX_POINTS * 5:
    resp = dft.list_count_points(page_size=COUNTPOINTS_PAGE_SIZE, page_number=page)
    data = extract_data(resp)
    if not data:
        break

    for row in data:
        la_id = row.get("local_authority_id")
        if la_id is None:
            continue
        try:
            if int(la_id) in london_la_ids:
                london_points.append(row)
        except Exception:
            pass

    if not has_more_pages(resp, page):
        # si no hay paginación visible, ya no tiene sentido seguir
        break
    page += 1

points_df = pd.DataFrame(london_points)
print("London count points collected (pre-clean):", len(points_df))

# -------------------------
# 4) Clean + sample + save
# -------------------------
if points_df.empty:
    raise RuntimeError(
        "No se recolectaron count points de Londres en las páginas revisadas. "
        "Sube MAX_PAGES_COUNTPOINTS o baja COUNTPOINTS_PAGE_SIZE si el API limita."
    )

points_df = (
    points_df
    .dropna(subset=["latitude", "longitude"])
    .drop_duplicates(subset=["count_point_id"])
    .rename(columns={"count_point_id": "point_id"})
)

points_df["latitude"] = points_df["latitude"].astype(float)
points_df["longitude"] = points_df["longitude"].astype(float)

points_df = points_df.sample(n=min(MAX_POINTS, len(points_df)), random_state=42).reset_index(drop=True)

OUT_PATH.parent.mkdir(parents=True, exist_ok=True)
points_df[["point_id", "latitude", "longitude"]].to_csv(OUT_PATH, index=False)

print(f"Saved → {OUT_PATH.resolve()}")
display(points_df.head())


local authorities fetched: 214


Unnamed: 0,id,name,region_id,ita_id,ons_code
0,1,Isles of Scilly,1,,E06000053
1,2,Nottinghamshire,2,,E10000024
2,3,Glasgow City,3,,S12000049
3,4,North Lanarkshire,3,,S12000050
4,5,Somerset,1,,E06000066


Greater London local authorities found: 33


Unnamed: 0,id,name,ons_code
167,168,Barking and Dagenham,E09000002
56,57,Barnet,E09000003
105,106,Bexley,E09000004
117,118,Brent,E09000005
175,176,Bromley,E09000006
144,145,Camden,E09000007
173,174,City of London,E09000001
133,134,Croydon,E09000008
146,147,Ealing,E09000009
120,121,Enfield,E09000010


London count points collected (pre-clean): 1093
Saved → /Users/davidbazalduamendez/Documents/GitHub/Short-term-traffic-congestion-prediction-for-London-Final-Project/notebooks/data/interim/monitoring_points.csv


Unnamed: 0,id,point_id,aadf_year,region_id,local_authority_id,road_name,road_category,road_type,start_junction_road_name,end_junction_road_name,easting,northing,latitude,longitude,link_length_km,link_length_miles
0,18526,18526,2024,6,168,A13,PA,Major,LA boundary,A123/A1153,545000,183238,51.529441,0.088977,2.1,1.3
1,7518,7518,2024,6,168,A1240,PA,Major,A1306,A124,549040,185000,51.544222,0.147921,2.8,1.74
2,6797,6797,2024,6,176,A232,PA,Major,A233,A21,542500,165130,51.367362,0.045676,1.0,0.62
3,36807,36807,2024,6,105,A209,PA,Major,A206,LA Boundary,546030,178000,51.48211,0.101649,1.6,0.99
4,28510,28510,2024,6,103,A200,PA,Major,A200 Jamaica St,A100,533653,179808,51.501415,-0.0758,0.3,0.19


In [5]:
import os
import time
import json
from pathlib import Path
from datetime import datetime, timezone

import pandas as pd
import requests
from dotenv import load_dotenv


In [6]:
load_dotenv()

TOMTOM_API_KEY = os.getenv("TOMTOM_API_KEY")
if not TOMTOM_API_KEY:
    raise RuntimeError("No encontré TOMTOM_API_KEY en tu .env")

POINTS_PATH = Path("data/interim/monitoring_points.csv")  # ajusta si lo tienes en otra carpeta
points = pd.read_csv(POINTS_PATH)

required = {"point_id", "latitude", "longitude"}
missing = required - set(points.columns)
if missing:
    raise RuntimeError(f"Faltan columnas en monitoring_points.csv: {missing}")

# ====== CONFIG CUOTA ======
TOMTOM_DAILY_BUDGET = 2500      # non-tile requests/day :contentReference[oaicite:2]{index=2}
INTERVAL_MIN = 10              # 30 min para no pasarte
MAX_POINTS = 50                # 50 puntos → 2400 requests/día aprox
DURATION_HOURS = 8             # histórico corto pero suficiente para baseline

points = points.head(MAX_POINTS).copy()

calls_per_hour = len(points) * (60 / INTERVAL_MIN)
expected_calls = int(calls_per_hour * DURATION_HOURS)
expected_per_day = int(len(points) * (60 / INTERVAL_MIN) * 24)

print("puntos:", len(points))
print("intervalo (min):", INTERVAL_MIN)
print("duración (h):", DURATION_HOURS)
print("llamadas esperadas en esta corrida:", expected_calls)
print("llamadas esperadas por día con este setup:", expected_per_day, "(budget:", TOMTOM_DAILY_BUDGET, ")")

if expected_per_day > TOMTOM_DAILY_BUDGET:
    print("ojo: con este setup te pasas del budget diario, baja MAX_POINTS o sube INTERVAL_MIN")


puntos: 50
intervalo (min): 10
duración (h): 8
llamadas esperadas en esta corrida: 2400
llamadas esperadas por día con este setup: 7200 (budget: 2500 )
ojo: con este setup te pasas del budget diario, baja MAX_POINTS o sube INTERVAL_MIN


In [7]:
def fetch_tomtom_flow(lat: float, lon: float, api_key: str):
    url = "https://api.tomtom.com/traffic/services/4/flowSegmentData/absolute/10/json"
    params = {
        "key": api_key,
        "point": f"{lat},{lon}",
        "unit": "KMPH",
        "openLr": "false",
    }
    r = requests.get(url, params=params, timeout=30)
    r.raise_for_status()
    return r.json()


In [7]:
RAW_DIR = Path("data/raw")
RAW_DIR.mkdir(parents=True, exist_ok=True)

RUN_ID = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
OUT_JSONL = RAW_DIR / f"tomtom_snapshots_{RUN_ID}.jsonl"

INTERVAL_SECONDS = INTERVAL_MIN * 60
MAX_ITERS = int((DURATION_HOURS * 3600) / INTERVAL_SECONDS)

print("guardando en:", OUT_JSONL.resolve())
print("iteraciones:", MAX_ITERS)


guardando en: /Users/davidbazalduamendez/Documents/GitHub/Short-term-traffic-congestion-prediction-for-London-Final-Project/notebooks/data/raw/tomtom_snapshots_20260107T085543Z.jsonl
iteraciones: 48


In [8]:
def now_utc_iso():
    return datetime.now(timezone.utc).isoformat()

errors = 0
total_calls = 0

with OUT_JSONL.open("a", encoding="utf-8") as f:
    for it in range(1, MAX_ITERS + 1):
        iter_start = time.time()
        ts = now_utc_iso()

        print(f"\niter {it}/{MAX_ITERS}  utc={ts}")

        for row in points.itertuples(index=False):
            point_id = str(row.point_id)
            lat = float(row.latitude)
            lon = float(row.longitude)

            record = {
                "timestamp_utc": ts,
                "point_id": point_id,
                "latitude": lat,
                "longitude": lon,
                "source": "tomtom_flowSegmentData",
                "payload": None,
                "error": None
            }

            try:
                record["payload"] = fetch_tomtom_flow(lat, lon, TOMTOM_API_KEY)
            except requests.HTTPError as e:
                status = getattr(e.response, "status_code", None)
                record["error"] = f"HTTPError {status}: {str(e)[:200]}"
                errors += 1
                if status == 429:
                    time.sleep(10)  # backoff si te rate-limitea
            except Exception as e:
                record["error"] = f"{type(e).__name__}: {str(e)[:200]}"
                errors += 1

            f.write(json.dumps(record, ensure_ascii=False) + "\n")
            total_calls += 1

            time.sleep(0.05)  # micro-sleep

        elapsed = time.time() - iter_start
        sleep_for = max(0, INTERVAL_SECONDS - elapsed)
        print(f"calls: {total_calls} | errores: {errors} | sleep: {int(sleep_for)}s")
        time.sleep(sleep_for)

print("listo")



iter 1/48  utc=2026-01-07T08:55:46.631655+00:00
calls: 50 | errores: 0 | sleep: 589s

iter 2/48  utc=2026-01-07T09:05:46.590716+00:00
calls: 100 | errores: 0 | sleep: 589s

iter 3/48  utc=2026-01-07T09:15:46.595124+00:00
calls: 150 | errores: 0 | sleep: 589s

iter 4/48  utc=2026-01-07T09:25:46.594875+00:00
calls: 200 | errores: 0 | sleep: 588s

iter 5/48  utc=2026-01-07T09:35:46.640491+00:00
calls: 250 | errores: 0 | sleep: 589s

iter 6/48  utc=2026-01-07T09:45:46.652997+00:00
calls: 300 | errores: 0 | sleep: 583s

iter 7/48  utc=2026-01-07T09:55:46.663858+00:00
calls: 350 | errores: 0 | sleep: 587s

iter 8/48  utc=2026-01-07T10:05:46.680947+00:00
calls: 400 | errores: 0 | sleep: 589s

iter 9/48  utc=2026-01-07T10:15:46.682292+00:00
calls: 450 | errores: 0 | sleep: 589s

iter 10/48  utc=2026-01-07T10:25:46.694537+00:00
calls: 500 | errores: 0 | sleep: 588s

iter 11/48  utc=2026-01-07T10:35:46.705749+00:00
calls: 550 | errores: 0 | sleep: 589s

iter 12/48  utc=2026-01-07T10:45:46.71673

## Build processed dataset

In [8]:

from glob import glob

paths = sorted(glob("data/raw/tomtom_snapshots_*.jsonl"))
print(paths)

rows = []
for p in paths:
    with open(p, "r") as f:
        for line in f:
            rows.append(json.loads(line))

df_raw = pd.DataFrame(rows)
print("shape:", df_raw.shape)
df_raw.head()


['data/raw/tomtom_snapshots_20260104T210243Z.jsonl', 'data/raw/tomtom_snapshots_20260104T210624Z.jsonl', 'data/raw/tomtom_snapshots_20260105T101732Z.jsonl', 'data/raw/tomtom_snapshots_20260105T133619Z.jsonl', 'data/raw/tomtom_snapshots_20260105T232705Z.jsonl', 'data/raw/tomtom_snapshots_20260106T083257Z.jsonl', 'data/raw/tomtom_snapshots_20260106T175950Z.jsonl', 'data/raw/tomtom_snapshots_20260106T234001Z.jsonl', 'data/raw/tomtom_snapshots_20260107T085543Z.jsonl']
shape: (15600, 7)
shape: (15600, 7)


Unnamed: 0,timestamp_utc,point_id,latitude,longitude,source,payload,error
0,2026-01-04T21:03:15.487051+00:00,18526,51.529441,0.088977,tomtom_flowSegmentData,"{'flowSegmentData': {'frc': 'FRC2', 'currentSp...",
1,2026-01-04T21:03:15.487051+00:00,7518,51.544222,0.147921,tomtom_flowSegmentData,"{'flowSegmentData': {'frc': 'FRC3', 'currentSp...",
2,2026-01-04T21:03:15.487051+00:00,6797,51.367362,0.045676,tomtom_flowSegmentData,"{'flowSegmentData': {'frc': 'FRC2', 'currentSp...",
3,2026-01-04T21:03:15.487051+00:00,36807,51.48211,0.101649,tomtom_flowSegmentData,"{'flowSegmentData': {'frc': 'FRC3', 'currentSp...",
4,2026-01-04T21:03:15.487051+00:00,28510,51.501415,-0.0758,tomtom_flowSegmentData,"{'flowSegmentData': {'frc': 'FRC3', 'currentSp...",


## Quick EDA

In [9]:
obs_per_point = (
    df_raw[df_raw["error"].isna()]
    .groupby("point_id")["timestamp_utc"]
    .nunique()
    .sort_values(ascending=False)
)

print(obs_per_point.head(10))


point_id
16435    291
6639     291
37718    291
37825    291
38572    291
46433    291
46680    291
46781    291
46787    291
46813    291
Name: timestamp_utc, dtype: int64
