# 01 — Data collection & quick EDA

Day 2–3 notebook.

Prereqs:
1) Create `.env` from `.env.example`
2) Fill API keys
3) Install requirements


In [None]:
from pathlib import Path
print('Working dir:', Path().resolve())


## Load clients

In [None]:
from traffic_forecast.config import get_settings
from traffic_forecast.http import HttpClient, build_session
from traffic_forecast.clients.tomtom import TomTomClient
from traffic_forecast.clients.tfl import TflClient
from traffic_forecast.clients.dft import DftRoadTrafficClient

s = get_settings()
http = HttpClient(session=build_session(user_agent=s.user_agent), timeout_seconds=s.http_timeout_seconds)
tomtom = TomTomClient(api_key=s.tomtom_api_key, http=http)
tfl = TflClient(app_key=s.tfl_app_key, app_id=s.tfl_app_id, http=http)
dft = DftRoadTrafficClient(http=http)


## Create/load monitoring points (DfT)

In [None]:
import pandas as pd
from traffic_forecast.data.points import LondonBBox, ensure_points_csv

bbox = LondonBBox(s.london_bbox_min_lat, s.london_bbox_max_lat, s.london_bbox_min_lon, s.london_bbox_max_lon)
points_path = ensure_points_csv(dft, bbox=bbox, target_n=s.collection_num_points, out_path=Path('data/metadata/points.csv'))
points = pd.read_csv(points_path)
points.head()


## (Optional) Run collector briefly

In [None]:
# Recommended from terminal:
# python -m traffic_forecast.scripts.run_collector
#
# If you must run here, keep it short:
# from traffic_forecast.data.collector import Collector, CollectorConfig
# cfg = CollectorConfig(num_points=20, interval_seconds=300, duration_minutes=20, out_dir=Path('data'), london_bbox=bbox)
# Collector(tomtom=tomtom, tfl=tfl, dft=dft, cfg=cfg).run()


## Build processed dataset

In [None]:
from traffic_forecast.data.build_dataset import build_dataset
out = build_dataset(data_dir=Path('data'), out_path=Path('data/processed/observations.parquet'))
print('Wrote:', out)


## Quick EDA

In [None]:
import matplotlib.pyplot as plt
df = pd.read_parquet('data/processed/observations.parquet')
df['timestamp_utc'] = pd.to_datetime(df['timestamp_utc'], utc=True, errors='coerce')
print('Rows:', len(df), 'Points:', df['point_id'].nunique())
df[['current_speed','free_flow_speed','congestion_index']].describe()


In [None]:
one_point = df['point_id'].astype(str).value_counts().index[0]
d1 = df[df['point_id'].astype(str) == str(one_point)].sort_values('timestamp_utc')

plt.figure()
plt.plot(d1['timestamp_utc'], d1['congestion_index'])
plt.xlabel('timestamp_utc')
plt.ylabel('congestion_index')
plt.title(f'Congestion index over time (point_id={one_point})')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
