In [6]:
from dataclasses import dataclass, asdict
from typing import List, Optional, Tuple, Dict, Iterable
import os
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import random

print(torch.cuda.is_available())
print(torch.version.cuda if torch.version else "no torch.version")
if torch.cuda.is_available():
    print(torch.cuda.get_device_name(0))

True
12.1
NVIDIA GeForce GTX 960M


# Parameters

In [7]:
PARAMS = {
    # Data paths
    "base_dir": "D:/Desktop/MSc Thesis - Copy/",
    "nodes_csv": "shapefiles/_network_parts_4_intersections/node_urban_features.csv",
    "links_csv": "shapefiles/_network_parts_4_intersections/positions_links_detailed.csv",

    # Urban features
    "using_urban_features": True,
    "urban_features_to_use": ["GSI", "WMHB", "GrCR", "GD"],
    "include_urban_in_node_features": False,

    # Filtering
    "min_participants_per_day": 10,
    "tz": "Asia/Seoul",

    # Binning
    "bin_hours": 6,                 # choose from 1,2,3,4,6,8,12,24
    "align_to_midnight": True,

    # Model + training
    "window_size": 4,
    "embedding_dim": 64,
    "learning_rate": 0.01,
    "num_epochs": 60,
    "encoder_edge_feat_dim": 4,     # hour, day, flow, transpor
    "time_feat_dim": 2,             # hour_norm, day_norm
    "use_mode_loss": False,

    # Loss normalizers (you were using 1s)
    "manual_max_losses": {
        "node_count": 1.0,
        "node_presence": 1.0,
        "edge_flow": 1.0,
        "transport_mode": 1.0
    },

    # Presence loss: currently setting it to 0
    "use_presence_loss": False,
    "presence_threshold": 0.5,      # for active node thresholding in edge candidates
}

# General functions

In [8]:
def set_seed(seed: int = 42):
    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

def load_csvs(params: Dict) -> Tuple[pd.DataFrame, pd.DataFrame]:
    nodes_df = pd.read_csv(os.path.join(params["base_dir"], params["nodes_csv"]), header=0)
    links_df = pd.read_csv(os.path.join(params["base_dir"], params["links_csv"]), header=0)
    return nodes_df, links_df

def reindex_node_ids(nodes_df: pd.DataFrame, links_df: pd.DataFrame):
    original_ids = sorted(nodes_df["node_id"].unique())
    id_map = {old: new for new, old in enumerate(original_ids)}
    nodes_df = nodes_df.copy()
    links_df = links_df.copy()
    nodes_df["node_id_original"] = nodes_df["node_id"]
    nodes_df["node_id"] = nodes_df["node_id"].map(id_map)
    links_df["node_id"] = links_df["node_id"].map(id_map)
    links_df["prev_node_id"] = links_df["prev_node_id"].map(id_map)
    return nodes_df, links_df

def prepare_inputs(params: Dict):
    nodes_df, links_df = load_csvs(params)

    # drop geometry and cast bools to ints
    if "geometry" in nodes_df.columns:
        nodes_df = nodes_df.drop(columns=["geometry"])
    for col in nodes_df.select_dtypes(include=["bool"]).columns:
        nodes_df[col] = nodes_df[col].astype(int)

    # timestamps & ids
    links_df["timestamp"] = pd.to_datetime(links_df["merged_datetime"], format='ISO8601')
    links_df["merged_datetime"] = pd.to_datetime(links_df["merged_datetime"], format='ISO8601').astype(int) // 10**9
    links_df["node_id"] = links_df["node_id"].astype(int)
    links_df["prev_node_id"] = links_df["prev_node_id"].astype(int)
    links_df["id_participant"] = links_df["id_participant"].astype(int)
    if links_df["id_participant"].min() > 0:
        links_df["id_participant"] = links_df["id_participant"] - 1

    links_df["hour_of_day"] = links_df["timestamp"].dt.hour
    links_df["day_of_week"] = links_df["timestamp"].dt.dayofweek

    # reindex nodes to [0..N-1]
    nodes_df, links_df = reindex_node_ids(nodes_df, links_df)

    # filter days with enough participants
    links_df["timestamp"] = pd.to_datetime(links_df["timestamp"], utc=True).dt.tz_convert(params["tz"])
    links_df["day"] = links_df["timestamp"].dt.floor("D")
    counts = links_df.groupby("day")["id_participant"].nunique()
    active_days = counts[counts >= params["min_participants_per_day"]].index
    links_df = links_df[links_df["day"].isin(active_days)].copy()

    return nodes_df, links_df

def normalize_urban(nodes_df: pd.DataFrame, feat_names: List[str]) -> np.ndarray:
    if not feat_names:
        return np.empty((len(nodes_df), 0))
    X = nodes_df[feat_names]
    Xn = (X - X.min()) / (X.max() - X.min())
    # fill rules you used
    fill_values = {"GSI": 0.0, "FSI": 0.0, "OSR": 1.0, "ABH": 0.0, "WMHB": 0.0, "HSTD": 0.0, "GrCR": 0.0, "GD": 1.0}
    for c in feat_names:
        if c in fill_values:
            Xn[c] = Xn[c].fillna(fill_values[c])
    return Xn.values

def make_node_features(nodes_df: pd.DataFrame, params: Dict) -> torch.Tensor:
    coords = nodes_df[["node_x", "node_y"]].values
    cmin, cmax = coords.min(axis=0), coords.max(axis=0)
    denom = np.where((cmax - cmin) == 0, 1, (cmax - cmin))
    coords_norm = (coords - cmin) / denom

    if params["using_urban_features"] and params["include_urban_in_node_features"]:
        uf = normalize_urban(nodes_df, params["urban_features_to_use"])
        arr = np.hstack([coords_norm, uf])
    else:
        #(coords only)
        arr = coords_norm

    return torch.tensor(arr, dtype=torch.float)

def bin_links(links_df: pd.DataFrame, params: Dict):
    df = links_df.copy()
    df["timestamp"] = pd.to_datetime(df["timestamp"], utc=True).dt.tz_convert(params["tz"])

    if params["align_to_midnight"]:
        start_dt = df["timestamp"].dt.floor("D").min()
    else:
        start_dt = df["timestamp"].min().floor(f'{params["bin_hours"]}H')

    interval = params["bin_hours"] * 3600
    start_sec = int(start_dt.timestamp())
    df["time_seconds"] = df["merged_datetime"] - start_sec
    df["time_bin"] = (df["time_seconds"] // interval).astype(int)

    # bin context (hour_norm, day_norm) from bin midpoint
    min_bin, max_bin = int(df["time_bin"].min()), int(df["time_bin"].max())
    bin_context = {}
    for t in range(min_bin, max_bin + 1):
        bin_start = start_dt + pd.Timedelta(seconds=interval * t)
        mid = bin_start + pd.Timedelta(seconds=interval // 2)
        hour_norm = (mid.hour + mid.minute/60) / 23.0
        day_norm = mid.dayofweek / 6.0
        bin_context[t] = (hour_norm, day_norm)

    return df, bin_context, range(min_bin, max_bin + 1)

def classify_transport_mode(speed_norm, speed_min, speed_max):
    speed = speed_norm * (speed_max - speed_min) + speed_min
    return 0 if speed <= 7.5 else 1

def build_snapshots(nodes_df, links_df, node_features: torch.Tensor, params: Dict):
    df = links_df.copy()
    df["hour_norm"] = df["hour_of_day"] / 23.0
    df["day_norm"] = df["day_of_week"] / 6.0
    sp_min, sp_max = df["speed"].min(), df["speed"].max()
    df["speed_norm"] = (df["speed"] - sp_min) / (sp_max - sp_min)

    df, bin_context, all_bins = bin_links(df, params)
    num_nodes = nodes_df["node_id"].nunique()

    features, edge_indices, edge_feats, targets = [], [], [], []
    for t in all_bins:
        now = df[df["time_bin"] == t].copy()
        nxt = df[df["time_bin"] == t + 1].copy()

        # node features: [visit_count] + node_features
        visit = now.groupby("node_id").size().reindex(range(num_nodes), fill_value=0).values
        visit_t = torch.tensor(visit, dtype=torch.float).unsqueeze(1)
        x = torch.cat([visit_t, node_features], dim=1)
        features.append(x)

        # edge features this step
        if not now.empty:
            g = now.groupby(["prev_node_id", "node_id"]).agg({
                "hour_norm": "mean",
                "day_norm": "mean",
                "speed_norm": "mean",
                "id_participant": "count"  # flow
            }).rename(columns={"id_participant": "flow"}).reset_index()
            g["transport"] = g["speed_norm"].apply(lambda s: classify_transport_mode(s, sp_min, sp_max))

            src = torch.tensor(g["prev_node_id"].values, dtype=torch.long)
            dst = torch.tensor(g["node_id"].values, dtype=torch.long)
            ei = torch.stack([src, dst], dim=0)
            ef = torch.tensor(g[["hour_norm", "day_norm", "flow", "transport"]].values, dtype=torch.float)
        else:
            ei = torch.empty((2, 0), dtype=torch.long)
            ef = torch.empty((0, 4), dtype=torch.float)

        edge_indices.append(ei)
        edge_feats.append(ef)

        # targets @ t+1
        if nxt.empty:
            node_target = torch.zeros(num_nodes)
            next_ei = torch.empty((2, 0), dtype=torch.long)
            flow_target = torch.empty((0,))
            mode_target = torch.empty((0,), dtype=torch.long)
        else:
            vt = nxt.groupby("node_id").size().reindex(range(num_nodes), fill_value=0).values
            node_target = torch.tensor(vt, dtype=torch.float)

            gn = nxt.groupby(["prev_node_id", "node_id"]).agg({
                "speed_norm": "mean",
                "hour_norm": "mean",
                "day_norm": "mean",
                "id_participant": "count"
            }).rename(columns={"id_participant": "flow"}).reset_index()
            gn["transport"] = gn["speed_norm"].apply(lambda s: classify_transport_mode(s, sp_min, sp_max))

            ns = torch.tensor(gn["prev_node_id"].values, dtype=torch.long)
            nd = torch.tensor(gn["node_id"].values, dtype=torch.long)
            next_ei = torch.stack([ns, nd], dim=0)
            flow_target = torch.tensor(gn["speed_norm"].values, dtype=torch.float)
            mode_target = torch.tensor(gn["transport"].values, dtype=torch.long)

        targets.append((node_target, next_ei, flow_target, mode_target))

    return features, edge_indices, edge_feats, targets, bin_context, all_bins

# Model Definitions

In [9]:
from torch_geometric.nn import NNConv
from torch.nn import GRUCell, Linear, ReLU, Sequential

class EdgeAwareTemporalEncoder(nn.Module):
    def __init__(self, in_channels, edge_feat_dim, hidden_dim):
        super().__init__()
        self.edge_nn = Sequential(
            Linear(edge_feat_dim, 32),
            ReLU(),
            Linear(32, in_channels * hidden_dim)
        )
        self.conv = NNConv(in_channels, hidden_dim, nn=self.edge_nn, aggr='mean')
        self.gru = GRUCell(hidden_dim, hidden_dim)

    def forward(self, x_seq, edge_index_seq, edge_attr_seq):
        h = None
        for x, edge_index, edge_attr in zip(x_seq, edge_index_seq, edge_attr_seq):
            out = self.conv(x, edge_index, edge_attr)
            h = out if h is None else self.gru(out, h)
        return h

class MovementLinkPredictor(nn.Module):
    def __init__(self, in_channels, encoder_edge_feat_dim, time_feat_dim, hidden_dim):
        super().__init__()
        self.encoder = EdgeAwareTemporalEncoder(in_channels, encoder_edge_feat_dim, hidden_dim)
        self.node_decoder = nn.Sequential(Linear(hidden_dim + time_feat_dim, 64), ReLU(), Linear(64, 1))
        self.edge_flow_decoder = nn.Sequential(nn.Linear(2*hidden_dim + time_feat_dim, 64), nn.ReLU(), nn.Linear(64, 1))
        self.transport_decoder = nn.Sequential(nn.Linear(2*hidden_dim + time_feat_dim, 64), nn.ReLU(), nn.Linear(64, 2))

    def forward_encoder(self, x_seq, edge_index_seq, edge_attr_seq):
        return self.encoder(x_seq, edge_index_seq, edge_attr_seq)

    def predict_node_counts(self, z, time_feat):
        return self.node_decoder(torch.cat([z, time_feat], dim=1)).squeeze(1)

    def predict_edge_flow(self, z, edge_index, time_feat):
        src = z[edge_index[0]]
        dst = z[edge_index[1]]
        return self.edge_flow_decoder(torch.cat([src, dst, time_feat], dim=1)).squeeze(1)

    def predict_edge_mode(self, z, edge_index, time_feat):
        src = z[edge_index[0]]
        dst = z[edge_index[1]]
        return self.transport_decoder(torch.cat([src, dst, time_feat], dim=1))

In [10]:
def assign_time_bins_by_day_cadence(df: pd.DataFrame, window_size: int):
    """
    Keep your cadence idea but short:
    - Tag days as train/val/test in repeating blocks (8T/4V/6T/2Te).
    - Remove target bins whose input window crosses groups.
    """
    d = df.copy()
    min_day = d["day"].min()
    d["day_index"] = (d["day"] - min_day).dt.days
    unique_days = sorted(d["day_index"].unique())

    pattern = (["train"]*8) + (["val"]*4) + (["train"]*6) + (["test"]*2)
    L = len(pattern)

    day_to_group = {}
    for i, day in enumerate(unique_days):
        day_to_group[day] = pattern[i % L]

    d["group"] = d["day_index"].map(day_to_group)
    bin_groups = d.groupby("time_bin")["group"].agg(lambda x: x.value_counts().idxmax())

    valid_bins = []
    for t in bin_groups.index:
        g = bin_groups[t]
        ok = True
        for b in range(t - window_size, t):
            if b not in bin_groups or bin_groups[b] != g:
                ok = False
                break
        if ok:
            valid_bins.append(t)

    train_bins = [t for t in valid_bins if bin_groups[t] == "train"]
    val_bins   = [t for t in valid_bins if bin_groups[t] == "val"]
    test_bins  = [t for t in valid_bins if bin_groups[t] == "test"]
    return train_bins, val_bins, test_bins

# Model training definitions

In [11]:
# ------------------------------ SHARED FORWARD STEP ----------------------------------
def forward_step(model, t, links_df, features, edge_indices, edge_feats, targets, bin_context, params):
    # Window slices
    ws = params["window_size"]
    x_seq = features[t - ws:t]
    ei_seq = edge_indices[t - ws:t]
    ef_seq = edge_feats[t - ws:t]
    if all(ei.size(1) == 0 for ei in ei_seq):
        return None  # skip

    z = model.forward_encoder(x_seq, ei_seq, ef_seq)

    hour, day = bin_context[t]
    time_feat = torch.tensor([[hour, day]], dtype=torch.float)
    num_nodes = features[0].shape[0]
    node_time_feat = time_feat.repeat(num_nodes, 1)

    node_target, next_ei, flow_target, mode_target = targets[t - 1]
    node_pred = model.predict_node_counts(z, node_time_feat)

    # Presence (your current training disables it)
    if params["use_presence_loss"]:
        # linear map 0..inf → clamp 0..1
        p_active = torch.clamp(node_pred, 0.0, 1.0)
        is_active = (node_target >= 1).float()
        presence_loss = F.binary_cross_entropy(p_active, is_active)
    else:
        presence_loss = torch.tensor(0.0)

    mse_loss = F.mse_loss(node_pred, node_target)

    # Candidate edges
    threshold = params["presence_threshold"]
    active_nodes = (node_pred >= threshold).nonzero(as_tuple=False).flatten()
    edge_loss = torch.tensor(0.0)
    mode_loss = torch.tensor(0.0)

    if active_nodes.numel() > 0:
        df_prev = links_df[links_df["time_bin"] == t - 1]
        last_pos = df_prev.sort_values("merged_datetime").groupby("id_participant").tail(1)
        src_nodes = torch.tensor(last_pos["node_id"].unique(), dtype=torch.long)

        # source→active + active→active
        from itertools import product
        pairs = list(set(product(src_nodes.tolist(), active_nodes.tolist())) |
                     set(product(active_nodes.tolist(), active_nodes.tolist())))

        if len(pairs) > 0:
            cand = torch.tensor(pairs, dtype=torch.long).T  # [2, E]
            # true lookup
            true_ei = next_ei
            true_flow = flow_target
            true_mode = mode_target
            true_map = {tuple(e): i for i, e in enumerate(true_ei.T.tolist())}

            flow_targets = torch.tensor(
                [true_flow[true_map[(int(s), int(d))]] if (int(s), int(d)) in true_map else 0.0
                 for s, d in cand.T], dtype=torch.float
            )
            edge_time_feat = time_feat.repeat(cand.shape[1], 1)
            pred_flow = model.predict_edge_flow(z, cand, edge_time_feat)

            # weight positives higher
            weights = torch.where(flow_targets > 0, torch.tensor(10.0), torch.tensor(1.0))
            edge_loss = F.mse_loss(pred_flow, flow_targets, reduction='none')
            edge_loss = (edge_loss * weights).mean()

            if params["use_mode_loss"]:
                exists = torch.tensor([(int(s), int(d)) in true_map for s, d in cand.T], dtype=torch.bool)
                if exists.any():
                    valid = cand[:, exists]
                    m_targets = torch.tensor([true_mode[true_map[(int(s), int(d))]] for s, d in valid.T],
                                             dtype=torch.long)
                    m_logits = model.predict_edge_mode(z, valid, time_feat.repeat(valid.shape[1], 1))
                    mode_loss = F.cross_entropy(m_logits, m_targets)
                else:
                    mode_loss = torch.tensor(0.0)

    # Normalize pieces (you used all 1.0 by default)
    mm = params["manual_max_losses"]
    total = (mse_loss/mm["node_count"]) + (presence_loss/mm["node_presence"]) + (edge_loss/mm["edge_flow"])
    if params["use_mode_loss"]:
        total = total + (mode_loss/mm["transport_mode"])

    return total, mse_loss, presence_loss, edge_loss, mode_loss

In [12]:
# ------------------------------ TRAIN / VAL LOOPS ------------------------------------
def compute_val(model, params, links_df, features, edge_indices, edge_feats, targets, bin_context, bins):
    model.eval()
    with torch.no_grad():
        s = 0
        acc = np.zeros(5, dtype=float)
        for t in bins:
            out = forward_step(model, t, links_df, features, edge_indices, edge_feats, targets, bin_context, params)
            if out is None:
                continue
            vals = [float(x.item()) for x in out]
            acc += np.array(vals)
            s += 1
        if s == 0:
            return [float('nan')]*5
        return (acc / s).tolist()

def train_one_run(params: Dict):
    set_seed(42)

    # 1) Data
    nodes_df, links_df = prepare_inputs(params)
    node_features = make_node_features(nodes_df, params)
    features, edge_indices, edge_feats, targets, bin_context, all_bins = build_snapshots(
        nodes_df, links_df, node_features, params
    )

    # Splits
    train_bins, val_bins, test_bins = assign_time_bins_by_day_cadence(links_df, params["window_size"])

    # 2) Model
    model = MovementLinkPredictor(
        in_channels=features[0].shape[1],
        encoder_edge_feat_dim=params["encoder_edge_feat_dim"],
        time_feat_dim=params["time_feat_dim"],
        hidden_dim=params["embedding_dim"]
    )
    opt = torch.optim.Adam(model.parameters(), lr=params["learning_rate"])
    sch = torch.optim.lr_scheduler.ReduceLROnPlateau(opt, mode='min', factor=0.5, patience=5, threshold=0.0005, verbose=True)

    # 3) Train
    train_hist = {"total": [], "mse": [], "presence": [], "edge": [], "mode": []}
    val_hist   = {"total": [], "mse": [], "presence": [], "edge": [], "mode": []}

    for epoch in range(params["num_epochs"]):
        model.train()
        s = 0
        acc = np.zeros(5, dtype=float)
        for t in train_bins:
            out = forward_step(model, t, links_df, features, edge_indices, edge_feats, targets, bin_context, params)
            if out is None:
                continue
            total_loss, mse_loss, presence_loss, edge_loss, mode_loss = out
            opt.zero_grad()
            total_loss.backward()
            opt.step()

            vals = [float(x.item()) for x in [total_loss, mse_loss, presence_loss, edge_loss, mode_loss]]
            acc += np.array(vals)
            s += 1

        train_vals = (acc / s).tolist() if s else [float('nan')]*5
        for k, v in zip(train_hist.keys(), train_vals):
            train_hist[k].append(v)

        # validation with SAME computation
        v_tot, v_mse, v_pres, v_edge, v_mode = compute_val(
            model, params, links_df, features, edge_indices, edge_feats, targets, bin_context, val_bins
        )
        for k, v in zip(val_hist.keys(), [v_tot, v_mse, v_pres, v_edge, v_mode]):
            val_hist[k].append(v)

        sch.step(train_hist["total"][-1])
        print(f"Epoch {epoch+1}/{params['num_epochs']} | Train {train_hist['total'][-1]:.4f} | Val {val_hist['total'][-1]:.4f}")

    return {
        "model": model,
        "features": features,
        "edge_indices": edge_indices,
        "edge_feats": edge_feats,
        "targets": targets,
        "bin_context": bin_context,
        "splits": (train_bins, val_bins, test_bins),
        "train_hist": train_hist,
        "val_hist": val_hist,
        "links_df": links_df,
    }

In [13]:
# ------------------------------ MULTI-RUN SWEEP --------------------------------------
def sweep(param_list: List[Dict]):
    results = []
    for i, p in enumerate(param_list, 1):
        print("\n" + "="*20 + f" RUN {i} " + "="*20)
        res = train_one_run(p)
        results.append((p, res))
    return results

# Running the model

In [14]:
if __name__ == "__main__":
    # Single run:
    result = train_one_run(PARAMS)

    # Or: multiple runs with small tweaks
    # runs = [
    #     {**PARAMS, "bin_hours": 6, "embedding_dim": 64, "use_presence_loss": False, "use_mode_loss": False},
    #     {**PARAMS, "bin_hours": 3, "embedding_dim": 128, "use_presence_loss": False, "use_mode_loss": False},
    #     {**PARAMS, "include_urban_in_node_features": True},  # try adding urban features into nodes
    # ]
    # all_results = sweep(runs)

KeyError: 'time_bin'