In [1]:
from pathlib import Path
import os
import sys
import shutil

import numpy as np
import pandas as pd
import tensorflow as tf
import traci

import warnings
warnings.filterwarnings("ignore", category=UserWarning)

PROJECT_DIR = Path.cwd()

NETWORK_FILE  = PROJECT_DIR / "my_network.net.xml"
ROUTE_FILE    = PROJECT_DIR / "my_routes.rou.xml"
CONFIG_FILE   = PROJECT_DIR / "my_config.sumocfg"

print("Working directory:", PROJECT_DIR)
print("Network exists? ", NETWORK_FILE.exists())
print("Routes exist?  ", ROUTE_FILE.exists())
print("Config exists? ", CONFIG_FILE.exists())

SUMO_HOME = os.environ.get("SUMO_HOME")
if SUMO_HOME is None:
    raise EnvironmentError("SUMO_HOME is not set.")

print("SUMO_HOME:", SUMO_HOME)


def get_sumo_binary(gui: bool = False) -> str:
    base_name = "sumo-gui" if gui else "sumo"

    cmd = shutil.which(base_name)
    if cmd is not None:
        return cmd

    bin_dir = Path(SUMO_HOME) / "bin"
    if sys.platform.startswith("win"):
        candidate = bin_dir / f"{base_name}.exe"
    else:
        candidate = bin_dir / base_name

    if not candidate.exists():
        raise FileNotFoundError(f"{base_name} not found at {candidate}")

    return str(candidate)


Working directory: C:\Users\manda\OneDrive\Documents\AI Traffic - Jupyter
Network exists?  True
Routes exist?   True
Config exists?  True
SUMO_HOME: C:\Program Files (x86)\Eclipse\Sumo\


In [2]:
# --- Discover TLS IDs ---

if traci.isLoaded():
    traci.close()

sumo_bin = get_sumo_binary(gui=False)
cmd = [sumo_bin, "-c", str(CONFIG_FILE), "--step-length", "1"]
traci.start(cmd)

tls_ids = traci.trafficlight.getIDList()
print("Total TLS:", len(tls_ids))
print(tls_ids)

traci.close()

# --- Build tls_lane_map ---

if traci.isLoaded():
    traci.close()

traci.start([sumo_bin, "-c", str(CONFIG_FILE), "--step-length", "1"])

tls_lane_map = {}
for tls in tls_ids:
    lanes = traci.trafficlight.getControlledLanes(tls)
    lanes = list(dict.fromkeys(lanes))
    tls_lane_map[tls] = lanes

print("\nTLS → lanes:")
for tls, lanes in tls_lane_map.items():
    print(tls, ":", lanes)

traci.close()


Total TLS: 7
('1234828897', '1757353212', '1757353214', '1783045940', '1783045985', '1843356909', '7671039164')

TLS → lanes:
1234828897 : ['107445428#1_0', '-173540663#0_0', '-47195458#0_0']
1757353212 : ['1173473098#2_0', '107445426#1_0', '173540663#2_0']
1757353214 : ['800859756#1_0', '800859756#1_1']
1783045940 : ['-821520600#3_0', '315129223#3_0', '315129223#3_1', '821520600#2_0']
1783045985 : ['-821520600#2_0', '821520600#1_0', '324489280#3_0']
1843356909 : ['223051913#1_0', '223051913#1_1']
7671039164 : ['164073716#1_0', '164073716#1_1', '164073716#1_2']


In [3]:
def get_tls_state(tls_id: str, lane_map: dict) -> list:
    lane_ids = lane_map[tls_id]
    queue_lengths = []
    waiting_times = []

    for lane in lane_ids:
        q = traci.lane.getLastStepHaltingNumber(lane)
        w = traci.lane.getWaitingTime(lane)
        queue_lengths.append(q)
        waiting_times.append(w)

    current_phase = traci.trafficlight.getPhase(tls_id)
    return queue_lengths + waiting_times + [current_phase]


# Compute feature_size
if traci.isLoaded():
    traci.close()

traci.start([sumo_bin, "-c", str(CONFIG_FILE), "--step-length", "1"])

for _ in range(5):
    traci.simulationStep()

lengths = []
for tls in tls_ids:
    s = get_tls_state(tls, tls_lane_map)
    lengths.append(len(s))

feature_size = max(lengths)
num_nodes = len(tls_ids)

print("State lengths per TLS:", lengths)
print("feature_size:", feature_size)
print("num_nodes:", num_nodes)

traci.close()


State lengths per TLS: [7, 7, 5, 9, 7, 5, 7]
feature_size: 9
num_nodes: 7


In [4]:
from collections import defaultdict

if traci.isLoaded():
    traci.close()

traci.start([sumo_bin, "-c", str(CONFIG_FILE), "--step-length", "1"])

tls_adj = {tls: set() for tls in tls_ids}

for tls in tls_ids:
    controlled_links = traci.trafficlight.getControlledLinks(tls)
    for link_group in controlled_links:
        for (incoming, outgoing, _) in link_group:
            for other_tls in tls_ids:
                if other_tls == tls:
                    continue
                if outgoing in tls_lane_map.get(other_tls, []):
                    tls_adj[tls].add(other_tls)
                    tls_adj[other_tls].add(tls)

traci.close()

edge_count = sum(len(neigh) for neigh in tls_adj.values())
if edge_count == 0 and len(tls_ids) > 1:
    print("No adjacency found; using simple chain.")
    ordered = list(tls_ids)
    for i in range(len(ordered) - 1):
        a, b = ordered[i], ordered[i+1]
        tls_adj[a].add(b)
        tls_adj[b].add(a)

print("\nAdjacency list:")
for tls, neigh in tls_adj.items():
    print(tls, ":", sorted(list(neigh)))

# adjacency matrix
tls_index = {tls_id: i for i, tls_id in enumerate(tls_ids)}
adj_matrix = np.zeros((num_nodes, num_nodes), dtype=np.float32)

for tls, neigh in tls_adj.items():
    i = tls_index[tls]
    for nb in neigh:
        j = tls_index[nb]
        adj_matrix[i, j] = 1.0
        adj_matrix[j, i] = 1.0

print("\nadj_matrix shape:", adj_matrix.shape)
print("adj_matrix[0]:", adj_matrix[0])



Adjacency list:
1234828897 : []
1757353212 : []
1757353214 : []
1783045940 : ['1783045985']
1783045985 : ['1783045940']
1843356909 : []
7671039164 : []

adj_matrix shape: (7, 7)
adj_matrix[0]: [0. 0. 0. 0. 0. 0. 0.]


In [5]:
def compute_global_reward(tls_lane_map: dict) -> float:
    total_wait = 0.0
    for tls, lanes in tls_lane_map.items():
        for lane in lanes:
            total_wait += traci.lane.getWaitingTime(lane)
    return -total_wait / 1000.0


In [6]:
class GNNActorCritic(tf.keras.Model):
    def __init__(self, hidden_dim: int, num_actions: int):
        super().__init__()
        self.state_embed = tf.keras.layers.Dense(hidden_dim, activation="relu")
        self.post_gnn   = tf.keras.layers.Dense(hidden_dim, activation="relu")
        self.policy_head = tf.keras.layers.Dense(num_actions)
        self.value_head  = tf.keras.layers.Dense(1)

    def call(self, inputs, training=False):
        x, adj = inputs        # x: (B, N, F), adj: (B, N, N)

        h = self.state_embed(x)      # (B, N, H)
        h_neigh = tf.matmul(adj, h)  # (B, N, H)

        h_cat = tf.concat([h, h_neigh], axis=-1)  # (B, N, 2H)
        h_out = self.post_gnn(h_cat)              # (B, N, H)

        policy_logits = self.policy_head(h_out)   # (B, N, A)

        graph_embed = tf.reduce_mean(h_out, axis=1)  # (B, H)
        value = self.value_head(graph_embed)         # (B, 1)

        return policy_logits, value


hidden_dim = 64
num_actions = 2   # 0 = keep phase, 1 = switch

gnn_model = GNNActorCritic(hidden_dim, num_actions)

adj_batch_tf = tf.convert_to_tensor(adj_matrix[None, ...], dtype=tf.float32)

# Build model with a dummy input
dummy_states = tf.random.uniform((1, num_nodes, feature_size), dtype=tf.float32)
logits_dummy, value_dummy = gnn_model((dummy_states, adj_batch_tf))

print("Policy logits shape:", logits_dummy.shape)  # (1, N, 2)
print("Value shape:", value_dummy.shape)           # (1, 1)


Policy logits shape: (1, 7, 2)
Value shape: (1, 1)


In [7]:
def select_actions_from_logits(policy_logits: tf.Tensor) -> np.ndarray:
    """
    Greedy action selection: argmax over actions per node.
    policy_logits: (N, num_actions)
    """
    if isinstance(policy_logits, tf.Tensor):
        policy_logits = policy_logits.numpy()
    return np.argmax(policy_logits, axis=-1)  # (N,)


def apply_actions_to_sumo(actions: np.ndarray, tls_ids_list):
    """
    actions[i] in {0,1} for TLS tls_ids_list[i]
    0 = keep phase, 1 = switch to next phase
    """
    for idx, tls in enumerate(tls_ids_list):
        a = int(actions[idx])
        if a == 0:
            continue
        elif a == 1:
            curr_phase = traci.trafficlight.getPhase(tls)
            logic = traci.trafficlight.getCompleteRedYellowGreenDefinition(tls)[0]
            num_phases = len(logic.phases)
            next_phase = (curr_phase + 1) % num_phases
            traci.trafficlight.setPhase(tls, next_phase)


In [8]:
if traci.isLoaded():
    traci.close()

traci.start([get_sumo_binary(False), "-c", str(CONFIG_FILE), "--step-length", "1"])
traci.simulationStep()

# build padded states
all_states = []
for tls in tls_ids:
    s = get_tls_state(tls, tls_lane_map)
    s_padded = s + [0] * (feature_size - len(s))
    all_states.append(s_padded)

states_np = np.array(all_states, dtype=np.float32)[None, ...]  # (1, N, F)
states_tf = tf.convert_to_tensor(states_np, dtype=tf.float32)

policy_logits_tf, value_tf = gnn_model((states_tf, adj_batch_tf))
policy_logits = policy_logits_tf[0]  # (N, 2)

actions = select_actions_from_logits(policy_logits)
print("Actions:", actions)

apply_actions_to_sumo(actions, tls_ids)

traci.close()
print("One-step action test done.")


Actions: [0 0 0 0 0 0 0]
One-step action test done.


In [11]:
def run_one_episode(max_steps=3600) -> float:
    """
    Run one full SUMO episode with the current (untrained) policy.
    Returns total episode reward.
    """
    if traci.isLoaded():
        traci.close()

    traci.start([get_sumo_binary(False), "-c", str(CONFIG_FILE), "--step-length", "1"])

    episode_return = 0.0

    for t in range(max_steps):
        # build states
        all_states = []
        for tls in tls_ids:
            s = get_tls_state(tls, tls_lane_map)
            s_padded = s + [0] * (feature_size - len(s))
            all_states.append(s_padded)

        states_np = np.array(all_states, dtype=np.float32)[None, ...]
        states_tf = tf.convert_to_tensor(states_np, dtype=tf.float32)

        policy_logits_tf, value_tf = gnn_model((states_tf, adj_batch_tf), training=False)
        policy_logits = policy_logits_tf[0]

        actions = select_actions_from_logits(policy_logits)
        apply_actions_to_sumo(actions, tls_ids)

        traci.simulationStep()

        r = compute_global_reward(tls_lane_map)
        episode_return += r

        if (t + 1) % 300 == 0:
            print(f"Step {t+1}/{max_steps}, reward: {r:.3f}")

    traci.close()
    print("Episode finished. Total return:", episode_return)
    return episode_return


# Test with a shorter episode first, e.g. 600 steps
test_return = run_one_episode(max_steps=3600)
print("Test episode return (untrained):", test_return)


  logic = traci.trafficlight.getCompleteRedYellowGreenDefinition(tls)[0]


Step 300/3600, reward: -0.005
Step 600/3600, reward: -0.001
Step 900/3600, reward: -0.000
Step 1200/3600, reward: -0.005
Step 1500/3600, reward: -0.000
Step 1800/3600, reward: -0.003
Step 2100/3600, reward: -0.002
Step 2400/3600, reward: -0.000
Step 2700/3600, reward: -0.000
Step 3000/3600, reward: -0.000
Step 3300/3600, reward: -0.000
Step 3600/3600, reward: -0.000
Episode finished. Total return: -8.543999999999977
Test episode return (untrained): -8.543999999999977


In [14]:
# === A2C Hyperparameters ===
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-6)  # small LR for stability
gamma = 0.99                                              # discount factor
entropy_coef = 1e-3                                       # exploration bonus


In [27]:
def train_one_episode(max_steps=900) -> float:
    """
    Run one training episode using online A2C updates.
    Returns total episode return (sum of rewards).
    """

    if traci.isLoaded():
        traci.close()

    traci.start([get_sumo_binary(False), "-c", str(CONFIG_FILE), "--step-length", "1"])

    episode_return = 0.0

    for t in range(max_steps):
        # ---- 1) Build current state batch: (1, N, F) ----
        all_states = []
        for tls in tls_ids:
            s = get_tls_state(tls, tls_lane_map)
            s_padded = s + [0] * (feature_size - len(s))
            all_states.append(s_padded)

        states_np = np.array(all_states, dtype=np.float32)[None, ...]
        states_tf = tf.convert_to_tensor(states_np, dtype=tf.float32)

        with tf.GradientTape() as tape:
            # ---- 2) Forward pass ----
            policy_logits_tf, value_tf = gnn_model((states_tf, adj_batch_tf), training=True)
            policy_logits = policy_logits_tf[0]     # (N, num_actions)

            # Probabilities
            action_probs = tf.nn.softmax(policy_logits, axis=-1)     # (N, num_actions)

            # ---- 3) Sample actions (stochastic) ----
            actions_tf = tf.random.categorical(tf.math.log(action_probs), num_samples=1)
            actions_tf = tf.squeeze(actions_tf, axis=-1)             # (N,)
            actions_np = actions_tf.numpy()

            # Apply actions in SUMO
            apply_actions_to_sumo(actions_np, tls_ids)

            # ---- 4) Step SUMO ----
            traci.simulationStep()

            # ---- 5) Reward ----
            r = compute_global_reward(tls_lane_map)   # negative, scaled
            episode_return += r

            # ---- 6) Next state for TD target ----
            next_states_list = []
            for tls in tls_ids:
                s_next = get_tls_state(tls, tls_lane_map)
                s_next_padded = s_next + [0] * (feature_size - len(s_next))
                next_states_list.append(s_next_padded)

            next_states_np = np.array(next_states_list, dtype=np.float32)[None, ...]
            next_states_tf = tf.convert_to_tensor(next_states_np, dtype=tf.float32)

            _, next_value_tf = gnn_model((next_states_tf, adj_batch_tf), training=False)

            # v and v_next are scalars (shape (1,1))
            v = tf.squeeze(value_tf)
            v_next = tf.squeeze(next_value_tf)

            # ---- 7) TD target & advantage ----
            v_next_detached = tf.stop_gradient(v_next)
            td_target = r + gamma * v_next_detached
            advantage = td_target - v

            # ---- 8) Losses ----
            log_probs = tf.nn.log_softmax(policy_logits, axis=-1)    # (N, num_actions)

            idx = tf.stack([
                tf.range(tf.shape(actions_tf)[0], dtype=tf.int32),
                tf.cast(actions_tf, tf.int32)
            ], axis=1)                                              # (N, 2)

            chosen_log_probs = tf.gather_nd(log_probs, idx)         # (N,)

            actor_loss = -tf.reduce_mean(chosen_log_probs * advantage)
            critic_loss = tf.reduce_mean(tf.square(td_target - v))

            entropy = -tf.reduce_mean(action_probs * tf.math.log(action_probs + 1e-8))

            loss = actor_loss + 0.5 * critic_loss - entropy_coef * entropy

        # ---- 9) Backprop ----
        grads = tape.gradient(loss, gnn_model.trainable_variables)
        grads, _ = tf.clip_by_global_norm(grads, 0.5)
        optimizer.apply_gradients(zip(grads, gnn_model.trainable_variables))
        if (t + 1) % 300 == 0:
            print(
                f"[Train] Step {t+1}/{max_steps}, "
                f"reward: {r:.4f}, "
                f"loss: {loss.numpy():.4f}"
            )

    traci.close()
    print("Training episode finished. Total return:", episode_return)
    return episode_return


In [32]:
train_returns = []
num_train_episodes = 50  # you can increase later

for ep in range(num_train_episodes):
    print(f"\n=== TRAINING EPISODE {ep+1}/{num_train_episodes} ===")
    ep_ret = train_one_episode(max_steps=900)
    train_returns.append(ep_ret)
    print(f"Episode {ep+1} return: {ep_ret:.4f}")

print("\nAll training returns:", train_returns)



=== TRAINING EPISODE 1/50 ===


  logic = traci.trafficlight.getCompleteRedYellowGreenDefinition(tls)[0]


[Train] Step 300/900, reward: -0.0060, loss: 0.4599
[Train] Step 600/900, reward: -0.0120, loss: 1.7620
[Train] Step 900/900, reward: -0.0000, loss: 0.2225
Training episode finished. Total return: -2.2239999999999815
Episode 1 return: -2.2240

=== TRAINING EPISODE 2/50 ===
[Train] Step 300/900, reward: -0.0010, loss: 0.5023
[Train] Step 600/900, reward: -0.0000, loss: 2.6128
[Train] Step 900/900, reward: -0.0050, loss: 0.6078
Training episode finished. Total return: -2.2719999999999856
Episode 2 return: -2.2720

=== TRAINING EPISODE 3/50 ===
[Train] Step 300/900, reward: -0.0010, loss: 0.0402
[Train] Step 600/900, reward: -0.0000, loss: 2.5701
[Train] Step 900/900, reward: -0.0050, loss: -0.1584
Training episode finished. Total return: -6.783999999999999
Episode 3 return: -6.7840

=== TRAINING EPISODE 4/50 ===
[Train] Step 300/900, reward: -0.0000, loss: 1.8683
[Train] Step 600/900, reward: -0.0000, loss: 0.4698
[Train] Step 900/900, reward: -0.0000, loss: -0.1252
Training episode fini

In [33]:
# make sure run_one_episode uses select_actions_from_logits (argmax)
test_return_trained = run_one_episode(max_steps=3600)
print("Trained policy episode return:", test_return_trained)

  logic = traci.trafficlight.getCompleteRedYellowGreenDefinition(tls)[0]


Step 300/3600, reward: -0.000
Step 600/3600, reward: -0.015
Step 900/3600, reward: -0.006
Step 1200/3600, reward: -0.007
Step 1500/3600, reward: -0.000
Step 1800/3600, reward: -0.001
Step 2100/3600, reward: -0.000
Step 2400/3600, reward: -0.005
Step 2700/3600, reward: -0.001
Step 3000/3600, reward: -0.000
Step 3300/3600, reward: -0.001
Step 3600/3600, reward: -0.005
Episode finished. Total return: -24.461000000000077
Trained policy episode return: -24.461000000000077


In [34]:
best_return = -1e9  # very small

for ep in range(10):
    print(f"\n=== TRAINING EPISODE {ep+1}/10 ===")
    ep_ret = train_one_episode(max_steps=900)
    train_returns.append(ep_ret)
    print(f"Episode {ep+1} return: {ep_ret:.4f}")

    if ep_ret > best_return:
        best_return = ep_ret
        gnn_model.save_weights("gnn_a2c_best.weights.h5")
        print(f"--> New best model saved with return {best_return:.4f}")


=== TRAINING EPISODE 1/10 ===


  logic = traci.trafficlight.getCompleteRedYellowGreenDefinition(tls)[0]


[Train] Step 300/900, reward: -0.0000, loss: 1.8929
[Train] Step 600/900, reward: -0.0000, loss: 2.6095
[Train] Step 900/900, reward: -0.0810, loss: 0.5834
Training episode finished. Total return: -3.815999999999996
Episode 1 return: -3.8160
--> New best model saved with return -3.8160

=== TRAINING EPISODE 2/10 ===
[Train] Step 300/900, reward: -0.1200, loss: 7.2763
[Train] Step 600/900, reward: -0.0000, loss: 0.5784
[Train] Step 900/900, reward: -0.0000, loss: 0.0070
Training episode finished. Total return: -20.997000000000067
Episode 2 return: -20.9970

=== TRAINING EPISODE 3/10 ===
[Train] Step 300/900, reward: -0.0010, loss: 6.3848
[Train] Step 600/900, reward: -0.0010, loss: 12.7989
[Train] Step 900/900, reward: -0.0060, loss: 5.4263
Training episode finished. Total return: -5.343999999999981
Episode 3 return: -5.3440

=== TRAINING EPISODE 4/10 ===
[Train] Step 300/900, reward: -0.0020, loss: 2.2984
[Train] Step 600/900, reward: -0.0170, loss: 4.0526
[Train] Step 900/900, reward:

In [5]:
# ================================ FINAL PPO TRAINING CODE ================================
# WORKS WITH YOUR GNN MODEL, TLS SETUP, adj_batch_tf, tls_ids, tls_lane_map, CONFIG_FILE

import numpy as np
import tensorflow as tf
import traci


# ---------------- PPO Hyperparameters ----------------
GAMMA = 0.99
LAMBDA = 0.95
CLIP_EPS = 0.2
LR = 1e-4
EPOCHS = 4
ENTROPY_COEFF = 0.01
VALUE_COEFF = 0.5

optimizer = tf.keras.optimizers.Adam(LR)


# ===================================================================
# utility: convert multi-TLS actions → 1 flattened PPO action
# ===================================================================
def flatten_action(action_vec, n_actions):
    """Convert TLS action vector like [0,1,0,1] → a single int."""
    code = 0
    base = 1
    for a in reversed(action_vec):
        code += a * base
        base *= n_actions
    return code


def unflatten_action(code, num_tls, n_actions):
    """Convert flattened int → TLS vector like [0,1,0,1]."""
    out = []
    for _ in range(num_tls):
        out.append(code % n_actions)
        code //= n_actions
    return out[::-1]


# ===================================================================
# reward function (stable)
# ===================================================================
def compute_reward(tls_lane_map):
    total_wait = sum(traci.lane.getWaitingTime(l) 
                     for tls in tls_lane_map for l in tls_lane_map[tls])
    return -np.tanh(total_wait / 1000)


# ===================================================================
# compute GAE advantages + returns
# ===================================================================
def compute_advantages(rewards, values):
    advantages = []
    gae = 0.0
    values = values + [0.0]  # bootstrap

    for t in reversed(range(len(rewards))):
        delta = rewards[t] + GAMMA * values[t+1] - values[t]
        gae = delta + GAMMA * LAMBDA * gae
        advantages.append(gae)

    advantages.reverse()
    returns = [advantages[i] + values[i] for i in range(len(rewards))]
    return np.array(advantages, dtype=np.float32), np.array(returns, dtype=np.float32)


# ===================================================================
# PPO update function
# ===================================================================
def ppo_update(states, actions, old_log_probs, returns, advantages):
    states = tf.convert_to_tensor(states, dtype=tf.float32)
    actions = tf.convert_to_tensor(actions, dtype=tf.int32)
    old_log_probs = tf.convert_to_tensor(old_log_probs, dtype=tf.float32)
    returns = tf.convert_to_tensor(returns, dtype=tf.float32)
    advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8)
    advantages = tf.convert_to_tensor(advantages, dtype=tf.float32)

    dataset_size = states.shape[0]

    for _ in range(EPOCHS):
        with tf.GradientTape() as tape:
            adj_batch = tf.repeat(adj_batch_tf, repeats=dataset_size, axis=0)

            logits, values = gnn_model((states, adj_batch), training=True)
            values = tf.squeeze(values, axis=1)

            # flatten logits → categorical distribution over joint action space
            flat_logits = tf.reshape(logits, (dataset_size, -1))
            probs = tf.nn.softmax(flat_logits)

            # pick chosen action log-prob
            indices = tf.stack([tf.range(dataset_size), actions], axis=1)
            new_log_probs = tf.math.log(tf.gather_nd(probs, indices) + 1e-10)

            ratio = tf.exp(new_log_probs - old_log_probs)

            unclipped = ratio * advantages
            clipped = tf.clip_by_value(ratio, 1-CLIP_EPS, 1+CLIP_EPS) * advantages
            policy_loss = -tf.reduce_mean(tf.minimum(unclipped, clipped))

            value_loss = VALUE_COEFF * tf.reduce_mean((returns - values)**2)

            entropy = -tf.reduce_mean(tf.reduce_sum(probs * tf.math.log(probs + 1e-10), axis=1))

            loss = policy_loss + value_loss - ENTROPY_COEFF * entropy

        grads = tape.gradient(loss, gnn_model.trainable_variables)
        optimizer.apply_gradients(zip(grads, gnn_model.trainable_variables))


# ===================================================================
# PPO TRAINING EPISODE
# ===================================================================
def ppo_train_episode(max_steps=900):

    if traci.isLoaded():
        traci.close()
    traci.start(["sumo", "-c", CONFIG_FILE, "--step-length", "1"])

    states, actions, rewards, values, log_probs = [], [], [], [], []

    num_tls = len(tls_ids)
    n_actions = 2   # your model uses 2 phases per TLS

    total_return = 0

    for t in range(max_steps):

        # ---------------- build state ----------------
        vec = []
        for tls in tls_ids:
            s = []
            for lane in tls_lane_map[tls]:
                s.append(traci.lane.getLastStepVehicleNumber(lane))
                s.append(traci.lane.getWaitingTime(lane))
            s += [0] * (feature_size - len(s))
            vec.append(s)

        state_np = np.array(vec, dtype=np.float32)[None, :]
        state_tf = tf.convert_to_tensor(state_np, dtype=tf.float32)

        # ---------------- model prediction ----------------
        logits_tf, value_tf = gnn_model((state_tf, adj_batch_tf), training=False)
        logits = logits_tf[0].numpy()
        value = value_tf.numpy()[0,0]

        flat_logits = logits.reshape(-1)
        probs = tf.nn.softmax(flat_logits).numpy()

        # ---------------- sample ACTION ----------------
        flat_action = np.random.choice(len(probs), p=probs)
        action_vec = unflatten_action(flat_action, num_tls, n_actions)
        log_prob = np.log(probs[flat_action] + 1e-10)

        # ---------------- apply action ----------------
        for i, tls in enumerate(tls_ids):
            traci.trafficlight.setPhase(tls, int(action_vec[i]))

        traci.simulationStep()

        r = compute_reward(tls_lane_map)
        total_return += r

        states.append(state_np[0])
        actions.append(flat_action)
        rewards.append(r)
        values.append(value)
        log_probs.append(log_prob)

        if (t+1) % 300 == 0:
            print(f"[PPO] Step {t+1}/{max_steps}, reward={r:.3f}")

    traci.close()

    # ---------------- prepare PPO buffers ----------------
    advantages, returns = compute_advantages(rewards, values)
    states = np.array(states, dtype=np.float32)
    actions = np.array(actions, dtype=np.int32)
    log_probs = np.array(log_probs, dtype=np.float32)

    # ---------------- PPO UPDATE ----------------
    ppo_update(states, actions, log_probs, returns, advantages)

    print("PPO episode return:", total_return)
    return total_return


# ===================================================================
# TRAIN PPO FOR N EPISODES
# ===================================================================
def train_ppo(num_episodes=50):
    returns = []
    for ep in range(num_episodes):
        print(f"\n========== PPO TRAINING EPISODE {ep+1}/{num_episodes} ==========")
        ret = ppo_train_episode(max_steps=900)
        returns.append(ret)
        print(f"Episode {ep+1} return = {ret:.3f}")
    return returns


In [6]:
train_ppo(num_episodes=50)





NameError: name 'CONFIG_FILE' is not defined