In [None]:
#Import all necessary libraries
import os
import gym
from gym import spaces
import sumo_rl  # Ensure SUMO Gym integration is available
import numpy as np
import traci
import time
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy


In [None]:
# Check whether SUMO was properly installed
if 'SUMO_HOME' not in os.environ:
    print("Entered")
    os.environ['SUMO_HOME'] = "/path/to/sumo"

In [None]:
#This dictionary maps each traffic light intersection to the id's (From Netedit in SUMO) of the incoming lanes
lanes = {
    "261199942" : ["435326187#0", "-377882482#1", "-435677953#2", "-435326192#1"], 
    "261174045" : ["435677953#0", "1303785156#0", "-435677955#1", "-57782117#3"]  
}

#This dictionary is used to compute the differences in metric values between the current and previous observations by storing the previous observation's values
previousMetrics = {"435326187#0": [], "-377882482#1": [], "-435677953#2": [], "-435326192#1": [], "435677953#0": [], "1303785156#0": [], "-435677955#1": [], "-57782117#3":[]}

In the next cell, the custom SUMORL environment used to interact with SUMO simulations, extract observations, and compute metrics and rewards is defined. This environment was used to both train and test the single agent PPO model.

In [12]:
import numpy as np
import time
import traci
import sumolib
import xml.etree.ElementTree as ET
from stable_baselines3 import PPO, A2C
import matplotlib.pyplot as plt

#SumoRL class
class SumoRL(gym.Env):
    def __init__(self, net_file, route_file_list): #constructor of SumoRL
        super(SumoRL, self).__init__()

        # Initialize SUMO with net and route files
        self.net_file = net_file
        self.route_file_list = route_file_list
        self.route_file_index = -1
        self.route_file = ""
        self.isTest = not bool(self.route_file_list)
        self.reward_values = []
        self.queue_lengths = []
        self.traffic_densities = []
        self.average_speeds = []
        self.simulation_started = False

        
        # e.g., [green, yellow, red duration for each signal]
        self.action_space = spaces.Box(low=2, high=45, shape=(4,), dtype=np.float64)  # Max duration per signal phase (red and yellow)
        self.metrics = spaces.Box(low=0, high=100, shape=(3,), dtype=int)
        self.lane_index = spaces.Box(low=0, high=3, shape=(),dtype=int)
        self.observation_space = spaces.Dict({ #The observation space for all incoming lanes, and includes beginning and ending timestamps as well
         "261199942_0": self.metrics,
         "261199942_1": self.metrics,
         "261199942_2": self.metrics,
         "261199942_3": self.metrics,
         "261174045_0": self.metrics,
         "261174045_1": self.metrics,
         "261174045_2": self.metrics,
         "261174045_3": self.metrics,
         "begin_time": spaces.Box(low=0, high=86400, shape=(1,), dtype=int),
         "end_time": spaces.Box(low=600, high=86400, shape=(1,), dtype=int)
        })
         
        self.begin_time = -600
        self.end_time = 0

    def get_observation(self):
        return self.compute_metrics()
        
    def step(self, actions):
        
        timing = {"261199942": [actions[0], actions[1]], "261174045": [actions[2], actions[3]]}

        for intersection in lanes:
            # Apply action
            curr_logic = traci.trafficlight.getAllProgramLogics(intersection)[0]
            curr_logic.phases[1].duration = timing[intersection][0]
            curr_logic.phases[5].duration = timing[intersection][1]
            traci.trafficlight.setProgramLogic(intersection, curr_logic)
            
        
        if self.isTest:
            count = 0
            while count < 300:
                traci.simulationStep()
                count+=1
        else:
            traci.simulationStep() # Perform one simulation step in SUMO (prior to extracing the next observation from the simulation)

        result_obs = self.get_observation()

        info = {} #Used to log additional info

        rewards = self.compute_reward(result_obs)
        print(f'Reward: {rewards}')

        done = traci.simulation.getMinExpectedNumber() == 0 #check termination of simulation

        return result_obs, rewards, done, info
  
    def reset(self):  #invoked every time the previous SUMO simulation episode terminates (or at the very beginning of the simulation to get the first observation) 
       
        if self.isTest: #reset handling for testing phase
            if self.simulation_started:
              traci.close()  # Close the current session
            traci.start(['sumo', '-c', 'sumo_config_testing.sumocfg'], label="32111224243343322511234456754342545744365")  # Restart simulation (run headless to improve efficiency)
            self.simulation_started = True
            return self.get_observation()

        #reset handling for training phase begins here
     
        if self.simulation_started:
            traci.close()  
        
        #Update the begin and end times for the next SUMO simulation episode to include in each observation
        self.begin_time = self.end_time
        self.end_time += 600
        if self.begin_time == 86400:
           self.begin_time = 0
           self.end_time = 600
        tree = ET.parse("sumo_config.sumocfg")
        root = tree.getroot()
        self.route_file_index+=1 #update route file index
        if self.route_file_index >= len(self.route_file_list):
            self.route_file_index = 0 #restart from the beginning
        self.route_file = self.route_file_list[self.route_file_index] #update the route file being used
        
        input_element = root.find("input") #load new route file for next SUMO simulation episdoe
        input_element.find("route-files").set('value', self.route_file)
        
        
        tree.write("sumo_config.sumocfg") #save changes
        traci.start(['sumo', '-c', 'sumo_config.sumocfg'], label="323211212234123222551145246276225")  # Restart simulation (run headless to improve efficiency)
        self.simulation_started = True
        return self.get_observation()
     
     #For each incoming lane, the observation metrics are extracted using traci API 
    def compute_metrics(self):
        
        observation = {"261199942_0": [], "261199942_1": [], "261199942_2": [], "261199942_3": [],"261174045_0": [], "261174045_1": [], "261174045_2": [], "261174045_3": []}
        for intersection in lanes:
         for edge in lanes[intersection]:
           # print("Entered lane loop")
            i = str(lanes[intersection].index(edge))
           
            queue_length=traci.edge.getLastStepVehicleNumber(edge)
            vehicle_ids = traci.edge.getLastStepVehicleIDs(edge)
            traffic_density=(len(vehicle_ids) / traci.lane.getLength(list(filter(lambda lane: edge in lane, traci.lane.getIDList()))[0]))
                  
            
           
            observation[intersection + "_" + i].append(queue_length)
            observation[intersection + "_" + i].append(traffic_density)
            if vehicle_ids:
              total_speed = sum([traci.vehicle.getSpeed(vehicle_id) for vehicle_id in vehicle_ids])
              observation[intersection + "_" + i].append((total_speed / len(vehicle_ids)))

            else:
                observation[intersection + "_" + i].append(0)

       
        observation['begin_time'] = np.array([self.begin_time])
        observation['end_time'] = np.array([self.end_time])
    
        return observation

    def render(self, mode="human"):
        pass  # SUMO's own GUI can be used if required

    def close(self):
        # Close the simulation (if needed)
        traci.close()

#Rewards are computed for each observation, which involves iterating through each incoming lanes of all traffic light intersections
    def compute_reward(self, observation):
        rewardValue = 0
        total_queue_length = 0
        total_traffic_density = 0
        total_average_speed = 0
        isInitial = False

        for intersection in lanes:
            for lane in lanes[intersection]:
                i = str(lanes[intersection].index(lane))
                queue_length = observation[intersection + "_" + i][0]
                traffic_density = observation[intersection + "_" + i][1]
                avg_speed = observation[intersection + "_" + i][2]
                
                
                if(len(previousMetrics[lane]) == 0): #Observation of first iteration in simulation is initialized for future comparison to compute the reward
                    isInitial = True
                    previousMetrics[lane] = [queue_length, traffic_density, avg_speed]
                    break
                #Take the summation of the reward in each incoming lane
                rewardValue+=1.5*(previousMetrics[lane][0] - queue_length) + 1.5*(previousMetrics[lane][1] - traffic_density) + 3*(avg_speed - previousMetrics[lane][2])
                previousMetrics[lane] = [queue_length, traffic_density, avg_speed] #store the metric values of the previous observation for future reward computation

                
                total_queue_length+=queue_length
                total_traffic_density+=traffic_density
                total_average_speed+=avg_speed
            
            if isInitial:
                break
        
        #Metric values are accumulated in separate lists to later plot the data
        self.reward_values.append(rewardValue)
        self.queue_lengths.append(total_queue_length/8)
        self.traffic_densities.append(total_traffic_density/8)
        self.average_speeds.append(total_average_speed/8)

        return rewardValue


The method below is the training method, where the training route rou.xml files are loaded, the SUMORL environment is initialized, and the single agent PPO model is instantiated with optimally chosen hyperparamter values.

In [13]:
def train_main(): #method used for training phase
  
    traci.start(['sumo', '-c', 'sumo_config.sumocfg'], label="5722114333223221323344242665425544543171")

    print("Init env...")
    route_file_list = [] 
    
    #All training files are appended to account for 7 days worth of data extracted from the Chicago traffic database.
    for day in range(0, 7):
        for time_index in range(1, 145):
            route_file_list.append(f'new_sumo_route-file_at_time_index_{time_index}_for_day_{day}.rou.xml')
    
    env = SumoRL('sumo_network_file_12_33.net.xml', route_file_list)

    model = PPO( # Initialize PPO agent from Stable-Baselines3
        policy="MultiInputPolicy",
        env=env,
        vf_coef=0.75,
        verbose=1,
        learning_rate=0.0003,
        n_steps=2048,
        batch_size=256,
        n_epochs=7,
        max_grad_norm=0.3,
        gamma=0.99,
    )
    

    print("Beginning the training of the PPO agent...")
    total_timesteps = 201600
    model.learn(total_timesteps=total_timesteps)

    metrics = {"Reward": env.reward_values, "Queue length": env.queue_lengths, "Traffic density": env.traffic_densities, "Average speed": env.average_speeds}
    
    #Plot graphs 
    
    for metric_key in metrics:
      print(len(metrics[metric_key]))
      print(f'Average  {metric_key}: {sum([value for value in metrics[metric_key]])/len(metrics[metric_key])}')
      plt.plot(list(range(1, len(metrics[metric_key]) + 1)), metrics[metric_key])
      plt.xlabel("Timestep")
      plt.ylabel(metric_key)
      plt.title(f'{metric_key} at each Timestep')
      plt.show()

    #Save model results
    model_path = "ppo_model_sumo_single_agent_trained"
    model.save(model_path)
    print(f"Model saved to {model_path}")
    
 
    
    traci.close() 

This method below is used to test the trained model using a 24-hour SUMO simulation that leverages a different route file, and uses a separate
sumocfg file. The metrics of the results are also plotted. For testing the baseline, just the action logic was commented out.

In [14]:
def test_main(): #method used for testing phase
    traci.start(['sumo', '-c', 'sumo_config_testing.sumocfg'], label="5421423321221243244353453443223524235224563")

    print("Init env...")
    
    env = SumoRL('sumo_network_file_12_33.net.xml', [])

    print("Testing the PPO agent...")
    model = PPO.load("ppo_model_sumo_single_agent_trained") #load the trained PPO model
    observation = env.reset()
    for timestep in range(1, 289):
           actions,_ = model.predict(observation , deterministic = True) #the deterministic flag was set to true instead of false to signify the testing stage
           observation, rewards, done, info = env.step(actions)
           if done:
               observation = env.reset()
    
    metrics = {"Reward": env.reward_values, "Queue length": env.queue_lengths, "Traffic density": env.traffic_densities, "Average speed": env.average_speeds}
   
    #Plot graphs for metrics
    for metric_key in metrics:
      print(len(metrics[metric_key]))
      print(f'Average  {metric_key}: {sum([value for value in metrics[metric_key]])/len(metrics[metric_key])}')
      plt.plot(list(range(0, len(metrics[metric_key]))), metrics[metric_key])
      plt.xlabel("Timestep")
      plt.ylabel(metric_key)
      plt.title(f'{metric_key} at each Timestep')
      plt.show()

In [11]:
if __name__ == "__main__":
   train_main()
   test_main()

Init env...
Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Beginning the training of the PPO agent...




Reward: 0.0
Reward: 0.0
Reward: -3.0029465089406133
Reward: 26.74083866024693
Reward: 29.643154984263944
Reward: 35.56215259903468
Reward: 18.668834432289295
Reward: 1.6471492824405516
Reward: -26.525828927616814
Reward: 9.100588285840402
Reward: 10.33489355884385
Reward: 25.566018119959956
Reward: 6.026121488745792
Reward: 0.136198468860945
Reward: 0.48578244999361697
Reward: 14.177568241476555
Reward: -3.1309437204997543
Reward: 10.406119080521174
Reward: 9.045133484965195
Reward: 1.094552601026356
Reward: 1.7883306270837913
Reward: -3.246391595677715
Reward: 15.772469284180564
Reward: 6.888777782814566
Reward: 22.57517895329744
Reward: -7.4393321956198
Reward: 5.533554105862942
Reward: -20.35552686860659
Reward: 3.0839672905782702
Reward: -12.055913221905264
Reward: -4.968834434601892
Reward: 7.3801610595806135
Reward: 11.778442298788901
Reward: -5.562422758145608
Reward: 0.16619599863521017
Reward: -5.983066921589721
Reward: -2.180954800104958
Reward: 19.98974360356998
Reward: -5.4

KeyboardInterrupt: 