In [1]:
import gymnasium as gym
import numpy as np
import os, sys
import time
import sys
import traffic_control_game
import pygame
from collections import defaultdict
import datetime
import matplotlib.pyplot as plt
from tqdm import tqdm
import seaborn as sns
import ast
import json
from matplotlib import cm
from matplotlib.ticker import LinearLocator
import matplotlib
import itertools
import plotly.express as px
import tiles3 as tc

#is_ipython = 'inline' in matplotlib.get_backend()
#if is_ipython:
#    from IPython import display

#plt.ion()



In [2]:
def timer(start_time=None, string=None):
    '''
    Function to compute the time
    start_time : starting time generated calling this function without arguments the first time
    string: visualization purposes (task description)
    '''
    if not start_time:
        start_time=datetime.datetime.now()
        return start_time
    elif start_time:
        thour, temp_sec = divmod((datetime.datetime.now()-start_time).total_seconds(),3600)
        tmin, tsec = divmod(temp_sec,60)
        pr = "for " + str(string) + " " if string else ""
        print("Execution time", pr, "is ", thour," h :", tmin,' m :', round(tsec,2), " s")

In [3]:
"""
Function to load and store dictionaries safely by saving converting them
to json format. Used due to the long training times and the instability
of Collab
"""

def save_json(q, filename):
    with open(f'{filename}.json', 'w') as fp:
        to_json = str({k: list(v) for k, v in q.items()})
        json.dump(to_json, fp)
    

def load_json(filename):
    with open(f'{filename}.json') as json_file:
        data = ast.literal_eval(json.load(json_file))
    return data
    

In [4]:
# Environment initialization

env_steps = 50
max_waiting_time = 1500
ps_ns = np.random.uniform(low=0.09, high=0.1, size=1)
ps_ew = np.random.uniform(low=0.04, high=0.05, size=1)
ps = np.tile(np.concatenate((ps_ns, ps_ew)), 2)

env_info = {"ps": ps, "max_wait_time": max_waiting_time, "env_steps": env_steps, "n_states": 2}        
        
env = gym.make("traffic_control-v0", env_info=env_info, render_mode="human") 

print(f"Observations space: {env.observation_space}")
print(f"Action space: {env.action_space}")


Observations space: Dict('NS': Discrete(76), 'WE': Discrete(92), 'pa': Discrete(6), 'wt': Discrete(1500))
Action space: Discrete(6)


# Semi-gradient Sarsa agent

In [7]:

class TileCoder:
    ''' Class to facilitate tile coding representations of states passed as parameters '''
    def __init__(self, iht_size=4096, num_tilings=8, num_tiles=8):
        self.iht = tc.IHT(iht_size)
        self.num_tilings = num_tilings
        self.num_tiles = num_tiles
    
    def get_tiles(self, queue_ns, queue_we, wt, pa):
        global env, env_info
        
        # Range of minimum and maximum value for each of the 2 components of the observation vector
        ns_min, ns_max = float(env.observation_space["NS"].start), float(env.observation_space["NS"].start + env.observation_space["NS"].n-1)
        we_min, we_max = float(env.observation_space["WE"].start), float(env.observation_space["WE"].start + env.observation_space["WE"].n-1)
        wt_min, wt_max = 0, env_info["max_wait_time"]
        pa_min, pa_max = 0, env.action_space.n-1
        
        queue_ns_scaled = (queue_ns-ns_min)*(self.num_tiles / (ns_max-ns_min))
        queue_we_scaled = (queue_we-we_min)*(self.num_tiles / (we_max-we_min))
        wt_scaled = (wt-wt_min)*(self.num_tiles / (wt_max-wt_min))
        pa_scaled = (pa-pa_min)*(self.num_tiles / (pa_max-pa_min))

        tiles = tc.tiles(self.iht, self.num_tilings, [queue_ns_scaled, queue_we_scaled, wt_scaled, pa_scaled])
        
        return np.array(tiles)
        

In [8]:

class SarsaAgent():
    """
    Class for the Semi-Gradient Sarsa agent.
    """
    def __init__(self):
        """ All values are set to None so they can be initialized in the agent_init method """
        self.num_tilings = None
        self.num_tiles = None
        self.iht_size = None
        self.initial_weights =None
        self.action_count = None
        
        self.eps_start = None
        self.eps_decay = None
        self.eps_end = None
        self.eps_runn = None
        
        self.discount = None
        self.num_actions = None
        self.step_size = None
        self.w =None
        self.tc = None
        self.previous_tiles, self.previous_action = None, None

    def info_init(self, info={}):
        """Setup for the agent, passed in a dictionary, called when the experiment first starts """
        self.num_tilings = info.get("num_tilings", 8)
        self.num_tiles = info.get("num_tiles", 8)
        self.iht_size = info.get("iht_size", 4096)
        self.initial_weights = info.get("initial_weights", 0.0)

        # epsilon
        self.eps_start = info.get("eps_start", 1.0)
        self.eps_decay = info.get("eps_decay", 2500)
        self.eps_end = info.get("eps_end", 1e-5)
        self.eps_runn = info.get("eps_start", 1.0)

        self.discount = info.get("discount", 1.0)
        self.num_actions = info.get("num_actions", 6)
        self.step_size = info.get("step_size", 0.5) / self.num_tilings

        self.w = np.ones((self.num_actions, self.iht_size)) * self.initial_weights

        self.tc = TileCoder(iht_size=self.iht_size, 
                            num_tilings=self.num_tilings, 
                            num_tiles=self.num_tiles)
        
        self.action_count = defaultdict(int)
        
    def decay(self, episode):
      """ """
      self.eps_runn = self.eps_end + (self.eps_start - self.eps_end)*np.exp(-1*episode/self.eps_decay)
      
    def get_freq_action(self):
      tot_chosen = np.sum(list(self.action_count.values()))
      return [freq_act/tot_chosen for _, freq_act in self.action_count.items()]

    def get_value(self, state):
      """ Function used to plot the estimates of state-action value """
      active_tiles = self.tc.get_tiles(queue_ns=state["NS"], queue_we=state["WE"], wt=state["wt"], pa=state["pa"]) 
      action, value = self.choose_action(active_tiles, greedy=True)
      return action, value

    def argmax(self, values):
      top = float("-inf")
      ties = []
      for i in range(len(values)):
          if values[i] > top:
              top = values[i]
              ties = []
          if values[i] == top:
              ties.append(i)
      return np.random.choice(ties)

    def play(self, env, fin_score = 4000, render_=False, print_info=False):
      ''' outside of training '''
      state, info = env.reset()
      if print_info:
        print("The game has started...")
      while True:
          active_tiles = self.tc.get_tiles(queue_ns=state["NS"], queue_we=state["WE"], wt=state["wt"], pa=state["pa"]) 
          action, _ = self.choose_action(active_tiles, greedy=True)
          next_state, reward, done, _, info = env.step(action)

          if render_:
            # Render the game
            env.render()
          
          if (done) or (info["score"]>=fin_score): # If player is dead break
            to_vis = info["score"]
            if print_info:
              print(f"\nThe game is done! Final score: {to_vis: ,}\n")
            break
          else:
            state = next_state  
      env.close()
      return to_vis

    def choose_action(self, tiles, greedy=False):
      ''' Function to choose action according to epsilon-greedy strategy,
          based on current tile-based state representation  '''
      action_values = [np.sum(self.w[action][tiles]) for action in range(self.num_actions)] 
      if (np.random.random()<self.eps_runn) and (not greedy):
        chosen = np.random.choice(self.num_actions) 
      else:
        chosen = self.argmax(action_values) 
      self.action_count[chosen] += 1
      return chosen, action_values[chosen]

    def start(self, state):
      """ Take first move and store first action and tile-based representation of state """
      active_tiles = self.tc.get_tiles(queue_ns=state["NS"], queue_we=state["WE"], wt=state["wt"], pa=state["pa"]) 
      action, _ = self.choose_action(active_tiles, greedy=False)

      self.previous_tiles = np.copy(active_tiles)
      self.previous_action = action
      return action

    def update(self, reward, state):
      """ Update of Sarsa algorithm (on-policy method)
          The q-values of the previous state-action pair are updated
          based on the value of the action taken in successive state (passed as parameter) """

      if state:
        active_tiles = self.tc.get_tiles(queue_ns=state["NS"], queue_we=state["WE"], wt=state["wt"], pa=state["pa"]) 
        action, action_value = self.choose_action(active_tiles, greedy=False)
      
      action_value = action_value if state!=False else 0
      update_target = reward + self.discount*action_value - np.sum(self.w[self.previous_action][self.previous_tiles])
      self.w[self.previous_action][self.previous_tiles] += self.step_size * update_target * np.ones((self.num_tilings,))

      if not state:
        return
      else:
        self.previous_tiles = np.copy(active_tiles)
        self.previous_action = action
        return action


In [10]:

def loop_episodes(agent, env, env_steps, agent_info, change_reward=False, drop_epsilon=False, print_=False, print_info=False):
    """ Function for the main loop of the Semi-Gradient Sarsa agent """

    num_episodes = agent_info.get("num_episodes", 5000)  # agent initialization with info dictionary
    agent.info_init(agent_info)

    store, best_score = defaultdict(list), 0
    
    # Loop over episodes
    for i_episode in range(1, num_episodes+1):
      
      # If drop_epsilon is an integer, training is stopped when episode number is equal to drop_epsilon
      # A greedy game is then played in the environment and the final score is returned
      #if (drop_epsilon!=False) and (i_episode==int(drop_epsilon)):
      #  if print_info:
      #    print(f"Epsilon before evaluation: {agent.epsilon: .4f}, Last max score with epsilon: {total_max_scores[-1]}")
      #  fin_score = agent.play(env, fin_score=30000, print_=False, print_info=print_info)
      #  return fin_score
        
      state, info = env.reset()   # first state

      action = agent.start(state)  # first action
      runn_score = 0
      runn_rewards = []

      while True:
        next_state, reward, done, _, info = env.step(action)
        runn_rewards.append(reward)

        # we break out also when score is greater than a maximum cap
        if (done) or (info["score"]>30000):          
          
          agent.update(reward, False)  # update of one-to-last state

          # update
          best_score = max(best_score, info["score"])
          store["total_max_scores"].append(best_score)
          store["total_scores"].append(info["score"])
          store["epsilon"].append(agent.eps_runn)
          store["avg_reward_episode"].append(np.mean(runn_rewards))
          break
        
        else:
          action = agent.update(reward, next_state)
          continue
      
        agent.decay()
      
      # info
      action_freq = agent.get_freq_action()
      store["action_freq"].append(action_freq)
      
      if print_:
        if i_episode % 1 == 0:
          print("\rEpisode {}/{}, Epsilon {} Avg Scores: {}, Max Score: {}, Action frequencies: {}.".format(i_episode, num_episodes,
                                                                         store["epsilon"][-1], 
                                                                         np.mean(store["total_scores"]).astype(int), 
                                                                         store["total_max_scores"][-1],
                                                                         ", ".join([str(round(x, 5)) for x in action_freq])))

    return agent, store


In [11]:
# Environment initialization (no render for training)


env_steps = 30
max_waiting_time = 1500
ps_ns = np.random.uniform(low=0.09, high=0.1, size=1)
ps_ew = np.random.uniform(low=0.04, high=0.05, size=1)
ps = np.tile(np.concatenate((ps_ns, ps_ew)), 2)

env_info = {"ps": ps, "max_wait_time": max_waiting_time, "env_steps": env_steps, "n_states": 2}        
        
env = gym.make("traffic_control-v0", env_info=env_info) 

agent_sarsa = SarsaAgent()
num_episodes = 30 #5000


info = {"num_tilings": 8,
        "num_tiles": 8,
        "iht_size": 4096,
        "num_episodes": num_episodes,
        "num_actions": env.action_space.n,
        "step_size": 0.5,  # divided by number of tilings
        "discount": 1.0,
        "eps_start": 1.0,
        "eps_decay": 1800,  # with exponential decay
        "eps_end": 1e-6} 


start = timer()

agent_sarsa, store_sarsa = loop_episodes(agent_sarsa, env, env_steps, info, print_=True)

# Training

print("")
timer(start, "Training Sarsa agent")

Episode 1/30, Epsilon 1.0 Avg Scores: 33, Max Score: 33, Action frequencies: 0.16667, 0.2, 0.1, 0.13333, 0.26667, 0.13333.
Episode 2/30, Epsilon 1.0 Avg Scores: 28, Max Score: 33, Action frequencies: 0.23214, 0.17857, 0.125, 0.10714, 0.23214, 0.125.
Episode 3/30, Epsilon 1.0 Avg Scores: 20, Max Score: 33, Action frequencies: 0.18182, 0.19481, 0.15584, 0.11688, 0.23377, 0.11688.
Episode 4/30, Epsilon 1.0 Avg Scores: 15, Max Score: 33, Action frequencies: 0.17895, 0.2, 0.14737, 0.13684, 0.2, 0.13684.


KeyboardInterrupt: 

In [None]:
save_json(store_sarsa, "store_sarsa")

with open('q_table_sarsa.npy', 'wb') as f:
  np.save(f, agent_sarsa.w)

In [None]:

# plot evolution scores (running mean), together with maximum score

fig, axs = plt.subplots(1, 3, figsize=(18, 6))

titles = ["Score evolution", "Max score evolution", "Epsilon evolution"]

total_scores, total_max_scores, eps = store_sarsa["total_max_scores"], store_sarsa["total_scores"], store_sarsa["epsilon"]

for idx, (ax, score, title) in enumerate(zip(axs, [total_scores, total_max_scores, eps], titles)):
  if idx==0:
    ax.plot(score, linewidth=1.2)
    means = np.lib.stride_tricks.sliding_window_view(score, 15).mean(1).reshape(-1)
    ax.plot(means, linewidth=2.5)
  else:
    ax.plot(score, linewidth=1.2)
  ax.set_xlabel('Episode', fontsize=14)
  ax.set_title(title, fontsize=15)

plt.suptitle("Semi-gradient Sarsa agent", fontsize=22)

#plt.savefig('evolution_sarsa.jpg', bbox_inches='tight', dpi=300)   
plt.show()
    

In [14]:
# Play a game

env_steps = 500

env = gym.make("traffic_control-v0", n_states = 2, env_steps = env_steps, render_mode="human") 

agent_sarsa.play(env, fin_score = 4000, render_=True, print_info=False)

: 

: 

In [None]:
obs, info = env.reset()


#actions_loop = [0]*5 + [1]*5 + [0] + [1]*5 + [0] + [1]*5
#actions_loop = [2]*10+[0]
actions_loop = [1]*10