In [1]:
!pip install pettingzoo[mpe]

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pettingzoo[mpe]
  Downloading PettingZoo-1.22.2-py3-none-any.whl (816 kB)
[K     |████████████████████████████████| 816 kB 5.0 MB/s 
[?25hCollecting gymnasium>=0.26.0
  Downloading Gymnasium-0.26.3-py3-none-any.whl (836 kB)
[K     |████████████████████████████████| 836 kB 44.7 MB/s 
Collecting pygame==2.1.0
  Downloading pygame-2.1.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)
[K     |████████████████████████████████| 18.3 MB 1.2 MB/s 
Collecting gymnasium-notices>=0.0.1
  Downloading gymnasium_notices-0.0.1-py3-none-any.whl (2.8 kB)
Installing collected packages: gymnasium-notices, gymnasium, pygame, pettingzoo
Successfully installed gymnasium-0.26.3 gymnasium-notices-0.0.1 pettingzoo-1.22.2 pygame-2.1.0


In [2]:
from keras.layers import Input,Dense,Activation
from keras.models import Model,load_model
import numpy as np
from tensorflow import keras
import keras.backend as K

class Agent(object):
  def __init__ (self,name,alpha,gamma,input_dims, n_actions,layer_size,fname ='reinforcePolicy'):
    self.gamma = gamma
    self.lr = alpha
    self.G = 0
    self.input_dims = input_dims
    self.layer_size = layer_size
    self.n_actions = n_actions
    self.state_memory = []
    self.action_memory = []
    self.reward_memory = []
    self.agent_name = name

    self.policy,self.predict = self.create_policy()
    self.action_space = [i for i in range(n_actions)]
    self.model_file = fname +'_name'

  def create_policy(self):
    input = Input(shape = (self.input_dims,))#comma indicates that it takes a batch
    advantages = Input(shape =[1])
    dense1 = Dense(self.layer_size,activation = 'relu')(input)
    dense2 = Dense(self.layer_size,activation = 'relu')(dense1)
    probs = Dense(self.n_actions, activation = 'softmax')(dense2)

    def custom_loss(y_true,y_pred):
      out = K.clip(y_pred,1e-8, 1-1e-8) #this is to ensure that we do not perform log calcualtions with log values of 0
      log_lik = y_true*K.log(out)

      return K.sum(-log_lik*advantages)
    
    policy = Model(inputs = [input,advantages], outputs = [probs])
    opt = keras.optimizers.Adam(learning_rate = self.lr)
    policy.compile(optimizer = opt, loss = custom_loss)

    predict = keras.Model(inputs = [input], outputs = [probs])

    return policy,predict

  def choose_action(self,obsv): 
    obsv = np.expand_dims(obsv,0)
    #to keep the input shape consistant
    probabilities = self.predict.predict(obsv)[0]
    # we take the 0th element because predict returns a tuple
    action = np.random.choice(self.action_space,p=probabilities)
    # action = np.argmax(probabilities)
    return action

  def store_trans(self,obsv,action,reward):
    self.action_memory.append(action)
    self.state_memory.append(obsv)
    self.reward_memory.append(reward)

  def learn(self):
    state_memory = np.array(self.state_memory)
    reward_memory = np.array(self.reward_memory)
    action_memory = np.array(self.action_memory)


    actions = np.zeros([len(action_memory),self.n_actions])
    actions[np.arange(len(action_memory)),action_memory] = 1
    # loss function requires the labels to be 1 hot encoded

    G = np.zeros_like(reward_memory)
    for t in range(len(reward_memory)):
      G_sum = 0
      discount = 1
      for k in range(t,len(reward_memory)):
        G_sum += reward_memory[k]*discount
        discount *= self.gamma
      G[t] = G_sum

    mean = np.mean(G)
    std = np.std(G) if np.std(G) > 0 else 1
    self.G = (G-mean)/std
    print(len(self.G))
    print(len(state_memory))
    print(len(actions))
    cost = self.policy.train_on_batch([state_memory,self.G], actions) # actions is our labels
    #[state_memory,self.G] =  y_pred and actions = y_true
    # This is to calculate for the loss function

    self.state_memory = []
    self.reward_memory = []
    self.action_memory = []

  def save_model(self):
    self.policy.save(self.model_file)

  def load_model(self):
    self.policy = keras.models.load_model(self.model_file)

In [3]:
from tensorflow.python.framework.ops import disable_eager_execution

disable_eager_execution()

In [4]:
from pettingzoo.mpe import simple_speaker_listener_v3

env = simple_speaker_listener_v3.env(max_cycles = 25 , continuous_actions = False)
env.reset()

In [5]:
for a in env.agents:
  x = env.action_space(a).n
  print(x)

3
5


In [6]:
agent_net = {}
agent_list = []
for a in env.agents:
  obs_space = env.observation_space(a).shape
  print(obs_space[0])
  action_space = env.action_space(a).n
  agent = Agent(a, 1e-5 , 0.95 , obs_space[0] ,action_space,256)
  agent_net[a] = agent
  agent_list.append(a)

3
11


In [7]:
agent_list = env.agents
print(agent_list)

['speaker_0', 'listener_0']


In [8]:
env.reset()
best_score = -1000
max_episode = 2000
score_history = []
avg_scores = []
for i in range(max_episode):
  score = 0
  env.reset()
  for agent in env.agent_iter():
    observation,reward,done,trunc,_= env.last()
    score += reward
    if not (done or trunc):
      action = agent_net[agent].choose_action(observation)
      env.step(action)
      agent_net[agent].store_trans(observation,action,reward)
    else:
      action = None
      env.step(action)
  
  score_history.append(score)    
  avg_score = np.mean(score_history[-100:])

  if avg_score > best_score:
    load_checkpoint = True
    best_score = avg_score
    for a in agent_list:
      agent_net[a].save_model()

  for agent in env.agents:
    agent_net[agent].learn()
  
  print('episode: ', i , ' score :',score,' avg score:', np.mean(score_history[-100:]))
  avg_scores.append(avg_score)

  updates=self.state_updates,


episode:  0  score : -18.70802940213866  avg score: -18.70802940213866
episode:  1  score : -94.59987048424445  avg score: -56.653949943191556
episode:  2  score : -343.72645558341674  avg score: -152.34478515659995
episode:  3  score : -63.78714101641408  avg score: -130.2053741215535
episode:  4  score : -14.220391798517333  avg score: -107.00837765694625
episode:  5  score : -90.32046334238781  avg score: -104.22705860451985
episode:  6  score : -37.089190701012626  avg score: -94.63593461830453
episode:  7  score : -22.578769944684105  avg score: -85.62878903410197
episode:  8  score : -33.16131308549883  avg score: -79.79906948425717
episode:  9  score : -37.49757278857258  avg score: -75.56891981468871
episode:  10  score : -62.37313150748031  avg score: -74.36930269585159
episode:  11  score : -3.833253274976945  avg score: -68.49129857744536
episode:  12  score : -209.1742502687352  avg score: -79.31306409215996
episode:  13  score : -37.31175204209558  avg score: -76.312970374

KeyboardInterrupt: ignored

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.signal import savgol_filter

x = np.linspace(0,2*np.pi,100)
y = np.sin(x) + np.random.random(100) * 0.2
yhat = savgol_filter(score_history, 101, 3) # window size 51, polynomial order 3


plt.plot(yhat, color='red')
plt.show()

In [None]:
print(avg_scores)