In [1]:
!pip install pettingzoo[mpe]

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pettingzoo[mpe]
  Downloading PettingZoo-1.22.2-py3-none-any.whl (816 kB)
[K     |████████████████████████████████| 816 kB 5.0 MB/s 
[?25hCollecting gymnasium>=0.26.0
  Downloading Gymnasium-0.26.3-py3-none-any.whl (836 kB)
[K     |████████████████████████████████| 836 kB 49.0 MB/s 
Collecting pygame==2.1.0
  Downloading pygame-2.1.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)
[K     |████████████████████████████████| 18.3 MB 100 kB/s 
Collecting gymnasium-notices>=0.0.1
  Downloading gymnasium_notices-0.0.1-py3-none-any.whl (2.8 kB)
Installing collected packages: gymnasium-notices, gymnasium, pygame, pettingzoo
Successfully installed gymnasium-0.26.3 gymnasium-notices-0.0.1 pettingzoo-1.22.2 pygame-2.1.0


In [2]:
from keras.layers import Input,Dense,Activation
from keras.models import Model,load_model
import numpy as np
from tensorflow import keras
import keras.backend as K

class Agent(object):
  def __init__ (self,name,alpha,gamma,input_dims, n_actions,layer_size,fname ='reinforcePolicy'):
    self.gamma = gamma
    self.lr = alpha
    self.G = 0
    self.input_dims = input_dims
    self.layer_size = layer_size
    self.n_actions = n_actions
    self.state_memory = []
    self.action_memory = []
    self.reward_memory = []
    self.agent_name = name


    self.policy,self.predict = self.create_policy()
    self.action_space = [i for i in range(n_actions)]
    self.model_file = fname+' '+self.agent_name

  def create_policy(self):
    input = Input(shape = (self.input_dims,))#comma indicates that it takes a batch
    advantages = Input(shape =[1])
    dense1 = Dense(self.layer_size,activation = 'relu')(input)
    dense2 = Dense(self.layer_size,activation = 'relu')(dense1)
    probs = Dense(self.n_actions, activation = 'softmax')(dense2)

    def custom_loss(y_true,y_pred):
      out = K.clip(y_pred,1e-5, 1-1e-5) #this is to ensure that we do not perform log calcualtions with log values of 0
      log_lik = y_true*K.log(out)

      return K.sum(-log_lik*advantages)
    
    policy = Model(inputs = [input,advantages], outputs = [probs])
    opt = keras.optimizers.Adam(learning_rate = self.lr)
    policy.compile(optimizer = opt, loss = custom_loss)

    predict = keras.Model(inputs = [input], outputs = [probs])

    return policy,predict

  def choose_action(self,obsv): 
    obsv = np.expand_dims(obsv,0)
    #to keep the input shape consistant
    probabilities = self.predict.predict(obsv)[0]
    # we take the 0th element because predict returns a tuple
    action = np.random.choice(self.action_space,p=probabilities)
    # action = np.argmax(probabilities)
    return action

  def store_trans(self,obsv,action,reward):
    self.action_memory.append(action)
    self.state_memory.append(obsv)
    self.reward_memory.append(reward)

  def learn(self):
    state_memory = np.array(self.state_memory)
    reward_memory = np.array(self.reward_memory)
    action_memory = np.array(self.action_memory)


    actions = np.zeros([len(action_memory),self.n_actions])
    actions[np.arange(len(action_memory)),action_memory] = 1
    # loss function requires the labels to be 1 hot encoded

    G = np.zeros_like(reward_memory)
    for t in range(len(reward_memory)):
      G_sum = 0
      discount = 1
      for k in range(t,len(reward_memory)):
        G_sum += reward_memory[k]*discount
        discount *= self.gamma
      G[t] = G_sum

    mean = np.mean(G)
    std = np.std(G) if np.std(G) > 0 else 1
    self.G = (G-mean)/std
    print(len(self.G))
    print(len(state_memory))
    print(len(actions))
    cost = self.policy.train_on_batch([state_memory,self.G], actions) # actions is our labels
    #[state_memory,self.G] =  y_pred and actions = y_true
    # This is to calculate for the loss function

    self.state_memory = []
    self.reward_memory = []
    self.action_memory = []

  def save_model(self):
    self.policy.save_weights(self.model_file)
    self.policy.save(self.model_file)

  def load_model(self):
    self.policy.load_weights(self.model_file)

In [3]:
from tensorflow.python.framework.ops import disable_eager_execution

disable_eager_execution()

In [4]:
from pettingzoo.mpe import simple_adversary_v2
env = simple_adversary_v2.env(N=2,max_cycles = 25 , continuous_actions = False)
env.reset()

In [5]:
agent_net = {}
agent_list = []
for a in env.agents:
  obs_space = env.observation_space(a).shape
  print(obs_space[0])
  agent = Agent(a, 1e-5 , 0.99 , obs_space[0] ,5,256)
  agent_net[a] = agent
  agent_list.append(a)

8
10
10


In [None]:
env.reset()
max_episode = 1000
good_best_score = -1000
opp_best_score = -10000
Good_score_history = []
Opp_score_history = []
good_score = 0
opp_score  = 0
Good_avg_score = []
Opp_score_history = []

for i in range(max_episode):
  opp_score = 0
  good_score = 0
  env.reset()
  for agent in env.agent_iter():
    observation,reward,done,trunc,_= env.last()
    if (agent == 'adversary_0'):
      opp_score += reward
    else:
      good_score += reward 
    if not (done or trunc):
      action = agent_net[agent].choose_action(observation)
      env.step(action)
      agent_net[agent].store_trans(observation,action,reward)
    else:
      action = None
      env.step(action)

  Good_score_history.append(good_score)
  Opp_score_history.append(opp_score)

  Good_avg_score = np.mean(Good_score_history[-100:])
  Opp_avg_score = np.mean(Opp_score_history[-100:])

  for agent in env.agents:
    agent_net[agent].learn()
  
  if Good_avg_score > good_best_score:
    load_checkpoint = True
    good_best_score = Good_avg_score
    for a in agent_list:
      if (a != 'adversary_0'):
        agent_net[a].save_model()

  if Opp_avg_score > opp_best_score:
    load_checkpoint = True
    opp_best_score = Opp_avg_score
    for a in agent_list:
      if (a == 'adversary_0'):
        agent_net[a].save_model()
 


  print('episode: ', i ,' Opp avg score:', Opp_avg_score,'Good avg score :',Good_avg_score)

  updates=self.state_updates,


episode:  0  Opp avg score: -17.73136877892052 Good avg score : -15.820822571087099
episode:  1  Opp avg score: -14.774178950045151 Good avg score : -8.43626675544986
episode:  2  Opp avg score: -26.3690510889497 Good avg score : 12.91410715224606
episode:  3  Opp avg score: -23.887606908238222 Good avg score : 1.6586724781292492
episode:  4  Opp avg score: -26.8490767426516 Good avg score : 2.1097004981065615
episode:  5  Opp avg score: -25.453988725890284 Good avg score : -1.2198243166113194
episode:  6  Opp avg score: -27.190126388350986 Good avg score : 0.5847595192566709
episode:  7  Opp avg score: -27.73157435512867 Good avg score : 3.239515933040388
episode:  8  Opp avg score: -27.679839220404162 Good avg score : 7.185847056682885
episode:  9  Opp avg score: -26.73341537010581 Good avg score : 6.885403819832108
episode:  10  Opp avg score: -26.023473550502942 Good avg score : 3.868634055201302
episode:  11  Opp avg score: -28.113352845058596 Good avg score : 6.575467109333112
ep

In [None]:
print(Good_score_history)

In [None]:
print(Opp_score_history)

In [None]:
!zip -r /content/adversary_0_actor /content/adversary_0_actor.zip
!zip -r /content/agent_0_actor /content/agent_0_actor.zip
!zip -r /content/agent_1_actor /content/agent_1_actor.zip

In [None]:
from google.colab import files

files.download('/content/agent_1_actor.zip')
files.download('/content/agent_0_actor.zip')
files.download('/content/adversary_0_actor.zip')