In [1]:
!pip install pettingzoo[mpe]

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pettingzoo[mpe]
  Downloading PettingZoo-1.22.2-py3-none-any.whl (816 kB)
[K     |████████████████████████████████| 816 kB 10.2 MB/s 
Collecting gymnasium>=0.26.0
  Downloading Gymnasium-0.26.3-py3-none-any.whl (836 kB)
[K     |████████████████████████████████| 836 kB 38.6 MB/s 
[?25hCollecting pygame==2.1.0
  Downloading pygame-2.1.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)
[K     |████████████████████████████████| 18.3 MB 105 kB/s 
Collecting gymnasium-notices>=0.0.1
  Downloading gymnasium_notices-0.0.1-py3-none-any.whl (2.8 kB)
Installing collected packages: gymnasium-notices, gymnasium, pygame, pettingzoo
Successfully installed gymnasium-0.26.3 gymnasium-notices-0.0.1 pettingzoo-1.22.2 pygame-2.1.0


In [2]:
import os
import tensorflow.keras as keras
from tensorflow.keras.layers import Dense

class ActorCriticNetwork(keras.Model):
  def __init__(self,n_actions, dims_1 = 256 , dims_2 = 256,
               name = 'actor_critc', chkpt_dir = 'tmp/actor_critic'):
    super(ActorCriticNetwork, self).__init__()
    self.dims_1 = dims_1
    self.dims_2 = dims_2
    self.n_actions = n_actions
    self.model_name = name
    self.checkpoint_dir = chkpt_dir
    self.checkpoint_file = os.path.join(self.checkpoint_dir, name+'_ac')

    self.fc1 = Dense(self.dims_1,activation = 'relu')
    self.fc2 = Dense(self.dims_2,activation = 'relu')
    self.v = Dense(1,activation = None)
    self.po = Dense(n_actions,activation = 'softmax')

  def call(self,state):
    value = self.fc1(state)
    value = self.fc2(value)

    v = self.v(value)
    po = self.po(value)

    return v,po

In [3]:
from re import A
import tensorflow as tf
from tensorflow.keras.optimizers import Adam
import tensorflow_probability as tfp


class Agent:
  def __init__(self, alpha = 1e-5, gamma = 0.95, n_actions = 5,name='agent'):
    self.gamma = gamma
    self.n_actions = n_actions
    self.action = None
    self.action_space = [i for i in range(self.n_actions)]
    self.name = name
    self.actor_critic = ActorCriticNetwork(n_actions=n_actions)
    self.actor_critic.compile(optimizer = Adam(learning_rate = alpha))
    self.fname = 'actor_critic '+self.name

    self.reward_history = []
    self.action_history = []
    self.obsv_history = []
    self.next_obsv_history = []

  def choose_action(self,observation):
    state = tf.convert_to_tensor([observation])
    _,probs = self.actor_critic(state)

    action_probs = tfp.distributions.Categorical(probs = probs)
    action = action_probs.sample()

    self.action = action
    action = action.numpy()[0]
    if (action == 5):
      action = action - 1
    elif (action < 0):
      action = 0

    return action

  def save_models(self):
    print('saving model.....')
    self.actor_critic.save_weights(self.fname)
  
  def load_models(self):
    self.actor_critic.load_weights(self.fname)

  def store_trans(self,action,reward,obsv,obsv_):
      self.action_history.append(action)
      self.reward_history.append(reward)
      self.obsv_history.append(obsv)
      self.next_obsv_history.append(obsv_)

  def clear_memory(self):
    self.reward_history = []
    self.action_history = []
    self.obsv_history = []
    self.next_obsv_history = []

  def learn(self,state,reward,state_,done):
    state = tf.convert_to_tensor([state],dtype = tf.float32)
    state_ = tf.convert_to_tensor([state_],dtype = tf.float32)
    reward = tf.convert_to_tensor(reward,dtype = tf.float32)

    with tf.GradientTape() as tape:
      if self.action == 5:
        self.action = self.action - 1

      state_value, probs = self.actor_critic(state)
      state_value_,_ = self.actor_critic(state_)
      state_value = tf.squeeze(state_value)
      state_value_ = tf.squeeze(state_value_)

      action_probs = tfp.distributions.Categorical(probs = probs)
      log_probs = action_probs.log_prob(self.action)

      delta = reward + self.gamma*state_value_*(1-int(done)) - state_value

      actor_loss = -log_probs*delta
      critic_loss = delta**2

      total_loss = actor_loss + critic_loss
    gradients = tape.gradient(total_loss,self.actor_critic.trainable_variables)
    
    self.actor_critic.optimizer.apply_gradients(zip(gradients, self.actor_critic.trainable_variables))


In [4]:
from pettingzoo.mpe import simple_spread_v2

env = simple_spread_v2.env(N=2, local_ratio=0.5, max_cycles=25, continuous_actions=False)
env.reset()

In [5]:
agent_scores = {}
policy_net = {}
for a in env.agents:
  agent = Agent(alpha = 1e-5,n_actions = 5,name = a)
  policy_net[a] = agent

In [7]:
import numpy as np
agent_list = env.agents
n_games = 2500
best_score = -10000
score_history = []
model_loaded = False

for i in range(n_games):
  env.reset()
  score = 0
  for agent in env.agent_iter():
    obsv,reward,done,trunc,_ = env.last()
    if not (done or trunc):
      action = policy_net[agent].choose_action(obsv)
      score += reward
      env.step(action)
      
      obsv_,_,_,_,_ = env.last()
    else:
      action = None
      env.step(action)
    
    if len(obsv_) == len(obsv):
      policy_net[agent].learn(obsv,reward,obsv_,done)
    
  score_history.append(score)    
  avg_score = np.mean(score_history[-100:])

  if (avg_score > best_score):
    best_score = avg_score
    for a in agent_list:
      policy_net[a].save_models()

    

  print('episode: ',i ,'soore: ',score,' avg_score: ', avg_score)

saving model.....
saving model.....
episode:  0 soore:  -106.81164744498759  avg_score:  -106.81164744498759
saving model.....
saving model.....
episode:  1 soore:  -46.49968696570056  avg_score:  -76.65566720534407
saving model.....
saving model.....
episode:  2 soore:  -47.95679846226109  avg_score:  -67.08937762431641
episode:  3 soore:  -79.36436737602864  avg_score:  -70.15812506224447
saving model.....
saving model.....
episode:  4 soore:  -48.359415797264774  avg_score:  -65.79838320924853
saving model.....
saving model.....
episode:  5 soore:  -51.1597085708781  avg_score:  -63.358604102853455
episode:  6 soore:  -72.23470972259508  avg_score:  -64.62661919138797
saving model.....
saving model.....
episode:  7 soore:  -43.483482615046825  avg_score:  -61.98372711934533
saving model.....
saving model.....
episode:  8 soore:  -53.471516892745875  avg_score:  -61.0379259830565
saving model.....
saving model.....
episode:  9 soore:  -59.852378805141186  avg_score:  -60.919371265264

KeyboardInterrupt: ignored

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.signal import savgol_filter

x = np.linspace(0,2*np.pi,100)
y = np.sin(x) + np.random.random(100) * 0.2
yhat = savgol_filter(score_history, 101, 7) # window size 51, polynomial order 3


plt.plot(yhat, color='red')
plt.show()

In [None]:
print(score_history)