In [1]:
!pip install pettingzoo[mpe]

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pettingzoo[mpe]
  Downloading PettingZoo-1.22.2-py3-none-any.whl (816 kB)
[K     |████████████████████████████████| 816 kB 5.2 MB/s 
Collecting gymnasium>=0.26.0
  Downloading Gymnasium-0.26.3-py3-none-any.whl (836 kB)
[K     |████████████████████████████████| 836 kB 39.6 MB/s 
[?25hCollecting pygame==2.1.0
  Downloading pygame-2.1.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)
[K     |████████████████████████████████| 18.3 MB 97 kB/s 
Collecting gymnasium-notices>=0.0.1
  Downloading gymnasium_notices-0.0.1-py3-none-any.whl (2.8 kB)
Installing collected packages: gymnasium-notices, gymnasium, pygame, pettingzoo
Successfully installed gymnasium-0.26.3 gymnasium-notices-0.0.1 pettingzoo-1.22.2 pygame-2.1.0


In [2]:
from tensorflow import keras
from keras.layers import Dense,Activation
from keras.models import Sequential,load_model
import numpy as np
import random
import os

In [3]:
class ReplayBuffer(object):
  def __init__(self,mem_max_size,input_shape,num_actions):
    self.mem_size = mem_max_size
    self.input_shape = input_shape
    self.num_actions = num_actions
    self.state_memory = np.zeros((self.mem_size,input_shape))
    self.action_memory = np.zeros((self.mem_size,num_actions),dtype = np.int8)
    self.reward_memory = np.zeros(self.mem_size)
    self.new_state_memory = np.zeros((self.mem_size,input_shape))
    self.terminal_state = np.zeros(self.mem_size,dtype=np.float32)
    self.mem_counter = 1

  def store_transition(self,state,action,reward,done):
    index = self.mem_counter % self.mem_size
    self.state_memory[index] = state
    self.reward_memory[index] = reward
    self.terminal_state[index] = 1 - int(done)
    #print('index: ',index)
    actions = np.zeros(self.action_memory.shape[1]) #one hot encoding of actions
    actions[action] = 1
    self.action_memory[index] = actions
    if self.mem_counter > 0 :
      self.new_state_memory[index-1] = state
    self.mem_counter +=1

  def sample(self,batch_size):
    mem_size = min(self.mem_counter,self.mem_size)
    batch = np.random.choice(mem_size,batch_size)

    #print('batch: ',batch)

    state = self.state_memory[batch]
    new_state = self.new_state_memory[batch]
    reward = self.reward_memory[batch]
    action = self.action_memory[batch]
    terminal = self.terminal_state[batch]

    return state,new_state,action,reward,terminal

  def clear_memory(self):
    self.state_memory = np.zeros((self.mem_size,self.input_shape))
    self.action_memory = np.zeros((self.mem_size,self.num_actions),dtype = np.int8)
    self.reward_memory = np.zeros(self.mem_size)
    self.new_state_memory = np.zeros((self.mem_size,self.input_shape))
    self.terminal_state = np.zeros(self.mem_size,dtype=np.float32)
    self.mem_counter = 1

In [4]:
from keras.engine.training import optimizers
def build_dqn(lr,input_dims,num_actions,unit_size):
  model = Sequential(
      [
          Dense(unit_size,input_shape=(input_dims,)),Activation('relu'),
          Dense(unit_size),Activation('relu'),
          Dense(num_actions)
      ]
  )
  opt = keras.optimizers.Adam(learning_rate = lr)
  model.compile(loss = 'mse',optimizer=opt)

  return model

In [5]:
!pip install pettingzoo[mpe]

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [6]:
class Agent(object):
  def __init__(self,agent_name, num_actions,alpha,gamma,batch_size,input_dims,episolon,
              episolon_dec=0.996,episolon_end = 0.01,
              mem_size = 100, fname = 'dqn_model',replace_target=100 ):
    self.action_space = [i for i in range(num_actions)]
    self.n_actions = num_actions
    self.gamma = gamma
    self.alpha = alpha
    self.episolon = episolon
    self.mem_size = 1000
    self.replace_target = replace_target
    self.batch_size = batch_size
    self.input_dims = input_dims
    self.episolon_dec = episolon_dec
    self.episolon_end = episolon_end
    self.agent_name = agent_name
    self.file_name = 'dqn_model_'+self.agent_name
    

    self.memory = ReplayBuffer(self.mem_size,input_dims,self.n_actions,)
    self.dqn_eval = build_dqn(self.alpha,self.input_dims,self.n_actions,256)
    self.q_target = build_dqn(self.alpha,self.input_dims,self.n_actions,256)

  def remember(self,state,action,reward,done):
    self.memory.store_transition(state,action,reward,done)

  def choose_action(self,observation):
    state = np.expand_dims(observation,0)
    rand = random.random()
    if rand<self.episolon:
      action = random.randint(0,self.n_actions-1)
    else:
      actions = self.dqn_eval(state)
      action = np.argmax(actions)
    
    return action
  
  def learn(self,episode):

    if self.memory.mem_counter<self.batch_size:
      return
    else:
      state,new_state,action,reward,terminal = self.memory.sample(self.batch_size)
      action_values = np.array(self.action_space,dtype = np.int8)
      
      action_indices = np.dot(action,action_values)

      q_eval = self.dqn_eval.predict(state)


      q_next = self.dqn_eval.predict(new_state)

      q_target = q_eval.copy()

      max_actions = np.argmax(q_eval, axis=1)

      batch_index = np.arange(self.batch_size,dtype = np.int32)
  
      q_target[batch_index, action_indices] = reward + self.gamma*q_next[batch_index, max_actions.astype(int)]*done
   

      _ = self.dqn_eval.fit(state,q_target,verbose = 0)

      self.episolon = self.episolon*self.episolon_dec if self.episolon > self.episolon_end else self.episolon_end
      
      if self.memory.mem_counter % self.replace_target == 0:
                self.update_network_parameters()


      #Steps for learning
      # 1. Sample Buffer (non sequential memories because it leads to coorrelation learning where agents focus only on a set parameter space which 
      # slows down learning) 
      
      # 2. Go from onehot encoding to integer encoding

      # 3. Calculate the current value of the state and the next state

      # 4. Update the Q_Target based on maximum values of the next state

      # 5. Use the Q_target as the target for the loss function for the Q_network

  def update_network_parameters(self):
      self.q_target = self.dqn_eval

  def save_model(self):
    self.dqn_eval.save(self.file_name)
  
  def load_model(self):
    self.dqn_eval.load_weights(self.file_name)


In [7]:
from pettingzoo.mpe import simple_speaker_listener_v3
env = simple_speaker_listener_v3.env(max_cycles = 25 , continuous_actions = False)
env.reset()

In [8]:
agent_net = {}
for a in env.agents:
  obs_space = env.observation_space(a).shape
  act_space = env.action_space(a).n
  agent = Agent(agent_name = a,num_actions = act_space,alpha = 1e-5,gamma = 0.95,batch_size = 64,input_dims = obs_space[0],episolon = 0.009,
              episolon_dec=0.996,episolon_end = 0.01,
              mem_size = 10000, fname = 'dqn_model',replace_target=100)
  agent_net[a] = agent

In [10]:
env.reset()
n_games = 2500
ddqn_scores = []
avg_scores=[]
eps_history = []
best_score = -10000
agent_list = env.agents
for i in range(n_games):
  score  = 0
  env.reset()
  for a in env.agent_iter():
    observation,reward,done,trunc,_= env.last()
    if not (done or trunc):
      action = agent_net[a].choose_action(observation)
      act = action
      score += reward
      env.step(act)
      agent_net[a].remember(observation,act,reward,done)
      agent_net[a].learn(i)
    else:
      act = None
      env.step(act)   

  ddqn_scores.append(score)
  avg_score = np.mean(ddqn_scores[-100:])

  
  if (avg_score > best_score):
    best_score = avg_score
    for a in agent_list:
       agent_net[a].save_model()

  print('episode: ', i,'score: %.2f' % score,' average score %.2f' % avg_score)
  avg_scores.append(avg_score)
  

episode:  0 score: -169.72  average score -169.72
episode:  1 score: -111.37  average score -140.55


KeyboardInterrupt: ignored

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.signal import savgol_filter

x = np.linspace(0,2*np.pi,100)
y = np.sin(x) + np.random.random(100) * 0.2
yhat = savgol_filter(ddqn_scores, 101, 3) # window size 51, polynomial order 3


plt.plot(yhat, color='red')
plt.show()

In [None]:
print(ddqn_scores)