<a href="https://colab.research.google.com/github/luthandonx/Multi-Agent-Reinforcement-Learining/blob/PPO-on-Simple-Spread/Simple_Spread_PPO.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pettingzoo[mpe]

In [None]:
from keras.models import Sequential
from keras.layers import Dense,LSTM,Input,Dropout
import keras.backend as K
import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.python.ops.numpy_ops import np_config
import numpy as np
import tensorflow_probability as tfp

np_config.enable_numpy_behavior()


In [None]:
class Replay:
  def __init__(self,max_mem,n_weights,obsv_shape):
    self.max_mem = max_mem
    self.n_weights = n_weights
    self.mem_counter = 0

    self.state_memory = np.zeros((self.max_mem,obsv_shape))
    self.action_memory = np.zeros((self.max_mem,n_weights))
    self.reward_memory = np.zeros((self.max_mem))
    self.next_state_memory = np.zeros((self.max_mem,obsv_shape))
    self.terminal_memory = np.zeros(self.max_mem,dtype = bool)
  
  def store_transition(self,obsv,action,reward,done):
    index = self.mem_counter % self.max_mem
    self.state_memory[index] = obsv
    self.action_memory[index] = action
    self.reward_memory[index] = reward
    self.terminal_memory[index] = done

    if (self.mem_counter > 0):
      self.next_state_memory[index-1] = obsv
    self.mem_counter += 1

    self.mem_counter += 1

  def sample_buffer(self,batch_indices):
    states = self.state_memory[batch_indices]
    actions = self.action_memory[batch_indices]
    rewards = self.reward_memory[batch_indices]
    next_states = self.next_state_memory[batch_indices]
    dones = self.terminal_memory[batch_indices]

    return states,actions,rewards,next_states,dones

  def return_indices(self,batch_size):
    mem_size = min(batch_size,self.mem_counter) # we dont want to choose something outside our mem size
    batch_indices = np.random.choice(mem_size,batch_size)
    return batch_indices
    

In [None]:

class ActorNet(keras.Model):
  def __init__(self,layer_size,n_weights):
    super(ActorNet,self).__init__()
    self.layer_size = layer_size

    self.layer_one = Dense(self.layer_size,activation = 'relu') #There might be an error with regards to the input shape 
    self.layer_two = Dense(self.layer_size,activation = 'relu')
    self.policy = Dense(n_weights,activation = 'softmax')

  def call(self,state):
    v1 = self.layer_one(state)
    v2 = self.layer_two(v1)
    w = self.policy(v2)
    return w

In [None]:
class CriticNet(keras.Model):
  def __init__(self,layer_size):
    super(CriticNet,self).__init__()
    self.layer_size = layer_size

    self.layer_one = Dense(self.layer_size,activation = 'relu') #There might be an error with regards to the input shape 
    self.layer_two = Dense(self.layer_size,activation = 'relu')
    self.action_value = Dense(1)

  def call(self,inputs):
    states,actions = inputs
    input = tf.concat([states,actions],axis = 1)
    v1 = self.layer_one(input)
    v2 = self.layer_two(v1)
    w = self.action_value(v2)
    return w
    print('w: ',w)
  

In [None]:
class Agent:
  def __init__(self,alpha,beta,gamma,lambdah,layer_size ,mem_size,n_weights,obsv_shape, batch_size, file_name,tau): 
    
    self.Actor = ActorNet(layer_size,n_weights)
    self.Critic = CriticNet(layer_size)
    self.Target_Actor = ActorNet(layer_size,n_weights)
    self.Target_Critic = CriticNet(layer_size)

    self.Target_Actor.compile(keras.optimizers.Adam(learning_rate = alpha))
    self.Target_Critic.compile(keras.optimizers.Adam(learning_rate = beta))

    self.max_mem = mem_size
    self.n_weights = n_weights
    self.obsv_shape = obsv_shape
    self.Memory = Replay(self.max_mem, self.n_weights,self.obsv_shape )

    self.file_name = file_name
    self.tau = tau
    self.batch_size = batch_size
    self.gamma = gamma
    self.learnCounter = 0
    self.lambdah = lambdah # suggested value = 0.95
    
    self.clipping_value = 0.2
    self.critic_discount = 0.5
    self.entropy_beta = 0.001

  def choose_action(self,observation):
    state = tf.convert_to_tensor([observation])
    probs = self.Actor(state)
    action_probs = tfp.distributions.Categorical(probs = probs)
    action = action_probs.sample()
    self.action = action
    action = action.numpy()[0]
    return action
  
  def update_network_params(self):
    tau = self.tau

    weights = []
    targets = self.Target_Actor.weights

    for i,weight in enumerate(self.Actor.weights):
      weights.append(weight * tau + targets[i] *(1-tau))
    self.Target_Actor.set_weights(weights)

    weights = []
    targets = self.Target_Critic.weights  
    for i,weight in enumerate(self.Critic.weights):
      weights.append(weight * tau + targets[i] *(1-tau))
    self.Target_Critic.set_weights(weights)


  def store_transition(self,state,action,reward,done):
    self.Memory.store_transition(state,action,reward,done)

  def save_models(self):
    print('Saving the models...')
    self.Actor.save(self.file_name+' Actor')
    self.Critic.save(self.file_name+' Critic')
  
  def load_models(self):
    print('Loading the models......')
    self.Actor = keras.models.load_model(self.file_name+' Actor')
    self.Critic = keras.models.load_model(self.file_name+' Critic')
    self.Target_Actor = keras.models.load_model(self.file_name+'Target Actor')
    self.Target_Critic = keras.models.load_model(self.file_name+'Target Critic')

  def save_models(self):
    self.Actor.save(self.file_name + 'Actor')
    self.Critic.save(self.file_name + 'Critic')
    self.Target_Actor(self.file_name+ 'Target Actor')
    self.Target_Critic(self.file_name+ 'Target Critic')

  def get_advantages(self, rewards, state, state_, dones):
      returns = []
      target_actions = self.Target_Actor(state_)

      target_probs = tfp.distributions.Categorical(probs = target_actions)
      target_actions_1 = tf.transpose([target_probs.sample()])
      target_actions_1 = tf.cast(target_actions_1,dtype = tf.float32)
      values = self.Critic((state,target_actions_1))

      action_probs = self.Actor(state)
      action = tfp.distributions.Categorical(probs = action_probs)
      action = tf.transpose([action.sample()])
      action = tf.cast(action,tf.float32)
      
      values_ = self.Target_Critic((state,action))

      gae = 0
      for i in reversed(range(len(rewards))):
        delta = rewards[i] +self.gamma*values_[i]*(1-dones[i]) -values[i]
        gae = delta + self.gamma * self.lambdah *(1-dones[i])* gae
        x = gae+values[i]
        returns.insert(0,x)

      return returns 
  
  def learn(self):
    if self.Memory.mem_counter < self.batch_size :
      return

    self.learnCounter += 1 
    indices = self.Memory.return_indices(self.batch_size)
    states,actions,rewards,next_states,dones = self.Memory.sample_buffer(indices)


    tf_states = tf.convert_to_tensor(states, dtype = tf.float32)
    tf_states_ = tf.convert_to_tensor(next_states, dtype = tf.float32)
    tf_rewards = tf.convert_to_tensor(rewards, dtype = tf.float32)
  
    advantages = self.get_advantages(tf_rewards, tf_states, tf_states_, dones)
   
    actions_probs = self.Actor(tf_states)
    actions = tfp.distributions.Categorical(probs = actions_probs)
    actions = tf.transpose([actions.sample()])
    actions = tf.cast(actions,dtype = tf.float32)
    values = self.Critic((tf_states,actions))
   

    def custom_actor_loss(y_true,y_pred):
      old_policy_probs = self.Actor(tf_states)
      new_policy_probs = self.Target_Actor(tf_states_)
      ratio = K.exp(K.log(new_policy_probs + 1e-10) - K.log(old_policy_probs + 1e-10))
      part_1 = ratio * advantages
      part_1 = ratio * advantages
      part_2 = K.clip(ratio,min_value=1 - self.clipping_value,max_value_value= 1 + self.clipping_val)*advantages
      actor_loss = -K.mean(K.minimum(part_1,part_2)) -  self.entropy_beta *K.mean(-(new_policy_probs * K.log(new_policy_probs + 1e-10)))
      return actor_loss

    
    def custom_critic_loss(y_true,y_pred):
      old_policy_probs = self.Actor(tf_states)
      new_policy_probs = self.Target_Actor(tf_states_)
      ratio = K.exp(K.log(new_policy_probs + 1e-10) - K.log(old_policy_probs + 1e-10))
      part_1 = ratio * advantages
      part_2 = K.clip(ratio,min_value=1 - self.clipping_value,max_value_value=1 + self.clipping_val)*advantages
      actor_loss = -K.mean(K.minimum(part_1,part_2))
      critic_loss = K.mean(K.square(rewards-values))
      total_loss = self.critic_discount * critic_loss - self.entropy_beta *K.mean(-(new_policy_probs * K.log(new_policy_probs + 1e-10)))
      return total_loss
    
    self.Actor.compile(optimizer = 'adam', loss = custom_actor_loss)
    self.Critic.compile(optimizer = 'adam', loss = custom_critic_loss)

    if (self.learnCounter % 100 == 0):
      self.update_network_params()
      self.learnCounter = 0


    

In [None]:
from pettingzoo.mpe import simple_spread_v2

env = simple_spread_v2.env(N=2, local_ratio=0.5, max_cycles=25, continuous_actions=False)
env.reset()

In [None]:
agent_scores = {}
policy_net = {}
for a in env.agents:
  obsv = env.observation_space(a).shape[0]
  n_action = env.action_space(a).n
  agent = Agent(alpha = 1e-5,beta = 1e-5,gamma = 0.65,lambdah = 0.2,layer_size = 256 ,mem_size = 10000,n_weights = n_action,obsv_shape = obsv, batch_size = 64, file_name = a,tau = 0.05)
  policy_net[a] = agent

In [None]:

agent_list = env.agents

n_games = 1000
best_score = -10000
score_history = []
model_loaded = False

for i in range(n_games):
  env.reset()
  score = 0
  for agent in env.agent_iter():
    obsv,reward,done,trunc,_ = env.last()
    if not (done or trunc):
      action = policy_net[agent].choose_action(obsv)
      score += reward
      env.step(action)
      policy_net[agent].store_transition(obsv,action,reward,done)
      policy_net[agent].learn()
    else:
      action = None
    env.step(action)
    
  score_history.append(score)    
  avg_score = np.mean(score_history[-100:])

  if (avg_score > best_score):
    best_score = avg_score

  print('episode: ',i ,'soore: ',score,' avg_score: ', avg_score)

In [None]:
print(score_history)