<a href="https://colab.research.google.com/github/luthandonx/Multi-Agent-Reinforcement-Learining/blob/Simple-Listener-Speaker/Maddpg_Simple_Speaker_Listiner.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pettingzoo[mpe]

In [None]:
from pettingzoo.mpe import simple_speaker_listener_v3
env = simple_speaker_listener_v3.env(max_cycles = 25 , continuous_actions = False)
env.reset()

In [None]:
import numpy as np

class Replay():
  def __init__(self,max_size,n_actions,input_shape):
    self.mem_size = max_size
    self.mem_counter = 0
    self.state_memory = np.zeros((self.mem_size,input_shape))
    self.new_state_memory = np.zeros((self.mem_size,input_shape))
    self.action_memory = np.zeros((self.mem_size,n_actions))
    self.reward_memory = np.zeros(self.mem_size)
    self.terminal_memory = np.zeros(self.mem_size,dtype = bool)

  def store_trans(self,state,action,reward,done):
    index = self.mem_counter % self.mem_size
    self.state_memory[index] = state
    self.action_memory[index] = action
    self.reward_memory[index] = reward
    self.terminal_memory[index] = done

    if (self.mem_counter > 0):
      self.new_state_memory[index-1] = state
    self.mem_counter += 1

  def sample_buffer(self,batch):
    self.batch = batch
    states = self.state_memory[batch]
    actions = self.action_memory[batch]
    rewards = self.reward_memory[batch]
    states_ = self.new_state_memory[batch]
    dones = self.terminal_memory[batch]
    
    return   states,actions,rewards,states_,dones

  def returnSample(self,batch_size):
    max_mem = min(batch_size,self.mem_counter)
    batch = np.random.choice(max_mem,batch_size) 
    return batch

  def returnshit(self):
    return self.state_memory

In [None]:
import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras.layers import Dense

class CriticNet(keras.Model):
  def __init__(self,n_actions, dim_layer):
    super(CriticNet,self).__init__()
    self.dim_size = dim_layer
    self.n_actions = n_actions
    self.dim_size = 128
    self.layer_one = Dense(self.dim_size,activation='relu')
    self.layer_two = Dense(self.dim_size,activation='relu')
    self.q_value = Dense(1,activation=None)

  def call(self,inputs):
    states,actions = inputs
    input_thing = tf.concat([states, actions],axis = 1)
    action_value = self.layer_one(input_thing)
    action_value = self.layer_two(action_value)

    q = self.q_value(action_value)

    return q

In [None]:
import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras.layers import Dense

class ActorNet(keras.Model):
  def __init__(self,n_actions, dim_layer):
    super(ActorNet,self).__init__()
    self.dim_size = dim_layer
    self.n_actions = n_actions

    self.layer_one = Dense(self.dim_size, activation = 'relu')
    self.layer_two = Dense(self.dim_size, activation = 'relu')
    self.policy = Dense(self.n_actions, activation = 'softmax')

  def call(self,state):
    probs = self.layer_one(state)
    probs = self.layer_two(probs)

    mu = self.policy(probs)
    return mu

In [None]:
from tensorflow.keras.optimizers import Adam
import tensorflow_probability as tfp


class Agent:
  def __init__(self,input_dims,alpha = 0.01,beta =1e-5 , env = None ,
               gamma = 0.95, n_actions = 5, max_size = 10000, tau = 0.001,
               layer_size = 256, batch_size = 50,noise = 0.1, name = None):
    
    self.gamma = gamma
    self.tau = tau
    self.memory = Replay(max_size,n_actions,input_dims)
    self.batch_size = batch_size
    self.n_actions = n_actions
    self.act_name = name+' actor'
    self.cri_name = name+' critc'
    self.tar_act_name = name+' target_actor'
    self.tar_cri_name = name+' target_critc'
    
    self.actor = ActorNet(n_actions = n_actions, dim_layer = layer_size)
    self.critic = CriticNet(n_actions,layer_size)

    self.target_actor = ActorNet(n_actions = n_actions, dim_layer = layer_size)
    self.target_critic = CriticNet(n_actions,layer_size)

    self.actor.compile(keras.optimizers.Adam(learning_rate=alpha))
    self.target_actor.compile(keras.optimizers.Adam(learning_rate=alpha))

    self.critic.compile(keras.optimizers.Adam(learning_rate=beta))
    self.target_critic.compile(keras.optimizers.Adam(learning_rate=beta))

    self.learnCounter = 0

  def Update_net_parameters(self,tau=None):
    if tau is None:
      tau = self.tau
    
    weights = []
    targets = self.target_actor.weights

    for i,weight in enumerate(self.actor.weights):
      weights.append(weight * tau + targets[i] *(1-tau))
    self.target_actor.set_weights(weights)

    weights = []
    targets = self.target_critic.weights  
    for i,weight in enumerate(self.critic.weights):
      weights.append(weight * tau + targets[i] *(1-tau))
    self.target_critic.set_weights(weights)

  def store_transition(self,state,action,reward,done):
    self.memory.store_trans(state,action,reward,done)

  def save_models(self):
    print('....saving models....')
    self.actor.save_weights(self.act_name)
    self.critic.save_weights(self.cri_name)
    self.target_actor.save_weights(self.tar_act_name)
    self.target_critic.save_weights(self.tar_cri_name)

    self.actor.save(self.act_name)

  def load_models(self):
    print('....loading models....')
    self.actor.load_weights(self.act_name)
    self.critic.load_weights(self.cri_name)
    self.target_actor.load_weights(self.tar_act_name)
    self.target_critic.load_weights(self.tar_cri_name)

  def choose_action(self,observation,evaluate = False):
    state = tf.convert_to_tensor([observation])
    probs = self.actor(state)
    action_probs = tfp.distributions.Categorical(probs = probs)
    action = action_probs.sample()
    action = action.numpy()[0]
    return action

  def learn(self,agents):
    
    if self.memory.mem_counter < self.batch_size:
      return
    
    self.learnCounter += 1
    agent_2 = agents[0]

    samples = self.memory.returnSample(self.batch_size)

    state_1,action_1,reward_1,new_state_1,done_1 = self.memory.sample_buffer(samples)
    states_1 = tf.convert_to_tensor(state_1)
    new_states_1 = tf.convert_to_tensor(new_state_1)
    rewards_1 = tf.convert_to_tensor(reward_1, dtype= tf.float32)
    actions_1 = tf.convert_to_tensor(action_1,dtype= tf.float32)

    state_2,action_2,reward_2,new_state_2,done_2 = agent_2.memory.sample_buffer(samples)
    states_2 = tf.convert_to_tensor(state_2)
    new_states_2 = tf.convert_to_tensor(new_state_2)
    rewards_2 = tf.convert_to_tensor(reward_2)
    actions_2 = tf.convert_to_tensor(action_2,dtype= tf.float32)


    with tf.GradientTape() as tape:
        target_actions_probs_1 = self.target_actor(new_states_1)
        target_probs = tfp.distributions.Categorical(probs = target_actions_probs_1)
        target_actions_1 = tf.transpose([target_probs.sample()])

    
        target_actions_probs_2 = agent_2.target_actor(new_states_2)
        target_probs = tfp.distributions.Categorical(probs = target_actions_probs_2)
        target_actions_2 = tf.transpose([target_probs.sample()])
      

        actions_1 = tf.transpose(tf.nn.embedding_lookup(tf.transpose(actions_1), [0]))
        actions_2 = tf.transpose(tf.nn.embedding_lookup(tf.transpose(actions_2), [0]))
        actions = tf.concat([actions_1,actions_2],axis = 1)
        actions = tf.cast(actions,tf.float32)

        new_states = tf.concat([new_states_1,new_states_2],axis = 1)
        target_actions = tf.concat([target_actions_1,target_actions_2], axis = 1)
        target_actions = tf.cast(target_actions,tf.float32)

      
        critic_value_ = tf.squeeze(self.target_critic((new_states,target_actions)),1)
        states= tf.concat([states_1,states_2],axis = 1)
     
        critic_value = tf.squeeze(self.critic((states,actions)),1)
        target = rewards_1 + self.gamma*critic_value_*(1-done_1)
        critic_loss = keras.losses.MSE(target,critic_value)
    
    params = self.target_critic.trainable_variables
    grads = tape.gradient(critic_loss,params)
    self.critic.optimizer.apply_gradients(zip(grads,params))


    with tf.GradientTape() as tape:
        new_policy_actions_probs = self.actor(states_1)
        new_probs = tfp.distributions.Categorical(probs = new_policy_actions_probs)
        new_actions_1 = tf.transpose([new_probs.sample()])

        new_actions_probs_2 = agent_2.actor(states_2)
        new_probs = tfp.distributions.Categorical(probs = new_actions_probs_2)
        new_actions_2 = tf.transpose([new_probs.sample()])

        new_actions = tf.concat([new_actions_1,new_actions_2],axis =1 )
        new_actions = tf.cast(new_actions,tf.float32)
        states = tf.concat([states_1,states_2],axis = 1)
          
        actor_loss = -self.critic((states,new_actions))
        actor_loss = actor_loss[0]
    
    params = self.critic.trainable_variables
    grads = tape.gradient(actor_loss,params)
    self.actor.optimizer.apply_gradients(zip(grads,params))

    if (self.learnCounter % 100 == 0):
      self.Update_net_parameters()
      self.learnCounter = 0

In [None]:
agent_net = {}
agent_list = []
for a in env.agents:
  obsv = env.observation_space(a).shape[0]
  n_action = env.action_space(a).n
  agent = Agent(input_dims = obsv, alpha = 1e-5,beta = 1e-5, env = env, gamma = 0.95,n_actions = n_action, max_size = 10000,
                layer_size = 256,batch_size = 64,noise = 0.1, name = a)
  agent_list.append(a)
  agent_net[a] = agent 

In [None]:
n_games = 1000
score_history = []
listener_state_ = []
speaker_state_ = []
best_score = -1000
prev_state = []

avg_scores = []
speaker = False
for i in range(n_games):
  score = 0
  env.reset()
  for agent in env.agent_iter():
    state,reward,done,trunc,_ = env.last()
    prev_state.append(state)
    if not (done or trunc):
      if (agent == 'speaker_0'):
        action = agent_net[agent].choose_action(state)
      else: 
        action = agent_net[agent].choose_action(state) 
    else:
      action = (None)
    score += reward 
    env.step(action)
    agent_net[agent].store_transition(state,action,reward,done)

    agent_net['speaker_0'].learn([agent_net['listener_0']])
    agent_net['listener_0'].learn([agent_net['speaker_0']])
  

  score_history.append(score)

  avg_score = np.mean(score_history[-100:])

  if avg_score > best_score:
    best_score = avg_score
    agent_net['speaker_0'].save_models()
    agent_net['speaker_0'].save_models() 
  
  print('episode: ', i,'average score: %.2f' % avg_score)
  avg_scores.append(avg_score)


    
    

      

In [None]:
print(avg_scores)