<a href="https://colab.research.google.com/github/luthandonx/Multi-Agent-Reinforcement-Learining/blob/Simple-Spread/Policy_Gradient_Simple_Spread_2_agents.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pettingzoo[mpe]

In [None]:
from keras.layers import Input,Dense,Activation
from keras.models import Model,load_model
import numpy as np
from tensorflow import keras
import keras.backend as K

class Agent(object):
  def __init__ (self,name,alpha,gamma,input_dims, n_actions,layer_size,fname ='reinforcePolicy'):
    self.gamma = gamma
    self.lr = alpha
    self.G = 0
    self.input_dims = input_dims
    self.layer_size = layer_size
    self.n_actions = n_actions
    self.state_memory = []
    self.action_memory = []
    self.reward_memory = []
    self.agent_name = name 

    self.policy,self.predict = self.create_policy()
    self.action_space = [i for i in range(n_actions)]
    self.model_file = fname + " " + self.agent_name

  def create_policy(self):
    input = Input(shape = (self.input_dims,))#comma indicates that it takes a batch
    advantages = Input(shape =[1])
    dense1 = Dense(self.layer_size,activation = 'relu')(input)
    dense2 = Dense(self.layer_size,activation = 'relu')(dense1)
    probs = Dense(self.n_actions, activation = 'softmax')(dense2)

    def custom_loss(y_true,y_pred):
      out = K.clip(y_pred,1e-8, 1-1e-8) #this is to ensure that we do not perform log calcualtions with log values of 0
      log_lik = y_true*K.log(out)

      return K.sum(-log_lik*advantages)
    
    policy = Model(inputs = [input,advantages], outputs = [probs])
    opt = keras.optimizers.Adam(learning_rate = self.lr)
    policy.compile(optimizer = opt, loss = custom_loss)

    predict = keras.Model(inputs = [input], outputs = [probs])

    return policy,predict

  def choose_action(self,obsv): 
    obsv = np.expand_dims(obsv,0)
    #to keep the input shape consistant
    probabilities = self.predict.predict(obsv)[0]
    # we take the 0th element because predict returns a tuple
    action = np.random.choice(self.action_space,p=probabilities)
    # action = np.argmax(probabilities)
    return action

  def store_trans(self,obsv,action,reward):
    self.action_memory.append(action)
    self.state_memory.append(obsv)
    self.reward_memory.append(reward)

  def learn(self):
    state_memory = np.array(self.state_memory)
    reward_memory = np.array(self.reward_memory)
    action_memory = np.array(self.action_memory)

    if len(state_memory)<64:
      return

    actions = np.zeros([len(action_memory),self.n_actions])
    actions[np.arange(len(action_memory)),action_memory] = 1
    # loss function requires the labels to be 1 hot encoded

    G = np.zeros_like(reward_memory)
    for t in range(len(reward_memory)):
      G_sum = 0
      discount = 1
      for k in range(t,len(reward_memory)):
        G_sum += reward_memory[k]*discount
        discount *= self.gamma
      G[t] = G_sum

    mean = np.mean(G)
    std = np.std(G) if np.std(G) > 0 else 1
    self.G = (G-mean)/std

    
    self.policy.train_on_batch([state_memory,self.G], actions) # actions is our labels
    #[state_memory,self.G] =  y_pred and actions = y_true
    # This is to calculate for the loss function

    self.state_memory = []
    self.reward_memory = []
    self.action_memory = []

  def save_model(self):
    self.policy.save_weights(self.model_file)

  def load_model(self):
    self.policy.load_weights(self.model_file)

In [None]:
from tensorflow.python.framework.ops import disable_eager_execution

disable_eager_execution()

In [None]:
from pettingzoo.mpe import simple_spread_v2

env = simple_spread_v2.env(N=2, local_ratio=0.5,  max_cycles=25, continuous_actions=False)
env.reset()

In [None]:
agent_net = {}
for a in env.agents:
  obs_space = env.observation_space(a).shape
  action_space = env.action_space(a).n
  agent = Agent(a, 1e-5, 0.95 , obs_space[0] ,action_space,256)
  agent_net[a] = agent

In [None]:
env.reset()
agent_list = env.agents
max_episode = 2500
score_history = []
best_score = -1000
for i in range(max_episode):
  score = 0
  env.reset()
  for agent in env.agent_iter():
    observation,reward,done,trunc,info= env.last()
    score += reward
    if not (done or trunc):
      action = agent_net[agent].choose_action(observation)
      env.step(action)
      agent_net[agent].store_trans(observation,action,reward)
    else:
      action = None
      env.step(action)
     
  score_history.append(score)    
  avg_score = np.mean(score_history[-100:])

  if (avg_score > best_score):
    best_score = avg_score
    for a in agent_list:
       agent_net[a].save_model()
 

  for a in agent_list:
    agent_net[a].learn()

  print('episode: ', i , ' score :',score,' avg score:', avg_score)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.signal import savgol_filter

x = np.linspace(0,2*np.pi,100)
y = np.sin(x) + np.random.random(100) * 0.2
yhat = savgol_filter(score_history, 101, 7) # window size 51, polynomial order 3


plt.plot(yhat, color='red')
plt.show()

In [None]:
print(score_history)