<a href="https://colab.research.google.com/github/luthandonx/Multi-Agent-Reinforcement-Learining/blob/Simple-Adversary/Actor_Critic_Simple_Adversary_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pettingzoo[mpe]

In [None]:
import os
import tensorflow as tf
from tensorflow.keras import Model,layers
from tensorflow import keras
from tensorflow.keras.optimizers import Adam
import tensorflow_probability as tfp

class ActorCriticNetwork(Model):
  def __init__(self,n_actions, dims_1 = 256 , dims_2 = 256,
               name = 'actor_critc', chkpt_dir = 'tmp/actor_critic'):
    super(ActorCriticNetwork, self).__init__()
    self.dims_1 = dims_1
    self.dims_2 = dims_2
    self.n_actions = n_actions
    self.model_name = name
    self.checkpoint_dir = chkpt_dir
    self.checkpoint_file = os.path.join(self.checkpoint_dir, name+'_ac')

    self.fc1 = layers.Dense(self.dims_1,activation = 'relu')
    self.fc2 = layers.Dense(self.dims_2,activation = 'relu')
    self.v = layers.Dense(1,activation = None)
    self.po = layers.Dense(n_actions,activation = 'softmax')

  def call(self,state):
    value = self.fc1(state)
    value = self.fc2(value)

    v = self.v(value)
    po = self.po(value)

    return v,po

In [None]:



class Agent:
  def __init__(self, alpha = 1e-5, gamma = 0.95, n_actions = 4,name='agent'):
    self.gamma = gamma
    self.n_actions = n_actions
    self.action = None
    self.action_space = [i for i in range(self.n_actions)]
    self.name = name
    self.actor_critic = ActorCriticNetwork(n_actions=n_actions)
    self.actor_critic.compile(optimizer = Adam(learning_rate = alpha))
    self.fname = 'actor_critic_'+self.name

    self.reward_history = []
    self.action_history = []
    self.obsv_history = []
    self.next_obsv_history = []

  def choose_action(self,observation):
    state = tf.convert_to_tensor([observation])
    _,probs = self.actor_critic(state)

    action_probs = tfp.distributions.Categorical(probs = probs)
    action = action_probs.sample()

    self.action = action

    return action.numpy()[0]

  def save_models(self):
    print('saving model.....')
    self.actor_critic.save(self.fname)
  
  def load_models(self):
    #print('loading model....')
    self.actor_critic = tf.keras.models_load_model(self.fname)

  def store_trans(self,action,reward,obsv,obsv_):
      self.action_history.append(action)
      self.reward_history.append(reward)
      self.obsv_history.append(obsv)
      self.next_obsv_history.append(obsv_)

  def clear_memory(self):
    self.reward_history = []
    self.action_history = []
    self.obsv_history = []
    self.next_obsv_history = []

  def learn(self,state,reward,state_,done):
    state = tf.convert_to_tensor([state],dtype = tf.float32)
    state_ = tf.convert_to_tensor([state_],dtype = tf.float32)
    reward = tf.convert_to_tensor(reward,dtype = tf.float32)

    with tf.GradientTape() as tape:
      state_value, probs = self.actor_critic(state)
      state_value_,_ = self.actor_critic(state_)
      state_value = tf.squeeze(state_value)
      state_value_ = tf.squeeze(state_value_)

      action_probs = tfp.distributions.Categorical(probs = probs)
      log_probs = action_probs.log_prob(self.action)

      delta = reward + self.gamma*state_value_*(1-int(done)) - state_value

      actor_loss = -log_probs*delta
      critic_loss = delta**2

      total_loss = actor_loss + critic_loss

    gradients = tape.gradient(total_loss,self.actor_critic.trainable_variables)
    self.actor_critic.optimizer.apply_gradients(zip(gradients, self.actor_critic.trainable_variables))


In [None]:
from pettingzoo.mpe import simple_adversary_v2
import numpy as np

env = simple_adversary_v2.env(N=2,max_cycles = 25 , continuous_actions = False )
env.reset()

In [None]:
env.reset()
agent_net = {}
agent_list = []
for a in env.agents:
  n_actions = env.action_space(a).n
  input_dims = env.observation_space(a).shape[0]
  if (a == 'adversary_0'):
    opp_agent = Agent(alpha = 1e-5,n_actions = 5,name = a)
  else:
    good_agent = Agent(alpha = 1e-5,n_actions = 5,name = a)
    agent_net[a] = good_agent
    agent_list.append(a)

In [None]:
env.reset()
max_episode = 1000
good_best_score = -1000
opp_best_score = -10000
Good_score_history = []
Opp_score_history = []
good_score = 0
opp_score  = 0
Opp_state_ = []
Good_state_ = []
Good_avg_score = []
Opp_score_history = []
n_games = 1000

for i in range(n_games):
  env.reset()
  good_score = 0
  opp_score  = 0
  for agent in env.agent_iter():
    obsv,reward,done,trunc,_ = env.last()
    if not (done or trunc):
      if (agent == 'adversary_0'):
        action = opp_agent.choose_action(obsv)
        opp_score += reward
        env.step(action)
        obsv_,_,_,_,_ = env.last()
        if(len(Opp_state_)>0):
          opp_agent.store_trans(action,reward,obsv,Opp_state_[0])
          opp_agent.learn(obsv,reward,Opp_state_[0],done)
      else:
        action = agent_net[agent].choose_action(obsv)
        good_score += reward
        env.step(action)
        state_,_,_,_,_ = env.last()
        if (state_.shape == obsv.shape):
          agent_net[agent].store_trans(action,reward,obsv,state_)
          agent_net[agent].learn(obsv,reward,state_,done)
        else:
          Opp_state_.append(state_)  
    else:
      env.step(None)   
  
  Good_score_history.append(good_score)
  Opp_score_history.append(opp_score)

  Good_avg_score = np.mean(Good_score_history[-100:])
  Opp_avg_score = np.mean(Opp_score_history[-100:])

  if (Good_avg_score > good_best_score):
    good_best_score = Good_avg_score
    for a in agent_list:
       agent_net[a].save_models()

  if (Opp_avg_score > opp_best_score):
    opp_best_score = Opp_avg_score
    opp_agent.save_models()
  
  print('episode: ', i ,' Opp avg score:', Opp_avg_score,'Good avg score :',Good_avg_score)
  

In [None]:
!zip -r /content/actor_critic_adversary_0.zip /content/actor_critic_adversary_0
!zip -r /content/actor_critic_agent_1.zip /content/actor_critic_agent_1
!zip -r /content/actor_critic_agent_0.zip /content/actor_critic_agent_0

In [None]:
from google.colab import files


In [None]:
# !zip -r /content/actor_critic_agent_0.zip /content/actor_critic_agent_0
# !zip -r /content/actor_critic_agent_1.zip /content/actor_critic_agent_1
files.download('/content/actor_critic_adversary_0.zip')
files.download('/content/actor_critic_agent_0.zip')
files.download('/content/actor_critic_agent_1.zip')

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.signal import savgol_filter

x = np.linspace(0,2*np.pi,100)
y = np.sin(x) + np.random.random(100) * 0.2
yhat = savgol_filter(Good_score_history, 101, 7) # window size 51, polynomial order 3
opphat = savgol_filter(Opp_score_history, 101, 7) 

plt.plot(yhat, color='red')
plt.show()

In [None]:
print(Good_score_history)

In [None]:
print(Opp_score_history)