In [None]:
import gym
from gym import spaces
import os
import time
import datetime

import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras import losses
from tensorflow.keras import Model
from tensorflow.keras import initializers
from tensorflow.keras import optimizers
from tensorflow.keras import backend

import numpy as np
import matplotlib.pyplot as plt

from distutils.log import error

from tensorflow.python.ops.numpy_ops import np_config
np_config.enable_numpy_behavior()
tf.config.run_functions_eagerly(True)

In [None]:
# Networks
def get_actor(num_states, num_actions, continuous, disc_actions_num):
    
    ### ACTOR NETWORK ###
    
    inputs = layers.Input(shape=(num_states,))
    out = layers.Dense(64, activation="relu")(inputs)
    # out = layers.LayerNormalization(axis=1)(out)
    out = layers.Dense(64, activation="relu")(out)
    # out = layers.LayerNormalization(axis=1)(out)
    
    if continuous:
        outputs = layers.Dense(num_actions, activation="tanh", kernel_initializer=initializers.RandomNormal(stddev=0.03))(out)
    else:
        outputs = layers.Dense(disc_actions_num, activation="softmax", kernel_initializer=initializers.RandomNormal(stddev=0.03))(out)
    
    return Model(inputs, outputs)

def get_critic(num_states, num_agents, num_actions, continuous, disc_actions_num):
    
    ### CRITIC NETWORK ###
    
    state_input = layers.Input(shape=(num_states * num_agents))
    state_out = layers.Dense(64, activation="relu")(state_input)
    
    if continuous:
        action_input = layers.Input(shape=(num_actions * num_agents))
    else:
        action_input = layers.Input(shape=(disc_actions_num * num_agents))
    action_out = layers.Dense(64, activation="relu")(action_input)

    concat = layers.Concatenate()([state_out, action_out])

    out = layers.Dense(64, activation="relu")(concat)
    out = layers.LayerNormalization(axis=1)(out)
    out = layers.Dense(64, activation="relu")(out)
    out = layers.LayerNormalization(axis=1)(out)
    outputs = layers.Dense(num_actions)(out)

    return Model([state_input, action_input], outputs)

In [None]:
# Helpers
class OUActionNoise:
    def __init__(self, mean, std_deviation, theta, dt=1e-2, x_initial=None):
        self.theta = theta
        self.mean = mean
        self.std_dev = std_deviation
        self.dt = dt
        self.x_initial = x_initial
        self.reset()
    def __call__(self):
        x = (
            self.x_prev
            + self.theta * (self.mean - self.x_prev) * self.dt
            + self.std_dev * np.sqrt(self.dt) * np.random.normal(size=self.mean.shape)
        )
        self.x_prev = x
        return x
    def reset(self):
        if self.x_initial is not None:
            self.x_prev = self.x_initial
        else:
            self.x_prev = np.zeros_like(self.mean)

@tf.function
def update_target(target_weights, weights, tau):
    for (a, b) in zip(target_weights, weights):
        a.assign(b * tau + a * (1 - tau))
        
def fixed(x, episode):
    return x

In [None]:
a = tf.convert_to_tensor(np.zeros((64,4,2)))

In [None]:
batch = a.numpy()
batch[:,0,:] = np.ones((64,2))

In [None]:
# batch

In [None]:
class Coop_MADDPG:
    def __init__(self, num_states, num_actions, num_agents, continuous,
            buffer_capacity, batch_size, std_dev, epsilon,
            actor_lr, critic_lr, gamma, tau, min_clip, max_clip,
            adam_eps, amsgrad, theta, disc_actions_num, loss_func):
        
        self.continuous = continuous
        self.buffer_capacity = buffer_capacity
        self.batch_size = batch_size
        # This is used to make sure we only sample from used buffer space
        self.buffer_counter = 0
        self.state_buffer = np.zeros((self.buffer_capacity, num_agents, num_states))
        if self.continuous:
            self.action_buffer = np.zeros((self.buffer_capacity, num_agents, num_actions))
        else:
            self.action_buffer = np.zeros((self.buffer_capacity, num_agents, disc_actions_num))
        self.reward_buffer = np.zeros((self.buffer_capacity, num_agents, 1))
        self.next_state_buffer = np.zeros((self.buffer_capacity, num_agents, num_states))
        self.done_buffer = np.zeros((self.buffer_capacity, num_agents, 1), np.float32)
        self.std_dev = std_dev # For continuous
        self.epsilon = epsilon # Epsilon greedy for discrete
        self.critic_lr = critic_lr
        self.actor_lr = actor_lr
        self.gamma = gamma
        self.tau = tau
        self.disc_actions_num = disc_actions_num
        self.num_agents = num_agents
        self.num_actions = num_actions
        
        self.loss_func = loss_func
        
        self.min_clip = min_clip
        self.max_clip = max_clip
        
        self.ou_noise = OUActionNoise(mean=np.zeros(1), std_deviation=float(std_dev) * np.ones(1), theta=theta)
        
        self.actor_model = get_actor(num_states, num_actions, continuous, disc_actions_num)
        self.critic_model = get_critic(num_states, num_agents, num_actions, continuous, disc_actions_num)
        self.target_actor = get_actor(num_states, num_actions, continuous, disc_actions_num)
        self.target_critic = get_critic(num_states, num_agents, num_actions, continuous, disc_actions_num)
        
        self.actor_optimizer = optimizers.Adam(
            learning_rate=actor_lr, beta_1=0.9, beta_2=0.999, epsilon=adam_eps, amsgrad=amsgrad,
        )
        self.critic_optimizer = optimizers.Adam(
            learning_rate=critic_lr, beta_1=0.9, beta_2=0.999, epsilon=adam_eps, amsgrad=amsgrad,
        )
        # Making the weights equal initially
        self.target_actor.set_weights(self.actor_model.get_weights())
        self.target_critic.set_weights(self.critic_model.get_weights())
    
    def record(self, obs_tuple):
        # Reuse the same buffer replacing old entries
        index = self.buffer_counter % self.buffer_capacity

        for agent in range(self.num_agents):
            self.state_buffer[index][agent] = obs_tuple[0][agent]
            self.action_buffer[index][agent] = obs_tuple[1][agent]
            self.reward_buffer[index][agent] = obs_tuple[2][agent]
            self.next_state_buffer[index][agent] = obs_tuple[3][agent]
            self.done_buffer[index][agent] = obs_tuple[4][agent]

        self.buffer_counter += 1
    
    # Calculation of loss and gradients
    @tf.function
    def update(self, state_batch, action_batch, reward_batch, next_state_batch, done_batch, loss_func):

        # state_batch = state_batch.reshape(*state_batch.shape[:1], -1)
        # action_batch = action_batch.reshape(*action_batch.shape[:1], -1)
        # reward_batch = reward_batch.reshape(*reward_batch.shape[:1], -1)
        # next_state_batch = next_state_batch.reshape(*next_state_batch.shape[:1], -1)
        # done_batch = done_batch.reshape(*done_batch.shape[:1], -1)
        #print(next_state_batch[:][0])
        flat_next_state = next_state_batch.reshape(*next_state_batch.shape[:1], -1)
        flat_state = state_batch.reshape(*state_batch.shape[:1], -1)
        flat_action = action_batch.reshape(*action_batch.shape[:1], -1)

        # calculate per agent loss
        with tf.GradientTape() as tape:
            y = []
            target_actions = np.zeros((self.batch_size, self.num_agents, self.disc_actions_num))
            for agent in range(self.num_agents):
                target_actions[:,agent,:] = self.target_actor(next_state_batch[:,agent,:], training=True).numpy()
            target_actions = tf.convert_to_tensor(target_actions)
            target_actions = target_actions.reshape(*target_actions.shape[:1], -1)
            for agent in range(self.num_agents):
                y.append(reward_batch[:,agent,:] + done_batch[:,agent,:] * self.gamma * self.target_critic([flat_next_state, target_actions], training=True))
            critic_value = (self.critic_model([flat_state, flat_action], training=True))
            critic_loss = loss_func(sum(y), critic_value)

        # get the gradient for the critic
        critic_grad = tape.gradient(critic_loss, self.critic_model.trainable_variables)
        
        # Gradient clipping
        critic_gvd = zip(critic_grad, self.critic_model.trainable_variables)
        critic_capped_grad = [(tf.clip_by_value(grad, clip_value_min=self.min_clip, clip_value_max=self.max_clip), var) for grad, var in critic_gvd]
        
        self.critic_optimizer.apply_gradients(critic_capped_grad)

        # calculate per agent
        with tf.GradientTape() as tape:
            actions = np.zeros((self.batch_size, self.num_agents, self.disc_actions_num))
            for agent in range(self.num_agents):
                actions[:,agent,:] = self.actor_model(state_batch[:,agent,:], training=True).numpy()
            actions = tf.convert_to_tensor(target_actions)
            actions = actions.reshape(*actions.shape[:1], -1)
            critic_value = self.critic_model([flat_state, actions], training=True)
            actor_loss = -tf.math.reduce_mean(critic_value)
            # print(actor_loss)

        # get the gradient for the critic
        actor_grad = tape.gradient(actor_loss, self.actor_model.trainable_variables)

        # Gradient clipping
        actor_gvd = zip(actor_grad, self.actor_model.trainable_variables)
        # print(*actor_gvd)
        *agvd, = actor_gvd
        actor_capped_grad = [(tf.clip_by_value(grad, clip_value_min=self.min_clip, clip_value_max=self.max_clip), var) for grad, var in actor_gvd]
        
        self.actor_optimizer.apply_gradients(actor_capped_grad)

    def learn(self):
        # Sample only valid data
        record_range = min(self.buffer_counter, self.buffer_capacity)
        # Randomly sample indices
        batch_indices = np.random.choice(record_range, self.batch_size)

        state_batch = tf.convert_to_tensor(self.state_buffer[batch_indices])
        action_batch = tf.convert_to_tensor(self.action_buffer[batch_indices])
        reward_batch = tf.convert_to_tensor(self.reward_buffer[batch_indices])
        reward_batch = tf.cast(reward_batch, dtype=tf.float32)
        next_state_batch = tf.convert_to_tensor(self.next_state_buffer[batch_indices])
        done_batch = tf.convert_to_tensor(self.done_buffer[batch_indices])

        self.update(state_batch, action_batch, reward_batch, next_state_batch, done_batch, self.loss_func)
        
    def policy(self, state, noise_object=0, use_noise=True, rng=np.random.default_rng()):
        if use_noise:
            if self.continuous:
                # sampled_actions = tf.squeeze(self.actor_model(state))
                
                # noise = noise_object()
                
                # sampled_actions = sampled_actions.numpy() + noise

                # # We make sure action is within bounds
                # legal_action = np.clip(sampled_actions, -1, 1)
                # return [np.squeeze(legal_action)][0]
                raise ValueError('not implemented continuous')
            else:
                if (rng.random() < self.epsilon):
                    # Return random array of actions
                    actions = np.random.randint(0, self.disc_actions_num, (self.num_agents, self.num_actions))
                    if actions.shape[1] == 1:
                        # Must squeeze if each agent just have 1 action
                        actions = actions.squeeze()
                    return actions.tolist()
                else:
                    # Return output of the actor network for each of the observed states
                    actions = []
                    for tf_obs in state[0]:
                        actions.append(np.argmax(self.actor_model(tf.expand_dims(tf_obs,0))))
                    return actions
        else:
            if self.continuous:
                # sampled_actions = tf.squeeze(self.actor_model(state)).numpy()
                # legal_action = np.clip(sampled_actions, -1, 1)
                # return [np.squeeze(legal_action)][0]
                raise ValueError('not implemented continuous')
            else:
                # Return output of the actor network for each of the observed states
                actions = []
                for tf_obs in state[0]:
                    actions.append(np.argmax(self.actor_model(tf.expand_dims(tf_obs,0))))
                return actions

In [None]:
def run(env, continuous, total_trials=1, total_episodes=100, 
            buffer_capacity=50000, batch_size=64, 
            std_dev=0.3, epsilon=0.2, actor_lr=0.002, critic_lr=0.003, min_clip=-1, max_clip=1,
            gamma=0.99, tau=0.005, adam_eps=1e-07, amsgrad=False, theta=0.15,
            disc_actions_num=2, seed=1453,
            render=False, directory='Weights/', output=True, total_time=True, use_gpu=True,
            save_weights=True, return_rewards=False,
            gamma_func=fixed, tau_func=fixed, critic_lr_func=fixed, actor_lr_func=fixed, std_dev_func=fixed, epsilon_func=fixed,
            mean_number=20, solved=999,
            reward_mod=False, start_steps=0, loss_func=losses.MeanAbsoluteError()):
    
    start_time = time.time()
    
    # _ = env.seed(seed)
    rng = np.random.default_rng(seed)

    num_agents = env.n_agents
    num_states = env.observation_space[0].shape[0]
    if continuous:
        num_actions = env.action_space.shape[0]
    else:
        num_actions = 1
    
    if not use_gpu:
        os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

    # Normalize action space according to https://stable-baselines3.readthedocs.io/en/master/guide/rl_tips.html
    # env.action_space = spaces.Box(low=-1, high=1, shape=(num_actions,), dtype='float32')
    
    ep_reward_list = []
    avg_reward_list = []
    true_reward_list = []
    true_avg_reward_list = []
    
    for trial in range(total_trials):
        step = 0

        # Add sublists for each trial
        avg_reward_list.append([])
        ep_reward_list.append([])
        true_reward_list.append([])
        true_avg_reward_list.append([])
        
        agent = Coop_MADDPG(num_states, num_actions, num_agents, continuous,
            buffer_capacity, batch_size, std_dev, epsilon,
            actor_lr, critic_lr, gamma, tau, min_clip, max_clip,
            adam_eps, amsgrad, theta, disc_actions_num, loss_func)

        for ep in range(total_episodes):
            before = time.time()
            
            agent.gamma = gamma_func(agent.gamma, ep)
            agent.tau = tau_func(agent.tau, ep)
            agent.critic_lr = critic_lr_func(agent.critic_lr, ep)
            agent.actor_lr = actor_lr_func(agent.actor_lr, ep)
            agent.std_dev = std_dev_func(agent.std_dev, ep)
            agent.epsilon = epsilon_func(agent.epsilon, ep)

            prev_state = env.reset()
            episodic_reward = np.zeros(num_agents)
            true_reward = np.zeros(num_agents)

            while True:
                if render:
                    env.render()
                
                tf_prev_state = tf.expand_dims(tf.convert_to_tensor(prev_state), 0)

                if step >= start_steps:
                    action = agent.policy(state=tf_prev_state, noise_object=agent.ou_noise, rng=rng)
                else:
                    action = env.action_space.sample()
                
                step += 1
                
                if continuous:
                    # try:
                    #     len(action)
                    # except:
                    #     action = [action]
                    # state, reward, done, info = env.step(action)
                    raise ValueError('not implemented continuous')
                else:
                    state, reward, done, info = env.step(action)
                
                true_reward = true_reward.__add__(reward)
                
                # Reward modification
                if reward_mod:
                    reward -= abs(state[0])

                terminal_state = np.array(np.invert(done),dtype=np.float32)
                
                agent.record((prev_state, action, reward, state, terminal_state))

                agent.learn()
                update_target(agent.target_actor.variables, agent.actor_model.variables, agent.tau)
                update_target(agent.target_critic.variables, agent.critic_model.variables, agent.tau)

                episodic_reward = episodic_reward.__add__(reward)

                prev_state = state
                
                if all(done):
                    break

            episodic_reward = np.sum(episodic_reward)
            true_reward = np.sum(true_reward)

            ep_reward_list[trial].append(episodic_reward)
            avg_reward = np.mean(ep_reward_list[trial][-mean_number:])
            avg_reward_list[trial].append(avg_reward)
            true_reward_list[trial].append(true_reward)
            true_avg_reward = np.mean(true_reward_list[trial][-mean_number:])
            true_avg_reward_list[trial].append(true_avg_reward)
            
            if output:
                if reward_mod:
                    print("Ep {} * AvgReward {:.2f} * true AvgReward {:.2f} * Reward {:.2f} * True Reward {:.2f} * time {:.2f} * step {}"
                    .format(ep, avg_reward, true_avg_reward, sum(episodic_reward, true_reward, (time.time() - before), step)))
                else:
                    print("Ep {} * AvgReward {:.2f} * Reward {:.2f} * time {:.2f} * step {}"
                    .format(ep, avg_reward, episodic_reward, (time.time() - before), step))
            
            # Stop if avg is above 'solved'
            if true_avg_reward >= solved:
                break

        # Save weights
        now = datetime.datetime.now()
        timestamp = "{}.{}.{}.{}.{}.{}".format(now.year, now.month, now.day, now.hour, now.minute, now.second)
        save_name = "{}_{}_{}".format(env.spec.id, continuous, timestamp)
        if save_weights:
            try:
                agent.actor_model.save_weights(directory + 'actor-trial' + str(trial) + '_' + save_name + '.h5')
            except:
                print('actor save fail')
            try:
                agent.critic_model.save_weights(directory + 'critic-trial' + str(trial) + '_' + save_name + '.h5')
            except:
                print('critic save fail')
    
    # Plotting graph
    for idx, p in enumerate(true_avg_reward_list):
        plt.plot(p, label=str(idx))
    plt.xlabel("Episode")
    plt.ylabel("True Avg. Epsiodic Reward (" + str(mean_number) + ")")
    plt.legend()
    try:
        plt.savefig('Graphs/' + save_name + '.png')
    except:
        print('fig save fail')
    plt.show()
    
    print('total time:', time.time() - start_time, 's')
    
    if return_rewards:
        return true_reward_list

In [None]:
def test(env, actor_weights, continuous, total_episodes=10, render=False, timing=False):
    rewards = []
    
    for ep in range(total_episodes):
        ep_reward = 0
        
        before = time.time()
        
        prev_state = env.reset()
        agent = Coop_MADDPG()
        agent.actor_model.load_weights(actor_weights)
        
        while True:
            if render:
                env.render()

            tf_prev_state = tf.expand_dims(tf.convert_to_tensor(prev_state), 0)
            action = agent.policy(state=tf_prev_state, use_noise=False)

            if continuous:
                # try:
                #     len(action)
                # except:
                #     action = [action]
                state, reward, done, info = env.step(action)
            else:
                state, reward, done, info = env.step(np.argmax(action))
            
            ep_reward += reward

            prev_state = state

            if all(done):
                break

        if timing:
            print(str(time.time() - before) + 's')
            rewards.append(ep_reward)
            
    plt.plot(rewards)
    plt.xlabel("Episode")
    plt.ylabel("True reward")
    plt.show()

In [None]:
def random(env, continuous=False, total_episodes=10, render=False, timing=False, testing=False):
    rewards = []
    for ep in range(total_episodes):
        ep_reward = 0
        
        before = time.time()
        
        prev_state = env.reset()
        
        while True:
            if render:
                env.render()
            action = np.randint(0-1, 4)
            state, reward, done, _ = env.step(action)
            ep_reward += reward[0]

            # For testing:
            if testing:
                time.sleep(0.5)
                print(reward)
            # ---

            prev_state = state

            if all(done):
                break

        if timing:
            print(str(time.time() - before) + 's')
            rewards.append(ep_reward)
            
    plt.plot(rewards)
    plt.xlabel("Episode")
    plt.ylabel("True reward")
    plt.show()

---
# Runs and tests
---

In [None]:
env = gym.make('ma_gym:TrafficJunction4-v0')
env.reset()
print(env.action_space)
# print(env.observation_space)
obs, rew, done, _ = env.step(np.random.randint(0,2,(4)).tolist())
print(obs[0].shape)
print(rew)
print(done)

In [None]:
run(env, False, output=True, solved=-9)