In [None]:
# # If you are running on Google Colab, please install TensorFlow 2.0 by uncommenting below..

# try:
#   # %tensorflow_version only exists in Colab.
#   %tensorflow_version 2.x
# except Exception:
#   pass

# # If you are running on Google Colab, uncomment below to install the necessary dependencies 
# # before beginning the exercise.

# print("Setting up colab environment")
# !pip install lz4
# !pip install gputil
# !pip uninstall -y -q pyarrow
# !pip install -q -U ray[tune]
# !pip install -q ray[debug]

# # A hack to force the runtime to restart, needed to include the above dependencies.
# print("Done installing! Restarting via forced crash (this is not an issue).")
# import os
# os._exit(0)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import os
os.chdir("/content/drive/MyDrive/semester_project_experiments/keras_reinforce_comm_goal")

In [None]:
import pickle
import numpy as np
from logger import info_logger, results_logger
from envs.particle_rllib.environment import ParticleEnv
from reinforce_agent import ReinforceAgent

In [None]:
import matplotlib.pyplot as plt
from IPython.display import clear_output

%matplotlib inline
plt.rcParams['figure.figsize'] = 12, 8

## Helpers

In [None]:
# Function that creates the environment
def create_env_fn():
    return ParticleEnv(n_listeners=n_listeners, 
                       n_landmarks=n_landmarks,
                       render_enable=render_enable)
    
class Results(dict):
    
    def __init__(self, *args, **kwargs):
        if 'filename' in kwargs:
            data = np.load(kwargs['filename'])
            super().__init__(data)
        else:
            super().__init__(*args, **kwargs)
        self.new_key = None
        self.plot_keys = None
        self.ylim = None
        
    def __setitem__(self, key, value):
        super().__setitem__(key, value)
        self.new_key = key

    def plot(self, window):
        clear_output(wait=True)
        for key in self:
            #Ensure latest results are plotted on top
            if self.plot_keys is not None and key not in self.plot_keys:
                continue
            elif key == self.new_key:
                continue
            self.plot_smooth(key, window)
        if self.new_key is not None:
            self.plot_smooth(self.new_key, window)
        plt.xlabel('Episode')
        plt.ylabel('Reward')
        plt.legend(loc='lower right')
        if self.ylim is not None:
            plt.ylim(self.ylim)
        plt.show()
        
    def plot_smooth(self, key, window):
        if len(self[key]) == 0:
            plt.plot([], [], label=key)
            return None
        y = np.convolve(self[key], np.ones((window,))/window, mode='valid')
        x = np.linspace(window/2, len(self[key]) - window/2, len(y))
        plt.plot(x, y, label=key)
        
    def save(self, filename='results'):
        results_dir = 'results/'
        if not os.path.exists(results_dir):
            os.makedirs(results_dir)
        np.savez(results_dir + filename, **self)

## Parameters

In [None]:
# pretraining parameters
pretraining_n_epochs = 10
pretraining = False

# training parameters
training_n_epochs = 100

# common parameters
n_episodes = 1000 # number of episodes in one epoch
n_steps = 25 # number of steps in one episode
policy_learning_rate = 0.002 
value_learning_rate = 0.01 # for updating the target network
gamma = 0.75 # discount factor
n_layers = 3
n_neurons = 128

# environment config parameters
n_listeners = 1 
n_landmarks = 12
render_enable = False

# convergence parameters
window_size = 5 # size of the sliding window 
min_rel_delta_reward = 0.02  # minimum acceptable variation of the reward

## Initialize the environment and the agents

In [None]:
env = create_env_fn()

# According to environment implementation, there exists a different action space and observation space for each agent, 
# action_space[0] (resp. observations_space[0]) is allocated for the manager, while the others are allocated for the workers
manager_action_space = env.action_space[0]
manager_observation_space = env.observation_space[0]
worker_action_space = env.action_space[1]
worker_observation_space = env.observation_space[1]

# Initiate the manager
manager = ReinforceAgent(name='manager',
                n_obs=manager_observation_space.shape[0], 
                action_space=manager_action_space,
                policy_learning_rate=policy_learning_rate, 
                value_learning_rate=value_learning_rate, 
                discount=gamma, 
                n_layers=n_layers,
                n_neurons=n_neurons)

info_logger.info("Manager agent initialized")
 
# Initiate the listener
worker = ReinforceAgent(name='worker',
                n_obs=worker_observation_space.shape[0], 
                action_space=worker_action_space,
                policy_learning_rate=policy_learning_rate, 
                value_learning_rate=value_learning_rate, 
                discount=gamma, 
                n_layers=n_layers,
                n_neurons=n_neurons)

info_logger.info("Worker agent initialized")

In [None]:
def run_experiment(env, is_training, n_steps, n_episodes, past_worker_rewards, past_manager_rewards):

    manager_rewards = []
    worker_rewards = []
    probs_correct_goal = []

    for episode in range(1, n_episodes+1):

        if episode % 10 == 0:
            results['worker'] = np.array(train_worker_rewards + worker_rewards)
            results['manager'] = np.array(train_manager_rewards + manager_rewards)
            results.plot(10)

        #Reset the environment to a new episode
        obs = env.reset()
        ext_comm_reward = 0
        episode_manager_reward = 0
        episode_worker_reward = 0
        step = 1
        correct_goals = []
        action = {}

        while True:

            # 1. Decide on an action based on the observations
            action['worker_agent_1'] = worker.decide(obs['worker_agent_1'])
            if 'manager_agent' in obs:
                # if no observation for the manager, then extend communication
                action['manager_agent'] = manager.decide(obs['manager_agent'])

            # 2. Take action in the environment
            next_obs, rewards, done, _ = env.step(action)
            ext_comm_reward += rewards['manager_agent']
            episode_worker_reward += rewards['worker_agent_1']
            episode_manager_reward += rewards['manager_agent']

            # 3. Store the information returned from the environment for training
            worker.observe(obs['worker_agent_1'], action['worker_agent_1'], rewards['worker_agent_1'])
            if is_training and 'manager_agent' in obs:
                if step > 1:
                    manager.observe(manager_obs, manager_action, ext_comm_reward)
                    ext_comm_reward = 0
                manager_obs = obs['manager_agent']
                manager_action = action['worker_agent_1']
                
            # 4. When we reach a terminal state ("done"), use the observed episode to train the network
            if step == n_steps:
                manager_rewards.append(episode_manager_reward)                
                worker_rewards.append(episode_worker_reward)
                worker.train()
                if is_training:
                    manager.train()
                break

            # Reset for next step
            obs = next_obs
            step += 1
    
    return manager_rewards, worker_rewards

## Simulation loop

In [None]:
convergence = False
convergence_counter = 0 
train_manager_rewards = []
train_worker_rewards = []
epoch_mean_rewards = [] # mean reward of the epoch in time
curr_epoch = 1
results = Results()

while curr_epoch <= training_n_epochs:

    # loop for training_n_epochs
    
    info_logger.info("Current epoch: {}".format(curr_epoch))

    if pretraining and curr_epoch <= pretraining_n_epochs:
        manager_rewards, worker_rewards = run_experiment(
            env, 
            is_training=False, 
            n_steps=n_steps, 
            n_episodes=n_episodes,
            past_worker_rewards=train_worker_rewards,
            past_manager_rewards=train_manager_rewards
        )
    else:
        manager_rewards, worker_rewards = run_experiment(
                env, 
                is_training=True, 
                n_steps=n_steps, 
                n_episodes=n_episodes,
                past_worker_rewards=train_worker_rewards,
                past_manager_rewards=train_manager_rewards
            )

    train_worker_rewards += worker_rewards
    train_manager_rewards += manager_rewards
    results.save(filename='results-ep{}'.format(curr_epoch))
    
    curr_epoch_mean_manager_reward = np.mean(manager_rewards)
    curr_epoch_mean_worker_reward = np.mean(worker_rewards)

    results_logger.info("Epoch: {}".format(curr_epoch))
    results_logger.info("\tmanager mean reward = {}".format(curr_epoch_mean_manager_reward))
    results_logger.info("\tworker mean reward = {}".format(curr_epoch_mean_worker_reward))

    epoch_mean_rewards.append(curr_epoch_mean_manager_reward)  

    # check convergence conditions
    if curr_epoch > pretraining_n_epochs + window_size:

        window_reward = 0
        for r in epoch_mean_rewards[-5:]:
            window_reward += r

        if abs(curr_epoch_mean_manager_reward - window_reward) / window_reward <= min_rel_delta_reward:
            convergence_counter += 1
            if convergence_counter >= 5 and curr_epoch <= training_n_epochs - 10:
                convergence = True
        else:
            convergence = False
            convergence_counter = 0 
        
    curr_epoch +=1

manager.save()
worker.save()

if convergence:
    results_logger.info("Convergence! The mean reward has remained stable for {} epochs".format(convergence_counter))
elif convergence_counter > 0:
    results_logger.info("No convergence. The mean reward stabilized for the first time around epoch {}".format(1 + training_n_epochs - convergence_counter))
else:
    results_logger.info("No convergence. The mean reward has never stabilized.")