In [None]:
# If you are running on Google Colab, please mount the drive uncommenting below

import os

from google.colab import drive
drive.mount('/content/drive')

os.chdir("/content/drive/MyDrive/semester_project_experiments/fmh/")

In [None]:
# # If you are running on Google Colab, please install TensorFlow 2.0 by uncommenting below

# try:
#   # %tensorflow_version only exists in Colab.
#   %tensorflow_version 2.x
# except Exception:
#   pass

In [None]:
# # If you are running on Google Colab, uncomment below to install the necessary dependencies 
# # before running the experiment, then comment it again

# print("Setting up colab environment")
# !pip uninstall -y -q pyarrow
# !pip install -q -U ray[tune]
# !pip install -q ray[debug]

# !pip install lz4
# !pip install gputil

# # A hack to force the runtime to restart, needed to include the above dependencies.
# print("Done installing! Restarting via forced crash (this is not an issue).")
# import os
# os._exit(0)

In [None]:
import json
import datetime
import pprint
import logging
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import ray
from ray import tune
from ray.tune.registry import register_env
from ray.tune.logger import pretty_print
from ray.rllib.agents.ppo.ppo import PPOTrainer

from envs.particle_rllib.environment import ParticleEnv

from callbacks import CustomCallbacks
from logger import info_logger, results_logger

## Helper functions

In [None]:
# Function that creates the environment
def create_env_fn(env_context=None):
    return ParticleEnv(n_listeners=n_listeners, 
                       n_landmarks=n_landmarks,
                       render_enable=render_enable)

# Function that maps a policy to its agent id
def policy_mapping_fn(agent_id):
    if agent_id.startswith('manager'):
        return "manager_policy"
    else:
        return "worker_policy"

# Functions to write and read the checkpoint number to epoch number dictionary 
def write_epoch_by_checkpoint(epoch_by_checkpoint):
    with open('temp/epoch-by-checkpoint.json', 'w') as f:
        json.dump(epoch_by_checkpoint, f)

def read_epoch_by_checkpoint():
    with open('temp/epoch-by-checkpoint.json') as f:
        epoch_by_checkpoint = json.load(f)
    return epoch_by_checkpoint 

## Parameters

In [None]:
# pretraining parameters
pretraining_n_epochs = 10

# training parameters
training_n_epochs = 100

# common parameters
training_algo = "PPO"
env_name = "ParticleManagerListeners"
n_episodes = 1000 # number of episodes in one epoch
n_steps = 25 # number of steps in one episode
learning_rate = 0.001 
tau = 0.01 # for updating the target network
gamma = 0.75 # discount factor
replay_buffer_size = 10000000
batch_size = 1024
hidden_layers = [256, 256]

# environment config parameters
n_listeners = 1 
n_landmarks = 12
render_enable = False

# convergence parameters
window_size = 5 # size of the sliding window 
min_rel_delta_reward = 0.02  # minimum acceptable variation of the reward

# savedata filepath
savedata_dir = './savedata/'

# checkpoint parameters 
checkpoint_interval = 20 # number of trainings after which a checkpoint is set
checkpoint_mode = 'pretraining' # mode of the checkpoint to restore
restore_checkpoint_n = 0
    
# Create savedata directory
if not os.path.exists(savedata_dir):
    os.makedirs(savedata_dir)

## Environment spaces

In [None]:
env = create_env_fn()

# According to environment implementation, there exists a different action space and observation space for each agent, 
# action_space[0] (resp. observations_space[0]) is allocated for the manager, while the others are allocated for the workers
manager_action_space = env.action_space[0]
manager_observation_space = env.observation_space[0]
worker_action_space = env.action_space[1]
worker_observation_space = env.observation_space[1]

## Trainers configuration

In [None]:
policies = {
    "manager_policy": (None, manager_observation_space, manager_action_space, {}),
    "worker_policy": (None, worker_observation_space, worker_action_space, {})
    }

pretraining_config = {
    "num_workers": 2,
    "lr": learning_rate,
    "gamma": gamma,
    "horizon": n_steps,
    "train_batch_size": batch_size,
    "multiagent": {
        "policies": policies,
        "policy_mapping_fn": policy_mapping_fn,
        "policies_to_train": ["worker_policy"]
    },
    "callbacks": CustomCallbacks,
    "no_done_at_end": True,
    "log_level": "ERROR"
}

training_config = {
    "num_workers": 2,
    "lr": learning_rate,
    "gamma": gamma,
    "horizon": n_steps,
    "train_batch_size": batch_size,
    "multiagent": {
        "policies": policies,
        "policy_mapping_fn": policy_mapping_fn,
        "policies_to_train": list(policies.keys())
    },
    "callbacks": CustomCallbacks,
    "no_done_at_end": True,
    "log_level": "ERROR"
}

## Initialize the environment, the trainer and the checkpoint folder/parameters

In [None]:
# Initialize and register the environment
register_env(env_name, create_env_fn)

# Initialize Ray
ray.shutdown()
ray.init()

if restore_checkpoint_n != 0:
    # Read the number of the epoch of the checkpoint to restore
    epoch_by_checkpoint = read_epoch_by_checkpoint()
    curr_epoch = epoch_by_checkpoint[checkpoint_mode][str(restore_checkpoint_n)]

    if curr_epoch > pretraining_n_epochs:

        # Initialize the trainer with the training configuration
        mode = 'training'
        trainer = PPOTrainer(env=env_name, config=training_config)
      
    else:

        # Initialize the trainer with the pre-training configuration
        mode = 'pretraining'
        trainer = PPOTrainer(env=env_name, config=pretraining_config)
    
    checkpoint_dir = './checkpoints/' + mode + '/' # checkpoints directory
    # Restore the simulation from the checkpoint
    trainer.restore(checkpoint_dir + 'checkpoint_{n}/checkpoint-{n}'.format(n=restore_checkpoint_n))
    info_logger.info("Restored checkpoint {}".format(restore_checkpoint_n))

else:
    # Initialize the trainer with the pre-training configuration and the parameters to start a simulation from scratch 
    mode = 'pretraining'
    curr_epoch = 1 
    epoch_by_checkpoint = {'pretraining': {}, 'training': {}}
    trainer = PPOTrainer(env=env_name, config=pretraining_config)

    checkpoint_dir = './checkpoints/' + mode + '/' # checkpoints directory
    # Create the checkpoint directory
    if not os.path.exists(checkpoint_dir):
        os.makedirs(checkpoint_dir)

    info_logger.info("Initializing with pre-training mode")

# Print the current configuration
pp = pprint.PrettyPrinter(indent=4)
print("Current configiguration\n-----------------------")
pp.pprint(trainer.get_config())
print("-----------------------\n")

## Simulation loop

In [None]:
try:
    convergence = False
    convergence_counter = 0 
    checkpoint_counter = 0 # to know when it's time to save a new checkpoint
    epoch_mean_rewards = [] # mean reward of the epoch in time

    while curr_epoch <= training_n_epochs:

        # loop for training_n_epochs
        
        info_logger.info("Current epoch: {}".format(curr_epoch))

        manager_mean_rewards = [] # mean reward of the training iteration in time
        prob_correct_goal_means = [] # mean probability of communicating the correct goal

        # after pretraining_n_epochs epochs, switch to training mode
        if curr_epoch == pretraining_n_epochs + 1 and mode == 'pretraining':

            info_logger.info("Switching to training mode")
            mode = 'training'
            checkpoint_dir = './checkpoints/' + mode + '/'

            # create checkpoint directory if it doesn't exist
            if not os.path.exists(checkpoint_dir):
                os.makedirs(checkpoint_dir)

            checkpoint_counter = 0

            # change the trainer configuration but keep the old weights
            model_weights = trainer.get_weights()
            trainer.cleanup()
            trainer = PPOTrainer(env=env_name, config=training_config)
            trainer.set_weights(model_weights)

            print("Training configiguration\n-----------------------")
            pp.pprint(trainer.get_config())
            print("-----------------------\n")

        # initialize iteration data saving log
        savedata_file_name = '{}-epoch.csv'.format(curr_epoch)
        savedata_file_path = savedata_dir + "/" + savedata_file_name
        savedata_columns = ["episodes_total","episode_len_mean", "worker_reward_mean", "manager_reward_mean", "prob_correct_goal"]
        savedata = pd.DataFrame(columns=savedata_columns)
    
        elapsed_episodes = 0 # Current episode

        # compute the episode in which the epoch will be over
        if mode == 'pretraining':
            # if pretraining mode, standard computation
            end_of_epoch = n_episodes * curr_epoch
        else:
            # if training mode, we subtract the number of episodes of pretraining because the trainer is reset after the pretraining
            end_of_epoch = n_episodes * (curr_epoch - pretraining_n_epochs)

        while elapsed_episodes <= end_of_epoch:

            # loop for n_episodes

            result = trainer.train()
            elapsed_episodes = result['episodes_total']
            episode_mean_reward = result['episode_reward_mean']
            manager_mean_reward = result['policy_reward_mean']['manager_policy']
            prob_correct_goal = result['custom_metrics']['prob_correct_goal_mean']

            manager_mean_rewards.append(manager_mean_reward)
            prob_correct_goal_means.append(prob_correct_goal)

            print(pretty_print(result))
            plt.plot(manager_mean_rewards)
            plt.show()
            
            checkpoint_counter += 1
            # save a checkpoint every checkpoint_interval trains
            if(checkpoint_counter == checkpoint_interval):
                trainer.save(checkpoint_dir)
                info_logger.info("Checkpoint saved ({m}: iteration {n})".format(m=mode, n=result['training_iteration']))
                checkpoint_counter = 0
                # write the checkpoint to epoch dict 
                epoch_by_checkpoint[mode][str(result['training_iteration'])] = curr_epoch
                write_epoch_by_checkpoint(epoch_by_checkpoint)

            # update the log 
            training_data = []
            training_data.append(elapsed_episodes) # first entry is the total number of episodes
            training_data.append(episode_mean_reward) # second entry is the mean episode length
            training_data += [result['policy_reward_mean'][policy_name] 
                                   for policy_name in policies.keys()]  # other entries are mean policy rewards
            training_data.append(prob_correct_goal)
            training_data_df = pd.DataFrame([training_data], columns=savedata_columns)
            savedata = savedata.append(training_data_df, ignore_index=True)

        # print results on file
        savedata.to_csv(savedata_file_path)

        # compute epoch's results
        curr_epoch_mean_reward = np.mean(manager_mean_rewards)
        curr_epoch_prob_correct_goal = np.mean(prob_correct_goal_means)
        
        results_logger.info("Epoch: {}".format(curr_epoch))
        results_logger.info("\tmean reward = {}".format(curr_epoch_mean_reward))
        results_logger.info("\tprobability of correct goal = {}".format(curr_epoch_prob_correct_goal))

        epoch_mean_rewards.append(curr_epoch_mean_reward)        

        # check convergence conditions
        if curr_epoch > pretraining_n_epochs + window_size:

            window_reward = 0
            for r in epoch_mean_rewards[-5:]:
                window_reward += r

            if abs(curr_epoch_mean_reward - window_reward) / window_reward <= min_rel_delta_reward:
                convergence_counter += 1
                if convergence_counter >= 5 and curr_epoch <= training_n_epochs - 10:
                    convergence = True
            else:
                convergence = False
                convergence_counter = 0 
            
        curr_epoch +=1

    if convergence:
        results_logger.info("Convergence! The mean reward has remained stable for {} epochs".format(convergence_counter))
    elif convergence_counter > 0:
        results_logger.info("No convergence. The mean reward stabilized for the first time around epoch {}".format(1 + training_n_epochs - convergence_counter))
    else:
        results_logger.info("No convergence. The mean reward has never stabilized.")

finally:
    ray.shutdown()