In [None]:
# If you are running on Google Colab, please mount the drive uncommenting below

import os

from google.colab import drive
drive.mount('/content/drive')

os.chdir("/content/drive/MyDrive/semester_project_experiments/meta_learning_random_search/")

In [None]:
# # If you are running on Google Colab, please install TensorFlow 2.0 by uncommenting below..

# try:
#   # %tensorflow_version only exists in Colab.
#   %tensorflow_version 2.x
# except Exception:
#   pass

# # If you are running on Google Colab, uncomment below to install the necessary dependencies 
# # before beginning the exercise.

# print("Setting up colab environment")
# !pip uninstall -y -q pyarrow
# !pip install -q -U ray[tune]
# !pip install -q ray[debug]
# !pip install lz4
# !pip install gputil

# # A hack to force the runtime to restart, needed to include the above dependencies.
# print("Done installing! Restarting via forced crash (this is not an issue).")
# import os
# os._exit(0)

In [None]:
import json
import pickle
import datetime
import pprint
import logging
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from copy import deepcopy
import ray
from ray import tune
from ray.tune.registry import register_env
from ray.tune.logger import pretty_print
from ray.rllib.agents.ppo.ppo import PPOTrainer
from envs.particle_rllib.environment import ParticleEnv
from logger import info_logger, results_logger

## Helper functions

In [None]:
# Function that creates the environment
def create_env_fn(env_context=None):
    return ParticleEnv(n_listeners=n_listeners, 
                       n_landmarks=n_landmarks,
                       render_enable=render_enable)

# Function that maps a policy to its agent id
def policy_mapping_fn(agent_id):
    if agent_id.startswith('manager'):
        return "manager_policy"
    else:
        return "worker_policy"

In [None]:
def sample_position_on_hypersphere(initial_pos):
    arr_lenght = len(initial_pos)
    x = np.random.normal(size=arr_lenght)
    radius = np.sqrt(np.sum(np.square(x)))
    return np.add(initial_pos, np.divide(x, radius))
    
def sample_neighboring_weights(old_weights):
    new_weights = deepcopy(old_weights)

    new_weights['manager_policy/fc_1/kernel'][0] = sample_position_on_hypersphere(old_weights['manager_policy/fc_1/kernel'][0])
    new_weights['manager_policy/fc_1/bias'] = sample_position_on_hypersphere(old_weights['manager_policy/fc_1/bias'])

    new_weights['manager_policy/fc_value_1/kernel'][0] = sample_position_on_hypersphere(old_weights['manager_policy/fc_value_1/kernel'][0])
    new_weights['manager_policy/fc_value_1/bias'] = sample_position_on_hypersphere(old_weights['manager_policy/fc_value_1/bias'])

    for i in range(len(old_weights['manager_policy/fc_2/kernel'])):
        new_weights['manager_policy/fc_2/kernel'][i] = sample_position_on_hypersphere(old_weights['manager_policy/fc_2/kernel'][i])
    new_weights['manager_policy/fc_2/bias'] = sample_position_on_hypersphere(old_weights['manager_policy/fc_2/bias'])

    for i in range(len(old_weights['manager_policy/fc_value_2/kernel'])):
        new_weights['manager_policy/fc_value_2/kernel'][i] = sample_position_on_hypersphere(old_weights['manager_policy/fc_value_2/kernel'][i])
    new_weights['manager_policy/fc_value_2/bias'] = sample_position_on_hypersphere(old_weights['manager_policy/fc_value_2/bias'])

    for i in range(len(old_manager_weights['manager_policy/fc_out/kernel'][0])):
        new_weights['manager_policy/fc_out/kernel'][:, i] = sample_position_on_hypersphere(old_weights['manager_policy/fc_out/kernel'][:, i])
    new_weights['manager_policy/fc_out/bias'] = sample_position_on_hypersphere(old_weights['manager_policy/fc_out/bias'])

    new_weights['manager_policy/value_out/kernel'][:, 0] = sample_position_on_hypersphere(old_weights['manager_policy/value_out/kernel'][:, 0])
    new_weights['manager_policy/value_out/bias'] = sample_position_on_hypersphere(old_weights['manager_policy/value_out/bias'])

    return new_weights

## Parameters

In [None]:
# training parameters
training_algo = "PPO"
env_name = "ParticleManagerListeners"
n_epochs = 100
n_episodes = 5000 # number of episodes in one epoch
n_steps = 25 # number of steps in one episode
learning_rate = 5e-4
tau = 0.01 # for updating the target network
gamma = 0.75 # discount factor
replay_buffer_size = 10000000
batch_size = 1024
hidden_layers = [256, 256]

# environment config parameters
n_listeners = 1 
n_landmarks = 12
render_enable = False

# convergence parameters
window_size = 5 # size of the sliding window 
min_rel_delta_reward = 0.02  # minimum acceptable variation of the reward

savedata_dir = './savedata/' # savedata directory
checkpoint_dir = './checkpoints/' # checkpoints directory
restore_checkpoint_n = 0 

# Create savedata directory
if not os.path.exists(savedata_dir):
    os.makedirs(savedata_dir)

# Create the checkpoint directory
if not os.path.exists(checkpoint_dir):
    os.makedirs(checkpoint_dir)

## Trainers configuration

In [None]:
env = create_env_fn()

# According to environment implementation, there exists a different action space and observation space for each agent, 
# action_space[0] (resp. observations_space[0]) is allocated for the manager, while the others are allocated for the workers
manager_action_space = env.action_space[0]
manager_observation_space = env.observation_space[0]
worker_action_space = env.action_space[1]
worker_observation_space = env.observation_space[1]

policies = {
    "manager_policy": (None, manager_observation_space, manager_action_space, {"lr": 0.0,}),
    "worker_policy": (None, worker_observation_space, worker_action_space, {"lr": learning_rate,})
    }

training_config = {
    "num_workers": 8,
    # "lr": learning_rate,
    # "tau": tau, 
    "gamma": gamma,
    "horizon": n_steps,
    # "actor_hiddens": hidden_layers,
    # "critic_hiddens": hidden_layers,
    # "buffer_size": replay_buffer_size, #---> too much memory required
    "train_batch_size": batch_size,
    "model": {
        "fcnet_hiddens": [16, 16]
    },
    "multiagent": {
        "policies": policies,
        "policy_mapping_fn": policy_mapping_fn,
        "policies_to_train": list(policies.keys())
    },
    "no_done_at_end": True,
    "log_level": "ERROR"
}

# Initialize and register the environment
register_env(env_name, create_env_fn)

In [None]:
# Initialize Ray
ray.init()
trainer = PPOTrainer(env=env_name, config=training_config)
# Print the current configuration
pp = pprint.PrettyPrinter(indent=4)
print("Current configiguration\n-----------------------")
pp.pprint(trainer.get_config())
print("-----------------------\n")

if restore_checkpoint_n != 0:
    # restore the old weights
    weights = trainer.get_weights()
    with open(checkpoint_dir + "manager_weights_{}".format(restore_checkpoint_n), 'rb') as fp:
        old_manager_weights = pickle.load(fp)
    weights['manager_policy'] = old_manager_weights
    trainer.set_weights(weights)
    info_logger.info("Restored checkpoint from epoch {}".format(restore_checkpoint_n))
    # restore the history of manager rewads
    with open(savedata_dir + "epoch-manager-rewards", 'rb') as fp:
        epoch_manager_rewards = pickle.load(fp)
    # restore the correct number of epoch 
    curr_epoch = len(epoch_manager_rewards) + 1
    # restore the old best manager reward
    old_manager_reward = epoch_manager_rewards[restore_checkpoint_n - 1]

else:
    curr_epoch = 1
    info_logger.info("Initializing with training mode")
    old_manager_reward = np.NINF
    old_manager_weights = trainer.get_weights()['manager_policy']
    epoch_manager_rewards = [] # total reward of the manager in each epoch

# Init simulation variables
convergence = False
convergence_counter = 0 

while curr_epoch <= n_epochs:

    # loop for n_epochs
    
    info_logger.info("Current epoch: {}".format(curr_epoch))

    manager_total_reward = 0 # total rewards of the manager in this epoch

    # initialize iteration data saving log
    savedata_file_name = '{}-epoch.csv'.format(curr_epoch)
    savedata_file_path = savedata_dir + savedata_file_name
    savedata_columns = ["episodes_total", "episode_mean_reward", "worker_mean_reward", "manager_mean_reward", "manager_total_reward"]
    savedata = pd.DataFrame(columns=savedata_columns)

    elapsed_episodes = 0

    # Loop for n_episodes
    while elapsed_episodes < n_episodes:

        result = trainer.train()

        elapsed_episodes = result['episodes_total']
        episode_mean_reward = result['episode_reward_mean']
        manager_mean_reward = result['policy_reward_mean']['manager_policy']
        worker_mean_reward = result['policy_reward_mean']['worker_policy']

        manager_total_reward += (manager_mean_reward * result['episodes_this_iter'])

        # update the log 
        training_data = []
        training_data.append(elapsed_episodes) # first entry is the total number of episodes
        training_data.append(episode_mean_reward) # second entry is the mean episode reward
        training_data.append(worker_mean_reward) # third entry is the worker mean reward
        training_data.append(manager_mean_reward) # fourth entry is the manager mean reward
        training_data.append(manager_total_reward) # fifth entry is the manager total reward up to now
        training_data_df = pd.DataFrame([training_data], columns=savedata_columns)
        savedata = savedata.append(training_data_df, ignore_index=True)

        print(pretty_print(result))

    # update the weights of the manager if the new set of weights is better than the previous one
    if manager_total_reward >= old_manager_reward:
        old_manager_reward = manager_total_reward
        old_manager_weights = trainer.get_weights()['manager_policy']
        # save checkpoint 
        with open(checkpoint_dir + "manager_weights_{}".format(curr_epoch), 'wb') as fp:
            pickle.dump(old_manager_weights, fp)
        info_logger.info("Saved checkpoint in epoch {}".format(curr_epoch))

    trainer.stop()
    trainer = PPOTrainer(env=env_name, config=training_config)
    weights = trainer.get_weights()
    new_manager_weights = sample_neighboring_weights(old_manager_weights)
    weights['manager_policy'] = new_manager_weights
    trainer.set_weights(weights)

    # print results on file
    savedata.to_csv(savedata_file_path)
    
    results_logger.info("Epoch: {}".format(curr_epoch))
    results_logger.info("\tmananger reward = {}".format(manager_total_reward))

    epoch_manager_rewards.append(manager_total_reward)   
    with open(savedata_dir + "epoch-manager-rewards", 'wb') as fp:
        pickle.dump(epoch_manager_rewards, fp)

    plt.plot(epoch_manager_rewards)     
    plt.show()

    # check convergence conditions
    if curr_epoch > n_epochs/2 + window_size:

        window_reward = 0
        for r in epoch_manager_rewards[-5:]:
            window_reward += r

        if abs(manager_total_reward - window_reward) / window_reward <= min_rel_delta_reward:
            convergence_counter += 1
            if convergence_counter >= 5 and curr_epoch <= n_epochs - 10:
                convergence = True
        else:
            convergence = False
            convergence_counter = 0 
        
    curr_epoch +=1

# Save final checkpoint
trainer.stop()
ray.shutdown()

if convergence:
    results_logger.info("Convergence! The mean reward has remained stable for {} epochs".format(convergence_counter))
elif convergence_counter > 0:
    results_logger.info("No convergence. The mean reward stabilized for the first time around epoch {}".format(1 + n_epochs - convergence_counter))
else:
    results_logger.info("No convergence. The mean reward has never stabilized.")