A proof of concept showing using a genetic algorithm with our environment.
It is similar to https://github.com/DEAP/deap/blob/a0b78956e28387785e3bb6e2b4b1f1b32c2b3883/examples/ga/onemax_short.py

In [None]:
# Run this cell if you're using colab. Otherwise, skip it.

!git clone https://github.com/platers/meta-transfer-learning.git

import os
os.chdir('meta-transfer-learning')

!pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.9.0.dev0-cp36-cp36m-manylinux1_x86_64.whl

!pip install -r requirements.txt
!pip install ran

In [2]:
import array
import random

import numpy as np
from typing import Dict

from deap import algorithms
from deap import base
from deap import creator
from deap import tools
import gym

from environments import SimpleEnv
import importlib
importlib.reload(SimpleEnv)
from environments.SimpleEnv import SimpleEnv #, TODO: add more environments

from ray.rllib.agents import ppo
from ray import tune
from ray.rllib.policy.policy import Policy
from ray.rllib.env import BaseEnv
from ray.rllib.env.multi_agent_env import MultiAgentEnv
from ray.rllib.evaluation import MultiAgentEpisode, RolloutWorker
from ray.rllib.agents.callbacks import DefaultCallbacks
import ray

import time
start = time.process_time()


In [3]:
ray.shutdown()
ray.init()

n_agents = 3
n_var = 2
training_envs = [
(SimpleEnv, {
    'n_agents': n_agents,
    'n_vars': n_var,
    'true_reward_weights': [1, 0],
    'max_step_count': MAX_STEP_COUNT,
}),
(SimpleEnv, {
    'n_agents': n_agents,
    'n_vars': n_var,
    'true_reward_weights': [0, 1],
    'max_step_count': MAX_STEP_COUNT,
})]
test_env = (SimpleEnv, {
    'n_agents': n_agents,
    'n_vars': n_var,
    'true_reward_weights': [1, 1],
    'max_step_count': MAX_STEP_COUNT,
})

creator.create('FitnessMax', base.Fitness, weights=(1.0, ))
creator.create('Individual', array.array, typecode='d',
               fitness=creator.FitnessMax)

toolbox = base.Toolbox()

toolbox.register('attr', random.uniform, -1, 1)
toolbox.register('individual', tools.initRepeat, creator.Individual,
                 toolbox.attr, n_agents * n_var)
toolbox.register('population', tools.initRepeat, list,
                 toolbox.individual)


2020-05-05 17:31:08,431	INFO resource_spec.py:212 -- Starting Ray with 15.28 GiB memory available for workers and up to 7.66 GiB for objects. You can adjust these settings with ray.init(memory=<bytes>, object_store_memory=<bytes>).
2020-05-05 17:31:08,800	INFO services.py:1170 -- View the Ray dashboard at [1m[32mlocalhost:8265[39m[22m


In [13]:
# some settings that we can tweak:
MAX_STEP_COUNT = 20 # number of steps in each round of DEAP evolution 
POPULATION_SIZE = 3 # population in each round of DEAP evolution 
N_GEN = 3 # number of rounds of DEAP evolution
N_RL_TRANING = 10 # number of rounds of training for RL agent
TRAIN_BATCH_SIZE = 30
LEARNING_RATE = 0.001
RL_TRAIN_ITERS = 10

In [20]:
def evaluate_individual_env(individual, environment, env_config):
    """Runs the environment. All agents have the same policy.
  It returns the total true reward as the fitness.
  """
    #Select random individuals from pop and create the reward weights
    pop = np.array([individual for i in range(n_agents)])
    reward_weights = pop
    env_config['reward_weights'] = reward_weights
    #env is only to get action space and observation space
    env = environment(config=env_config)
    class MyCallbacks(DefaultCallbacks):
        #Callback functions to keep track of true reward while training
        def on_episode_start(self, worker: RolloutWorker, base_env: BaseEnv,
                         policies: Dict[str, Policy],
                         episode: MultiAgentEpisode, **kwargs):
            episode.user_data["true_rewards"] = np.zeros(n_agents)

        def on_episode_step(self, worker: RolloutWorker, base_env: BaseEnv,
                        episode: MultiAgentEpisode, **kwargs):
            env = base_env
            true_reward = env.env_states[0].env.last_true_reward
            episode.user_data["true_rewards"] += true_reward

        def on_episode_end(self, worker: RolloutWorker, base_env: BaseEnv,
                       policies: Dict[str, Policy], episode: MultiAgentEpisode,
                       **kwargs):
            true_reward = episode.user_data["true_rewards"]
            for i, r in enumerate(true_reward):
                episode.custom_metrics["true_reward_agent_" + str(i)] = r
    
    # settings for the RL agent trainer     
    config={
        "train_batch_size": TRAIN_BATCH_SIZE,
        "lr": LEARNING_RATE,
        "sgd_minibatch_size": TRAIN_BATCH_SIZE,
        "multiagent": {
            "policies": {
            },
            "policy_mapping_fn":  #all agents share a policy
                lambda agent_id:
                    'agent'
        },
        "model": {"fcnet_hiddens": []},
        'env_config': env_config,
        "callbacks": MyCallbacks,
    }
    config['multiagent']['policies']['agent'] = (None, env.observation_space, env.action_space, {})
    trainer = ppo.PPOTrainer(env=environment, config=config)
    
    true_reward_mean = 0
    for i in range(RL_TRAIN_ITERS):
        #print('TRAINING', i)
        true_reward_mean = 0

        #Train the RL agent
        metrics = trainer.train()  # distributed training step
        print("episode_reward_mean", metrics["episode_reward_mean"])
    for i in range(n_agents):
        true_reward_mean += metrics['custom_metrics']['true_reward_agent_' + str(i) + '_mean']
    true_reward_mean /= n_agents
    print('Evaluated', individual, 'Fitness', true_reward_mean)
    return true_reward_mean


def evaluate_individual(individual):
    """Runs all environments. 
  returns the average true reward over all environments as the fitness.
  """
    fitness = 0
    for env, config in training_envs:
        fitness += evaluate_individual_env(individual, env, config)
    return (fitness, )


# some setup for Deap
toolbox.register('evaluate', evaluate_individual)
toolbox.register('mate', tools.cxTwoPoint)
toolbox.register('mutate', tools.mutFlipBit, indpb=0.05) # add more noise
toolbox.register('select', tools.selTournament, tournsize=3)

In [21]:
# pop is a list individual reward function's weights 
pop = toolbox.population(n=POPULATION_SIZE)
# the 10 best individual reward function (could even befround the first round)
hof = tools.HallOfFame(10)

stats = tools.Statistics(lambda ind: ind.fitness.values)
stats.register('avg', np.mean)
stats.register('std', np.std)
stats.register('min', np.min)
stats.register('max', np.max)

pop, log = algorithms.eaSimple(pop, toolbox, cxpb=0.5, mutpb=0.2, ngen=N_GEN, 
                                   stats=stats, halloffame=hof, verbose=True)

print ('pop', pop)

2020-05-05 17:53:08,825	INFO trainable.py:217 -- Getting current IP.


episode_reward_mean 0.6556691909458053
episode_reward_mean 3.4023960045488315
episode_reward_mean 25.593949188072592
episode_reward_mean 64.97871928847836
episode_reward_mean 96.36543487864066
episode_reward_mean 116.14978967370038
episode_reward_mean 127.32298120681297


KeyboardInterrupt: 

In [None]:
best_individual = hof[0]
print(best_individual)

test_reward = evaluate_individual_env(best_individual, test_env[0], test_env[1])
print(test_reward)

In [22]:
print(evaluate_individual([0, 0, 1, 0, 1, 0])) #Ideal reward, altruistic agent

2020-05-05 17:53:37,547	INFO trainable.py:217 -- Getting current IP.


episode_reward_mean 0.22221723887196276
episode_reward_mean 3.0452974112777156
episode_reward_mean 5.74149909495085
episode_reward_mean 10.39506426013766
episode_reward_mean 5.645653283076753
episode_reward_mean -1.946103320447728
episode_reward_mean -4.742527244058437
episode_reward_mean -1.2956380341178737
episode_reward_mean 1.4002652358892373
episode_reward_mean 4.579323364947922
Evaluated [0, 0, 1, 0, 1, 0] Fitness 0.7632205608246538


2020-05-05 17:54:08,101	INFO trainable.py:217 -- Getting current IP.


episode_reward_mean -1.564286915003322
episode_reward_mean 10.05871125976555
episode_reward_mean 18.145338259509007
episode_reward_mean 25.11647563450708
episode_reward_mean 27.269118028419033
episode_reward_mean 38.54694242131547
episode_reward_mean 54.84275766728213
episode_reward_mean 64.801590360613
episode_reward_mean 63.37483755223144
episode_reward_mean 67.15470601265577
Evaluated [0, 0, 1, 0, 1, 0] Fitness -5.603656222295872
(-4.8404356614712185,)


In [None]:
hof[:]

In [None]:
print(evaluate_individual([1, 0, -1, 0, -1, 0])) #Worst reward, selfish agent

In [None]:
print("Time Spent = ", (time.process_time() - start)/60, " minutes")