A proof of concept showing using a genetic algorithm with our environment.
It is similar to https://github.com/DEAP/deap/blob/a0b78956e28387785e3bb6e2b4b1f1b32c2b3883/examples/ga/onemax_short.py

In [None]:
# Run this cell if you're using colab. Otherwise, skip it.

!git clone https://github.com/platers/meta-transfer-learning.git

import os
os.chdir('meta-transfer-learning')

!pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.9.0.dev0-cp36-cp36m-manylinux1_x86_64.whl

!pip install -r requirements.txt

In [6]:
import array
import random

import numpy as np
from typing import Dict

from deap import algorithms
from deap import base
from deap import creator
from deap import tools
import gym

from environments import SimpleEnv
import importlib
importlib.reload(SimpleEnv)
from environments.SimpleEnv import SimpleEnv #, TODO: add more environments

from ray.rllib.agents import ppo
from ray import tune
from ray.rllib.policy.policy import Policy
from ray.rllib.env import BaseEnv
from ray.rllib.env.multi_agent_env import MultiAgentEnv
from ray.rllib.evaluation import MultiAgentEpisode, RolloutWorker
from ray.rllib.agents.callbacks import DefaultCallbacks
import ray

In [7]:
ray.shutdown()
ray.init()

n_agents = 3
n_var = 2
training_envs = [
(SimpleEnv, {
    'n_agents': n_agents,
    'n_vars': n_var,
    'true_reward_weights': [1, 0],
    'max_step_count': 20,
}),
(SimpleEnv, {
    'n_agents': n_agents,
    'n_vars': n_var,
    'true_reward_weights': [0, 1],
    'max_step_count': 20,
})]
test_env = (SimpleEnv, {
    'n_agents': n_agents,
    'n_vars': n_var,
    'true_reward_weights': [1, 1],
    'max_step_count': 20,
})

creator.create('FitnessMax', base.Fitness, weights=(1.0, ))
creator.create('Individual', array.array, typecode='d',
               fitness=creator.FitnessMax)

toolbox = base.Toolbox()

toolbox.register('attr', random.uniform, -1, 1)
toolbox.register('individual', tools.initRepeat, creator.Individual,
                 toolbox.attr, n_agents * n_var)
toolbox.register('population', tools.initRepeat, list,
                 toolbox.individual)


2020-05-04 20:29:32,171	INFO resource_spec.py:212 -- Starting Ray with 2.15 GiB memory available for workers and up to 1.08 GiB for objects. You can adjust these settings with ray.init(memory=<bytes>, object_store_memory=<bytes>).
2020-05-04 20:29:32,533	INFO services.py:1170 -- View the Ray dashboard at [1m[32mlocalhost:8265[39m[22m


In [8]:
def evaluate_individual_env(individual, environment, env_config):
    """Runs the environment. All agents have the same policy.
  It returns the total true reward as the fitness.
  """
    #Select random individuals from pop and create the reward weights
    pop = np.array([individual for i in range(n_agents)])
    reward_weights = pop
    env_config['reward_weights'] = reward_weights
    #env is only to get action space and observation space
    env = environment(config=env_config)
    class MyCallbacks(DefaultCallbacks):
        #Callback functions to keep track of true reward while training
        def on_episode_start(self, worker: RolloutWorker, base_env: BaseEnv,
                         policies: Dict[str, Policy],
                         episode: MultiAgentEpisode, **kwargs):
            episode.user_data["true_rewards"] = np.zeros(n_agents)

        def on_episode_step(self, worker: RolloutWorker, base_env: BaseEnv,
                        episode: MultiAgentEpisode, **kwargs):
            env = base_env
            true_reward = env.env_states[0].env.last_true_reward
            episode.user_data["true_rewards"] += true_reward

        def on_episode_end(self, worker: RolloutWorker, base_env: BaseEnv,
                       policies: Dict[str, Policy], episode: MultiAgentEpisode,
                       **kwargs):
            true_reward = episode.user_data["true_rewards"]
            for i, r in enumerate(true_reward):
                episode.custom_metrics["true_reward_agent_" + str(i)] = r
            
    config={
        "multiagent": {
            "policies": {
            },
            "policy_mapping_fn":  #all agents share a policy
                lambda agent_id:
                    'agent'
        },
        'env_config': env_config,
        "callbacks": MyCallbacks,
    }
    config['multiagent']['policies']['agent'] = (None, env.observation_space, env.action_space, {})
    trainer = ppo.PPOTrainer(env=environment, config=config)
    
    true_reward_mean = 0
    for i in range(10):
        #print('TRAINING', i)
        true_reward_mean = 0
        custom_metrics = trainer.train()['custom_metrics']  # distributed training step
        #print(custom_metrics)
        for i in range(n_agents):
            true_reward_mean += custom_metrics['true_reward_agent_' + str(i) + '_mean']
    true_reward_mean /= n_agents
    #print('true reward', trainer.collect_metrics()['custom_metrics']['true_reward_mean'])
    print('Evaluated', individual, 'Fitness', true_reward_mean)
    return true_reward_mean


def evaluate_individual(individual):
    """Runs all environments. 
  returns the average true reward over all environments as the fitness.
  """
    fitness = 0
    for env, config in training_envs:
        fitness += evaluate_individual_env(individual, env, config)
    return (fitness, )
    
toolbox.register('evaluate', evaluate_individual)
toolbox.register('mate', tools.cxTwoPoint)
toolbox.register('mutate', tools.mutFlipBit, indpb=0.05)
toolbox.register('select', tools.selTournament, tournsize=3)

In [9]:
pop = toolbox.population(n=3)
hof = tools.HallOfFame(10)
stats = tools.Statistics(lambda ind: ind.fitness.values)
stats.register('avg', np.mean)
stats.register('std', np.std)
stats.register('min', np.min)
stats.register('max', np.max)

pop, log = algorithms.eaSimple(pop, toolbox, cxpb=0.5, mutpb=0.2, ngen=3, 
                                   stats=stats, halloffame=hof, verbose=True)

print ('pop', pop)

2020-05-04 20:29:33,743	ERROR syncer.py:39 -- Log sync requires rsync to be installed.


[2m[36m(pid=21899)[0m   from ._conv import register_converters as _register_converters
[2m[36m(pid=21897)[0m   from ._conv import register_converters as _register_converters


2020-05-04 20:29:44,211	INFO trainable.py:180 -- _setup took 10.466 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.
2020-05-04 20:29:44,212	INFO trainable.py:217 -- Getting current IP.
2020-05-04 20:32:53,158	ERROR syncer.py:39 -- Log sync requires rsync to be installed.


Evaluated array('d', [-0.17752966514486035, -0.8454946636188607, 0.2130358149916065, -0.3617550073268947, 0.8111183220359401, 0.6172385093124968]) Fitness -5.72644739620655
[2m[36m(pid=21898)[0m   from ._conv import register_converters as _register_converters
[2m[36m(pid=21896)[0m   from ._conv import register_converters as _register_converters


2020-05-04 20:32:57,636	INFO trainable.py:217 -- Getting current IP.
2020-05-04 20:34:48,921	ERROR syncer.py:39 -- Log sync requires rsync to be installed.


Evaluated array('d', [-0.17752966514486035, -0.8454946636188607, 0.2130358149916065, -0.3617550073268947, 0.8111183220359401, 0.6172385093124968]) Fitness 1.2745005642682374
[2m[36m(pid=22130)[0m   from ._conv import register_converters as _register_converters
[2m[36m(pid=22131)[0m   from ._conv import register_converters as _register_converters


2020-05-04 20:34:53,488	INFO trainable.py:217 -- Getting current IP.
2020-05-04 20:36:46,031	ERROR syncer.py:39 -- Log sync requires rsync to be installed.


Evaluated array('d', [-0.2515747680644629, -0.7063495891325171, 0.8234620760139413, 0.7610109760866954, 0.6506763108978786, -0.8552344804263239]) Fitness 5.1734567729166026
[2m[36m(pid=22142)[0m   from ._conv import register_converters as _register_converters
[2m[36m(pid=22215)[0m   from ._conv import register_converters as _register_converters


2020-05-04 20:36:50,614	INFO trainable.py:217 -- Getting current IP.
2020-05-04 20:38:41,714	ERROR syncer.py:39 -- Log sync requires rsync to be installed.


Evaluated array('d', [-0.2515747680644629, -0.7063495891325171, 0.8234620760139413, 0.7610109760866954, 0.6506763108978786, -0.8552344804263239]) Fitness -12.527856433419759
[2m[36m(pid=22303)[0m   from ._conv import register_converters as _register_converters
[2m[36m(pid=22304)[0m   from ._conv import register_converters as _register_converters


2020-05-04 20:38:46,623	INFO trainable.py:217 -- Getting current IP.
2020-05-04 20:40:38,811	ERROR syncer.py:39 -- Log sync requires rsync to be installed.


Evaluated array('d', [-0.7911810104935142, 0.046728697284842635, 0.24939730637720992, -0.7113797379718891, -0.99173253560944, 0.5387376630286655]) Fitness -3.4557299821117584
[2m[36m(pid=22314)[0m   from ._conv import register_converters as _register_converters
[2m[36m(pid=22389)[0m   from ._conv import register_converters as _register_converters


2020-05-04 20:40:43,324	INFO trainable.py:217 -- Getting current IP.
2020-05-04 20:42:34,350	ERROR syncer.py:39 -- Log sync requires rsync to be installed.


Evaluated array('d', [-0.7911810104935142, 0.046728697284842635, 0.24939730637720992, -0.7113797379718891, -0.99173253560944, 0.5387376630286655]) Fitness 17.188637546175197
gen	nevals	avg     	std    	min    	max    
0  	3     	0.642187	9.33207	-7.3544	13.7329
1  	0     	13.7329 	0      	13.7329	13.7329
[2m[36m(pid=22477)[0m   from ._conv import register_converters as _register_converters
[2m[36m(pid=22476)[0m   from ._conv import register_converters as _register_converters


2020-05-04 20:42:38,947	INFO trainable.py:217 -- Getting current IP.
2020-05-04 20:44:29,818	ERROR syncer.py:39 -- Log sync requires rsync to be installed.


Evaluated array('d', [-0.7911810104935142, 0.046728697284842635, 0.24939730637720992, -0.7113797379718891, -0.99173253560944, 0.5387376630286655]) Fitness -4.810960957433126
[2m[36m(pid=22487)[0m   from ._conv import register_converters as _register_converters
[2m[36m(pid=22564)[0m   from ._conv import register_converters as _register_converters


2020-05-04 20:44:34,884	INFO trainable.py:217 -- Getting current IP.
2020-05-04 20:46:26,127	ERROR syncer.py:39 -- Log sync requires rsync to be installed.


Evaluated array('d', [-0.7911810104935142, 0.046728697284842635, 0.24939730637720992, -0.7113797379718891, -0.99173253560944, 0.5387376630286655]) Fitness 18.434354394776065
2  	1     	13.6964 	0.0516255	13.6234	13.7329
[2m[36m(pid=22624)[0m   from ._conv import register_converters as _register_converters
[2m[36m(pid=22625)[0m   from ._conv import register_converters as _register_converters


2020-05-04 20:46:30,644	INFO trainable.py:217 -- Getting current IP.
2020-05-04 20:48:25,386	ERROR syncer.py:39 -- Log sync requires rsync to be installed.


Evaluated array('d', [-0.7911810104935142, 0.046728697284842635, 0.24939730637720992, -0.7113797379718891, -0.99173253560944, 0.5387376630286655]) Fitness 3.4793956488557405
[2m[36m(pid=22635)[0m   from ._conv import register_converters as _register_converters
[2m[36m(pid=22743)[0m   from ._conv import register_converters as _register_converters


2020-05-04 20:48:29,991	INFO trainable.py:217 -- Getting current IP.
2020-05-04 20:50:21,090	ERROR syncer.py:39 -- Log sync requires rsync to be installed.


Evaluated array('d', [-0.7911810104935142, 0.046728697284842635, 0.24939730637720992, -0.7113797379718891, -0.99173253560944, 0.5387376630286655]) Fitness 10.20664307831675
[2m[36m(pid=22798)[0m   from ._conv import register_converters as _register_converters
[2m[36m(pid=22799)[0m   from ._conv import register_converters as _register_converters


2020-05-04 20:50:25,660	INFO trainable.py:217 -- Getting current IP.
2020-05-04 20:52:16,741	ERROR syncer.py:39 -- Log sync requires rsync to be installed.


Evaluated array('d', [-0.7911810104935142, 0.046728697284842635, 0.24939730637720992, -0.7113797379718891, -0.99173253560944, 0.5387376630286655]) Fitness -15.884982506187269
[2m[36m(pid=22810)[0m   from ._conv import register_converters as _register_converters
[2m[36m(pid=22883)[0m   from ._conv import register_converters as _register_converters


2020-05-04 20:52:21,804	INFO trainable.py:217 -- Getting current IP.


Evaluated array('d', [-0.7911810104935142, 0.046728697284842635, 0.24939730637720992, -0.7113797379718891, -0.99173253560944, 0.5387376630286655]) Fitness -3.5275562641373956
3  	2     	2.6688  	15.6139  	-19.4125	13.7329
pop [array('d', [-0.7911810104935142, 0.046728697284842635, 0.24939730637720992, -0.7113797379718891, -0.99173253560944, 0.5387376630286655]), array('d', [-0.7911810104935142, 0.046728697284842635, 0.24939730637720992, -0.7113797379718891, -0.99173253560944, 0.5387376630286655]), array('d', [-0.7911810104935142, 0.046728697284842635, 0.24939730637720992, -0.7113797379718891, -0.99173253560944, 0.5387376630286655])]


In [10]:
best_individual = hof[0]
print(best_individual)

test_reward = evaluate_individual_env(best_individual, test_env[0], test_env[1])
print(test_reward)

2020-05-04 20:54:13,993	ERROR syncer.py:39 -- Log sync requires rsync to be installed.


array('d', [-0.7911810104935142, 0.046728697284842635, 0.24939730637720992, -0.7113797379718891, -0.99173253560944, 0.5387376630286655])
[2m[36m(pid=22968)[0m   from ._conv import register_converters as _register_converters
[2m[36m(pid=22969)[0m   from ._conv import register_converters as _register_converters


2020-05-04 20:54:18,486	INFO trainable.py:217 -- Getting current IP.


Evaluated array('d', [-0.7911810104935142, 0.046728697284842635, 0.24939730637720992, -0.7113797379718891, -0.99173253560944, 0.5387376630286655]) Fitness 0.8423062089511587
0.8423062089511587


In [None]:
print(evaluate_individual([0, 0, 1, 0, 1, 0])) #Ideal reward, altruistic agent

In [None]:
hof[:]

In [None]:
print(evaluate_individual([1, 0, -1, 0, -1, 0])) #Worst reward, selfish agent