A proof of concept showing using a genetic algorithm with our environment.
It is similar to https://github.com/DEAP/deap/blob/a0b78956e28387785e3bb6e2b4b1f1b32c2b3883/examples/ga/onemax_short.py

In [None]:
# Run this cell if you're using colab. Otherwise, skip it.

!git clone https://github.com/platers/meta-transfer-learning.git

import os
os.chdir('meta-transfer-learning')

!pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.9.0.dev0-cp36-cp36m-manylinux1_x86_64.whl

!pip install -r requirements.txt

In [None]:
import array
import random

import numpy as np
from typing import Dict

from deap import algorithms
from deap import base
from deap import creator
from deap import tools
import gym
from environments.SimpleEnv import SimpleEnv

from ray.rllib.agents import ppo
from ray import tune
from ray.rllib.policy.policy import Policy
from ray.rllib.env import BaseEnv
from ray.rllib.env.multi_agent_env import MultiAgentEnv
from ray.rllib.evaluation import MultiAgentEpisode, RolloutWorker
from ray.rllib.agents.callbacks import DefaultCallbacks
import ray

In [None]:
ray.init()


creator.create('FitnessMax', base.Fitness, weights=(1.0, ))
creator.create('Individual', array.array, typecode='d',
               fitness=creator.FitnessMax)

toolbox = base.Toolbox()

toolbox.register('attr', random.uniform, -1, 1)
toolbox.register('individual', tools.initRepeat, creator.Individual,
                 toolbox.attr, 2)
toolbox.register('population', tools.initRepeat, list,
                 toolbox.individual)

n_agents = 3

def evaluate_individual(individual):
    """Runs the environment. All agents have the same policy.
  It returns the total true reward as the fitness.
  """
    
    #Select random individuals from pop and create the reward weights
    pop = np.array([individual for i in range(n_agents)])
    reward_weights = pop
    #print(individual, reward_weights)
    
    #env is only to get action space and observation space
    env = SimpleEnv(config={
        'n_agents': n_agents,
        'n_vars': 2,
        'reward_weights': reward_weights,
        'max_step_count': 20,
    })
    class MyCallbacks(DefaultCallbacks):
        #Callback functions to keep track of true reward while training
        def on_episode_start(self, worker: RolloutWorker, base_env: BaseEnv,
                         policies: Dict[str, Policy],
                         episode: MultiAgentEpisode, **kwargs):
            episode.user_data["true_rewards"] = np.zeros(n_agents)

        def on_episode_step(self, worker: RolloutWorker, base_env: BaseEnv,
                        episode: MultiAgentEpisode, **kwargs):
            env = base_env
            #print(env.env_states[0].env.last_true_reward)
            true_reward = env.env_states[0].env.last_true_reward
            episode.user_data["true_rewards"] += true_reward

        def on_episode_end(self, worker: RolloutWorker, base_env: BaseEnv,
                       policies: Dict[str, Policy], episode: MultiAgentEpisode,
                       **kwargs):
            true_reward = episode.user_data["true_rewards"]
            for i, r in enumerate(true_reward):
                episode.custom_metrics["true_reward_agent_" + str(i)] = r
            
    config={
        "multiagent": {
            "policies": {
            },
            "policy_mapping_fn":  #all agents share a policy
                lambda agent_id:
                    'agent'
        },
        'env_config': {
            'n_agents': n_agents,
            'n_vars': 2,
            'reward_weights': reward_weights,
            'max_step_count': 20,
        },
        "callbacks": MyCallbacks,
    }
    config['multiagent']['policies']['agent'] = (None, env.observation_space, env.action_space, {})
    trainer = ppo.PPOTrainer(env=SimpleEnv, config=config)
    
    true_reward_mean = 0
    for i in range(10):
        #print('TRAINING', i)
        true_reward_mean = 0
        custom_metrics = trainer.train()['custom_metrics']  # distributed training step
        print(custom_metrics)
        for i in range(n_agents):
            true_reward_mean += custom_metrics['true_reward_agent_' + str(i) + '_mean']
    true_reward_mean /= n_agents
    #print('true reward', trainer.collect_metrics()['custom_metrics']['true_reward_mean'])
    return (true_reward_mean, )


toolbox.register('evaluate', evaluate_individual)
toolbox.register('mate', tools.cxTwoPoint)
toolbox.register('mutate', tools.mutFlipBit, indpb=0.05)
toolbox.register('select', tools.selTournament, tournsize=3)

In [None]:

pop = toolbox.population(n=n_agents)
hof = tools.HallOfFame(10)
stats = tools.Statistics(lambda ind: ind.fitness.values)
stats.register('avg', np.mean)
stats.register('std', np.std)
stats.register('min', np.min)
stats.register('max', np.max)

pop, log = algorithms.eaSimple(pop, toolbox, cxpb=0.5, mutpb=0.2, ngen=2, 
                                   stats=stats, halloffame=hof, verbose=True)

print ('pop', pop)

In [None]:
pop = [[1, -1], [1, -1], [1, -1]]
print(evaluate(pop))