A proof of concept showing using a genetic algorithm with our environment.
It is similar to https://github.com/DEAP/deap/blob/a0b78956e28387785e3bb6e2b4b1f1b32c2b3883/examples/ga/onemax_short.py

In [None]:
# Run this cell if you're using colab. Otherwise, skip it.

!git clone https://github.com/platers/meta-transfer-learning.git

import os
os.chdir('meta-transfer-learning')

!pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.9.0.dev0-cp36-cp36m-manylinux1_x86_64.whl

!pip install -r requirements.txt
!pip install ran

In [None]:
import array
import random

import numpy as np
from typing import Dict

from deap import algorithms
from deap import base
from deap import creator
from deap import tools
import gym

from environments import SimpleEnv
import importlib
importlib.reload(SimpleEnv)
from environments.SimpleEnv import SimpleEnv #, TODO: add more environments

from ray.rllib.agents import ppo
from ray import tune
from ray.rllib.policy.policy import Policy
from ray.rllib.env import BaseEnv
from ray.rllib.env.multi_agent_env import MultiAgentEnv
from ray.rllib.evaluation import MultiAgentEpisode, RolloutWorker
from ray.rllib.agents.callbacks import DefaultCallbacks
import ray

import time
start = time.process_time()


In [None]:
# some settings that we can tweak:
MAX_STEP_COUNT = 1 # number of steps in each round of DEAP evolution 
POPULATION_SIZE = 3 # population in each round of DEAP evolution 
N_GEN = 5 # number of rounds of DEAP evolution This has the most effect
N_RL_TRANING = 10 # number of rounds of training for RL agent
TRAIN_BATCH_SIZE = 30
LEARNING_RATE = 0.001
EVAL_METHOD = "RL" # RL or OPTIMAL

In [None]:
ray.shutdown()
ray.init()
n_agents = 3
n_var = 2
training_envs = [
(SimpleEnv, {
    'n_agents': n_agents,
    'n_vars': n_var,
    'true_reward_weights': [1, 0],
    'max_step_count': MAX_STEP_COUNT,
}),
(SimpleEnv, {
    'n_agents': n_agents,
    'n_vars': n_var,
    'true_reward_weights': [0, 1],
    'max_step_count': MAX_STEP_COUNT,
})]
test_env = (SimpleEnv, {
    'n_agents': n_agents,
    'n_vars': n_var,
    'true_reward_weights': [1, 1],
    'max_step_count': MAX_STEP_COUNT,
})

creator.create('FitnessMax', base.Fitness, weights=(1.0, ))
creator.create('Individual', array.array, typecode='d',
               fitness=creator.FitnessMax)

toolbox = base.Toolbox()

toolbox.register('attr', random.uniform, -1, 1)
toolbox.register('individual', tools.initRepeat, creator.Individual,
                 toolbox.attr, n_agents * n_var)
toolbox.register('population', tools.initRepeat, list,
                 toolbox.individual)


In [None]:
def get_optimal_action(reward_weights, env_config):
    n_vars = env_config["n_vars"]
    max_act = 5
    
    reward_scale_factor = np.array([1]*n_vars + [2]*(len(reward_weights)-n_vars))
    scaled_reward_weights = reward_weights * reward_scale_factor
    best_act = np.argmax(scaled_reward_weights)
    action = np.eye(len(reward_weights))[best_act] * max_act
    action = np.reshape(action, [n_agents, n_vars])
    return action 

def evaluate_individual_env_optimal_act(individual, environment_fn, env_config):
    env_config['reward_weights'] = np.array([individual for i in range(n_agents)])
    env = environment_fn(config=env_config)
    
    ave_true_rewards = 0
    obs = env.reset()
    ave_reward = 0
    for _ in range(env_config["max_step_count"]):
        actions = {i: get_optimal_action(env.reward_weights[i], env_config) for i in range(n_agents)}
        obs, reward, _, _ = env.step(actions)
        reward = np.array([reward[i] for i in range(len(reward))])
        ave_reward += reward
        ave_true_rewards += env.last_true_reward
        
    ave_true_rewards /= env_config["max_step_count"]
    ave_reward /= env_config["max_step_count"]
    
    return np.mean(ave_true_rewards)

In [None]:
def evaluate_individual_env_rl(individual, environment, env_config):
    """Runs the environment. All agents have the same policy.
  It returns the total true reward as the fitness.
  """
    #Select random individuals from pop and create the reward weights
    pop = np.array([individual for i in range(n_agents)])
    reward_weights = pop
    env_config['reward_weights'] = reward_weights
    #env is only to get action space and observation space
    env = environment(config=env_config)
    class MyCallbacks(DefaultCallbacks):
        #Callback functions to keep track of true reward while training
        def on_episode_start(self, worker: RolloutWorker, base_env: BaseEnv,
                         policies: Dict[str, Policy],
                         episode: MultiAgentEpisode, **kwargs):
            episode.user_data["true_rewards"] = np.zeros(n_agents)

        def on_episode_step(self, worker: RolloutWorker, base_env: BaseEnv,
                        episode: MultiAgentEpisode, **kwargs):
            env = base_env
            true_reward = env.env_states[0].env.last_true_reward
            episode.user_data["true_rewards"] += true_reward

        def on_episode_end(self, worker: RolloutWorker, base_env: BaseEnv,
                       policies: Dict[str, Policy], episode: MultiAgentEpisode,
                       **kwargs):
            true_reward = episode.user_data["true_rewards"]
            for i, r in enumerate(true_reward):
                episode.custom_metrics["true_reward_agent_" + str(i)] = r
    
    # settings for the RL agent trainer     
    config={
        "train_batch_size": TRAIN_BATCH_SIZE,
        "lr": LEARNING_RATE,
        "sgd_minibatch_size": TRAIN_BATCH_SIZE,
        "multiagent": {
            "policies": {
            },
            "policy_mapping_fn":  #all agents share a policy
                lambda agent_id:
                    'agent'
        },
        "model": {"fcnet_hiddens": []},
        'env_config': env_config,
        "callbacks": MyCallbacks,
    }
    config['multiagent']['policies']['agent'] = (None, env.observation_space, env.action_space, {})
    metrics = None
    while True:
        trainer = ppo.PPOTrainer(env=environment, config=config)
        true_reward_mean = 0
        for i in range(N_RL_TRANING):
            true_reward_mean = 0
            #Train the RL agent
            metrics = trainer.train()  # distributed training step
            print("episode_reward_mean", metrics["episode_reward_mean"])
            if metrics["episode_reward_mean"] < 0 and i > 2:
                break
        #Train agent until it does well
        if metrics["episode_reward_mean"] > 0:
            break
    print("episode_reward_mean", metrics["episode_reward_mean"])
    for i in range(n_agents):
        true_reward_mean += metrics['custom_metrics']['true_reward_agent_' + str(i) + '_mean']
    true_reward_mean /= n_agents
    print('Evaluated', individual, 'Fitness', true_reward_mean)
    return true_reward_mean

In [None]:
if EVAL_METHOD == "RL":
    evaluate_individual_env = evaluate_individual_env_rl
else:
    evaluate_individual_env = evaluate_individual_env_optimal_act
        
def evaluate_individual(individual):
    """Runs all environments. 
  returns the average true reward over all environments as the fitness.
  """
    
    fitness = 0
    for env, config in training_envs:
        fitness += evaluate_individual_env(individual, env, config)
    fitness /= len(training_envs)
    return (fitness, )


# some setup for Deap
toolbox.register('evaluate', evaluate_individual)
toolbox.register('mate', tools.cxTwoPoint)
toolbox.register('mutate', tools.mutFlipBit, indpb=0.05) # add more noise
toolbox.register('select', tools.selTournament, tournsize=3)

In [None]:
# pop is a list individual reward function's weights 
pop = toolbox.population(n=POPULATION_SIZE)
# the 10 best individual reward function (could even befround the first round)
hof = tools.HallOfFame(10)

stats = tools.Statistics(lambda ind: ind.fitness.values)
stats.register('avg', np.mean)
stats.register('std', np.std)
stats.register('min', np.min)
stats.register('max', np.max)

pop, log = algorithms.eaSimple(pop, toolbox, cxpb=0.5, mutpb=0.2, ngen=N_GEN, 
                                   stats=stats, halloffame=hof, verbose=True)

print ('pop', pop)

In [None]:
best_individual = hof[0]
print(best_individual)

test_reward = evaluate_individual_env(best_individual, test_env[0], test_env[1])
print(test_reward)

In [None]:
print(evaluate_individual([0.9298461960519508, 0.7449587149229808, 0.4628576259710946, -0.282921307700329, -0.7019321146761455, 0.618488821337605])) #Ideal reward, altruistic agent

In [None]:
print(evaluate_individual([1, 0, -1, 0, -1, 0])) #Worst reward, selfish agent

In [None]:
print("Time Spent = ", (time.process_time() - start)/60, " minutes")