In [1]:
# A proof of concept showing using a genetic algorithm with our environment.
# It is similar to https://github.com/DEAP/deap/blob/a0b78956e28387785e3bb6e2b4b1f1b32c2b3883/examples/ga/onemax_short.py

import array
import random

import numpy as np
from typing import Dict

from deap import algorithms
from deap import base
from deap import creator
from deap import tools
import gym
from environments.SimpleEnv import SimpleEnv

from ray.rllib.agents import ppo
from ray import tune
from ray.rllib.policy.policy import Policy
from ray.rllib.env import BaseEnv
from ray.rllib.env.multi_agent_env import MultiAgentEnv
from ray.rllib.evaluation import MultiAgentEpisode, RolloutWorker
from ray.rllib.agents.callbacks import DefaultCallbacks
import ray

ray.init()


creator.create('FitnessMax', base.Fitness, weights=(1.0, ))
creator.create('Individual', array.array, typecode='d',
               fitness=creator.FitnessMax)

toolbox = base.Toolbox()

toolbox.register('attr', random.uniform, -1, 1)
toolbox.register('individual', tools.initRepeat, creator.Individual,
                 toolbox.attr, 2)
toolbox.register('population', tools.initRepeat, list,
                 toolbox.individual)

n_agents = 3

def evaluate_population(pop):
    """Runs the environment. Selects random agents from pop.
  It returns the total true reward as the fitness.
  """
    
    #Select random individuals from pop and create the reward weights
    pop = np.array(pop)
    reward_weights = pop
    #print(individual, reward_weights)
    
    #env is only to get action space and observation space
    env = SimpleEnv(config={
        'n_agents': n_agents,
        'n_vars': 2,
        'reward_weights': reward_weights,
        'max_step_count': 20,
    })
    class MyCallbacks(DefaultCallbacks):
        #Callback functions to keep track of true reward while training
        def on_episode_start(self, worker: RolloutWorker, base_env: BaseEnv,
                         policies: Dict[str, Policy],
                         episode: MultiAgentEpisode, **kwargs):
            episode.user_data["true_rewards"] = np.zeros(n_agents)

        def on_episode_step(self, worker: RolloutWorker, base_env: BaseEnv,
                        episode: MultiAgentEpisode, **kwargs):
            env = base_env
            #print(env.env_states[0].env.last_true_reward)
            true_reward = env.env_states[0].env.last_true_reward
            episode.user_data["true_rewards"] += true_reward

        def on_episode_end(self, worker: RolloutWorker, base_env: BaseEnv,
                       policies: Dict[str, Policy], episode: MultiAgentEpisode,
                       **kwargs):
            true_reward = episode.user_data["true_rewards"]
            for i, r in enumerate(true_reward):
                episode.custom_metrics["true_reward_agent_" + str(i)] = r
            
    config={
        "multiagent": {
            "policies": {
            },
            "policy_mapping_fn":
                lambda agent_id:
                    agent_id
        },
        'env_config': {
            'n_agents': n_agents,
            'n_vars': 2,
            'reward_weights': reward_weights,
            'max_step_count': 20,
        },
        "callbacks": MyCallbacks,
    }
    for i in range(n_agents):
        config['multiagent']['policies']['agent_' + str(i)] = (None, env.observation_space, env.action_space, {})
    trainer = ppo.PPOTrainer(env=SimpleEnv, config=config)
    
    true_reward_mean = []
    for i in range(10):
        #print('TRAINING', i)
        true_reward_mean = []
        custom_metrics = trainer.train()['custom_metrics']  # distributed training step
        print(custom_metrics)
        for i in range(n_agents):
            true_reward_mean.append((custom_metrics['true_reward_agent_' + str(i) + '_mean'], ))
        
    #print('true reward', trainer.collect_metrics()['custom_metrics']['true_reward_mean'])
    return true_reward_mean

def evaluate_individual(individual):
    """Runs the environment. All agents have the same policy.
  It returns the total true reward as the fitness.
  """
    
    #Select random individuals from pop and create the reward weights
    pop = np.array([individual for i in range(n_agents)])
    reward_weights = pop
    #print(individual, reward_weights)
    
    #env is only to get action space and observation space
    env = SimpleEnv(config={
        'n_agents': n_agents,
        'n_vars': 2,
        'reward_weights': reward_weights,
        'max_step_count': 20,
    })
    class MyCallbacks(DefaultCallbacks):
        #Callback functions to keep track of true reward while training
        def on_episode_start(self, worker: RolloutWorker, base_env: BaseEnv,
                         policies: Dict[str, Policy],
                         episode: MultiAgentEpisode, **kwargs):
            episode.user_data["true_rewards"] = np.zeros(n_agents)

        def on_episode_step(self, worker: RolloutWorker, base_env: BaseEnv,
                        episode: MultiAgentEpisode, **kwargs):
            env = base_env
            #print(env.env_states[0].env.last_true_reward)
            true_reward = env.env_states[0].env.last_true_reward
            episode.user_data["true_rewards"] += true_reward

        def on_episode_end(self, worker: RolloutWorker, base_env: BaseEnv,
                       policies: Dict[str, Policy], episode: MultiAgentEpisode,
                       **kwargs):
            true_reward = episode.user_data["true_rewards"]
            for i, r in enumerate(true_reward):
                episode.custom_metrics["true_reward_agent_" + str(i)] = r
            
    config={
        "multiagent": {
            "policies": {
            },
            "policy_mapping_fn":  #all agents share a policy
                lambda agent_id:
                    'agent'
        },
        'env_config': {
            'n_agents': n_agents,
            'n_vars': 2,
            'reward_weights': reward_weights,
            'max_step_count': 20,
        },
        "callbacks": MyCallbacks,
    }
    config['multiagent']['policies']['agent'] = (None, env.observation_space, env.action_space, {})
    trainer = ppo.PPOTrainer(env=SimpleEnv, config=config)
    
    true_reward_mean = 0
    for i in range(10):
        #print('TRAINING', i)
        true_reward_mean = 0
        custom_metrics = trainer.train()['custom_metrics']  # distributed training step
        print(custom_metrics)
        for i in range(n_agents):
            true_reward_mean += custom_metrics['true_reward_agent_' + str(i) + '_mean']
    true_reward_mean /= n_agents
    #print('true reward', trainer.collect_metrics()['custom_metrics']['true_reward_mean'])
    return (true_reward_mean, )


#toolbox.register('evaluate', evaluate_population)
toolbox.register('mate', tools.cxTwoPoint)
toolbox.register('mutate', tools.mutFlipBit, indpb=0.05)
toolbox.register('select', tools.selTournament, tournsize=3)

def evolve(population, toolbox, cxpb, mutpb, ngen, stats=None, 
           halloffame=None, verbose=__debug__):
    """
    Almost identical to deap.algorithms.eaSimple. 
    """
    
    logbook = tools.Logbook()
    logbook.header = ['gen', 'nevals'] + (stats.fields if stats else [])

    # Evaluate the individuals with an invalid fitness
    invalid_ind = [ind for ind in population]
    #print('population1', population)
    fitnesses = evaluate_population(invalid_ind)
    for ind, fit in zip(invalid_ind, fitnesses):
        ind.fitness.values = fit

    #print('population2', population)
    if halloffame is not None:
        halloffame.update(population)

    record = stats.compile(population) if stats else {}
    logbook.record(gen=0, nevals=len(invalid_ind), **record)
    if verbose:
        print(logbook.stream)

    # Begin the generational process
    for gen in range(1, ngen + 1):
        # Select the next generation individuals
        print('population', population)
        offspring = toolbox.select(population, len(population))

        # Vary the pool of individuals
        offspring = algorithms.varAnd(offspring, toolbox, cxpb, mutpb)

        # Evaluate the individuals with an invalid fitness
        invalid_ind = [ind for ind in offspring]
        fitnesses = evaluate_population(invalid_ind)
        for ind, fit in zip(invalid_ind, fitnesses):
            ind.fitness.values = fit

        # Update the hall of fame with the generated individuals
        if halloffame is not None:
            halloffame.update(offspring)

        # Replace the current population by the offspring
        population[:] = offspring

        # Append the current generation statistics to the logbook
        record = stats.compile(population) if stats else {}
        logbook.record(gen=gen, nevals=len(invalid_ind), **record)
        if verbose:
            print(logbook.stream)

    return population, logbook

  from ._conv import register_converters as _register_converters
2020-05-03 12:44:39,761	INFO resource_spec.py:212 -- Starting Ray with 2.54 GiB memory available for workers and up to 1.29 GiB for objects. You can adjust these settings with ray.init(memory=<bytes>, object_store_memory=<bytes>).
2020-05-03 12:44:40,107	INFO services.py:1170 -- View the Ray dashboard at [1m[32mlocalhost:8265[39m[22m


In [2]:
#Use evaluate individual - select for group selection
pop = toolbox.population(n=n_agents)
toolbox.register('evaluate', evaluate_individual)
hof = tools.HallOfFame(10)
stats = tools.Statistics(lambda ind: ind.fitness.values)
stats.register('avg', np.mean)
stats.register('std', np.std)
stats.register('min', np.min)
stats.register('max', np.max)

pop, log = algorithms.eaSimple(pop, toolbox, cxpb=0.5, mutpb=0.2, ngen=2, 
                                   stats=stats, halloffame=hof, verbose=True)

2020-05-03 12:44:40,806	INFO trainer.py:421 -- Tip: set 'eager': true or the --eager flag to enable TensorFlow eager execution
2020-05-03 12:44:40,900	ERROR syncer.py:39 -- Log sync requires rsync to be installed.
2020-05-03 12:44:40,909	INFO trainer.py:580 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.


[2m[36m(pid=9114)[0m   from ._conv import register_converters as _register_converters
[2m[36m(pid=9117)[0m   from ._conv import register_converters as _register_converters


2020-05-03 12:44:47,106	INFO trainable.py:217 -- Getting current IP.


{'true_reward_agent_0_mean': 0.18978884825733985, 'true_reward_agent_0_min': -11.674890041351318, 'true_reward_agent_0_max': 18.933333314955235, 'true_reward_agent_1_mean': -0.4147306205322457, 'true_reward_agent_1_min': -12.045711737126112, 'true_reward_agent_1_max': 12.723820120096207, 'true_reward_agent_2_mean': 0.36191094439689553, 'true_reward_agent_2_min': -14.41079644113779, 'true_reward_agent_2_max': 19.053350493311882}
{'true_reward_agent_0_mean': -0.1986860764940502, 'true_reward_agent_0_min': -15.024060606956482, 'true_reward_agent_0_max': 13.53712423890829, 'true_reward_agent_1_mean': -0.7003104172301573, 'true_reward_agent_1_min': -16.76941379904747, 'true_reward_agent_1_max': 19.54326067864895, 'true_reward_agent_2_mean': -2.0313096019920884, 'true_reward_agent_2_min': -15.644575461745262, 'true_reward_agent_2_max': 15.58729462325573}
{'true_reward_agent_0_mean': -2.0014530976825338, 'true_reward_agent_0_min': -13.836990661919117, 'true_reward_agent_0_max': 9.113343521952

2020-05-03 12:47:17,951	ERROR syncer.py:39 -- Log sync requires rsync to be installed.


{'true_reward_agent_0_mean': -5.922253688414294, 'true_reward_agent_0_min': -16.37138692039298, 'true_reward_agent_0_max': 4.950532461516559, 'true_reward_agent_1_mean': -16.681630438000376, 'true_reward_agent_1_min': -27.253180027008057, 'true_reward_agent_1_max': -7.57680525816977, 'true_reward_agent_2_mean': -11.39771924143829, 'true_reward_agent_2_min': -20.634342340752482, 'true_reward_agent_2_max': -2.723449006676674}
[2m[36m(pid=9115)[0m   from ._conv import register_converters as _register_converters
[2m[36m(pid=9116)[0m   from ._conv import register_converters as _register_converters


2020-05-03 12:47:24,101	INFO trainable.py:217 -- Getting current IP.


{'true_reward_agent_0_mean': -0.21029326899414172, 'true_reward_agent_0_min': -12.285366874188185, 'true_reward_agent_0_max': 16.6010437682271, 'true_reward_agent_1_mean': 0.19122196278718093, 'true_reward_agent_1_min': -13.748688347637653, 'true_reward_agent_1_max': 14.761405177414417, 'true_reward_agent_2_mean': 0.03409754443103111, 'true_reward_agent_2_min': -13.871654592454433, 'true_reward_agent_2_max': 16.323897045105696}
{'true_reward_agent_0_mean': 18.110378507060233, 'true_reward_agent_0_min': -28.501447150483727, 'true_reward_agent_0_max': 84.43289574980736, 'true_reward_agent_1_mean': -5.331898848536512, 'true_reward_agent_1_min': -23.44284749031067, 'true_reward_agent_1_max': 23.108754217624664, 'true_reward_agent_2_mean': 11.427101130800146, 'true_reward_agent_2_min': -25.257414028048515, 'true_reward_agent_2_max': 60.03092351555824}
{'true_reward_agent_0_mean': 28.477860109504253, 'true_reward_agent_0_min': -29.607445895671844, 'true_reward_agent_0_max': 98.73897376656532

2020-05-03 12:49:49,910	ERROR syncer.py:39 -- Log sync requires rsync to be installed.


{'true_reward_agent_0_mean': 37.88226355888299, 'true_reward_agent_0_min': -26.087430112063885, 'true_reward_agent_0_max': 94.49110712110996, 'true_reward_agent_1_mean': 3.1386909396797273, 'true_reward_agent_1_min': -19.79840850830078, 'true_reward_agent_1_max': 34.79483203962445, 'true_reward_agent_2_mean': 31.419096953207845, 'true_reward_agent_2_min': -22.12456312775612, 'true_reward_agent_2_max': 78.5134951993823}
[2m[36m(pid=9351)[0m   from ._conv import register_converters as _register_converters
[2m[36m(pid=9350)[0m   from ._conv import register_converters as _register_converters


2020-05-03 12:49:55,961	INFO trainable.py:217 -- Getting current IP.


{'true_reward_agent_0_mean': -0.25357442714099304, 'true_reward_agent_0_min': -13.96676990389824, 'true_reward_agent_0_max': 14.782632521470077, 'true_reward_agent_1_mean': 0.2829695787985111, 'true_reward_agent_1_min': -14.117055165581405, 'true_reward_agent_1_max': 16.691753648221493, 'true_reward_agent_2_mean': -0.04163038854116166, 'true_reward_agent_2_min': -12.15682478249073, 'true_reward_agent_2_max': 15.184949710965157}
{'true_reward_agent_0_mean': 4.634362882149726, 'true_reward_agent_0_min': -16.960823990404606, 'true_reward_agent_0_max': 35.359932268678676, 'true_reward_agent_1_mean': 0.8706147514446638, 'true_reward_agent_1_min': -18.46067025512457, 'true_reward_agent_1_max': 34.83531603962183, 'true_reward_agent_2_mean': -0.9164166976656998, 'true_reward_agent_2_min': -22.04696974158287, 'true_reward_agent_2_max': 20.290313452482224}
{'true_reward_agent_0_mean': 3.7019351542956427, 'true_reward_agent_0_min': -19.36236497014761, 'true_reward_agent_0_max': 48.53447723388672,

2020-05-03 12:52:25,576	ERROR syncer.py:39 -- Log sync requires rsync to be installed.


{'true_reward_agent_0_mean': 1.9738986582279359, 'true_reward_agent_0_min': -20.832955986261368, 'true_reward_agent_0_max': 54.807140827178955, 'true_reward_agent_1_mean': -16.880362026375188, 'true_reward_agent_1_min': -29.505124360322952, 'true_reward_agent_1_max': 45.63651278987527, 'true_reward_agent_2_mean': 4.232172342714803, 'true_reward_agent_2_min': -13.792873591184616, 'true_reward_agent_2_max': 24.29930578172207}
gen	nevals	avg    	std    	min     	max    
0  	3     	3.08491	15.2275	-11.3339	24.1467
[2m[36m(pid=9363)[0m   from ._conv import register_converters as _register_converters
[2m[36m(pid=9462)[0m   from ._conv import register_converters as _register_converters


2020-05-03 12:52:31,686	INFO trainable.py:217 -- Getting current IP.


{'true_reward_agent_0_mean': -0.30241972306524983, 'true_reward_agent_0_min': -13.896786071360111, 'true_reward_agent_0_max': 13.630845069885254, 'true_reward_agent_1_mean': 0.6428976068622433, 'true_reward_agent_1_min': -19.827843889594078, 'true_reward_agent_1_max': 14.509237967431545, 'true_reward_agent_2_mean': -0.1772894032009208, 'true_reward_agent_2_min': -12.189038844779134, 'true_reward_agent_2_max': 23.95267380774021}
{'true_reward_agent_0_mean': 3.1929477456660242, 'true_reward_agent_0_min': -24.834673024713993, 'true_reward_agent_0_max': 81.02191178500652, 'true_reward_agent_1_mean': -1.2830997437402403, 'true_reward_agent_1_min': -24.38797217607498, 'true_reward_agent_1_max': 55.90010266751051, 'true_reward_agent_2_mean': 5.672376704898852, 'true_reward_agent_2_min': -51.32901482284069, 'true_reward_agent_2_max': 42.20939818024635}
{'true_reward_agent_0_mean': -12.879729356165917, 'true_reward_agent_0_min': -25.448423206806183, 'true_reward_agent_0_max': 84.24664422869682,

2020-05-03 12:54:56,287	ERROR syncer.py:39 -- Log sync requires rsync to be installed.


{'true_reward_agent_0_mean': -17.19477107252318, 'true_reward_agent_0_min': -35.03441867232323, 'true_reward_agent_0_max': 61.716988891363144, 'true_reward_agent_1_mean': -14.642290729974848, 'true_reward_agent_1_min': -34.25021241605282, 'true_reward_agent_1_max': 47.69436630606651, 'true_reward_agent_2_mean': 23.85136944721271, 'true_reward_agent_2_min': -30.557699874043465, 'true_reward_agent_2_max': 45.52760075032711}
[2m[36m(pid=9541)[0m   from ._conv import register_converters as _register_converters
[2m[36m(pid=9542)[0m   from ._conv import register_converters as _register_converters


2020-05-03 12:55:02,614	INFO trainable.py:217 -- Getting current IP.


{'true_reward_agent_0_mean': -0.1633284401099081, 'true_reward_agent_0_min': -17.062376998364925, 'true_reward_agent_0_max': 16.88639849051833, 'true_reward_agent_1_mean': 0.2473616630930337, 'true_reward_agent_1_min': -12.864770494401455, 'true_reward_agent_1_max': 18.931149289011955, 'true_reward_agent_2_mean': 0.1776803478381771, 'true_reward_agent_2_min': -13.266850480809808, 'true_reward_agent_2_max': 16.609469570219517}
{'true_reward_agent_0_mean': -1.0906223294431039, 'true_reward_agent_0_min': -22.010109370574355, 'true_reward_agent_0_max': 43.09563681483269, 'true_reward_agent_1_mean': 3.2763092634048006, 'true_reward_agent_1_min': -17.95288346707821, 'true_reward_agent_1_max': 24.856494599953294, 'true_reward_agent_2_mean': 2.144435331440054, 'true_reward_agent_2_min': -18.326523691415787, 'true_reward_agent_2_max': 22.407754676416516}
{'true_reward_agent_0_mean': -4.4395671349557775, 'true_reward_agent_0_min': -27.562517369784473, 'true_reward_agent_0_max': 44.62791334465146

In [2]:
#Use evaluate population
pop = toolbox.population(n=n_agents)
hof = tools.HallOfFame(10)
stats = tools.Statistics(lambda ind: ind.fitness.values)
stats.register('avg', np.mean)
stats.register('std', np.std)
stats.register('min', np.min)
stats.register('max', np.max)

(pop, log) = evolve(
    pop,
    toolbox,
    cxpb=0.5,
    mutpb=0.2,
    ngen=3,
    stats=stats,
    halloffame=hof,
    verbose=True,
    )

print ('pop', pop)

2020-05-02 19:44:09,694	INFO trainer.py:421 -- Tip: set 'eager': true or the --eager flag to enable TensorFlow eager execution
2020-05-02 19:44:09,789	ERROR syncer.py:39 -- Log sync requires rsync to be installed.
2020-05-02 19:44:09,798	INFO trainer.py:580 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.


[2m[36m(pid=17780)[0m   from ._conv import register_converters as _register_converters
[2m[36m(pid=17781)[0m   from ._conv import register_converters as _register_converters


2020-05-02 19:44:29,038	INFO trainable.py:180 -- _setup took 19.245 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.
2020-05-02 19:44:29,039	INFO trainable.py:217 -- Getting current IP.


{'true_reward_agent_0_mean': -0.03672808718911256, 'true_reward_agent_0_min': -14.902736930991523, 'true_reward_agent_0_max': 14.684467680752277, 'true_reward_agent_1_mean': 0.10257448901487805, 'true_reward_agent_1_min': -13.657341606914997, 'true_reward_agent_1_max': 21.018747905269265, 'true_reward_agent_2_mean': 0.01720135783081787, 'true_reward_agent_2_min': -13.558223649859428, 'true_reward_agent_2_max': 16.660553514957428}
{'true_reward_agent_0_mean': -3.3104821921887524, 'true_reward_agent_0_min': -22.918064691126347, 'true_reward_agent_0_max': 37.737172320485115, 'true_reward_agent_1_mean': -0.05132808728485998, 'true_reward_agent_1_min': -16.546868479810655, 'true_reward_agent_1_max': 19.297310791909695, 'true_reward_agent_2_mean': -3.0512137598750524, 'true_reward_agent_2_min': -15.538922376930714, 'true_reward_agent_2_max': 16.56323118507862}
{'true_reward_agent_0_mean': -7.628187831962751, 'true_reward_agent_0_min': -26.3912685662508, 'true_reward_agent_0_max': 46.75625977

2020-05-02 19:46:50,013	ERROR syncer.py:39 -- Log sync requires rsync to be installed.


{'true_reward_agent_0_mean': -13.463509728138398, 'true_reward_agent_0_min': -31.726001374423504, 'true_reward_agent_0_max': 44.61888311430812, 'true_reward_agent_1_mean': -8.511883776946451, 'true_reward_agent_1_min': -28.87863167375326, 'true_reward_agent_1_max': 21.5758962277323, 'true_reward_agent_2_mean': -4.653170590403096, 'true_reward_agent_2_min': -22.34093351662159, 'true_reward_agent_2_max': 24.591818057000637}
gen	nevals	avg     	std    	min     	max     
0  	3     	-8.87619	3.60602	-13.4635	-4.65317
population [array('d', [-0.8179451046903832, 0.8381507555515655]), array('d', [-0.8255587441221259, 0.6138589995044808]), array('d', [0.5042140939425694, -0.3126186444144592])]
[2m[36m(pid=17782)[0m   from ._conv import register_converters as _register_converters
[2m[36m(pid=17783)[0m   from ._conv import register_converters as _register_converters


2020-05-02 19:47:08,882	INFO trainable.py:180 -- _setup took 18.868 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.
2020-05-02 19:47:08,882	INFO trainable.py:217 -- Getting current IP.


{'true_reward_agent_0_mean': -0.11330947479291353, 'true_reward_agent_0_min': -13.030011162161827, 'true_reward_agent_0_max': 17.891326010227203, 'true_reward_agent_1_mean': -0.27129522192903094, 'true_reward_agent_1_min': -12.189794339239597, 'true_reward_agent_1_max': 18.658127814531326, 'true_reward_agent_2_mean': 0.17577213245342135, 'true_reward_agent_2_min': -17.081376053392887, 'true_reward_agent_2_max': 20.274739645421505}
{'true_reward_agent_0_mean': -1.0008624063792013, 'true_reward_agent_0_min': -19.85705430060625, 'true_reward_agent_0_max': 19.455532629042864, 'true_reward_agent_1_mean': 1.0167946666359784, 'true_reward_agent_1_min': -24.610318889841437, 'true_reward_agent_1_max': 41.40404460579157, 'true_reward_agent_2_mean': 1.8600263651762907, 'true_reward_agent_2_min': -18.970519348978996, 'true_reward_agent_2_max': 21.800836976617575}
{'true_reward_agent_0_mean': -2.055206073766649, 'true_reward_agent_0_min': -25.561499568633735, 'true_reward_agent_0_max': 23.711720287

2020-05-02 19:49:28,985	ERROR syncer.py:39 -- Log sync requires rsync to be installed.


{'true_reward_agent_0_mean': -7.197712974591486, 'true_reward_agent_0_min': -30.207335256040096, 'true_reward_agent_0_max': 29.585197642445564, 'true_reward_agent_1_mean': 2.6962196839671377, 'true_reward_agent_1_min': -34.793314695358276, 'true_reward_agent_1_max': 52.56577454507351, 'true_reward_agent_2_mean': 0.3625911633353644, 'true_reward_agent_2_min': -27.642026364803314, 'true_reward_agent_2_max': 35.475639291107655}
1  	3     	-1.37963	4.22287	-7.19771	2.69622 
population [array('d', [-0.8255587441221259, -0.3126186444144592]), array('d', [0.5042140939425694, 0.6138589995044808]), array('d', [0.5042140939425694, -0.3126186444144592])]
[2m[36m(pid=18047)[0m   from ._conv import register_converters as _register_converters
[2m[36m(pid=18048)[0m   from ._conv import register_converters as _register_converters


2020-05-02 19:49:48,195	INFO trainable.py:180 -- _setup took 19.209 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.
2020-05-02 19:49:48,195	INFO trainable.py:217 -- Getting current IP.


{'true_reward_agent_0_mean': -0.10516482778166392, 'true_reward_agent_0_min': -13.142949096858501, 'true_reward_agent_0_max': 18.579245075583458, 'true_reward_agent_1_mean': 0.3342458797002291, 'true_reward_agent_1_min': -13.988721085712314, 'true_reward_agent_1_max': 17.159135138615966, 'true_reward_agent_2_mean': -0.14160609966904303, 'true_reward_agent_2_min': -14.808785647153854, 'true_reward_agent_2_max': 17.225516138598323}
{'true_reward_agent_0_mean': 0.3255056980250447, 'true_reward_agent_0_min': -16.705914959311485, 'true_reward_agent_0_max': 21.934297181665897, 'true_reward_agent_1_mean': -0.30527540663162656, 'true_reward_agent_1_min': -12.476791235327255, 'true_reward_agent_1_max': 14.820636610034853, 'true_reward_agent_2_mean': -0.3281357921641029, 'true_reward_agent_2_min': -11.425935875624418, 'true_reward_agent_2_max': 19.170060358941555}
{'true_reward_agent_0_mean': 5.803816202782018, 'true_reward_agent_0_min': -22.7904114946723, 'true_reward_agent_0_max': 37.749969556

2020-05-02 19:52:07,525	ERROR syncer.py:39 -- Log sync requires rsync to be installed.


{'true_reward_agent_0_mean': 8.915034366304807, 'true_reward_agent_0_min': -27.971521943807602, 'true_reward_agent_0_max': 50.16226092353463, 'true_reward_agent_1_mean': -2.7409492544754177, 'true_reward_agent_1_min': -17.782050577225164, 'true_reward_agent_1_max': 14.182712577283382, 'true_reward_agent_2_mean': -2.691558982191964, 'true_reward_agent_2_min': -15.104011187329888, 'true_reward_agent_2_max': 16.981375370174646}
2  	3     	1.16084 	5.48308	-2.74095	8.91503 
population [array('d', [0.5042140939425694, -0.3126186444144592]), array('d', [-0.8255587441221259, 0.6138589995044808]), array('d', [0.5042140939425694, 0.6138589995044808])]
[2m[36m(pid=18058)[0m   from ._conv import register_converters as _register_converters
[2m[36m(pid=18136)[0m   from ._conv import register_converters as _register_converters


2020-05-02 19:52:26,598	INFO trainable.py:180 -- _setup took 19.072 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.
2020-05-02 19:52:26,599	INFO trainable.py:217 -- Getting current IP.


{'true_reward_agent_0_mean': 0.09126990698758164, 'true_reward_agent_0_min': -14.828576222062111, 'true_reward_agent_0_max': 18.506440553814173, 'true_reward_agent_1_mean': 0.24993146612294367, 'true_reward_agent_1_min': -14.371537514030933, 'true_reward_agent_1_max': 20.74822475016117, 'true_reward_agent_2_mean': -0.1708565825290134, 'true_reward_agent_2_min': -11.774140860885382, 'true_reward_agent_2_max': 15.316454976797104}
{'true_reward_agent_0_mean': -0.8091277288935089, 'true_reward_agent_0_min': -17.74844066798687, 'true_reward_agent_0_max': 16.605887584388256, 'true_reward_agent_1_mean': 6.517130101349685, 'true_reward_agent_1_min': -13.541281402111053, 'true_reward_agent_1_max': 38.81933831423521, 'true_reward_agent_2_mean': -1.4573186728635483, 'true_reward_agent_2_min': -16.77055600658059, 'true_reward_agent_2_max': 17.80556874105241}
{'true_reward_agent_0_mean': -2.595669973913573, 'true_reward_agent_0_min': -15.56043853238225, 'true_reward_agent_0_max': 16.878474798053503

In [2]:
#Sanity check, this pop should be very close to the true reward function. Expect high fitness
pop = [[1, -1], [1, -1], [1, -1]]
print(evaluate(pop))

2020-05-02 15:02:15,714	INFO trainer.py:421 -- Tip: set 'eager': true or the --eager flag to enable TensorFlow eager execution
2020-05-02 15:02:15,788	ERROR syncer.py:39 -- Log sync requires rsync to be installed.
2020-05-02 15:02:15,798	INFO trainer.py:580 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.


[2m[36m(pid=10479)[0m   from ._conv import register_converters as _register_converters
[2m[36m(pid=10478)[0m   from ._conv import register_converters as _register_converters


2020-05-02 15:02:34,721	INFO trainable.py:180 -- _setup took 18.933 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.
2020-05-02 15:02:34,722	INFO trainable.py:217 -- Getting current IP.


{'true_reward_agent_0_mean': 0.06309596741950373, 'true_reward_agent_0_min': -15.793003261089325, 'true_reward_agent_0_max': 20.82217174768448, 'true_reward_agent_1_mean': -0.07088532369147288, 'true_reward_agent_1_min': -13.382614257046953, 'true_reward_agent_1_max': 17.442511551082134, 'true_reward_agent_2_mean': -0.10051303304877365, 'true_reward_agent_2_min': -13.546605411916971, 'true_reward_agent_2_max': 14.234586730599403}
{'true_reward_agent_0_mean': -1.4887058582130954, 'true_reward_agent_0_min': -17.87443380570039, 'true_reward_agent_0_max': 16.0120142782107, 'true_reward_agent_1_mean': 0.18680497107047814, 'true_reward_agent_1_min': -22.44005953706801, 'true_reward_agent_1_max': 22.56702247262001, 'true_reward_agent_2_mean': 0.589440964944854, 'true_reward_agent_2_min': -14.150779940187931, 'true_reward_agent_2_max': 23.312319873366505}
{'true_reward_agent_0_mean': -2.0609699610471353, 'true_reward_agent_0_min': -21.483412104658782, 'true_reward_agent_0_max': 18.646432980895

In [3]:
pop = [[-1, 1], [-1, 1], [-1, 1]]
print(evaluate(pop))

2020-05-02 15:04:51,773	ERROR syncer.py:39 -- Log sync requires rsync to be installed.


[2m[36m(pid=10476)[0m   from ._conv import register_converters as _register_converters
[2m[36m(pid=10477)[0m   from ._conv import register_converters as _register_converters


2020-05-02 15:05:10,393	INFO trainable.py:180 -- _setup took 18.617 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.
2020-05-02 15:05:10,394	INFO trainable.py:217 -- Getting current IP.


{'true_reward_agent_0_mean': -0.11662918505806374, 'true_reward_agent_0_min': -15.453765712678432, 'true_reward_agent_0_max': 16.0421684384346, 'true_reward_agent_1_mean': 0.5692989542410942, 'true_reward_agent_1_min': -12.483187314122915, 'true_reward_agent_1_max': 15.38494049012661, 'true_reward_agent_2_mean': 0.14381846922395197, 'true_reward_agent_2_min': -14.652116991579533, 'true_reward_agent_2_max': 19.261050276458263}
{'true_reward_agent_0_mean': -0.3484838804654737, 'true_reward_agent_0_min': -13.308767020702362, 'true_reward_agent_0_max': 13.805523544549942, 'true_reward_agent_1_mean': -0.2733594095386252, 'true_reward_agent_1_min': -16.898636043071747, 'true_reward_agent_1_max': 19.101291661150753, 'true_reward_agent_2_mean': 0.3308985035362821, 'true_reward_agent_2_min': -12.60540136974305, 'true_reward_agent_2_max': 16.61903390288353}
{'true_reward_agent_0_mean': -1.3687531748475157, 'true_reward_agent_0_min': -15.67809831816703, 'true_reward_agent_0_max': 25.7463867664337