In [1]:
# A proof of concept showing using a genetic algorithm with our environment.
# It is similar to https://github.com/DEAP/deap/blob/a0b78956e28387785e3bb6e2b4b1f1b32c2b3883/examples/ga/onemax_short.py

import array
import random

import numpy as np

from deap import algorithms
from deap import base
from deap import creator
from deap import tools
import gym
from environments.SimpleEnv import SimpleEnv

from ray.rllib.agents import ppo
from ray import tune
from ray.rllib.policy.policy import Policy
from ray.rllib.env.multi_agent_env import MultiAgentEnv
import ray

ray.init()


creator.create('FitnessMax', base.Fitness, weights=(1.0, ))
creator.create('Individual', array.array, typecode='d',
               fitness=creator.FitnessMax)

toolbox = base.Toolbox()

toolbox.register('attr', random.uniform, -1, 1)
toolbox.register('individual', tools.initRepeat, creator.Individual,
                 toolbox.attr, 2)
toolbox.register('population', tools.initRepeat, list,
                 toolbox.individual)

n_agents = 3

def evaluate(pop):
    """Runs the environment. Selects random agents from pop.
  It returns the total true reward as the fitness.
  """
    
    #Select random individuals from pop and create the reward weights
    pop = np.array(pop)
    reward_weights = pop
    #print(individual, reward_weights)
    
    #env is only to get action space and observation space
    env = SimpleEnv(config={
        'n_agents': n_agents,
        'n_vars': 2,
        'reward_weights': reward_weights,
        'max_step_count': 20,
    })
    
    #Callback functions to keep track of true reward while training
    def on_episode_start(info):
        episode = info["episode"]
        episode.user_data["true_rewards"] = np.zeros(n_agents)#TODO: change to n_agents

    def on_episode_step(info):
        episode = info["episode"]
        env = info["env"]
        #print(env.env_states[0].env.last_true_reward)
        true_reward = env.env_states[0].env.last_true_reward
        episode.user_data["true_rewards"] += true_reward

    def on_episode_end(info):
        episode = info["episode"]
        true_reward = episode.user_data["true_rewards"]
        for i, r in enumerate(true_reward):
            episode.custom_metrics["true_reward_agent_" + str(i)] = r
            
    config={
        "multiagent": {
            "policies": {
            },
            "policy_mapping_fn":
                lambda agent_id:
                    agent_id
        },
        'env_config': {
            'n_agents': n_agents,
            'n_vars': 2,
            'reward_weights': reward_weights,
            'max_step_count': 20,
        },
        "callbacks": {
            "on_episode_start": on_episode_start,
            "on_episode_step": on_episode_step,
            "on_episode_end": on_episode_end,
        },
    }
    for i in range(n_agents):
        config['multiagent']['policies']['agent_' + str(i)] = (None, env.observation_space, env.action_space, {})
    trainer = ppo.PPOTrainer(env=SimpleEnv, config=config)
    
    true_reward_mean = []
    for i in range(10):
        #print('TRAINING', i)
        custom_metrics = trainer.train()['custom_metrics']  # distributed training step
        print(custom_metrics)
        for i in range(n_agents):
            true_reward_mean.append((custom_metrics['true_reward_agent_' + str(i) + '_mean'], ))
        
    #print('true reward', trainer.collect_metrics()['custom_metrics']['true_reward_mean'])
    return true_reward_mean


toolbox.register('evaluate', evaluate)
toolbox.register('mate', tools.cxTwoPoint)
toolbox.register('mutate', tools.mutFlipBit, indpb=0.05)
toolbox.register('select', tools.selTournament, tournsize=3)

def evolve(population, toolbox, cxpb, mutpb, ngen, stats=None, 
           halloffame=None, verbose=__debug__):
    """
    Almost identical to deap.algorithms.eaSimple. 
    
    TODO: Sometimes all individuals become identical for some reason
    Runtime is O(pop size * ngen * RL iterations) with a huge constant. 
    Easily parralizable, but we can remove the pop size factor by training the
    entire population at once. Requires environments to support many agents.
    """
    
    logbook = tools.Logbook()
    logbook.header = ['gen', 'nevals'] + (stats.fields if stats else [])

    # Evaluate the individuals with an invalid fitness
    invalid_ind = [ind for ind in population]
    #print('population1', population)
    fitnesses = evaluate(invalid_ind)
    for ind, fit in zip(invalid_ind, fitnesses):
        ind.fitness.values = fit

    #print('population2', population)
    if halloffame is not None:
        halloffame.update(population)

    record = stats.compile(population) if stats else {}
    logbook.record(gen=0, nevals=len(invalid_ind), **record)
    if verbose:
        print(logbook.stream)

    # Begin the generational process
    for gen in range(1, ngen + 1):
        # Select the next generation individuals
        print('population', population)
        offspring = toolbox.select(population, len(population))

        # Vary the pool of individuals
        offspring = algorithms.varAnd(offspring, toolbox, cxpb, mutpb)

        # Evaluate the individuals with an invalid fitness
        invalid_ind = [ind for ind in offspring]
        fitnesses = evaluate(invalid_ind)
        for ind, fit in zip(invalid_ind, fitnesses):
            ind.fitness.values = fit

        # Update the hall of fame with the generated individuals
        if halloffame is not None:
            halloffame.update(offspring)

        # Replace the current population by the offspring
        population[:] = offspring

        # Append the current generation statistics to the logbook
        record = stats.compile(population) if stats else {}
        logbook.record(gen=gen, nevals=len(invalid_ind), **record)
        if verbose:
            print(logbook.stream)

    return population, logbook

def main():
    pop = toolbox.population(n=n_agents)
    hof = tools.HallOfFame(10)
    stats = tools.Statistics(lambda ind: ind.fitness.values)
    stats.register('avg', np.mean)
    stats.register('std', np.std)
    stats.register('min', np.min)
    stats.register('max', np.max)

    (pop, log) = evolve(
        pop,
        toolbox,
        cxpb=0.5,
        mutpb=0.2,
        ngen=3,
        stats=stats,
        halloffame=hof,
        verbose=True,
        )

    print ('pop', pop)

    return (pop, log, hof)


if __name__ == '__main__':
    main()


  from ._conv import register_converters as _register_converters
2020-05-01 22:32:59,129	INFO resource_spec.py:212 -- Starting Ray with 2.93 GiB memory available for workers and up to 1.47 GiB for objects. You can adjust these settings with ray.init(memory=<bytes>, object_store_memory=<bytes>).
2020-05-01 22:32:59,726	INFO services.py:1170 -- View the Ray dashboard at [1m[32mlocalhost:8266[39m[22m
2020-05-01 22:33:00,000	INFO trainer.py:421 -- Tip: set 'eager': true or the --eager flag to enable TensorFlow eager execution
2020-05-01 22:33:00,137	ERROR syncer.py:39 -- Log sync requires rsync to be installed.
2020-05-01 22:33:00,151	INFO trainer.py:580 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.


[2m[36m(pid=31421)[0m   from ._conv import register_converters as _register_converters
[2m[36m(pid=31423)[0m   from ._conv import register_converters as _register_converters


2020-05-01 22:33:30,705	INFO trainable.py:180 -- _setup took 30.565 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.
2020-05-01 22:33:30,705	INFO trainable.py:217 -- Getting current IP.


{'true_reward_agent_0_mean': -0.15498154801047348, 'true_reward_agent_0_min': -12.244781985878944, 'true_reward_agent_0_max': 18.493033677339554, 'true_reward_agent_1_mean': -0.43743399197774124, 'true_reward_agent_1_min': -12.707408532500267, 'true_reward_agent_1_max': 18.821210399270058, 'true_reward_agent_2_mean': 0.20345525578581147, 'true_reward_agent_2_min': -12.735222198069096, 'true_reward_agent_2_max': 14.891411125659943}
{'true_reward_agent_0_mean': -1.0045145776376512, 'true_reward_agent_0_min': -15.839915418997407, 'true_reward_agent_0_max': 16.55339601635933, 'true_reward_agent_1_mean': -0.7546000459776314, 'true_reward_agent_1_min': -19.996425203979015, 'true_reward_agent_1_max': 15.603810021653771, 'true_reward_agent_2_mean': 2.8176268869128127, 'true_reward_agent_2_min': -22.855759439989924, 'true_reward_agent_2_max': 23.374390348792076}
{'true_reward_agent_0_mean': -1.6934921706670139, 'true_reward_agent_0_min': -15.351007277145982, 'true_reward_agent_0_max': 13.869446

2020-05-01 22:36:57,386	ERROR syncer.py:39 -- Log sync requires rsync to be installed.


{'true_reward_agent_0_mean': -0.6554335779178291, 'true_reward_agent_0_min': -23.78200162947178, 'true_reward_agent_0_max': 34.85212025325745, 'true_reward_agent_1_mean': 1.6161983571203018, 'true_reward_agent_1_min': -23.483320073690265, 'true_reward_agent_1_max': 29.363411754369736, 'true_reward_agent_2_mean': 2.6297184188080793, 'true_reward_agent_2_min': -44.053634223062545, 'true_reward_agent_2_max': 41.701861783862114}
gen	nevals	avg      	std     	min      	max     
0  	3     	-0.129653	0.262254	-0.437434	0.203455
population [array('d', [0.15639472994139103, 0.16183220268031784]), array('d', [0.9149542711414311, 0.15902652261354433]), array('d', [0.6502595333470527, -0.027230239015153757])]
[2m[36m(pid=31420)[0m   from ._conv import register_converters as _register_converters
[2m[36m(pid=31422)[0m   from ._conv import register_converters as _register_converters


2020-05-01 22:37:21,863	INFO trainable.py:180 -- _setup took 24.476 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.
2020-05-01 22:37:21,864	INFO trainable.py:217 -- Getting current IP.


{'true_reward_agent_0_mean': -0.33712740884613596, 'true_reward_agent_0_min': -14.500387288630009, 'true_reward_agent_0_max': 17.22421921789646, 'true_reward_agent_1_mean': 0.0629927469122049, 'true_reward_agent_1_min': -13.897988080978394, 'true_reward_agent_1_max': 15.080430198460817, 'true_reward_agent_2_mean': 0.4197854024091066, 'true_reward_agent_2_min': -11.81008056551218, 'true_reward_agent_2_max': 15.663599155843258}
{'true_reward_agent_0_mean': -1.9234273335711258, 'true_reward_agent_0_min': -14.809092968702316, 'true_reward_agent_0_max': 14.57370449602604, 'true_reward_agent_1_mean': 1.0340313626231727, 'true_reward_agent_1_min': -16.60789081454277, 'true_reward_agent_1_max': 23.63104624301195, 'true_reward_agent_2_mean': 1.6060643242339574, 'true_reward_agent_2_min': -23.326547347009182, 'true_reward_agent_2_max': 41.42121014744043}
{'true_reward_agent_0_mean': -1.729993019135991, 'true_reward_agent_0_min': -18.649232824798673, 'true_reward_agent_0_max': 16.689207445830107,

2020-05-01 22:40:27,150	ERROR syncer.py:39 -- Log sync requires rsync to be installed.


{'true_reward_agent_0_mean': -1.445010737082157, 'true_reward_agent_0_min': -19.428084559738636, 'true_reward_agent_0_max': 21.685091391205788, 'true_reward_agent_1_mean': -2.6868347839475426, 'true_reward_agent_1_min': -19.13734645396471, 'true_reward_agent_1_max': 14.478128192946315, 'true_reward_agent_2_mean': 6.209941062986036, 'true_reward_agent_2_min': -23.823339220136404, 'true_reward_agent_2_max': 56.70186144113541}
1  	3     	0.0485502	0.309177	-0.337127	0.419785
population [array('d', [0.6502595333470527, -0.027230239015153757]), array('d', [0.15639472994139103, 0.16183220268031784]), array('d', [0.6502595333470527, -0.027230239015153757])]
[2m[36m(pid=31733)[0m   from ._conv import register_converters as _register_converters
[2m[36m(pid=31734)[0m   from ._conv import register_converters as _register_converters


2020-05-01 22:40:51,560	INFO trainable.py:180 -- _setup took 24.410 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.
2020-05-01 22:40:51,561	INFO trainable.py:217 -- Getting current IP.


{'true_reward_agent_0_mean': -0.04478343224072887, 'true_reward_agent_0_min': -13.979901686310768, 'true_reward_agent_0_max': 18.157632388174534, 'true_reward_agent_1_mean': 0.12055373292751029, 'true_reward_agent_1_min': -11.513849252020009, 'true_reward_agent_1_max': 15.430780403316021, 'true_reward_agent_2_mean': 0.013783042635041057, 'true_reward_agent_2_min': -12.711527675390244, 'true_reward_agent_2_max': 15.627353608608246}
{'true_reward_agent_0_mean': -1.1783790772179783, 'true_reward_agent_0_min': -17.25314173847437, 'true_reward_agent_0_max': 15.37072753161192, 'true_reward_agent_1_mean': 0.15802446559478994, 'true_reward_agent_1_min': -14.299021059647202, 'true_reward_agent_1_max': 22.60475954413414, 'true_reward_agent_2_mean': 11.07883165527419, 'true_reward_agent_2_min': -15.960695691406727, 'true_reward_agent_2_max': 47.88431499898434}
{'true_reward_agent_0_mean': 1.0486686665261369, 'true_reward_agent_0_min': -17.015883177518845, 'true_reward_agent_0_max': 21.29966908693

2020-05-01 22:43:58,313	ERROR syncer.py:39 -- Log sync requires rsync to be installed.


{'true_reward_agent_0_mean': 1.353908838241923, 'true_reward_agent_0_min': -19.712898099794984, 'true_reward_agent_0_max': 23.123037293553352, 'true_reward_agent_1_mean': 5.228581130998609, 'true_reward_agent_1_min': -17.927971802651882, 'true_reward_agent_1_max': 33.88634682446718, 'true_reward_agent_2_mean': 25.440826316599196, 'true_reward_agent_2_min': -10.491198629140854, 'true_reward_agent_2_max': 58.54375920817256}
2  	3     	0.0298511	0.0684482	-0.0447834	0.120554
population [array('d', [0.6502595333470527, -0.027230239015153757]), array('d', [0.6502595333470527, -0.027230239015153757]), array('d', [0.6502595333470527, -0.027230239015153757])]
[2m[36m(pid=31745)[0m   from ._conv import register_converters as _register_converters
[2m[36m(pid=31878)[0m   from ._conv import register_converters as _register_converters


2020-05-01 22:44:21,628	INFO trainable.py:180 -- _setup took 23.314 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.
2020-05-01 22:44:21,629	INFO trainable.py:217 -- Getting current IP.


{'true_reward_agent_0_mean': -0.14712155736582644, 'true_reward_agent_0_min': -14.247595623135567, 'true_reward_agent_0_max': 15.785346211865544, 'true_reward_agent_1_mean': -0.008224893949081888, 'true_reward_agent_1_min': -12.597308412194252, 'true_reward_agent_1_max': 16.043423749506474, 'true_reward_agent_2_mean': 0.3228843667276669, 'true_reward_agent_2_min': -11.392032355070114, 'true_reward_agent_2_max': 17.137738455086946}
{'true_reward_agent_0_mean': 0.6592773099648912, 'true_reward_agent_0_min': -15.3502267729491, 'true_reward_agent_0_max': 22.882493942975998, 'true_reward_agent_1_mean': 1.5463325516456097, 'true_reward_agent_1_min': -17.01872193813324, 'true_reward_agent_1_max': 23.533475756645203, 'true_reward_agent_2_mean': -0.1336842079469352, 'true_reward_agent_2_min': -19.63633221387863, 'true_reward_agent_2_max': 32.15635349415243}
{'true_reward_agent_0_mean': -2.9070146001645116, 'true_reward_agent_0_min': -16.819582130759954, 'true_reward_agent_0_max': 13.73189226910

In [None]:
#I had problems with rllib running out of memory after running many tests. Installing the ray nightly build fixed it
ray.shutdown()