In [1]:
# A proof of concept showing using a genetic algorithm with our environment.
# It is similar to https://github.com/DEAP/deap/blob/a0b78956e28387785e3bb6e2b4b1f1b32c2b3883/examples/ga/onemax_short.py

import array
import random

import numpy as np

from deap import algorithms
from deap import base
from deap import creator
from deap import tools
import gym
from environments.SimpleEnv import SimpleEnv

from ray.rllib.agents import ppo
from ray import tune
from ray.rllib.policy.policy import Policy
from ray.rllib.env.multi_agent_env import MultiAgentEnv
from ray.rllib.utils import try_import_tf
tf = try_import_tf()
import ray

ray.init()


creator.create('FitnessMax', base.Fitness, weights=(1.0, ))
creator.create('Individual', array.array, typecode='d',
               fitness=creator.FitnessMax)

toolbox = base.Toolbox()

max_action = 5

toolbox.register('attr', random.uniform, -1, 1)
toolbox.register('individual', tools.initRepeat, creator.Individual,
                 toolbox.attr, 2)
toolbox.register('population', tools.initRepeat, list,
                 toolbox.individual)


def evalOneMax(individual, pop):
    """Runs the environment. It always takes the same action determined by the individual's genes.
  It returns the total reward as the fitness."""
    
    #print(individual, pop)
    individual = np.asarray(individual)
    reward_weights = [individual, np.random.default_rng().choice(pop, 1, replace=False)[0]]
    reward_weights = np.asarray(reward_weights)
    print(individual, reward_weights)
    env = SimpleEnv(config={
        'n_agents': 2,
        'n_vars': 2,
        'reward_weights': reward_weights,
    })
    def on_episode_start(info):
        episode = info["episode"]
        episode.user_data["true_rewards"] = []

    def on_episode_step(info):
        episode = info["episode"]
        env = info["env"]
        true_reward = env.env_states[0].env.last_true_reward
        episode.user_data["true_rewards"].append(true_reward)

    def on_episode_end(info):
        episode = info["episode"]
        true_reward = np.sum(episode.user_data["true_rewards"])
        episode.custom_metrics["true_reward"] = true_reward
        
    trainer = ppo.PPOTrainer(env=SimpleEnv, config={
        "multiagent": {
            "policies": {
                # the first tuple value is None -> uses default policy
                "agent_0": (None, env.observation_space, env.action_space, {}),
                "agent_1": (None, env.observation_space, env.action_space, {}),
            },
            "policy_mapping_fn":
                lambda agent_id:
                    agent_id
        },
        'env_config': {
            'n_agents': 2,
            'n_vars': 2,
            'reward_weights': reward_weights,
        },
        "callbacks": {
            "on_episode_start": on_episode_start,
            "on_episode_step": on_episode_step,
            "on_episode_end": on_episode_end,
        },
    })
    for i in range(1):
        print('TRAINING')
        trainer.train()  # distributed training step
    print('true reward', trainer.collect_metrics()['custom_metrics']['true_reward_mean'])
    return (trainer.collect_metrics()['custom_metrics']['true_reward_mean'], )


toolbox.register('evaluate', evalOneMax)
toolbox.register('mate', tools.cxTwoPoint)
toolbox.register('mutate', tools.mutFlipBit, indpb=0.05)
toolbox.register('select', tools.selTournament, tournsize=3)

def evolve(population, toolbox, cxpb, mutpb, ngen, stats=None, 
           halloffame=None, verbose=__debug__):
    logbook = tools.Logbook()
    logbook.header = ['gen', 'nevals'] + (stats.fields if stats else [])

    # Evaluate the individuals with an invalid fitness
    invalid_ind = [ind for ind in population if not ind.fitness.valid]
    fitnesses = toolbox.map(toolbox.evaluate, invalid_ind, [invalid_ind] * len(invalid_ind))
    for ind, fit in zip(invalid_ind, fitnesses):
        ind.fitness.values = fit

    if halloffame is not None:
        halloffame.update(population)

    record = stats.compile(population) if stats else {}
    logbook.record(gen=0, nevals=len(invalid_ind), **record)
    if verbose:
        print(logbook.stream)

    # Begin the generational process
    for gen in range(1, ngen + 1):
        # Select the next generation individuals
        offspring = toolbox.select(population, len(population))

        # Vary the pool of individuals
        offspring = algorithms.varAnd(offspring, toolbox, cxpb, mutpb)

        # Evaluate the individuals with an invalid fitness
        invalid_ind = [ind for ind in offspring if not ind.fitness.valid]
        fitnesses = toolbox.map(toolbox.evaluate, invalid_ind, [invalid_ind] * len(invalid_ind))
        for ind, fit in zip(invalid_ind, fitnesses):
            ind.fitness.values = fit

        # Update the hall of fame with the generated individuals
        if halloffame is not None:
            halloffame.update(offspring)

        # Replace the current population by the offspring
        population[:] = offspring

        # Append the current generation statistics to the logbook
        record = stats.compile(population) if stats else {}
        logbook.record(gen=gen, nevals=len(invalid_ind), **record)
        if verbose:
            print(logbook.stream)

    return population, logbook

def main():
    pop = toolbox.population(n=3)
    hof = tools.HallOfFame(1)
    stats = tools.Statistics(lambda ind: ind.fitness.values)
    stats.register('avg', np.mean)
    stats.register('std', np.std)
    stats.register('min', np.min)
    stats.register('max', np.max)

    (pop, log) = evolve(
        pop,
        toolbox,
        cxpb=0.5,
        mutpb=0.2,
        ngen=1,
        stats=stats,
        halloffame=hof,
        verbose=True,
        )

    print ('pop', pop)

    return (pop, log, hof)


if __name__ == '__main__':
    main()


  from ._conv import register_converters as _register_converters
2020-04-30 12:02:01,289	INFO resource_spec.py:212 -- Starting Ray with 2.98 GiB memory available for workers and up to 1.5 GiB for objects. You can adjust these settings with ray.init(memory=<bytes>, object_store_memory=<bytes>).
2020-04-30 12:02:01,905	INFO services.py:1148 -- View the Ray dashboard at [1m[32mlocalhost:8265[39m[22m
2020-04-30 12:02:02,671	INFO trainer.py:428 -- Tip: set 'eager': true or the --eager flag to enable TensorFlow eager execution
2020-04-30 12:02:02,820	ERROR syncer.py:39 -- Log sync requires rsync to be installed.
2020-04-30 12:02:02,828	INFO trainer.py:585 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.


[0.80638477 0.88196682] [[ 0.80638477  0.88196682]
 [ 0.61255572 -0.91280453]]




[2m[36m(pid=5993)[0m   from ._conv import register_converters as _register_converters
[2m[36m(pid=5994)[0m   from ._conv import register_converters as _register_converters


2020-04-30 12:02:18,630	INFO trainable.py:180 -- _setup took 15.804 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.
2020-04-30 12:02:18,631	INFO trainable.py:217 -- Getting current IP.


TRAINING


2020-04-30 12:02:44,969	ERROR syncer.py:39 -- Log sync requires rsync to be installed.


true reward 0.2011278227063567
[ 0.61255572 -0.91280453] [[ 0.61255572 -0.91280453]
 [ 0.80638477  0.88196682]]




[2m[36m(pid=5992)[0m   from ._conv import register_converters as _register_converters
[2m[36m(pid=5995)[0m   from ._conv import register_converters as _register_converters


2020-04-30 12:03:02,060	INFO trainable.py:180 -- _setup took 17.089 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.
2020-04-30 12:03:02,062	INFO trainable.py:217 -- Getting current IP.


TRAINING


2020-04-30 12:03:38,141	ERROR syncer.py:39 -- Log sync requires rsync to be installed.


true reward 0.5502692859196727
[0.23415128 0.63128753] [[0.23415128 0.63128753]
 [0.80638477 0.88196682]]




[2m[36m(pid=6240)[0m   from ._conv import register_converters as _register_converters
[2m[36m(pid=6241)[0m   from ._conv import register_converters as _register_converters


2020-04-30 12:03:58,217	INFO trainable.py:180 -- _setup took 20.070 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.
2020-04-30 12:03:58,222	INFO trainable.py:217 -- Getting current IP.


TRAINING


2020-04-30 12:04:31,543	ERROR syncer.py:39 -- Log sync requires rsync to be installed.


true reward 0.14298566424677875
gen	nevals	avg     	std     	min     	max     
0  	3     	0.298128	0.179864	0.142986	0.550269
[ 0.61255572 -0.91280453] [[ 0.61255572 -0.91280453]
 [ 0.61255572 -0.91280453]]




[2m[36m(pid=6252)[0m   from ._conv import register_converters as _register_converters
[2m[36m(pid=6341)[0m   from ._conv import register_converters as _register_converters


2020-04-30 12:04:50,470	INFO trainable.py:180 -- _setup took 18.920 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.
2020-04-30 12:04:50,472	INFO trainable.py:217 -- Getting current IP.


TRAINING
true reward 0.14197522020988607
1  	1     	0.297791	0.180155	0.141975	0.550269
pop [array('d', [0.806384765928241, 0.8819668225125128]), array('d', [0.6125557167934648, -0.9128045263817832]), array('d', [0.6125557167934648, -0.9128045263817832])]


In [3]:
ray.shutdown()

NameError: name 'ray' is not defined