From 56795efe14cdf12c5aef02ac8582e545feb722d9 Mon Sep 17 00:00:00 2001 From: pekka Date: Fri, 17 Nov 2017 11:54:51 +0700 Subject: [PATCH] update README and add main-method --- README.md | 28 ++++++-- run_agent.py | 183 ++++++++++++++++++++++++++------------------------- 2 files changed, 117 insertions(+), 94 deletions(-) diff --git a/README.md b/README.md index e143281..554529b 100644 --- a/README.md +++ b/README.md @@ -12,8 +12,12 @@ This project implements the FullyConv reinforcement learning agent for [pysc2](https://github.com/deepmind/pysc2/) as specified in https://deepmind.com/documents/110/sc2le.pdf. +It's possible to use +- [A2C](https://blog.openai.com/baselines-acktr-a2c/), which is a synchronous version of [A3C](https://arxiv.org/abs/1602.01783) used in the deepmind paper +- [PPO](https://arxiv.org/pdf/1707.06347.pdf) (Proximal Policy Optimization) + Differences to the deepmind spec: -- Use [A2C](https://blog.openai.com/baselines-acktr-a2c/) instead of [A3C](https://arxiv.org/abs/1602.01783) +- Use A2C or PPO instead of A3C - The non-spatial feature-vector is discarded here. (Probably because of this can't learn CollectMineralsAndGas) - There are some other minor simplifaction to the observation space - Use different hyper-parameters @@ -22,6 +26,7 @@ Differences to the deepmind spec: - And maybe others that I don't know of ### Results +The results here were obtained with A2C. See below A2C vs PPO comparison. @@ -80,16 +85,31 @@ Might be that at least hyperparameters here are off (and possibly other things). Other environments seem more stable. +The training was done using one core of Tesla K80 -GPU per environment. ### How to run -`python run_a2c.py --map_name MoveToBeacon --model_name my_beacon_model --n_envs 32` +`python run_agent.py --map_name MoveToBeacon --model_name my_beacon_model --n_envs 32` This will save - tf summaries to `_files/summaries/my_beacon_model/` - model to `_files/models/my_beacon_model` -relative to the project path. See `run_a2c.py` for more arguments. +relative to the project path. +By default using A2C. To run PPO specify `--agent_mode ppo`. + +See `run_agent.py` for more arguments. + +### A2C vs PPO +PPO implementation here gives very similar scores to A2C. +With PPO the training seem to take considerably longer but is more stable. +However, I don't have enough computation power to verify this rigorously. +It's also possible that the PPO-parameters here are totally off. + + +Example of training graph using PPO. +The typical sigmoid shape in A2C training doesn't appear. +Similar behaviour is observed in other environments. ### Requirements - Python 3 (will NOT work with python 2) @@ -102,5 +122,5 @@ Let me know if there are issues. ### References I have borrowed some ideas from https://github.com/xhujoy/pysc2-agents (FullyConv-network etc.) -and [Open AI's baselines](https://github.com/openai/baselines/) but the implementation here is different from those. +and [Open AI's baselines](https://github.com/openai/baselines/) (A2C and PPO) but the implementation here is different from those. For parallel environments using the code from baselines adapted for sc2. \ No newline at end of file diff --git a/run_agent.py b/run_agent.py index f40bedf..9016c66 100644 --- a/run_agent.py +++ b/run_agent.py @@ -44,8 +44,7 @@ FLAGS(sys.argv) -# TODO below it gets little messy with the folders, maybe do something more clever - +#TODO this runner is maybe too long and too messy.. full_chekcpoint_path = os.path.join(FLAGS.checkpoint_path, FLAGS.model_name) if FLAGS.training: @@ -63,106 +62,110 @@ def check_and_handle_existing_folder(f): raise Exception("folder %s already exists" % f) -if FLAGS.training: - check_and_handle_existing_folder(full_chekcpoint_path) - check_and_handle_existing_folder(full_summary_path) - -env_args = dict( - map_name=FLAGS.map_name, - step_mul=FLAGS.step_mul, - game_steps_per_episode=0, - screen_size_px=(FLAGS.resolution,) * 2, - minimap_size_px=(FLAGS.resolution,) * 2, - visualize=FLAGS.visualize -) - -envs = SubprocVecEnv((partial(make_sc2env, **env_args),) * FLAGS.n_envs) -# envs = SingleEnv(make_sc2env(**env_args)) - -tf.reset_default_graph() -sess = tf.Session() - -agent = ActorCriticAgent( - mode=FLAGS.agent_mode, - sess=sess, - spatial_dim=FLAGS.resolution, - unit_type_emb_dim=5, - loss_value_weight=FLAGS.loss_value_weight, - entropy_weight_action_id=FLAGS.entropy_weight_action, - entropy_weight_spatial=FLAGS.entropy_weight_spatial, - scalar_summary_freq=FLAGS.scalar_summary_freq, - all_summary_freq=FLAGS.all_summary_freq, - summary_path=full_summary_path, - max_gradient_norm=FLAGS.max_gradient_norm -) - -agent.build_model() -if os.path.exists(full_chekcpoint_path): - agent.load(full_chekcpoint_path) -else: - agent.init() - -if FLAGS.n_steps_per_batch is None: - n_steps_per_batch = 128 if FLAGS.agent_mode == ACMode.PPO else 8 -else: - n_steps_per_batch = FLAGS.n_steps_per_batch - -if FLAGS.agent_mode == ACMode.PPO: - ppo_par = PPORunParams( - FLAGS.ppo_lambda, - batch_size=FLAGS.ppo_batch_size or n_steps_per_batch, - n_epochs=FLAGS.ppo_epochs - ) -else: - ppo_par = None - -runner = Runner( - envs=envs, - agent=agent, - discount=FLAGS.discount, - n_steps=n_steps_per_batch, - do_training=FLAGS.training, - ppo_par=ppo_par -) - -runner.reset() - -if FLAGS.K_batches >= 0: - n_batches = FLAGS.K_batches * 1000 -else: - n_batches = -1 - - def _print(i): print(datetime.now()) print("# batch %d" % i) sys.stdout.flush() -def _save_if_training(): +def _save_if_training(agent): if FLAGS.training: agent.save(full_chekcpoint_path) agent.flush_summaries() sys.stdout.flush() -i = 0 +def main(): + if FLAGS.training: + check_and_handle_existing_folder(full_chekcpoint_path) + check_and_handle_existing_folder(full_summary_path) + + env_args = dict( + map_name=FLAGS.map_name, + step_mul=FLAGS.step_mul, + game_steps_per_episode=0, + screen_size_px=(FLAGS.resolution,) * 2, + minimap_size_px=(FLAGS.resolution,) * 2, + visualize=FLAGS.visualize + ) + + envs = SubprocVecEnv((partial(make_sc2env, **env_args),) * FLAGS.n_envs) + # envs = SingleEnv(make_sc2env(**env_args)) + + tf.reset_default_graph() + sess = tf.Session() + + agent = ActorCriticAgent( + mode=FLAGS.agent_mode, + sess=sess, + spatial_dim=FLAGS.resolution, + unit_type_emb_dim=5, + loss_value_weight=FLAGS.loss_value_weight, + entropy_weight_action_id=FLAGS.entropy_weight_action, + entropy_weight_spatial=FLAGS.entropy_weight_spatial, + scalar_summary_freq=FLAGS.scalar_summary_freq, + all_summary_freq=FLAGS.all_summary_freq, + summary_path=full_summary_path, + max_gradient_norm=FLAGS.max_gradient_norm + ) + + agent.build_model() + if os.path.exists(full_chekcpoint_path): + agent.load(full_chekcpoint_path) + else: + agent.init() + + if FLAGS.n_steps_per_batch is None: + n_steps_per_batch = 128 if FLAGS.agent_mode == ACMode.PPO else 8 + else: + n_steps_per_batch = FLAGS.n_steps_per_batch + + if FLAGS.agent_mode == ACMode.PPO: + ppo_par = PPORunParams( + FLAGS.ppo_lambda, + batch_size=FLAGS.ppo_batch_size or n_steps_per_batch, + n_epochs=FLAGS.ppo_epochs + ) + else: + ppo_par = None + + runner = Runner( + envs=envs, + agent=agent, + discount=FLAGS.discount, + n_steps=n_steps_per_batch, + do_training=FLAGS.training, + ppo_par=ppo_par + ) + + runner.reset() + + if FLAGS.K_batches >= 0: + n_batches = FLAGS.K_batches * 1000 + else: + n_batches = -1 + + i = 0 + + try: + while True: + if i % 500 == 0: + _print(i) + if i % 4000 == 0: + _save_if_training(agent) + runner.run_batch() + i += 1 + if 0 <= n_batches <= i: + break + except KeyboardInterrupt: + pass + + print("Okay. Work is done") + _print(i) + _save_if_training(agent) -try: - while True: - if i % 500 == 0: - _print(i) - if i % 4000 == 0: - _save_if_training() - runner.run_batch() - i += 1 - if 0 <= n_batches <= i: - break -except KeyboardInterrupt: - pass + envs.close() -print("Okay. Work is done") -_print(i) -_save_if_training() -envs.close() +if __name__ == "__main__": + main()