From 56795efe14cdf12c5aef02ac8582e545feb722d9 Mon Sep 17 00:00:00 2001
From: pekka <aalto.pekka@gmail.com>
Date: Fri, 17 Nov 2017 11:54:51 +0700
Subject: [PATCH] update README and add main-method

---
 README.md    |  28 ++++++--
 run_agent.py | 183 ++++++++++++++++++++++++++-------------------------
 2 files changed, 117 insertions(+), 94 deletions(-)
diff --git a/README.md b/README.md
index e143281..554529b 100644
--- a/README.md
+++ b/README.md
@@ -12,8 +12,12 @@ This project implements the FullyConv reinforcement learning agent
 for [pysc2](https://github.com/deepmind/pysc2/)
 as specified in https://deepmind.com/documents/110/sc2le.pdf.
 
+It's possible to use
+- [A2C](https://blog.openai.com/baselines-acktr-a2c/), which is a synchronous version of [A3C](https://arxiv.org/abs/1602.01783) used in the deepmind paper
+- [PPO](https://arxiv.org/pdf/1707.06347.pdf) (Proximal Policy Optimization)
+
 Differences to the deepmind spec:
-- Use [A2C](https://blog.openai.com/baselines-acktr-a2c/) instead of [A3C](https://arxiv.org/abs/1602.01783)
+- Use A2C or PPO instead of A3C
 - The non-spatial feature-vector is discarded here. (Probably because of this can't learn CollectMineralsAndGas)
 - There are some other minor simplifaction to the observation space
 - Use different hyper-parameters
@@ -22,6 +26,7 @@ Differences to the deepmind spec:
 - And maybe others that I don't know of
 
 ### Results
+The results here were obtained with A2C. See below A2C vs PPO comparison.
 
 <table align="center">
   <tr>
@@ -80,16 +85,31 @@ Might be that at least hyperparameters here are off (and possibly other things).
 
 Other environments seem more stable.
 
+The training was done using one core of Tesla K80 -GPU per environment.
 
 ### How to run
-`python run_a2c.py --map_name MoveToBeacon --model_name my_beacon_model --n_envs 32`
+`python run_agent.py --map_name MoveToBeacon --model_name my_beacon_model --n_envs 32`
 
 This will save
 - tf summaries to `_files/summaries/my_beacon_model/`
 - model to `_files/models/my_beacon_model`
 
-relative to the project path. See `run_a2c.py` for more arguments.
+relative to the project path.
+By default using A2C. To run PPO specify `--agent_mode ppo`.
+
+See `run_agent.py` for more arguments.
+
+### A2C vs PPO
+PPO implementation here gives very similar scores to A2C.
+With PPO the training seem to take considerably longer but is more stable.
+However, I don't have enough computation power to verify this rigorously.
+It's also possible that the PPO-parameters here are totally off.
+
+<img src="https://image.ibb.co/dEB0A6/Screen_Shot_2560_11_17_at_11_31_25_AM.png" width="360" height="300">
 
+Example of training graph using PPO.
+The typical sigmoid shape in A2C training doesn't appear.
+Similar behaviour is observed in other environments.
 
 ### Requirements
 - Python 3 (will NOT work with python 2)
@@ -102,5 +122,5 @@ Let me know if there are issues.
 
 ### References
 I have borrowed some ideas from https://github.com/xhujoy/pysc2-agents (FullyConv-network etc.)
-and [Open AI's baselines](https://github.com/openai/baselines/) but the implementation here is different from those.
+and [Open AI's baselines](https://github.com/openai/baselines/) (A2C and PPO) but the implementation here is different from those.
 For parallel environments using the code from baselines adapted for sc2.
\ No newline at end of file
diff --git a/run_agent.py b/run_agent.py
index f40bedf..9016c66 100644
--- a/run_agent.py
+++ b/run_agent.py
@@ -44,8 +44,7 @@
 
 FLAGS(sys.argv)
 
-# TODO below it gets little messy with the folders, maybe do something more clever
-
+#TODO this runner is maybe too long and too messy..
 full_chekcpoint_path = os.path.join(FLAGS.checkpoint_path, FLAGS.model_name)
 
 if FLAGS.training:
@@ -63,106 +62,110 @@ def check_and_handle_existing_folder(f):
             raise Exception("folder %s already exists" % f)
 
 
-if FLAGS.training:
-    check_and_handle_existing_folder(full_chekcpoint_path)
-    check_and_handle_existing_folder(full_summary_path)
-
-env_args = dict(
-    map_name=FLAGS.map_name,
-    step_mul=FLAGS.step_mul,
-    game_steps_per_episode=0,
-    screen_size_px=(FLAGS.resolution,) * 2,
-    minimap_size_px=(FLAGS.resolution,) * 2,
-    visualize=FLAGS.visualize
-)
-
-envs = SubprocVecEnv((partial(make_sc2env, **env_args),) * FLAGS.n_envs)
-# envs = SingleEnv(make_sc2env(**env_args))
-
-tf.reset_default_graph()
-sess = tf.Session()
-
-agent = ActorCriticAgent(
-    mode=FLAGS.agent_mode,
-    sess=sess,
-    spatial_dim=FLAGS.resolution,
-    unit_type_emb_dim=5,
-    loss_value_weight=FLAGS.loss_value_weight,
-    entropy_weight_action_id=FLAGS.entropy_weight_action,
-    entropy_weight_spatial=FLAGS.entropy_weight_spatial,
-    scalar_summary_freq=FLAGS.scalar_summary_freq,
-    all_summary_freq=FLAGS.all_summary_freq,
-    summary_path=full_summary_path,
-    max_gradient_norm=FLAGS.max_gradient_norm
-)
-
-agent.build_model()
-if os.path.exists(full_chekcpoint_path):
-    agent.load(full_chekcpoint_path)
-else:
-    agent.init()
-
-if FLAGS.n_steps_per_batch is None:
-    n_steps_per_batch = 128 if FLAGS.agent_mode == ACMode.PPO else 8
-else:
-    n_steps_per_batch = FLAGS.n_steps_per_batch
-
-if FLAGS.agent_mode == ACMode.PPO:
-    ppo_par = PPORunParams(
-        FLAGS.ppo_lambda,
-        batch_size=FLAGS.ppo_batch_size or n_steps_per_batch,
-        n_epochs=FLAGS.ppo_epochs
-    )
-else:
-    ppo_par = None
-
-runner = Runner(
-    envs=envs,
-    agent=agent,
-    discount=FLAGS.discount,
-    n_steps=n_steps_per_batch,
-    do_training=FLAGS.training,
-    ppo_par=ppo_par
-)
-
-runner.reset()
-
-if FLAGS.K_batches >= 0:
-    n_batches = FLAGS.K_batches * 1000
-else:
-    n_batches = -1
-
-
 def _print(i):
     print(datetime.now())
     print("# batch %d" % i)
     sys.stdout.flush()
 
 
-def _save_if_training():
+def _save_if_training(agent):
     if FLAGS.training:
         agent.save(full_chekcpoint_path)
         agent.flush_summaries()
         sys.stdout.flush()
 
 
-i = 0
+def main():
+    if FLAGS.training:
+        check_and_handle_existing_folder(full_chekcpoint_path)
+        check_and_handle_existing_folder(full_summary_path)
+
+    env_args = dict(
+        map_name=FLAGS.map_name,
+        step_mul=FLAGS.step_mul,
+        game_steps_per_episode=0,
+        screen_size_px=(FLAGS.resolution,) * 2,
+        minimap_size_px=(FLAGS.resolution,) * 2,
+        visualize=FLAGS.visualize
+    )
+
+    envs = SubprocVecEnv((partial(make_sc2env, **env_args),) * FLAGS.n_envs)
+    # envs = SingleEnv(make_sc2env(**env_args))
+
+    tf.reset_default_graph()
+    sess = tf.Session()
+
+    agent = ActorCriticAgent(
+        mode=FLAGS.agent_mode,
+        sess=sess,
+        spatial_dim=FLAGS.resolution,
+        unit_type_emb_dim=5,
+        loss_value_weight=FLAGS.loss_value_weight,
+        entropy_weight_action_id=FLAGS.entropy_weight_action,
+        entropy_weight_spatial=FLAGS.entropy_weight_spatial,
+        scalar_summary_freq=FLAGS.scalar_summary_freq,
+        all_summary_freq=FLAGS.all_summary_freq,
+        summary_path=full_summary_path,
+        max_gradient_norm=FLAGS.max_gradient_norm
+    )
+
+    agent.build_model()
+    if os.path.exists(full_chekcpoint_path):
+        agent.load(full_chekcpoint_path)
+    else:
+        agent.init()
+
+    if FLAGS.n_steps_per_batch is None:
+        n_steps_per_batch = 128 if FLAGS.agent_mode == ACMode.PPO else 8
+    else:
+        n_steps_per_batch = FLAGS.n_steps_per_batch
+
+    if FLAGS.agent_mode == ACMode.PPO:
+        ppo_par = PPORunParams(
+            FLAGS.ppo_lambda,
+            batch_size=FLAGS.ppo_batch_size or n_steps_per_batch,
+            n_epochs=FLAGS.ppo_epochs
+        )
+    else:
+        ppo_par = None
+
+    runner = Runner(
+        envs=envs,
+        agent=agent,
+        discount=FLAGS.discount,
+        n_steps=n_steps_per_batch,
+        do_training=FLAGS.training,
+        ppo_par=ppo_par
+    )
+
+    runner.reset()
+
+    if FLAGS.K_batches >= 0:
+        n_batches = FLAGS.K_batches * 1000
+    else:
+        n_batches = -1
+
+    i = 0
+
+    try:
+        while True:
+            if i % 500 == 0:
+                _print(i)
+            if i % 4000 == 0:
+                _save_if_training(agent)
+            runner.run_batch()
+            i += 1
+            if 0 <= n_batches <= i:
+                break
+    except KeyboardInterrupt:
+        pass
+
+    print("Okay. Work is done")
+    _print(i)
+    _save_if_training(agent)
 
-try:
-    while True:
-        if i % 500 == 0:
-            _print(i)
-        if i % 4000 == 0:
-            _save_if_training()
-        runner.run_batch()
-        i += 1
-        if 0 <= n_batches <= i:
-            break
-except KeyboardInterrupt:
-    pass
+    envs.close()
 
-print("Okay. Work is done")
-_print(i)
-_save_if_training()
 
-envs.close()
+if __name__ == "__main__":
+    main()