diff --git a/README.md b/README.md
index e143281..554529b 100644
--- a/README.md
+++ b/README.md
@@ -12,8 +12,12 @@ This project implements the FullyConv reinforcement learning agent
for [pysc2](https://github.com/deepmind/pysc2/)
as specified in https://deepmind.com/documents/110/sc2le.pdf.
+It's possible to use
+- [A2C](https://blog.openai.com/baselines-acktr-a2c/), which is a synchronous version of [A3C](https://arxiv.org/abs/1602.01783) used in the deepmind paper
+- [PPO](https://arxiv.org/pdf/1707.06347.pdf) (Proximal Policy Optimization)
+
Differences to the deepmind spec:
-- Use [A2C](https://blog.openai.com/baselines-acktr-a2c/) instead of [A3C](https://arxiv.org/abs/1602.01783)
+- Use A2C or PPO instead of A3C
- The non-spatial feature-vector is discarded here. (Probably because of this can't learn CollectMineralsAndGas)
- There are some other minor simplifaction to the observation space
- Use different hyper-parameters
@@ -22,6 +26,7 @@ Differences to the deepmind spec:
- And maybe others that I don't know of
### Results
+The results here were obtained with A2C. See below A2C vs PPO comparison.
@@ -80,16 +85,31 @@ Might be that at least hyperparameters here are off (and possibly other things).
Other environments seem more stable.
+The training was done using one core of Tesla K80 -GPU per environment.
### How to run
-`python run_a2c.py --map_name MoveToBeacon --model_name my_beacon_model --n_envs 32`
+`python run_agent.py --map_name MoveToBeacon --model_name my_beacon_model --n_envs 32`
This will save
- tf summaries to `_files/summaries/my_beacon_model/`
- model to `_files/models/my_beacon_model`
-relative to the project path. See `run_a2c.py` for more arguments.
+relative to the project path.
+By default using A2C. To run PPO specify `--agent_mode ppo`.
+
+See `run_agent.py` for more arguments.
+
+### A2C vs PPO
+PPO implementation here gives very similar scores to A2C.
+With PPO the training seem to take considerably longer but is more stable.
+However, I don't have enough computation power to verify this rigorously.
+It's also possible that the PPO-parameters here are totally off.
+
+
+Example of training graph using PPO.
+The typical sigmoid shape in A2C training doesn't appear.
+Similar behaviour is observed in other environments.
### Requirements
- Python 3 (will NOT work with python 2)
@@ -102,5 +122,5 @@ Let me know if there are issues.
### References
I have borrowed some ideas from https://github.com/xhujoy/pysc2-agents (FullyConv-network etc.)
-and [Open AI's baselines](https://github.com/openai/baselines/) but the implementation here is different from those.
+and [Open AI's baselines](https://github.com/openai/baselines/) (A2C and PPO) but the implementation here is different from those.
For parallel environments using the code from baselines adapted for sc2.
\ No newline at end of file
diff --git a/run_agent.py b/run_agent.py
index f40bedf..9016c66 100644
--- a/run_agent.py
+++ b/run_agent.py
@@ -44,8 +44,7 @@
FLAGS(sys.argv)
-# TODO below it gets little messy with the folders, maybe do something more clever
-
+#TODO this runner is maybe too long and too messy..
full_chekcpoint_path = os.path.join(FLAGS.checkpoint_path, FLAGS.model_name)
if FLAGS.training:
@@ -63,106 +62,110 @@ def check_and_handle_existing_folder(f):
raise Exception("folder %s already exists" % f)
-if FLAGS.training:
- check_and_handle_existing_folder(full_chekcpoint_path)
- check_and_handle_existing_folder(full_summary_path)
-
-env_args = dict(
- map_name=FLAGS.map_name,
- step_mul=FLAGS.step_mul,
- game_steps_per_episode=0,
- screen_size_px=(FLAGS.resolution,) * 2,
- minimap_size_px=(FLAGS.resolution,) * 2,
- visualize=FLAGS.visualize
-)
-
-envs = SubprocVecEnv((partial(make_sc2env, **env_args),) * FLAGS.n_envs)
-# envs = SingleEnv(make_sc2env(**env_args))
-
-tf.reset_default_graph()
-sess = tf.Session()
-
-agent = ActorCriticAgent(
- mode=FLAGS.agent_mode,
- sess=sess,
- spatial_dim=FLAGS.resolution,
- unit_type_emb_dim=5,
- loss_value_weight=FLAGS.loss_value_weight,
- entropy_weight_action_id=FLAGS.entropy_weight_action,
- entropy_weight_spatial=FLAGS.entropy_weight_spatial,
- scalar_summary_freq=FLAGS.scalar_summary_freq,
- all_summary_freq=FLAGS.all_summary_freq,
- summary_path=full_summary_path,
- max_gradient_norm=FLAGS.max_gradient_norm
-)
-
-agent.build_model()
-if os.path.exists(full_chekcpoint_path):
- agent.load(full_chekcpoint_path)
-else:
- agent.init()
-
-if FLAGS.n_steps_per_batch is None:
- n_steps_per_batch = 128 if FLAGS.agent_mode == ACMode.PPO else 8
-else:
- n_steps_per_batch = FLAGS.n_steps_per_batch
-
-if FLAGS.agent_mode == ACMode.PPO:
- ppo_par = PPORunParams(
- FLAGS.ppo_lambda,
- batch_size=FLAGS.ppo_batch_size or n_steps_per_batch,
- n_epochs=FLAGS.ppo_epochs
- )
-else:
- ppo_par = None
-
-runner = Runner(
- envs=envs,
- agent=agent,
- discount=FLAGS.discount,
- n_steps=n_steps_per_batch,
- do_training=FLAGS.training,
- ppo_par=ppo_par
-)
-
-runner.reset()
-
-if FLAGS.K_batches >= 0:
- n_batches = FLAGS.K_batches * 1000
-else:
- n_batches = -1
-
-
def _print(i):
print(datetime.now())
print("# batch %d" % i)
sys.stdout.flush()
-def _save_if_training():
+def _save_if_training(agent):
if FLAGS.training:
agent.save(full_chekcpoint_path)
agent.flush_summaries()
sys.stdout.flush()
-i = 0
+def main():
+ if FLAGS.training:
+ check_and_handle_existing_folder(full_chekcpoint_path)
+ check_and_handle_existing_folder(full_summary_path)
+
+ env_args = dict(
+ map_name=FLAGS.map_name,
+ step_mul=FLAGS.step_mul,
+ game_steps_per_episode=0,
+ screen_size_px=(FLAGS.resolution,) * 2,
+ minimap_size_px=(FLAGS.resolution,) * 2,
+ visualize=FLAGS.visualize
+ )
+
+ envs = SubprocVecEnv((partial(make_sc2env, **env_args),) * FLAGS.n_envs)
+ # envs = SingleEnv(make_sc2env(**env_args))
+
+ tf.reset_default_graph()
+ sess = tf.Session()
+
+ agent = ActorCriticAgent(
+ mode=FLAGS.agent_mode,
+ sess=sess,
+ spatial_dim=FLAGS.resolution,
+ unit_type_emb_dim=5,
+ loss_value_weight=FLAGS.loss_value_weight,
+ entropy_weight_action_id=FLAGS.entropy_weight_action,
+ entropy_weight_spatial=FLAGS.entropy_weight_spatial,
+ scalar_summary_freq=FLAGS.scalar_summary_freq,
+ all_summary_freq=FLAGS.all_summary_freq,
+ summary_path=full_summary_path,
+ max_gradient_norm=FLAGS.max_gradient_norm
+ )
+
+ agent.build_model()
+ if os.path.exists(full_chekcpoint_path):
+ agent.load(full_chekcpoint_path)
+ else:
+ agent.init()
+
+ if FLAGS.n_steps_per_batch is None:
+ n_steps_per_batch = 128 if FLAGS.agent_mode == ACMode.PPO else 8
+ else:
+ n_steps_per_batch = FLAGS.n_steps_per_batch
+
+ if FLAGS.agent_mode == ACMode.PPO:
+ ppo_par = PPORunParams(
+ FLAGS.ppo_lambda,
+ batch_size=FLAGS.ppo_batch_size or n_steps_per_batch,
+ n_epochs=FLAGS.ppo_epochs
+ )
+ else:
+ ppo_par = None
+
+ runner = Runner(
+ envs=envs,
+ agent=agent,
+ discount=FLAGS.discount,
+ n_steps=n_steps_per_batch,
+ do_training=FLAGS.training,
+ ppo_par=ppo_par
+ )
+
+ runner.reset()
+
+ if FLAGS.K_batches >= 0:
+ n_batches = FLAGS.K_batches * 1000
+ else:
+ n_batches = -1
+
+ i = 0
+
+ try:
+ while True:
+ if i % 500 == 0:
+ _print(i)
+ if i % 4000 == 0:
+ _save_if_training(agent)
+ runner.run_batch()
+ i += 1
+ if 0 <= n_batches <= i:
+ break
+ except KeyboardInterrupt:
+ pass
+
+ print("Okay. Work is done")
+ _print(i)
+ _save_if_training(agent)
-try:
- while True:
- if i % 500 == 0:
- _print(i)
- if i % 4000 == 0:
- _save_if_training()
- runner.run_batch()
- i += 1
- if 0 <= n_batches <= i:
- break
-except KeyboardInterrupt:
- pass
+ envs.close()
-print("Okay. Work is done")
-_print(i)
-_save_if_training()
-envs.close()
+if __name__ == "__main__":
+ main()