Added option to toggle visualisation of the worker

openai · Jan 31, 2017 · a3fdfba · a3fdfba
1 parent 3f85a63
commit a3fdfba
Show file tree

Hide file tree

Showing 4 changed files with 26 additions and 9 deletions.
diff --git a/README.md b/README.md
@@ -61,6 +61,10 @@ Using 16 workers, the agent should be able to solve `PongDeterministic-v3` (not
 Using 32 workers, the agent is able to solve the same environment in 10 minutes on an `m4.16xlarge` instance.
 If you run this experiment on a high-end MacBook Pro, the above job will take just under 2 hours to solve Pong.
 
+Add '--visualise' toggle if you want to visualise the worker using env.render() as follows:
+
+`python train.py --num-workers 2 --env-id PongDeterministic-v3 --log-dir /tmp/pong --visualise`
+
 ![pong](https://github.com/openai/universe-starter-agent/raw/master/imgs/tb_pong.png "Pong")
 
 For best performance, it is recommended for the number of workers to not exceed available number of CPU cores.

diff --git a/a3c.py b/a3c.py
@@ -71,7 +71,7 @@ class RunnerThread(threading.Thread):
 is that a universe environment is _real time_.  This means that there should be a thread
 that would constantly interact with the environment and tell it what to do.  This thread is here.
 """
-    def __init__(self, env, policy, num_local_steps):
+    def __init__(self, env, policy, num_local_steps, visualise):
         threading.Thread.__init__(self)
         self.queue = queue.Queue(5)
         self.num_local_steps = num_local_steps
@@ -81,6 +81,7 @@ def __init__(self, env, policy, num_local_steps):
         self.daemon = True
         self.sess = None
         self.summary_writer = None
+        self.visualise = visualise
 
     def start_runner(self, sess, summary_writer):
         self.sess = sess
@@ -92,7 +93,7 @@ def run(self):
             self._run()
 
     def _run(self):
-        rollout_provider = env_runner(self.env, self.policy, self.num_local_steps, self.summary_writer)
+        rollout_provider = env_runner(self.env, self.policy, self.num_local_steps, self.summary_writer, self.visualise)
         while True:
             # the timeout variable exists because apparently, if one worker dies, the other workers
             # won't die with it, unless the timeout is set to some large number.  This is an empirical
@@ -102,7 +103,7 @@ def _run(self):
 
 
 
-def env_runner(env, policy, num_local_steps, summary_writer, render=True):
+def env_runner(env, policy, num_local_steps, summary_writer, render):
     """
 The logic of the thread runner.  In brief, it constantly keeps on running
 the policy, and as long as the rollout exceeds a certain length, the thread
@@ -158,7 +159,7 @@ def env_runner(env, policy, num_local_steps, summary_writer, render=True):
         yield rollout
 
 class A3C(object):
-    def __init__(self, env, task):
+    def __init__(self, env, task, visualise):
         """
 An implementation of the A3C algorithm that is reasonably well-tuned for the VNC environments.
 Below, we will have a modest amount of complexity due to the way TensorFlow handles data parallelism.
@@ -205,7 +206,7 @@ def __init__(self, env, task):
             # on the one hand;  but on the other hand, we get less frequent parameter updates, which
             # slows down learning.  In this code, we found that making local steps be much
             # smaller than 20 makes the algorithm more difficult to tune and to get to work.
-            self.runner = RunnerThread(env, pi, 20)
+            self.runner = RunnerThread(env, pi, 20, visualise)
 
 
             grads = tf.gradients(self.loss, pi.var_list)

diff --git a/train.py b/train.py
@@ -18,6 +18,10 @@
 parser.add_argument('-m', '--mode', type=str, default='tmux',
                     help="tmux: run workers in a tmux session. nohup: run workers with nohup. child: run workers as child processes")
 
+# Add visualise tag
+parser.add_argument('--visualise', action='store_true',
+                    help="Visualise the gym environment by running env.render() between each timestep")
+
 
 def new_cmd(session, name, cmd, mode, logdir, shell):
     if isinstance(cmd, (list, tuple)):
@@ -30,14 +34,18 @@ def new_cmd(session, name, cmd, mode, logdir, shell):
         return name, "nohup {} -c {} >{}/{}.{}.out 2>&1 & echo kill $! >>{}/kill.sh".format(shell, shlex_quote(cmd), logdir, session, name, logdir)
 
 
-def create_commands(session, num_workers, remotes, env_id, logdir, shell='bash', mode='tmux'):
+def create_commands(session, num_workers, remotes, env_id, logdir, shell='bash', mode='tmux', visualise=False):
     # for launching the TF workers and for launching tensorboard
     base_cmd = [
         'CUDA_VISIBLE_DEVICES=',
         sys.executable, 'worker.py',
-        '--log-dir', logdir, '--env-id', env_id,
+        '--log-dir', logdir,
+        '--env-id', env_id,
         '--num-workers', str(num_workers)]
 
+    if visualise:
+        base_cmd += ['--visualise']
+
     if remotes is None:
         remotes = ["1"] * num_workers
     else:
@@ -86,7 +94,7 @@ def create_commands(session, num_workers, remotes, env_id, logdir, shell='bash',
 
 def run():
     args = parser.parse_args()
-    cmds, notes = create_commands("a3c", args.num_workers, args.remotes, args.env_id, args.log_dir, mode=args.mode)
+    cmds, notes = create_commands("a3c", args.num_workers, args.remotes, args.env_id, args.log_dir, mode=args.mode, visualise=args.visualise)
     if args.dry_run:
         print("Dry-run mode due to -n flag, otherwise the following commands would be executed:")
     else:

diff --git a/worker.py b/worker.py
@@ -24,7 +24,7 @@ def save(self, sess, save_path, global_step=None, latest_filename=None,
 
 def run(args, server):
     env = create_env(args.env_id, client_id=str(args.task), remotes=args.remotes)
-    trainer = A3C(env, args.task)
+    trainer = A3C(env, args.task, args.visualise)
 
     # Variable names that start with "local" are not saved in checkpoints.
     if use_tf12_api:
@@ -122,6 +122,10 @@ def main(_):
                              'or the address of pre-existing VNC servers and '
                              'rewarders to use (e.g. -r vnc://localhost:5900+15900,vnc://localhost:5901+15901)')
 
+    # Add visualisation argument
+    parser.add_argument('--visualise', action='store_true',
+                        help="Visualise the gym environment by running env.render() between each timestep")
+
     args = parser.parse_args()
     spec = cluster_spec(args.num_workers, 1)
     cluster = tf.train.ClusterSpec(spec).as_cluster_def()