Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
[WIP] add support for seeding environments (#135)
* Make environments seedable

* Fix monitor bugs

- Set monitor_id before setting the infix. This was a bug that would yield incorrect results with multiple monitors.
- Remove extra pid from stats recorder filename. This should be purely cosmetic.

* Start uploading seeds in episode_batch

* Fix _bigint_from_bytes for python3

* Set seed explicitly in random_agent

* Pass through seed argument

* Also pass through random state to spaces

* Pass random state into the observation/action spaces

* Make all _seed methods return the list of used seeds

* Switch over to np.random where possible

* Start hashing seeds, and also seed doom engine

* Fixup seeding determinism in many cases

* Seed before loading the ROM

* Make seeding more Python3 friendly

* Make the MuJoCo skipping a bit more forgiving

* Remove debugging PDB calls

* Make setInt argument into raw bytes

* Validate and upload seeds

* Skip box2d

* Make seeds smaller, and change representation of seeds in upload

* Handle long seeds

* Fix RandomAgent example to be deterministic

* Handle integer types correctly in Python2 and Python3

* Try caching pip

* Try adding swap

* Add df and free calls

* Bump swap

* Bump swap size

* Try setting overcommit

* Try other sysctls

* Try fixing overcommit

* Try just setting overcommit_memory=1

* Add explanatory comment

* Add what's new section to readme

* BUG: Mark ElevatorAction-ram-v0 as non-deterministic for now

* Document seed

* Move nondetermistic check into spec
  • Loading branch information
gdb committed May 29, 2016
1 parent 2e26518 commit 58e6aa9
Show file tree
Hide file tree
Showing 61 changed files with 711 additions and 285 deletions.
3 changes: 1 addition & 2 deletions .gitignore
Expand Up @@ -23,8 +23,6 @@ ghostdriver.log

junk
MUJOCO_LOG.txt
mujoco-bundle


rllab_mujoco

Expand All @@ -36,3 +34,4 @@ tutorial/*.html

# PyCharm project files
.idea
vizdoom.ini
7 changes: 6 additions & 1 deletion .travis.yml
Expand Up @@ -2,7 +2,7 @@ dist: trusty
sudo: required
cache:
apt: true
pip: false
pip: true
language: python
addons:
apt:
Expand Down Expand Up @@ -30,6 +30,11 @@ before_install:
# In a pull request, there are no secrets, and hence no MuJoCo:
# https://docs.travis-ci.com/user/pull-requests#Security-Restrictions-when-testing-Pull-Requests.
- '[ -z ${MUJOCO_KEY_BUNDLE+x} ] || ( curl https://openai-public.s3-us-west-2.amazonaws.com/mujoco/$MUJOCO_KEY_BUNDLE.tar.gz | tar xz -C ~/.mujoco )'
# Without this line, Travis has fork()s fail with an out of memory
# error. (These fork()s are for spawning the subprocess for video
# recording.) We should debug the memory usage at some stage, but
# simply setting overcommit is a good starting point.
- sudo sysctl -w vm.overcommit_memory=1
env:
- DISPLAY=:12
install: pip install tox-travis
Expand Down
7 changes: 7 additions & 0 deletions README.rst
Expand Up @@ -251,3 +251,10 @@ We are using `nose2 <https://github.com/nose-devs/nose2>`_ for tests. You can ru
nose2
You can also run tests in a specific directory by using the ``-s`` option, or by passing in the specific name of the test. See the `nose2 docs <http://nose2.readthedocs.org/en/latest/usage.html#naming-tests>`_ for more details.

What's new
----------

- 2016-05-28: For controlled reproducibility, envs now support seeding
(cf #91 and #135). The monitor records which seeds are used. We will
soon add seed information to the display on the scoreboard.
8 changes: 6 additions & 2 deletions examples/agents/random_agent.py
Expand Up @@ -19,14 +19,18 @@ def act(self, observation, reward, done):
logger.setLevel(logging.INFO)

env = gym.make('CartPole-v0' if len(sys.argv)<2 else sys.argv[1])
agent = RandomAgent(env.action_space)

# You provide the directory to write to (can be an existing
# directory, including one with existing data -- all monitor files
# will be namespaced). You can also dump to a tempdir if you'd
# like: tempfile.mkdtemp().
outdir = '/tmp/random-agent-results'
env.monitor.start(outdir, force=True)
env.monitor.start(outdir, force=True, seed=0)

# This declaration must go *after* the monitor call, since the
# monitor's seeding creates a new action_space instance with the
# appropriate pseudorandom number generator.
agent = RandomAgent(env.action_space)

episode_count = 100
max_steps = 200
Expand Down
50 changes: 0 additions & 50 deletions gym/configuration.py
@@ -1,9 +1,4 @@
import hashlib
import numpy as np
import logging
import os
import random
import struct
import sys

import gym
Expand Down Expand Up @@ -40,48 +35,3 @@ def undo_logger_setup():
root_logger.removeHandler(handler)
gym.logger.setLevel(logging.NOTSET)
requests_logger.setLevel(logging.NOTSET)

def seed(a=None):
"""Seeds the 'random' and 'numpy.random' generators. By default,
Python seeds these with the system time. Call this if you are
using multiple processes.
Notes:
SECURITY SENSITIVE: a bug here would allow people to generate fake results. Please let us know if you find one :).
Args:
a (Optional[int, str]): None or no argument seeds from an operating system specific randomness source. If an int or str passed, then all of bits are used.
"""
# Adapted from https://svn.python.org/projects/python/tags/r32/Lib/random.py
if a is None:
a = bigint_from_bytes(os.urandom(32))

if isinstance(a, str):
a = a.encode('utf8')
a += hashlib.sha512(a).digest()
a = bigint_from_bytes(a)

# Actually seed the generators
random.seed(a)
np.random.seed(int_list_from_bigint(a))

return a

# TODO: don't hardcode sizeof_int here
def bigint_from_bytes(bytes):
sizeof_int = 4
padding = sizeof_int - len(bytes) % sizeof_int
bytes += '\0' * padding
int_count = len(bytes) / sizeof_int
unpacked = struct.unpack("{}I".format(int_count), bytes)
accum = 0
for i, val in enumerate(unpacked):
accum += 2 ** (sizeof_int * 8 * i) * val
return accum

def int_list_from_bigint(bigint):
ints = []
while bigint > 0:
bigint, mod = divmod(bigint, 2 ** 32)
ints.append(mod)
return ints
27 changes: 25 additions & 2 deletions gym/core.py
Expand Up @@ -17,17 +17,20 @@ class Env(object):
The main API methods that users of this class need to know are:
reset
step
reset
render
close
seed
When implementing an environment, override the following methods
in your subclass:
_step
_reset
_render
_close
_seed
And set the following attributes:
Expand Down Expand Up @@ -70,6 +73,7 @@ def _render(self, mode='human', close=False):
if close:
return
raise NotImplementedError
def _seed(self, seed=None): return []

@property
def monitor(self):
Expand Down Expand Up @@ -172,7 +176,9 @@ def close(self):
Environments will automatically close() themselves when
garbage collected or when the program exits.
"""
if self._closed:
# _closed will be missing if this instance is still
# initializing.
if not hasattr(self, '_closed') or self._closed:
return

self._close()
Expand All @@ -181,6 +187,23 @@ def close(self):
# end up with double close.
self._closed = True

def seed(self, seed=None):
"""Sets the seed for this env's random number generator(s).
Note:
Some environments use multiple pseudorandom number generators.
We want to capture all such seeds used in order to ensure that
there aren't accidental correlations between multiple generators.
Returns:
list<bigint>: Returns the list of seeds used in this env's random
number generators. The first value in the list should be the
"main" seed, or the value which a reproducer should pass to
'seed'. Often, the main seed equals the provided 'seed', but
this won't be true if seed=None, for example.
"""
return self._seed(seed)

def __del__(self):
self.close()

Expand Down
16 changes: 16 additions & 0 deletions gym/envs/__init__.py
Expand Up @@ -228,11 +228,21 @@
name = ''.join([g.capitalize() for g in game.split('_')])
if obs_type == 'ram':
name = '{}-ram'.format(name)

nondeterministic = False
if game == 'elevator_action' and obs_type == 'ram':
# ElevatorAction-ram-v0 seems to yield slightly
# non-deterministic observations about 10% of the time. We
# should track this down eventually, but for now we just
# mark it as nondetermistic.
nondeterministic = True

register(
id='{}-v0'.format(name),
entry_point='gym.envs.atari:AtariEnv',
kwargs={'game': game, 'obs_type': obs_type},
timestep_limit=10000,
nondeterministic=nondeterministic,
)

# Board games
Expand All @@ -248,6 +258,11 @@
'illegal_move_mode': 'lose',
'board_size': 9,
},
# The pachi player seems not to be determistic given a fixed seed.
# (Reproduce by running 'import gym; h = gym.make('Go9x9-v0'); h.seed(1); h.reset(); h.step(15); h.step(16); h.step(17)' a few times.)
#
# This is probably due to a computation time limit.
nondetermistic=True,
)

register(
Expand All @@ -260,6 +275,7 @@
'illegal_move_mode': 'lose',
'board_size': 19,
},
nondetermistic=True,
)

register(
Expand Down
17 changes: 12 additions & 5 deletions gym/envs/algorithmic/algorithmic_env.py
@@ -1,8 +1,7 @@
from gym import Env
from gym.spaces import Discrete, Tuple
from gym.utils import colorize
from gym.utils import colorize, seeding
import numpy as np
import random
from six import StringIO
import sys
import math
Expand All @@ -17,6 +16,7 @@ class AlgorithmicEnv(Env):

def __init__(self, inp_dim=1, base=10, chars=False):
global hash_base

hash_base = 50 ** np.arange(inp_dim)
self.base = base
self.last = 10
Expand All @@ -27,10 +27,17 @@ def __init__(self, inp_dim=1, base=10, chars=False):
self.inp_dim = inp_dim
AlgorithmicEnv.current_length = 2
tape_control = []
self.action_space = Tuple(([Discrete(2 * inp_dim), Discrete(2), Discrete(self.base)]))
self.observation_space = Discrete(self.base + 1)

self._seed()
self.reset()

def _seed(self, seed=None):
self.np_random, seed = seeding.np_random(seed)

self.action_space = Tuple(([Discrete(2 * self.inp_dim, np_random=self.np_random), Discrete(2, np_random=self.np_random), Discrete(self.base, np_random=self.np_random)]))

This comment has been minimized.

Copy link
@jietang

jietang May 30, 2016

Contributor

It's non-intuitive to me that action/observation spaces would require a random seed initializer. Looking closer the reason it's required is for sample(), which IMO is an api for agents rather than part of envs.

What do you think about making sample() take a seed / numpy.RandomState directly? It doesn't solve the problem of allowing fully deterministic reproductions (because we don't have a way to record seeds used for other sources of agent randomness), but it does a better job separating agent / env randomness and still allows fully deterministic environments (in the sense that if you supply the same set of action sequences you'll get the same trajectory)

This comment has been minimized.

Copy link
@gdb

gdb May 30, 2016

Author Collaborator

I can definitely see the argument. I leaned towards drawing the boundaries at Gym code vs user code since randomness is very hard to get right, and it seems worth us going as far out of our way to help the user as possible. (And also to be deterministic if the user has no randomness in their code.)

However, bringing the action/observation spaces into the fold definitely complicated the code, and as you say, it might be cleaner to think about env code vs agent code.

@joschu what's your take?

This comment has been minimized.

Copy link
@jietang

jietang May 30, 2016

Contributor

That's a good point about user code vs. Gym code.

Another (half-baked) idea: add a gym.sample_action(env) which takes care of initializing and passing a seed to action_space.sample() as well as recording the seed in the monitor

edit: I'm starting to be more convinced that separating env / agent randomness is the right way to go - with the current approach even if you initialize the env with the same seed and supply the same actions to an env you will get different environment observations depending on how many calls to action_space.sample() you make.

This comment has been minimized.

Copy link
@gdb

gdb May 30, 2016

Author Collaborator

What use case do you have in mind? The only one I really know is having a fully reproducible run. Would you ever expect the user would make varied calls to action_space.sample() but still make the exact same sequence of actions? Maybe for some model-based exploration?

This comment has been minimized.

Copy link
@jietang

jietang May 30, 2016

Contributor

One use case is debugging. I'm writing an agent and I see it make a bad action in a particular state. So I'd like to force the env back into that state so I can tweak the algorithm to get better performance. I record the seed and actions and replay them. As I'm tweaking I add some extra calls to action_space.sample(); suddenly I can't get back to the state I'm debugging even though it seems like the initialization and state changes are all the same. (calling action_space.sample() essentially re-seeds the environment under the hood by drawing from the RNG)

This comment has been minimized.

Copy link
@joschu

joschu May 30, 2016

Contributor

It's great that now the initial state sampling is deterministic. But it seems like there's a fair amount of clutter in the code (self.setting observation/action spaces in _seed) for a very uncommon use case -- needing deterministic numbers to come out of action_space.sample(). How about we just use a global RandomState object, say, spaces.randomstate? (It's a step up from using the global numpy randomstate, which is what we were doing before)

This comment has been minimized.

Copy link
@gdb

gdb via email May 30, 2016

Author Collaborator

This comment has been minimized.

Copy link
@joschu

joschu May 30, 2016

Contributor

Using a global RNG doesn't rule out determinism. My TRPO and CEM implementations were fully deterministic, as long as you call np.random.seed, and as long as the environment isn't using some other source of randomness.
Using action_space.sample() isn't that common of a use case, but I can imagine using it for epsilon-greedy exploration. But it's rare that someone would use this feature and need determinism.

self.observation_space = Discrete(self.base + 1, np_random=self.np_random)
return [seed]

def _get_obs(self, pos=None):
if pos is None:
pos = self.x
Expand Down Expand Up @@ -198,6 +205,6 @@ def _reset(self):
AlgorithmicEnv.sum_rewards = []
self.sum_reward = 0.0
self.time = 0
self.total_len = random.randrange(3) + AlgorithmicEnv.current_length
self.total_len = self.np_random.randint(3) + AlgorithmicEnv.current_length
self.set_data()
return self._get_obs()
6 changes: 2 additions & 4 deletions gym/envs/algorithmic/copy.py
Expand Up @@ -2,7 +2,6 @@
Task is to copy content from the input tape to
the output tape. http://arxiv.org/abs/1511.07275
"""
import random
import numpy as np
from gym.envs.algorithmic import algorithmic_env
from gym.envs.algorithmic.algorithmic_env import ha
Expand All @@ -17,8 +16,7 @@ def set_data(self):
self.content = {}
self.target = {}
for i in range(self.total_len):
val = random.randrange(self.base)
val = self.np_random.randint(self.base)
self.content[ha(np.array([i]))] = val
self.target[i] = val
self.total_reward = self.total_len

self.total_reward = self.total_len
3 changes: 1 addition & 2 deletions gym/envs/algorithmic/duplicated_input.py
Expand Up @@ -3,7 +3,6 @@
http://arxiv.org/abs/1511.07275
"""

import random
import numpy as np
from gym.envs.algorithmic import algorithmic_env
from gym.envs.algorithmic.algorithmic_env import ha
Expand All @@ -20,7 +19,7 @@ def set_data(self):
self.target = {}
copies = int(self.total_len / self.duplication)
for i in range(copies):
val = random.randrange(self.base)
val = self.np_random.randint(self.base)
self.target[i] = val
for d in range(self.duplication):
self.content[ha(np.array([i * self.duplication + d]))] = val
Expand Down
4 changes: 1 addition & 3 deletions gym/envs/algorithmic/repeat_copy.py
Expand Up @@ -2,7 +2,6 @@
Task is to copy content multiple-times from the input tape to
the output tape. http://arxiv.org/abs/1511.07275
"""
import random
import numpy as np
from gym.envs.algorithmic import algorithmic_env
from gym.envs.algorithmic.algorithmic_env import ha
Expand All @@ -20,10 +19,9 @@ def set_data(self):
self.target = {}
unique = set()
for i in range(self.total_len):
val = random.randrange(self.base)
val = self.np_random.randint(self.base)
self.content[ha(np.array([i]))] = val
self.target[i] = val
self.target[2 * self.total_len - i - 1] = val
self.target[2 * self.total_len + i] = val
self.total_reward = 3.0 * self.total_len + 0.9

3 changes: 1 addition & 2 deletions gym/envs/algorithmic/reverse.py
Expand Up @@ -3,7 +3,6 @@
http://arxiv.org/abs/1511.07275
"""

import random
import numpy as np
from gym.envs.algorithmic import algorithmic_env
from gym.envs.algorithmic.algorithmic_env import ha
Expand All @@ -21,7 +20,7 @@ def set_data(self):
self.content = {}
self.target = {}
for i in range(self.total_len):
val = random.randrange(self.base)
val = self.np_random.randint(self.base)
self.content[ha(np.array([i]))] = val
self.target[self.total_len - i - 1] = val
self.total_reward = self.total_len + 0.9
7 changes: 2 additions & 5 deletions gym/envs/algorithmic/reversed_addition.py
@@ -1,4 +1,3 @@
import random
import numpy as np
from gym.envs.algorithmic import algorithmic_env
from gym.envs.algorithmic.algorithmic_env import ha
Expand All @@ -17,14 +16,12 @@ def set_data(self):
for i in range(self.total_len):
vals = []
for k in range(self.rows):
val = random.randrange(self.base)
val = self.np_random.randint(self.base)
self.content[ha(np.array([i, k]))] = val
vals.append(val)
total = sum(vals) + curry
self.target[i] = total % self.base
curry = total / self.base
if curry > 0:
self.target[self.total_len] = curry
self.total_reward = self.total_len


self.total_reward = self.total_len

0 comments on commit 58e6aa9

Please sign in to comment.