# openai/gym

Switch branches/tags
Nothing to show
75f28fa Aug 24, 2018
3 contributors

### Users who have contributed to this file

114 lines (92 sloc) 3.79 KB
 import numpy as np import sys from gym.envs.toy_text import discrete UP = 0 RIGHT = 1 DOWN = 2 LEFT = 3 class CliffWalkingEnv(discrete.DiscreteEnv): """ This is a simple implementation of the Gridworld Cliff reinforcement learning task. Adapted from Example 6.6 (page 132) from Reinforcement Learning: An Introduction by Sutton and Barto: http://incompleteideas.net/book/the-book-2nd.html With inspiration from: https://github.com/dennybritz/reinforcement-learning/blob/master/lib/envs/cliff_walking.py The board is a 4x12 matrix, with (using Numpy matrix indexing): [3, 0] as the start at bottom-left [3, 11] as the goal at bottom-right [3, 1..10] as the cliff at bottom-center Each time step incurs -1 reward, and stepping into the cliff incurs -100 reward and a reset to the start. An episode terminates when the agent reaches the goal. """ metadata = {'render.modes': ['human', 'ansi']} def __init__(self): self.shape = (4, 12) self.start_state_index = np.ravel_multi_index((3, 0), self.shape) nS = np.prod(self.shape) nA = 4 # Cliff Location self._cliff = np.zeros(self.shape, dtype=np.bool) self._cliff[3, 1:-1] = True # Calculate transition probabilities and rewards P = {} for s in range(nS): position = np.unravel_index(s, self.shape) P[s] = {a: [] for a in range(nA)} P[s][UP] = self._calculate_transition_prob(position, [-1, 0]) P[s][RIGHT] = self._calculate_transition_prob(position, [0, 1]) P[s][DOWN] = self._calculate_transition_prob(position, [1, 0]) P[s][LEFT] = self._calculate_transition_prob(position, [0, -1]) # Calculate initial state distribution # We always start in state (3, 0) isd = np.zeros(nS) isd[self.start_state_index] = 1.0 super(CliffWalkingEnv, self).__init__(nS, nA, P, isd) def _limit_coordinates(self, coord): """ Prevent the agent from falling out of the grid world :param coord: :return: """ coord[0] = min(coord[0], self.shape[0] - 1) coord[0] = max(coord[0], 0) coord[1] = min(coord[1], self.shape[1] - 1) coord[1] = max(coord[1], 0) return coord def _calculate_transition_prob(self, current, delta): """ Determine the outcome for an action. Transition Prob is always 1.0. :param current: Current position on the grid as (row, col) :param delta: Change in position for transition :return: (1.0, new_state, reward, done) """ new_position = np.array(current) + np.array(delta) new_position = self._limit_coordinates(new_position).astype(int) new_state = np.ravel_multi_index(tuple(new_position), self.shape) if self._cliff[tuple(new_position)]: return [(1.0, self.start_state_index, -100, False)] terminal_state = (self.shape[0] - 1, self.shape[1] - 1) is_done = tuple(new_position) == terminal_state return [(1.0, new_state, -1, is_done)] def render(self, mode='human'): outfile = sys.stdout for s in range(self.nS): position = np.unravel_index(s, self.shape) if self.s == s: output = " x " # Print terminal state elif position == (3, 11): output = " T " elif self._cliff[position]: output = " C " else: output = " o " if position[1] == 0: output = output.lstrip() if position[1] == self.shape[1] - 1: output = output.rstrip() output += '\n' outfile.write(output) outfile.write('\n')