# OpenAI Gym

## Agent Anatomy

In [1]:
import random
from typing import List


class Environment:
    """
    Dummy environment that returns random rewards.
    """

    def __init__(self) -> None:
        self.steps_left = 10

    def get_observation(self) -> List[float]:
        """Get current observation from this environment"""
        return [0.0] * 3

    def get_actions(self) -> List[int]:
        """Return currenlty available actions"""
        return [0, 1]

    def is_done(self) -> bool:
        """Returns true when the game is done"""
        return self.steps_left == 0

    def action(self, action: int) -> float:
        """
        Apply given action in this environment.
        Returns random reward.
        """

        # Panic if this method is called when the game is over
        if self.is_done():
            raise Exception("Game is over")

        # Record that a step has been made and return random reward
        self.steps_left -= 1
        return random.random()


class Agent:
    """
    Dummy agent that plays according to a random policy.
    """

    def __init__(self) -> None:
        self.total_reward = 0.0

    def step(self, env: Environment) -> None:
        """
        Make one policy step in given environment.
        """

        # Get current observation from the environment
        #  - Note: This agent actually ignores it.
        _ = env.get_observation()

        # Choose random action
        action = random.choice(env.get_actions())

        # Apply slected action in the environment and collect reward
        reward = env.action(action)
        self.total_reward += reward


def run_episode(i: int) -> None:
    # Create the environment and agent
    env = Environment()
    agent = Agent()

    # Run the game until done
    #  - i.e. run single episode
    while not env.is_done():
        agent.step(env)

    # Show total reward for the episode
    print(f"Episode: {i + 1}\tTotal reward: {agent.total_reward:.4f}")


# Run few episodes
for i in range(5):
    run_episode(i)

Episode: 1	Total reward: 5.6444
Episode: 2	Total reward: 4.3383
Episode: 3	Total reward: 5.6232
Episode: 4	Total reward: 3.4625
Episode: 5	Total reward: 4.9504


## Random CartPole Agent

In [2]:
import gym  # noqa

env = gym.make("CartPole-v0")

In [3]:
env.observation_space

Box(-3.4028234663852886e+38, 3.4028234663852886e+38, (4,), float32)

In [4]:
env.action_space

Discrete(2)

In [5]:
env.reset()

array([0.00777905, 0.01561849, 0.02360984, 0.02817653])

In [6]:
import numpy as np  # noqa


def run_cartpole_episode(env: gym.Env) -> float:
    """
    Implements a CartPole agent with random policy.

    :returns: total reward
    """
    total_reward = 0.0

    # Reset the environment and get initial observation
    obs = env.reset()

    # Interact with the environment until done
    done = False
    while not done:
        # Sample an action
        action = env.action_space.sample()

        # Apply selected action in the environment
        obs, reward, done, _ = env.step(action)

        # Accumulate reward
        total_reward += reward

    return total_reward


n_episodes = 100
reward_boundary = 195

# Compute mean reward over 100 episodes
rewards = np.array([run_cartpole_episode(env) for _ in range(n_episodes)])
reward_mean = rewards.mean()

# Evaluate random policy for the CartPole environment
if reward_mean >= reward_boundary:
    print(f"Agent is good enough: {reward_mean:.2f}/{reward_boundary:.1f}")
else:
    print(f"Agent is not good enough: {reward_mean:.2f}/{reward_boundary:.1f}")

Agent is not good enough: 20.86/195.0


## Environment Wrappers
Let's create an example of an environment wrapper which will play a random action with some probability instead of given one.

In [7]:
from typing import TypeVar  # noqa

A = TypeVar("A")


class RandomActionWrapper(gym.ActionWrapper):
    def __init__(self, env: gym.Env, epsilon: float = 0.1) -> None:
        super().__init__(env)
        self.epsilon = epsilon
        self.n_random_actions = 0

    def action(self, action: A) -> A:
        if random.random() < self.epsilon:
            self.n_random_actions += 1
            # Apply random action with epsilon probability
            return self.env.action_space.sample()

        return action


# Setup wrapped CartPole environment
env = RandomActionWrapper(gym.make("CartPole-v0"))


def run_cartpole_episode(env: RandomActionWrapper) -> float:
    total_reward = 0.0

    # Reset the environment and get initial observation
    obs = env.reset()

    # Interact with the environment until done
    done = False
    while not done:
        # Apply one of the actions and let the env. wrapper sample the other
        obs, reward, done, _ = env.step(0)

        # Accumulate reward
        total_reward += reward

    return total_reward


# Compute mean reward over 100 episodes
rewards = np.array([run_cartpole_episode(env) for _ in range(n_episodes)])
reward_mean = rewards.mean()

# Evaluate random policy for the CartPole environment
if reward_mean >= reward_boundary:
    print(f"Agent is good enough: {reward_mean:.2f}/{reward_boundary:.1f}")
else:
    print(f"Agent is not good enough: {reward_mean:.2f}/{reward_boundary:.1f}")

print("Mean exploration:", env.n_random_actions / n_episodes)

Agent is not good enough: 9.89/195.0
Mean exploration: 1.14


## Environment Monitor
There's a special environment wrapper called `Monitor` which can record the environment as a video and store it to a directory. Note that it requires an access to X11 - to make this working one can run the python program with
```bash
xvfb-run -s "-screen 0 640x480x24" python run.py
```
or via ssh with `ssh -X` option.

In [8]:
# Crate CartPole environment and wrap it into a Monitor
#   - Note: Video will be stored in 'recording/' per episode
env = gym.wrappers.Monitor(
    env=gym.make("CartPole-v0"),
    directory="recording",
    video_callable=lambda episode_id: True,
    force=True,
)


def run_cartpole_episode(env: gym.wrappers.Monitor) -> None:
    _ = env.reset()
    done = False
    while not done:
        action = env.action_space.sample()
        _, _, done, _ = env.step(action)


# Run 5 episodes in our monitored environment
for _ in range(5):
    run_cartpole_episode(env)

# Terminate the monitor and environment
env.close()
env.env.close()

In [9]:
!tree recording/

[01;34mrecording/[00m
├── openaigym.episode_batch.0.1180449.stats.json
├── openaigym.manifest.0.1180449.manifest.json
├── openaigym.video.0.1180449.video000000.meta.json
├── [01;35mopenaigym.video.0.1180449.video000000.mp4[00m
├── openaigym.video.0.1180449.video000001.meta.json
├── [01;35mopenaigym.video.0.1180449.video000001.mp4[00m
├── openaigym.video.0.1180449.video000002.meta.json
├── [01;35mopenaigym.video.0.1180449.video000002.mp4[00m
├── openaigym.video.0.1180449.video000003.meta.json
├── [01;35mopenaigym.video.0.1180449.video000003.mp4[00m
├── openaigym.video.0.1180449.video000004.meta.json
└── [01;35mopenaigym.video.0.1180449.video000004.mp4[00m

0 directories, 12 files
