# OpenAI Gym

## Agent Anatomy

In [1]:
import random
from typing import List


class Environment:
    """
    Dummy environment that returns random rewards.
    """

    def __init__(self) -> None:
        self.steps_left = 10

    def get_observation(self) -> List[float]:
        """Get current observation from this environment"""
        return [0.0] * 3

    def get_actions(self) -> List[int]:
        """Return currenlty available actions"""
        return [0, 1]

    def is_done(self) -> bool:
        """Returns true when the game is done"""
        return self.steps_left == 0

    def action(self, action: int) -> float:
        """
        Apply given action in this environment.
        Returns random reward.
        """

        # Panic if this method is called when the game is over
        if self.is_done():
            raise Exception("Game is over")

        # Record that a step has been made and return random reward
        self.steps_left -= 1
        return random.random()


class Agent:
    """
    Dummy agent that plays according to a random policy.
    """

    def __init__(self) -> None:
        self.total_reward = 0.0

    def step(self, env: Environment) -> None:
        """
        Make one policy step in given environment.
        """

        # Get current observation from the environment
        #  - Note: This agent actually ignores it.
        _ = env.get_observation()

        # Choose random action
        action = random.choice(env.get_actions())

        # Apply slected action in the environment and collect reward
        reward = env.action(action)
        self.total_reward += reward


def run_episode(i: int) -> None:
    # Create the environment and agent
    env = Environment()
    agent = Agent()

    # Run the game until done
    #  - i.e. run single episode
    while not env.is_done():
        agent.step(env)

    # Show total reward for the episode
    print(f"Episode: {i + 1}\tTotal reward: {agent.total_reward:.4f}")


# Run few episodes
for i in range(5):
    run_episode(i)

Episode: 1	Total reward: 4.9421
Episode: 2	Total reward: 5.1507
Episode: 3	Total reward: 5.9124
Episode: 4	Total reward: 3.5335
Episode: 5	Total reward: 4.2677


## Random CartPole Agent

In [2]:
import gym  # noqa

env = gym.make("CartPole-v0")

In [3]:
env.observation_space

Box(-3.4028234663852886e+38, 3.4028234663852886e+38, (4,), float32)

In [4]:
env.action_space

Discrete(2)

In [5]:
env.reset()

array([-0.04776663,  0.00067543, -0.01365968,  0.02324247])

In [6]:
import numpy as np  # noqa

# Set RNG state
env.seed(42)


def run_cartpole_episode(env):
    """
    Implements a CartPole agent with random policy.

    :returns: total reward
    """
    total_reward = 0.0

    # Reset the environment and get initial observation
    obs = env.reset()

    # Interact with the environment until done
    done = False
    while not done:
        # Sample an action
        action = env.action_space.sample()

        # Apply selected action in the environment
        obs, reward, done, _ = env.step(action)

        # Accumulate reward
        total_reward += reward

    return total_reward


n_episodes = 100
reward_boundary = 195

# Compute mean reward over 100 episodes
rewards = np.array([run_cartpole_episode(env) for _ in range(n_episodes)])
reward_mean = rewards.mean()

# Evaluate random policy for the CartPole environment
if reward_mean >= reward_boundary:
    print(f"Agent is good enough: {reward_mean:.2f}/{reward_boundary:.1f}")
else:
    print(f"Agent is not good enough: {reward_mean:.2f}/{reward_boundary:.1f}")

Agent is not good enough: 21.72/195.0
