Tutorial from https://www.learndatasci.com/tutorials/reinforcement-q-learning-scratch-python-openai-gym/

In [None]:
import gym

from IPython.display import clear_output
from gym.envs.toy_text.taxi import TaxiEnv
from time import sleep

env: TaxiEnv = gym.make("Taxi-v3").env

# Test rendering the environment

env.reset()
env.render()

print(f"Action space: {env.action_space}")
print(f"State space: {env.observation_space}")

# Set a specific state based on an encoding
# (taxi row, taxi column, passenger index, destination index)

#### Example of encoding a specific state:

In [None]:
state: int = env.encode(3, 1, 2, 0)
print(f"State ID: {state}")

env.s = state
env.render()

#### Examine reward table P

The reward table is essentially a matrix, but is implemented as a dictionary

In [None]:
print(f"Number of states: {len(env.P)}")
print(f"Number of actions in state 0: {len(env.P[0])}")

print(f"Actions for state {state}: {env.P[state]}")

#### Take a random action at each step:

In [None]:
env.s = 328

epochs: int = 0
penalties: int = 0
frames: list = []

done = False

while not done:
    action: int = env.action_space.sample()
    state, reward, done, info = env.step(action)

    if reward == -10:
        penalties += 1

    frames.append({
        "frame": env.render(mode="ansi"),
        "state": state,
        "action": action,
        "reward": reward
    })

    epochs += 1

print(f"Timestamps taken: {epochs}")
print(f"Number of penalties: {penalties}")

In [None]:
def print_frames(frames):
    for i, frame in enumerate(frames):
        clear_output(wait=True)
        print(frame["frame"])
        print(f"Timestep: {i + 1}")
        print(f"State: {frame['state']}")
        print(f"Action: {frame['action']}")
        print(f"Reward: {frame['reward']}")
        sleep(0.1)

print_frames(frames)