The code in this notebook is adapted from the "Tutorial: An Introduction to Reinforcement Learning Using OpenAI Gym" post found here: https://www.gocoder.one/blog/rl-tutorial-with-openai-gym/

In [1]:
import gym
import numpy as np
import random

env = gym.make("Taxi-v3", render_mode='ansi').env

In [2]:
# Reset environment and print initial state
env.reset()

print(env.render())

+---------+
|R:[43m [0m| : :G|
| : | : : |
| : : : : |
| | : | : |
|[34;1mY[0m| : |[35mB[0m: |
+---------+




In [3]:
# Get a random action from environment and print current state
print("Step 1")

action = env.action_space.sample()

env.step(action)

print(env.render())

Step 1
+---------+
|[43mR[0m: | : :G|
| : | : : |
| : : : : |
| | : | : |
|[34;1mY[0m| : |[35mB[0m: |
+---------+
  (West)



In [4]:
max_steps = 20
for i in range(2, max_steps + 1):
    print(f"Step {i}")

    action = env.action_space.sample()

    env.step(action)
    
    print(env.render())

Step 2
+---------+
|[43mR[0m: | : :G|
| : | : : |
| : : : : |
| | : | : |
|[34;1mY[0m| : |[35mB[0m: |
+---------+
  (West)

Step 3
+---------+
|[43mR[0m: | : :G|
| : | : : |
| : : : : |
| | : | : |
|[34;1mY[0m| : |[35mB[0m: |
+---------+
  (Pickup)

Step 4
+---------+
|[43mR[0m: | : :G|
| : | : : |
| : : : : |
| | : | : |
|[34;1mY[0m| : |[35mB[0m: |
+---------+
  (North)

Step 5
+---------+
|R: | : :G|
|[43m [0m: | : : |
| : : : : |
| | : | : |
|[34;1mY[0m| : |[35mB[0m: |
+---------+
  (South)

Step 6
+---------+
|R: | : :G|
| : | : : |
|[43m [0m: : : : |
| | : | : |
|[34;1mY[0m| : |[35mB[0m: |
+---------+
  (South)

Step 7
+---------+
|R: | : :G|
| : | : : |
|[43m [0m: : : : |
| | : | : |
|[34;1mY[0m| : |[35mB[0m: |
+---------+
  (Dropoff)

Step 8
+---------+
|R: | : :G|
| : | : : |
| :[43m [0m: : : |
| | : | : |
|[34;1mY[0m| : |[35mB[0m: |
+---------+
  (East)

Step 9
+---------+
|R: | : :G|
| : | : : |
| : : : : |
| |[43m [0m: | : |
|[34;

In [5]:
steps = 0

info = env.reset()
state = info[0]

done = False
max_steps = 5000
for i in range(1, max_steps + 1):
    action = env.action_space.sample()
    state, reward, done, _, _ = env.step(action)
    steps += 1

    if done:
        break

print(f"Steps taken: {steps}")

Steps taken: 969


In [6]:
# Initialize q-table
state_size = env.observation_space.n
action_size = env.action_space.n
qtable = np.zeros((state_size, action_size))

# Hyperparameters
learning_rate = 0.5
discount_rate = 0.8
epsilon = 1.0
decay_rate= 0.01

episodes = 2000
max_steps = 300
steps_taken = []

for episode in range(episodes):
    # Reset the environment
    info = env.reset()
    state = info[0]

    for i in range(1, max_steps + 1):
        if random.uniform(0,1) < epsilon:
            # Explore
            action = env.action_space.sample()
        else:
            # Exploit
            action = np.argmax(qtable[state,:])

        # Get new state after action
        new_state, reward, done, _, _ = env.step(action)

        # Update q-table
        qtable[state,action] = qtable[state,action] + learning_rate * (reward + discount_rate * np.max(qtable[new_state,:])-qtable[state,action])

        # Set current state
        state = new_state

        if done:
            steps_taken.append(i)
            break
        elif i == max_steps:
            steps_taken.append(i)

    # Decrease probability of exploration
    epsilon = np.exp(-decay_rate*episode)

print(steps_taken)

[300, 300, 300, 300, 300, 300, 300, 300, 300, 300, 300, 300, 300, 241, 300, 300, 300, 300, 300, 300, 300, 300, 144, 300, 300, 300, 300, 209, 300, 300, 300, 300, 79, 249, 281, 300, 300, 300, 300, 300, 300, 300, 300, 300, 300, 231, 300, 268, 228, 191, 300, 300, 300, 225, 300, 165, 173, 300, 300, 300, 240, 50, 213, 82, 119, 156, 300, 134, 37, 216, 69, 103, 300, 110, 152, 112, 220, 102, 118, 300, 151, 84, 82, 65, 17, 272, 150, 135, 34, 80, 165, 17, 137, 142, 103, 75, 30, 74, 109, 48, 99, 73, 40, 133, 106, 107, 14, 300, 300, 72, 300, 78, 112, 42, 64, 17, 32, 197, 87, 65, 24, 44, 203, 188, 107, 292, 146, 28, 95, 187, 90, 16, 148, 119, 87, 131, 69, 151, 29, 53, 80, 13, 49, 136, 12, 29, 145, 40, 54, 125, 104, 137, 46, 56, 24, 12, 48, 36, 17, 29, 32, 95, 43, 57, 17, 9, 52, 94, 137, 90, 7, 7, 16, 15, 51, 128, 21, 36, 34, 24, 16, 154, 32, 18, 14, 74, 48, 32, 57, 14, 65, 30, 17, 137, 89, 33, 20, 35, 33, 12, 167, 73, 25, 22, 101, 6, 57, 48, 18, 34, 13, 117, 43, 81, 84, 10, 110, 38, 68, 26, 189, 132

In [7]:
steps = 0

info = env.reset()
state = info[0]

done = False
max_steps = 100
for i in range(1, max_steps + 1):
    action = np.argmax(qtable[state,:])
    state, reward, done, _, _ = env.step(action)
    print(f"Step {i}")
    print(env.render())
    steps += 1

    if done:
        break

print(f"Steps taken: {steps}")
env.close()

Step 1
+---------+
|[34;1mR[0m: | : :[35mG[0m|
| : | : : |
| :[43m [0m: : : |
| | : | : |
|Y| : |B: |
+---------+
  (North)

Step 2
+---------+
|[34;1mR[0m: | : :[35mG[0m|
| :[43m [0m| : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (North)

Step 3
+---------+
|[34;1mR[0m: | : :[35mG[0m|
|[43m [0m: | : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (West)

Step 4
+---------+
|[34;1m[43mR[0m[0m: | : :[35mG[0m|
| : | : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (North)

Step 5
+---------+
|[42mR[0m: | : :[35mG[0m|
| : | : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (Pickup)

Step 6
+---------+
|R: | : :[35mG[0m|
|[42m_[0m: | : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (South)

Step 7
+---------+
|R: | : :[35mG[0m|
| :[42m_[0m| : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (East)

Step 8
+---------+
|R: | : :[35mG[0m|
| : | : : |
| :[42m_[0m: : : |
| | : | : |
|Y| : |B: |
+---------+
  (South