In [1]:
#https://gymnasium.farama.org/environments/toy_text/taxi/
import gymnasium as gym
import random
import numpy as np

In [2]:
env = gym.make('Taxi-v3')

In [3]:
alpha = 0.9
gamma = 0.95
epsilon = 0.995
epsilon_decay = 0.9995
min_epsilon = 0.01 
num_episodes = 10000
max_steps = 100

In [4]:
q_table = np.zeros((env.observation_space.n, env.action_space.n))

In [5]:
q_table.shape

(500, 6)

In [6]:
def select_action(state):
    if random.uniform(0, 1) < epsilon:
        return env.action_space.sample()
    else:
        return np.argmax(q_table[state])

In [7]:
for epsilon in range(num_episodes):
    state = env.reset()
    state_index = state[0] 
    done = False
    t_reward = 0

    print(f"Episode {epsilon}", end="\r")
    for i in range(max_steps):
        action = select_action(state_index)
        next_state, reward, done, truncated, info = env.step(action)

        old_value = q_table[state_index, action]

        q_table[state_index, action] = (1 - alpha) * old_value + alpha * (reward + gamma * np.max(q_table[next_state]))

        t_reward += reward
        state_index = next_state

        if done or truncated:
            print(f"Episode {epsilon} finished after {i} timesteps with reward {t_reward}")
            break

    epsilon = max(min_epsilon, epsilon * epsilon_decay)

Episode 25 finished after 70 timesteps with reward -239
Episode 29 finished after 40 timesteps with reward -128
Episode 61 finished after 95 timesteps with reward -309
Episode 179 finished after 95 timesteps with reward -354
Episode 396 finished after 26 timesteps with reward -60
Episode 429 finished after 57 timesteps with reward -190
Episode 434 finished after 64 timesteps with reward -188
Episode 439 finished after 42 timesteps with reward -139
Episode 523 finished after 94 timesteps with reward -299
Episode 557 finished after 95 timesteps with reward -408
Episode 649 finished after 85 timesteps with reward -245
Episode 670 finished after 98 timesteps with reward -339
Episode 679 finished after 86 timesteps with reward -354
Episode 807 finished after 78 timesteps with reward -292
Episode 838 finished after 94 timesteps with reward -290
Episode 862 finished after 79 timesteps with reward -248
Episode 978 finished after 89 timesteps with reward -249
Episode 1019 finished after 19 time

In [9]:
env = gym.make('Taxi-v3', render_mode='human')
for episode in range(3):
    state = env.reset()
    state_index = state[0]
    done = False 
    for i in range(max_steps):
        env.render()
        action = np.argmax(q_table[state_index])
        next_state, reward, done, truncated, info = env.step(action)
        state_index = next_state
        if done or truncated:
            print(f"Episode {episode} finished after {i} timesteps with reward {reward}")
            break

env.close()

 

Episode 0 finished after 15 timesteps with reward 20
Episode 1 finished after 15 timesteps with reward 20
Episode 2 finished after 9 timesteps with reward 20
