In [None]:
import gymnasium as gym
import numpy as np
import random
from tqdm import tqdm

env = gym.make("Taxi-v3", render_mode="ansi")
env.reset()
print(env.render())

"""
0: guney
1: kuzey
2: dogu
3: bati
4: yolcuyu al
5: yolcuyu birak
"""

action_space = env.action_space.n
state_space = env.observation_space.n

q_table = np.zeros((state_space, action_space))

alpha = 0.1 # learning rate
gamma = 0.6 # discount rate
epsilon = 0.1 # epsilon

for i in tqdm(range(1, 100001)):
    state, _ = env.reset()

    done = False

    while not done:

        if random.uniform(0,1) < epsilon: # explore - %10
            action = env.action_space.sample()
        else: # exploit
            action = np.argmax(q_table[state])

        next_state, reward, done, info, _ = env.step(action)

        q_table[state, action] = q_table[state, action] + alpha * (reward + gamma * np.max(q_table[next_state]) - q_table[state, action])

        state = next_state

print("Training finished")

# testing
total_epoch, total_penalties = 0, 0
episodes = 100

for i in tqdm(range(episodes)):
    state, _ = env.reset()

    epochs, penalties, reward = 0, 0, 0

    done = False

    while not done:
        action = np.argmax(q_table[state])

        next_state, reward, done, info, _ = env.step(action)

        state = next_state

        if reward == -10:
            penalties += 1
        
        epochs += 1
    
    total_epoch += epochs
    total_penalties += penalties

print("Result after {} episode".format(episodes))
print("Average timesteps per episode :", total_epoch/episodes)
print("Average penalties per episode :", total_penalties/episodes)



+---------+
|[34;1mR[0m: | : :G|
| : | : : |
| : :[43m [0m: : |
| | : | : |
|Y| : |[35mB[0m: |
+---------+




100%|██████████| 100000/100000 [00:30<00:00, 3269.80it/s]


Training finished


100%|██████████| 100/100 [00:00<00:00, 6048.46it/s]

Result after 100 episode
Average timesteps per episode : 13.11
Average penalties per episode : 0.0



