In [1]:
import gymnasium as gym
import numpy as np
from gymnasium.wrappers import TimeLimit # Add this line
import matplotlib.pyplot as plt
from IPython.display import  display, clear_output

In [2]:
env = gym.make("Taxi-v3",render_mode="rgb_array")

In [3]:
q_table = np.zeros((env.observation_space.n, env.action_space.n))
q_table.shape

(500, 6)

In [7]:
learning_rate = 0.1
discount_factor = 0.99
exploration_start = 1.0
exploration_decay = 0.99995
min_exploration_rate = 0.1

def epsilon_greedy_action(state, q_table, epsilon):
    if np.random.rand() < epsilon:
        return env.action_space.sample()  # Explore
    else:
        return np.argmax(q_table[state])  # Exploit

In [8]:
episodes = 100000
epislon = exploration_start
env = TimeLimit(env, max_episode_steps=200)


In [9]:
from tqdm import tqdm
for episode in tqdm(range(episodes)):
    state,info = env.reset()
    done = False
    total_reward = 0
    while not done:
        action = epsilon_greedy_action(state, q_table, epislon)
        next_state, reward, done, truncated, info = env.step(action)
        
        # Update Q-value
        best_next_action = np.argmax(q_table[next_state])
        td_target = reward + discount_factor * q_table[next_state][best_next_action]
        td_error = td_target - q_table[state][action]
        q_table[state][action] += learning_rate * td_error
        
        state = next_state
        total_reward += reward
        done = done or truncated
    epislon = max(min_exploration_rate, epislon * exploration_decay)
    if (episode + 1) % 1000 == 0:
        # clear_output(wait=True)
        # display(env.render())
        print(f"Episode {episode + 1}/{episodes}, Total Reward: {total_reward}")  

print("Training completed!")
print("Final Q-table:")
print(q_table)
env.close()

  1%|          | 1019/100000 [00:08<12:06, 136.17it/s]

Episode 1000/100000, Total Reward: -178


  2%|▏         | 2017/100000 [00:15<09:50, 165.82it/s]

Episode 2000/100000, Total Reward: -682


  3%|▎         | 3032/100000 [00:20<07:53, 204.85it/s]

Episode 3000/100000, Total Reward: -257


  4%|▍         | 4033/100000 [00:25<06:20, 252.19it/s]

Episode 4000/100000, Total Reward: -85


  5%|▌         | 5037/100000 [00:28<04:35, 344.58it/s]

Episode 5000/100000, Total Reward: -355


  6%|▌         | 6049/100000 [00:31<04:02, 388.13it/s]

Episode 6000/100000, Total Reward: -222


  7%|▋         | 7070/100000 [00:33<03:31, 438.74it/s]

Episode 7000/100000, Total Reward: -214


  8%|▊         | 8075/100000 [00:36<03:07, 490.84it/s]

Episode 8000/100000, Total Reward: -332


  9%|▉         | 9064/100000 [00:37<02:27, 615.83it/s]

Episode 9000/100000, Total Reward: -82


 10%|█         | 10123/100000 [00:39<02:23, 624.84it/s]

Episode 10000/100000, Total Reward: -90


 11%|█         | 11037/100000 [00:41<02:33, 580.31it/s]

Episode 11000/100000, Total Reward: -71


 12%|█▏        | 12142/100000 [00:42<01:59, 737.24it/s]

Episode 12000/100000, Total Reward: -49


 13%|█▎        | 13087/100000 [00:44<01:42, 847.86it/s]

Episode 13000/100000, Total Reward: -80


 14%|█▍        | 14125/100000 [00:45<01:31, 942.61it/s]

Episode 14000/100000, Total Reward: -38


 15%|█▌        | 15111/100000 [00:46<01:26, 983.22it/s]

Episode 15000/100000, Total Reward: -6


 16%|█▌        | 16177/100000 [00:47<01:22, 1011.29it/s]

Episode 16000/100000, Total Reward: 0


 17%|█▋        | 17117/100000 [00:48<01:20, 1023.55it/s]

Episode 17000/100000, Total Reward: -16


 18%|█▊        | 18169/100000 [00:49<01:10, 1153.38it/s]

Episode 18000/100000, Total Reward: -34


 19%|█▉        | 19121/100000 [00:50<01:10, 1152.85it/s]

Episode 19000/100000, Total Reward: -42


 20%|██        | 20132/100000 [00:50<01:13, 1086.62it/s]

Episode 20000/100000, Total Reward: -32


 21%|██        | 21174/100000 [00:51<01:07, 1159.43it/s]

Episode 21000/100000, Total Reward: -16


 22%|██▏       | 22099/100000 [00:52<01:08, 1136.71it/s]

Episode 22000/100000, Total Reward: -14


 23%|██▎       | 23285/100000 [00:53<00:59, 1292.07it/s]

Episode 23000/100000, Total Reward: -16


 24%|██▍       | 24211/100000 [00:54<01:00, 1261.01it/s]

Episode 24000/100000, Total Reward: -2


 25%|██▌       | 25187/100000 [00:55<00:55, 1353.11it/s]

Episode 25000/100000, Total Reward: -18


 26%|██▌       | 26193/100000 [00:55<00:54, 1363.50it/s]

Episode 26000/100000, Total Reward: 3


 27%|██▋       | 27174/100000 [00:56<00:57, 1262.17it/s]

Episode 27000/100000, Total Reward: -14


 28%|██▊       | 28207/100000 [00:57<00:48, 1468.20it/s]

Episode 28000/100000, Total Reward: -43


 29%|██▉       | 29203/100000 [00:58<00:52, 1345.41it/s]

Episode 29000/100000, Total Reward: 5


 30%|███       | 30184/100000 [00:58<00:50, 1373.49it/s]

Episode 30000/100000, Total Reward: -2


 31%|███       | 31227/100000 [00:59<00:49, 1382.89it/s]

Episode 31000/100000, Total Reward: -3


 32%|███▏      | 32204/100000 [01:00<00:42, 1594.90it/s]

Episode 32000/100000, Total Reward: -12


 33%|███▎      | 33207/100000 [01:00<00:41, 1626.52it/s]

Episode 33000/100000, Total Reward: -16


 34%|███▍      | 34142/100000 [01:01<00:51, 1282.96it/s]

Episode 34000/100000, Total Reward: 5


 35%|███▌      | 35249/100000 [01:02<00:41, 1574.56it/s]

Episode 35000/100000, Total Reward: 5


 36%|███▌      | 36222/100000 [01:02<00:44, 1442.51it/s]

Episode 36000/100000, Total Reward: 1


 37%|███▋      | 37238/100000 [01:03<00:38, 1643.21it/s]

Episode 37000/100000, Total Reward: -5


 38%|███▊      | 38288/100000 [01:04<00:37, 1640.51it/s]

Episode 38000/100000, Total Reward: 12


 39%|███▉      | 39171/100000 [01:04<00:35, 1724.08it/s]

Episode 39000/100000, Total Reward: 10


 40%|████      | 40032/100000 [01:05<00:38, 1574.71it/s]

Episode 40000/100000, Total Reward: -3


 41%|████▏     | 41253/100000 [01:06<00:33, 1736.67it/s]

Episode 41000/100000, Total Reward: 10


 42%|████▏     | 42330/100000 [01:06<00:33, 1723.90it/s]

Episode 42000/100000, Total Reward: 6


 43%|████▎     | 43234/100000 [01:07<00:33, 1703.27it/s]

Episode 43000/100000, Total Reward: -3


 44%|████▍     | 44313/100000 [01:07<00:32, 1709.93it/s]

Episode 44000/100000, Total Reward: -6


 45%|████▌     | 45226/100000 [01:08<00:30, 1768.39it/s]

Episode 45000/100000, Total Reward: 6


 46%|████▋     | 46358/100000 [01:09<00:29, 1842.57it/s]

Episode 46000/100000, Total Reward: 4


 47%|████▋     | 47283/100000 [01:09<00:30, 1709.62it/s]

Episode 47000/100000, Total Reward: -8


 48%|████▊     | 48211/100000 [01:10<00:30, 1676.89it/s]

Episode 48000/100000, Total Reward: 11


 49%|████▉     | 49233/100000 [01:10<00:36, 1383.98it/s]

Episode 49000/100000, Total Reward: 2


 50%|█████     | 50185/100000 [01:11<00:27, 1783.34it/s]

Episode 50000/100000, Total Reward: 10


 51%|█████     | 51109/100000 [01:12<00:27, 1754.41it/s]

Episode 51000/100000, Total Reward: 8


 52%|█████▏    | 52352/100000 [01:12<00:27, 1755.80it/s]

Episode 52000/100000, Total Reward: -2


 53%|█████▎    | 53305/100000 [01:13<00:25, 1836.45it/s]

Episode 53000/100000, Total Reward: 5


 54%|█████▍    | 54244/100000 [01:13<00:24, 1864.59it/s]

Episode 54000/100000, Total Reward: 3


 55%|█████▌    | 55333/100000 [01:14<00:25, 1770.93it/s]

Episode 55000/100000, Total Reward: 1


 56%|█████▌    | 56248/100000 [01:15<00:24, 1751.98it/s]

Episode 56000/100000, Total Reward: 7


 57%|█████▋    | 57197/100000 [01:15<00:23, 1818.85it/s]

Episode 57000/100000, Total Reward: 9


 58%|█████▊    | 58321/100000 [01:16<00:22, 1855.68it/s]

Episode 58000/100000, Total Reward: 0


 59%|█████▉    | 59273/100000 [01:16<00:24, 1671.19it/s]

Episode 59000/100000, Total Reward: 10


 60%|██████    | 60234/100000 [01:17<00:21, 1807.87it/s]

Episode 60000/100000, Total Reward: -8


 61%|██████    | 61181/100000 [01:17<00:22, 1732.20it/s]

Episode 61000/100000, Total Reward: -3


 62%|██████▏   | 62251/100000 [01:18<00:23, 1625.82it/s]

Episode 62000/100000, Total Reward: 12


 63%|██████▎   | 63250/100000 [01:19<00:22, 1616.44it/s]

Episode 63000/100000, Total Reward: -28


 64%|██████▍   | 64323/100000 [01:19<00:21, 1664.39it/s]

Episode 64000/100000, Total Reward: 7


 65%|██████▌   | 65206/100000 [01:20<00:22, 1561.86it/s]

Episode 65000/100000, Total Reward: 7


 66%|██████▋   | 66293/100000 [01:21<00:19, 1728.64it/s]

Episode 66000/100000, Total Reward: -5


 67%|██████▋   | 67227/100000 [01:21<00:17, 1838.14it/s]

Episode 67000/100000, Total Reward: 8


 68%|██████▊   | 68361/100000 [01:22<00:17, 1790.56it/s]

Episode 68000/100000, Total Reward: 1


 69%|██████▉   | 69318/100000 [01:22<00:16, 1851.97it/s]

Episode 69000/100000, Total Reward: 9


 70%|███████   | 70240/100000 [01:23<00:16, 1761.43it/s]

Episode 70000/100000, Total Reward: 9


 71%|███████   | 71176/100000 [01:23<00:15, 1823.75it/s]

Episode 71000/100000, Total Reward: 9


 72%|███████▏  | 72111/100000 [01:24<00:16, 1723.63it/s]

Episode 72000/100000, Total Reward: 8


 73%|███████▎  | 73336/100000 [01:25<00:15, 1708.41it/s]

Episode 73000/100000, Total Reward: 13


 74%|███████▍  | 74224/100000 [01:25<00:15, 1717.75it/s]

Episode 74000/100000, Total Reward: 7


 75%|███████▌  | 75308/100000 [01:26<00:14, 1759.59it/s]

Episode 75000/100000, Total Reward: -40


 76%|███████▌  | 76189/100000 [01:26<00:15, 1555.04it/s]

Episode 76000/100000, Total Reward: 10


 77%|███████▋  | 77202/100000 [01:27<00:14, 1608.38it/s]

Episode 77000/100000, Total Reward: -6


 78%|███████▊  | 78255/100000 [01:28<00:12, 1728.42it/s]

Episode 78000/100000, Total Reward: 8


 79%|███████▉  | 79328/100000 [01:28<00:11, 1766.04it/s]

Episode 79000/100000, Total Reward: 9


 80%|████████  | 80393/100000 [01:29<00:11, 1730.44it/s]

Episode 80000/100000, Total Reward: 2


 81%|████████▏ | 81250/100000 [01:30<00:11, 1572.27it/s]

Episode 81000/100000, Total Reward: -13


 82%|████████▏ | 82154/100000 [01:30<00:11, 1594.09it/s]

Episode 82000/100000, Total Reward: -2


 83%|████████▎ | 83252/100000 [01:31<00:11, 1444.75it/s]

Episode 83000/100000, Total Reward: -20


 84%|████████▍ | 84298/100000 [01:32<00:09, 1728.19it/s]

Episode 84000/100000, Total Reward: 1


 85%|████████▌ | 85241/100000 [01:32<00:08, 1842.38it/s]

Episode 85000/100000, Total Reward: 4


 86%|████████▋ | 86365/100000 [01:33<00:07, 1841.02it/s]

Episode 86000/100000, Total Reward: 3


 87%|████████▋ | 87309/100000 [01:33<00:06, 1850.34it/s]

Episode 87000/100000, Total Reward: 8


 88%|████████▊ | 88184/100000 [01:34<00:08, 1344.34it/s]

Episode 88000/100000, Total Reward: 7


 89%|████████▉ | 89242/100000 [01:35<00:06, 1719.72it/s]

Episode 89000/100000, Total Reward: 3


 90%|█████████ | 90357/100000 [01:35<00:05, 1820.83it/s]

Episode 90000/100000, Total Reward: -2


 91%|█████████ | 91207/100000 [01:36<00:05, 1484.59it/s]

Episode 91000/100000, Total Reward: -3


 92%|█████████▏| 92220/100000 [01:36<00:05, 1546.08it/s]

Episode 92000/100000, Total Reward: -2


 93%|█████████▎| 93309/100000 [01:37<00:03, 1781.12it/s]

Episode 93000/100000, Total Reward: 5


 94%|█████████▍| 94243/100000 [01:38<00:03, 1839.34it/s]

Episode 94000/100000, Total Reward: 1


 95%|█████████▌| 95172/100000 [01:38<00:03, 1577.68it/s]

Episode 95000/100000, Total Reward: 7


 96%|█████████▋| 96301/100000 [01:39<00:02, 1825.51it/s]

Episode 96000/100000, Total Reward: -2


 97%|█████████▋| 97251/100000 [01:39<00:01, 1839.70it/s]

Episode 97000/100000, Total Reward: 0


 98%|█████████▊| 98385/100000 [01:40<00:00, 1838.12it/s]

Episode 98000/100000, Total Reward: -28


 99%|█████████▉| 99289/100000 [01:41<00:00, 1675.05it/s]

Episode 99000/100000, Total Reward: 9


100%|██████████| 100000/100000 [01:41<00:00, 985.86it/s]


Episode 100000/100000, Total Reward: -10
Training completed!
Final Q-table:
[[ 0.          0.          0.          0.          0.          0.        ]
 [ 7.44059051  8.525849    7.44059051  8.525849    9.6220697  -0.474151  ]
 [11.84784175 12.97761793 11.84784175 12.97761793 14.11880599  3.97761793]
 ...
 [14.11880599 15.2715212  14.11880599 12.97761793  5.11880599  5.11880599]
 [ 9.6220697  10.72936333  9.6220697  10.72936333  0.6220697   0.6220697 ]
 [17.612      16.43588    17.612      18.8         8.612       8.612     ]]


In [14]:
env = gym.make("Taxi-v3", render_mode="human")

state, info = env.reset()
done = False

In [None]:
total_reward = 0
while not done:
    action = np.argmax(q_table[state])
    next_state, reward, done, truncated, info = env.step(action)
    
    state = next_state
    total_reward += reward
    done = done or truncated
    # env.render()
print(f"Total Reward: {total_reward}")
env.close()

Total Reward: 9


In [17]:
env = gym.make("Taxi-v3", render_mode="human")

state, info = env.reset()
done = False
total_reward = 0
while not done:
    action = np.argmax(q_table[state])
    next_state, reward, done, truncated, info = env.step(action)
    
    state = next_state
    total_reward += reward
    done = done or truncated
    # env.render()
print(f"Total Reward: {total_reward}")
env.close()

Total Reward: 11
