In [7]:
import numpy as np
import gym
from IPython.display import clear_output

# Setup
env = gym.make('Taxi-v3').env
q_table = np.zeros([env.observation_space.n, env.action_space.n])

# Hyperparameters
alpha = 0.4      # Learning rate
gamma = 0.6      # Discount factor
epsilon = 1.0    # Start with 100% exploration
min_epsilon = 0.01
decay_rate = 0.0005

episodes = 10000
total_epochs = 0
total_penalties = 0

# Training
for episode in range(episodes):
    state = env.reset()[0]
    done = False
    penalties = 0
    epochs = 0

    while not done:
        # ε-greedy action selection
        if np.random.uniform(0, 1) < epsilon:
            action = env.action_space.sample()  # Explore
        else:
            action = np.argmax(q_table[state])  # Exploit

        old_value = q_table[state, action]
        next_state, reward, done, _, _ = env.step(action)
        next_max = np.max(q_table[next_state])

        # Q-learning update
        new_value = (1 - alpha) * old_value + alpha * (reward + gamma * next_max)
        q_table[state, action] = new_value

        state = next_state

        if reward == -10:
            penalties += 1
           # 🔄 Print progress every 500 episodes
        if episode % 500 == 0:
            clear_output(wait=True)                 # Clear the cell output
            print(f"Episode: {episode}")
            print(f"  ⏱ Steps this episode: {epochs}")
            print(f"  ❌ Penalties this episode: {penalties}")
            print("-" * 30)
        

        epochs += 1

    # Decrease ε (less exploration over time)
    epsilon = max(min_epsilon, epsilon * np.exp(-decay_rate * episode))

    total_epochs += epochs
    total_penalties += penalties

print("✅ Training finished with ε-greedy Q-learning!")
print(f"📊 Average timesteps per episode: {total_epochs / episodes:.2f}")
print(f"📊 Average penalties per episode: {total_penalties / episodes:.2f}")

Episode: 9500
  ⏱ Steps this episode: 17
  ❌ Penalties this episode: 0
------------------------------
✅ Training finished with ε-greedy Q-learning!
📊 Average timesteps per episode: 18.87
📊 Average penalties per episode: 1.43


In [8]:
q_table

array([[  0.        ,   0.        ,   0.        ,   0.        ,
          0.        ,   0.        ],
       [ -2.39270965,  -2.3585428 ,  -2.38037848,  -2.34282765,
         -2.27325184, -11.24689344],
       [ -1.9083821 ,  -1.66530259,  -1.89266498,  -1.58167549,
         -0.7504    , -10.89789413],
       ...,
       [ -1.70487637,  -0.31632841,  -1.84650736,  -1.78739456,
        -10.4422912 , -10.64353635],
       [ -2.17630746,  -2.1614336 ,  -2.18918714,  -2.130861  ,
        -11.0822425 , -10.98509058],
       [  3.21603716,   0.50563457,   3.49564765,  10.99762099,
         -5.7029256 ,  -5.09729446]])

In [9]:
episodes = 1000                      # Number of test episodes to run
total_epochs = 0                   # Total steps taken across all episodes
total_penalties = 0                # Total number of penalties across all episodes

for _ in range(episodes):
    state = env.reset()[0]         # Get the initial state (unpack from tuple)
    done = False                   # Whether the episode is finished
    epochs = 0                     # Steps taken in this episode
    penalties = 0                  # Penalties in this episode

    while not done:
        # Select the best action based on trained Q-table
        action = np.argmax(q_table[state])

        # Apply the action in the environment
        state, reward, done, _, _ = env.step(action)

        # Check if the action caused a penalty (-10 reward)
        if reward == -10:
            penalties += 1

        epochs += 1  # Count each step taken

    # Track totals across all test episodes
    total_epochs += epochs
    total_penalties += penalties

# Show average performance of the trained agent
print(f"\n📊 Results after {episodes} test episodes:")
print(f"🕒 Average timesteps per episode: {total_epochs / episodes:.2f}")
print(f"❌ Average penalties per episode: {total_penalties / episodes:.2f}")



📊 Results after 1000 test episodes:
🕒 Average timesteps per episode: 13.07
❌ Average penalties per episode: 0.00
