In [1]:
!pip install gym



In [2]:
import gym 
env = gym.make('Taxi-v3',render_mode='ansi').env
env.reset(seed=0)
env = env.unwrapped
arr = env.render()
print(arr)
# print(arr.getvalue())  # Use getvalue() if arr is a StringIO object

+---------+
|R: | : :G|
| : | : : |
| : : : : |
|[43m [0m| : | : |
|[35mY[0m| : |[34;1mB[0m: |
+---------+




In [3]:
print('Action Space ',env.action_space)
print('State Space',env.observation_space)

Action Space  Discrete(6)
State Space Discrete(500)


In [4]:
state = env.encode(3,1,2,0)
print("State",state)
env.s = state 
arr = env.render()
print(arr)

State 328
+---------+
|[35mR[0m: | : :G|
| : | : : |
| : : : : |
| |[43m [0m: | : |
|[34;1mY[0m| : |B: |
+---------+




In [5]:
env.P[env.s]

{0: [(1.0, 428, -1, False)],
 1: [(1.0, 228, -1, False)],
 2: [(1.0, 348, -1, False)],
 3: [(1.0, 328, -1, False)],
 4: [(1.0, 328, -10, False)],
 5: [(1.0, 328, -10, False)]}

In [8]:
state = env.encode(3,1,2,0)
env.s = state 

epochs = 0 
penalties = 0 

frames = [] 

done = False
while not done : 
    action  = env.action_space.sample()
    state,reward,done,_,_ = env.step(action)

    if reward == -10 : 
        penalties +=1
    frames.append({'state':state,'action':action,'reward':reward})

    epochs+=1

print('Timesteps taken: ',epochs)
print('Penalities incurred: ',penalties)

Timesteps taken:  709
Penalities incurred:  237


In [9]:
from IPython.display import clear_output 
from time import sleep 

def print_frame(frames): 
    actions= ['North','South','East','West','Pick-Up','Drop-Of']
    for i,frame  in enumerate(frames): 
        env.s = frame['state']
        clear_output(wait=True)
        arr = env.render()
        print(arr)
        print(f"Timestep: {i+1}")
        print(f'State: {frame['state']}')
        print(f'Action: {frame['action']}')
        print(f'Reward: {frame['reward']}')
        sleep(0.1)
        
print_frame(frames)

+---------+
|[35m[34;1m[43mR[0m[0m[0m: | : :G|
| : | : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (Dropoff)

Timestep: 709
State: 0
Action: 5
Reward: 20


In [10]:
episodes = 100 
total_epochs = 0 
total_penalties = 0 

for _ in range(episodes): 
    state = env.reset()
   
    epochs = 0 
    penalties = 0 
    reward = 0  
    done = False
    
    while not done : 
        action  = env.action_space.sample()
        state,reward,done,_,_ = env.step(action)
    
        if reward == -10 : 
            penalties +=1
        # frames.append({'state':state,'action':action,'reward':reward})
        epochs+=1 
    total_epochs +=epochs 
    total_penalties+=penalties

print(f'Result after {episodes} episodes')
print(f'Average timesteps per episode {total_epochs/episodes}')
print(f'Average penalties per episode {total_penalties/episodes}')

Result after 100 episodes
Average timesteps per episode 2312.44
Average penalties per episode 746.85


In [15]:
import numpy as np 

states = env.observation_space.n 
actions = env.action_space.n
# print(actions)
q_table = np.zeros([states,actions])
q_table

array([[0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       ...,
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.]])

In [16]:
q_table.shape

(500, 6)

In [22]:
# Hyperparameters
alpha = 0.4     # Learning rate - how much new info overrides old
gamma = 0.6     # Discount factor - how much future rewards matter

# Training config
episodes = 10000                 # Number of episodes to train on
total_epochs = 0                # Total steps across all episodes
total_penalties = 0            # Total penalties across all episodes

# Training loop
for episode in range(episodes):
    state = env.reset()[0]      # Get the initial state (unpack from tuple)
    done = False
    epochs = 0
    penalties = 0

    while not done:
        # Choose the best known action (greedy strategy)
        action = np.argmax(q_table[state])

        # Store current Q value
        old_value = q_table[state, action]

        # Apply the action to get new state and reward
        next_state, reward, done, _, _ = env.step(action)

        # Estimate future optimal value
        next_max = np.argmax(q_table[next_state])

        # Q-learning update rule
        new_value = (1 - alpha) * old_value + alpha * (reward + gamma * q_table[next_state, next_max])
        q_table[state, action] = new_value

        state = next_state

        # Track penalties
        if reward == -10:
            penalties += 1

        epochs += 1

    # Update totals
    total_epochs += epochs
    total_penalties += penalties

    # 🔄 Print progress every 500 episodes
    if episode % 500 == 0:
        clear_output(wait=True)                 # Clear the cell output
        print(f"Episode: {episode}")
        print(f"  ⏱ Steps this episode: {epochs}")
        print(f"  ❌ Penalties this episode: {penalties}")
        print("-" * 30)

# Final summary
print("\n✅ Finished Training with Q-learning!")
print(f"📊 Average timesteps per episode: {total_epochs / episodes:.2f}")
print(f"📊 Average penalties per episode: {total_penalties / episodes:.2f}")

Episode: 9500
  ⏱ Steps this episode: 16
  ❌ Penalties this episode: 0
------------------------------

✅ Finished Training with Q-learning!
📊 Average timesteps per episode: 13.06
📊 Average penalties per episode: 0.00


In [23]:
episodes = 1000                      # Number of test episodes to run
total_epochs = 0                   # Total steps taken across all episodes
total_penalties = 0                # Total number of penalties across all episodes

for _ in range(episodes):
    state = env.reset()[0]         # Get the initial state (unpack from tuple)
    done = False                   # Whether the episode is finished
    epochs = 0                     # Steps taken in this episode
    penalties = 0                  # Penalties in this episode

    while not done:
        # Select the best action based on trained Q-table
        action = np.argmax(q_table[state])

        # Apply the action in the environment
        state, reward, done, _, _ = env.step(action)

        # Check if the action caused a penalty (-10 reward)
        if reward == -10:
            penalties += 1

        epochs += 1  # Count each step taken

    # Track totals across all test episodes
    total_epochs += epochs
    total_penalties += penalties

# Show average performance of the trained agent
print(f"\n📊 Results after {episodes} test episodes:")
print(f"🕒 Average timesteps per episode: {total_epochs / episodes:.2f}")
print(f"❌ Average penalties per episode: {total_penalties / episodes:.2f}")



📊 Results after 1000 test episodes:
🕒 Average timesteps per episode: 13.15
❌ Average penalties per episode: 0.00


In [25]:
state = env.encode(3,1,2,0)
print("State",state)
env.s = state 
arr = env.render()
print(arr)


frames = [] 

done = False
while not done : 
    action  =  np.argmax(q_table[state])
    state,reward,done,_,_ = env.step(action)

    if reward == -10 : 
        penalties +=1
    frames.append({'state':state,'action':action,'reward':reward})

    epochs+=1

print('Timesteps taken: ',epochs)
print('Penalities incurred: ',penalties)

print_frame(frames)



+---------+
|[35m[34;1m[43mR[0m[0m[0m: | : :G|
| : | : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (Dropoff)

Timestep: 10
State: 0
Action: 5
Reward: 20
