In [5]:
# imports
import numpy as np
import gymnasium as gym
import random
import os

# Setting seed for reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)

# Environment
env = gym.make("Taxi-v3",render_mode=None)
action_size = env.action_space.n
state_size = env.observation_space.n

# Q-tables
qtable1 = np.zeros((state_size, action_size))
qtable2 = np.zeros((state_size, action_size))

print("Initial Q-tables:")
print("Q1:\n", qtable1)
print("Q2:\n", qtable2)

# Hyperparameters
total_episodes = 250000
max_steps = 100
learning_rate = 0.15
gamma = 0.95
epsilon = 1.0
max_epsilon = 1.0
min_epsilon = 0.01
decay_rate = 0.001

# Saving data
rewards = []
success_count = 0
log_path = "taxi_training_log.txt"
if os.path.exists(log_path):
    os.remove(log_path)

# Training
for episode in range(total_episodes):
    state, info = env.reset(seed=SEED)
    done = False
    total_rewards = 0

    for step in range(max_steps):
        # Epsilon-greedy
        if random.uniform(0, 1) < epsilon:
            action = env.action_space.sample()
        else:
            combined_q = qtable1[state] + qtable2[state]
            action = np.argmax(combined_q)

        new_state, reward, done, truncated, info = env.step(action)

        # Update Q values
        if random.random() < 0.5:
            best_action_q2 = np.argmax(qtable2[new_state])
            target = reward + gamma * qtable2[new_state, best_action_q2]
            qtable1[state, action] += learning_rate * (target - qtable1[state, action])
        else:
            best_action_q1 = np.argmax(qtable1[new_state])
            target = reward + gamma * qtable1[new_state, best_action_q1]
            qtable2[state, action] += learning_rate * (target - qtable2[state, action])

        state = new_state
        total_rewards += reward

        # Logging every 1000 episodes
        if episode % 1000 == 0 and step == 0:
            with open(log_path, "a") as f:
                f.write(f"Episode {episode + 1} start\n")

        if done or truncated:
            if reward == 20:
                success_count += 1
            with open(log_path, "a") as f:
                f.write(f"Episode {episode + 1} terminated. Steps: {step + 1}, Reward: {reward}, Done: {done}, Truncated: {truncated}\n")
            break

    # Update epsilon
    epsilon = min_epsilon + (max_epsilon - min_epsilon) * np.exp(-decay_rate * episode)
    rewards.append(total_rewards)

# Training results
print("\nTraining completed.")
print(f"Success rate: {success_count / total_episodes:.3f} ({success_count} successful episodes)")
print(f"Score over time (average reward): {sum(rewards) / total_episodes:.3f}")
print("Final Q-tables:")
print("Q1:\n", qtable1)
print("Q2:\n", qtable2)

# Save the Q-tables
np.save("taxi_qtable1.npy", qtable1)
np.save("taxi_qtable2.npy", qtable2)

# Evaluation
env.close()
env = gym.make("Taxi-v3", render_mode="human")
eval_max_steps = 100

for episode in range(25):
    state, info = env.reset(seed=SEED)
    done = False
    print("\n****************************************************")
    print(f"EVALUATION EPISODE {episode + 1}")

    for step in range(eval_max_steps):
        combined_q = qtable1[state] + qtable2[state]
        action = np.argmax(combined_q)
        new_state, reward, done, truncated, info = env.step(action)
        print(f"Step {step}: State {state}, Action {action}, New State {new_state}, Reward {reward}")

        if done or truncated:
            env.render()
            print(f"Number of steps: {step + 1}")
            print(f"Reward: {reward}")
            break
        state = new_state
    else:
        print(f"Episode did not terminate within {eval_max_steps} steps.")
        print(f"Final state: {state}")

env.close()

Initial Q-tables:
Q1:
 [[0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 ...
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]]
Q2:
 [[0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 ...
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]]

Training completed.
Success rate: 0.999 (249717 successful episodes)
Score over time (average reward): 6.787
Final Q-tables:
Q1:
 [[  0.           0.           0.           0.           0.
    0.        ]
 [  0.           0.           0.           0.           0.
    0.        ]
 [ -0.79866952  -0.64489406  -0.70892118  -0.7511781    1.27734712
   -3.97163418]
 ...
 [  0.           0.           0.           0.           0.
    0.        ]
 [ -2.76931793  -0.90262955  -2.75110258  -1.45890517 -11.94810484
  -10.35111116]
 [  0.           0.           0.           0.           0.
    0.        ]]
Q2:
 [[  0.           0.           0.           0.           0.
    0.        ]
 [  0.           0.      