<a href="https://colab.research.google.com/github/mikohuhu/q-learning-taxi-v3-MMAI845/blob/JAY/final_taxiv3_final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [1]:
import gym
import random
import numpy as np
import pandas as pd
from collections import defaultdict
import time

# Variables

In [2]:
alpha = 0.1
gamma = 0.6
epsilon = 0.1
NUM_EPISODES = 100000
NUM_EVAL_EPISODES = 100

# Sample

In [3]:
def sampling(env, num_episodes):
    total_penalties = 0
    total_timesteps = 0
    total_rewards = 0
    start_time = time.time()  # Start time

    for _ in range(num_episodes):
        state = env.reset()
        done = False
        timesteps = 0
        penalties = 0
        episode_rewards = 0

        while not done:
            action = env.action_space.sample()
            next_state, reward, done, _ = env.step(action)
            timesteps += 1
            episode_rewards += reward

            if reward == -10:
                penalties += 1

        total_timesteps += timesteps
        total_penalties += penalties
        total_rewards += episode_rewards

    average_penalties = total_penalties / num_episodes
    average_timesteps = total_timesteps / num_episodes
    average_rewards_per_move = total_rewards / (total_timesteps - total_penalties)  # Average rewards per move

    end_time = time.time()  # End time
    execution_time = end_time - start_time  # Calculate execution time

    return average_penalties, average_timesteps, average_rewards_per_move, execution_time

if __name__ == "__main__":
    env = gym.make("Taxi-v3")
    NUM_EPISODES = 100000
    avg_penalties, avg_timesteps, avg_rewards_per_move, execution_time = sampling(env, NUM_EPISODES)
    print("Sampling Method:")
    print("Average timesteps per trip: {}".format(avg_timesteps))
    print("Average penalties per episode: {}".format(avg_penalties))
    print("Average rewards per move: {}".format(avg_rewards_per_move))
    print("Execution time: {:.2f} seconds".format(execution_time))

  deprecation(
  deprecation(
  if not isinstance(terminated, (bool, np.bool8)):


Sampling Method:
Average timesteps per trip: 196.64697
Average penalties per episode: 63.97952
Average rewards per move: -5.815110337916346
Execution time: 676.15 seconds


# Q-Learning

In [4]:
env = gym.make("Taxi-v3")
def epsilon_greedy_policy(Q, state, epsilon):
    if np.random.uniform(0, 1) > epsilon:
        action = np.argmax(Q[state])
    else:
        action = env.action_space.sample()
    return action

def train_q_learning(Q, env, num_episodes, alpha, gamma, epsilon):
    start_time = time.time()
    for episode in range(num_episodes):
        state = env.reset()
        done = False

        while not done:
            action = epsilon_greedy_policy(Q, state, epsilon)
            next_state, reward, done, _ = env.step(action)

            next_max_q_value = np.max(Q[next_state]) if not done else 0
            Q[state][action] += alpha * (reward + gamma * next_max_q_value - Q[state][action])

            state = next_state
    end_time = time.time()
    execution_time = end_time - start_time
    return execution_time


def evaluate_q_learning(Q, env, num_episodes):
    total_penalties = 0
    total_timesteps = 0
    total_rewards = 0

    for _ in range(num_episodes):
        state = env.reset()
        penalties = 0
        timesteps = 0
        rewards = 0
        done = False

        while not done:
            action = np.argmax(Q[state])
            state, reward, done, _ = env.step(action)

            rewards += reward
            timesteps += 1

            if reward == -10:
                penalties += 1

        total_penalties += penalties
        total_timesteps += timesteps
        total_rewards += rewards

    average_penalties = total_penalties / num_episodes
    average_timesteps = total_timesteps / num_episodes
    average_rewards = total_rewards / num_episodes

    print("Q-learning Method:")
    print("Average number of penalties per episode:", average_penalties)
    print("Average number of timesteps per trip:", average_timesteps)
    print("Average rewards per move:", average_rewards)

def main():
    env = gym.make("Taxi-v3")
    NUM_EPISODES = 100000
    Q = defaultdict(lambda: np.zeros(env.action_space.n))
    alpha = 0.1
    gamma = 0.6
    epsilon = 0.1

    execution_time = train_q_learning(Q, env, NUM_EPISODES, alpha, gamma, epsilon)
    evaluate_q_learning(Q, env, 100)
    print("Execution time: {:.2f} seconds".format(execution_time))

if __name__ == "__main__":
    main()

Q-learning Method:
Average number of penalties per episode: 0.0
Average number of timesteps per trip: 12.86
Average rewards per move: 8.14
Execution time: 81.18 seconds


# SARSA

In [5]:
def epsilon_greedy_policy(Q, state, epsilon, env):
    if np.random.uniform(0, 1) > epsilon:
        action = np.argmax(Q[state])
    else:
        action = env.action_space.sample()
    return action

def train_sarsa(Q, env, num_episodes, alpha, gamma, epsilon):
    start_time = time.time()
    for episode in range(num_episodes):
        state = env.reset()
        action = epsilon_greedy_policy(Q, state, epsilon, env)
        done = False

        while not done:
            next_state, reward, done, _ = env.step(action)
            next_action = epsilon_greedy_policy(Q, next_state, epsilon, env)

            next_q_value = Q[next_state][next_action] if not done else 0
            Q[state][action] += alpha * (reward + gamma * next_q_value - Q[state][action])

            state = next_state
            action = next_action

    end_time = time.time()
    execution_time = end_time - start_time
    return execution_time

def evaluate_sarsa(Q, env, num_episodes):
    total_penalties = 0
    total_timesteps = 0
    total_rewards = 0

    for _ in range(num_episodes):
        state = env.reset()
        penalties = 0
        timesteps = 0
        rewards = 0
        done = False

        while not done:
            action = np.argmax(Q[state])
            state, reward, done, _ = env.step(action)

            rewards += reward
            timesteps += 1

            if reward == -10:
                penalties += 1

        total_penalties += penalties
        total_timesteps += timesteps
        total_rewards += rewards

    average_penalties = total_penalties / num_episodes
    average_timesteps = total_timesteps / num_episodes
    average_rewards = total_rewards / num_episodes

    print("SARSA Method:")
    print("Average number of penalties per episode:", average_penalties)
    print("Average number of timesteps per trip:", average_timesteps)
    print("Average rewards per move:", average_rewards)

def main():
    env = gym.make("Taxi-v3")
    NUM_EPISODES = 100000
    Q = defaultdict(lambda: np.zeros(env.action_space.n))
    alpha = 0.1
    gamma = 0.6
    epsilon = 0.1

    print("\nRunning SARSA...")
    execution_time = train_sarsa(Q, env, NUM_EPISODES, 0.2, 0.8, 0.1)
    evaluate_sarsa(Q, env, 100)
    print("Execution time: {:.2f} seconds".format(execution_time))

if __name__ == "__main__":
    main()


Running SARSA...
SARSA Method:
Average number of penalties per episode: 0.0
Average number of timesteps per trip: 28.55
Average rewards per move: -9.23
Execution time: 117.97 seconds


# Deep Q-Learning

In [5]:
import numpy as np
import time
from keras.optimizers import Adam
import gym
from keras.models import Sequential
from keras.layers import Dense, Embedding, Reshape
from collections import deque
import random
from tqdm import tqdm  # Importing tqdm for progress bar

# Initialize the environment with the new step API
env_taxi = gym.make("Taxi-v3", new_step_api=True).env

class TaxiAgent:
    def __init__(self, env, optimizer):
        self._state_size = env.observation_space.n
        self._action_size = env.action_space.n
        self._optimizer = optimizer
        self.experience_replay_memory = deque(maxlen=2000)
        self.discount = 0.6
        self.exploration = 0.1
        self.q_network = self._build_compile_model()
        self.target_network = self._build_compile_model()
        self.align_both_model()

    def gather(self, state, action, reward, next_state, terminated):
        self.experience_replay_memory.append((state, action, reward, next_state, terminated))

    def _build_compile_model(self):
        model = Sequential()
        model.add(Embedding(self._state_size, 10, input_length=1))
        model.add(Reshape((10,)))
        model.add(Dense(50, activation='relu'))
        model.add(Dense(50, activation='relu'))
        model.add(Dense(self._action_size, activation='linear'))
        model.compile(loss='mse', optimizer=self._optimizer)
        return model

    def align_both_model(self):
        self.target_network.set_weights(self.q_network.get_weights())

    def act(self, state):
        if np.random.rand() <= self.exploration:
            return env_taxi.action_space.sample()
        q_values = self.q_network.predict(state, verbose=0)
        return np.argmax(q_values[0])

    def retrain(self, batch_size):
        minibatch = random.sample(self.experience_replay_memory, batch_size)
        for state, action, reward, next_state, terminated in minibatch:
            target = self.q_network.predict(state, verbose=0)
            if terminated:
                target[0][action] = reward
            else:
                t = self.target_network.predict(next_state, verbose=0)
                target[0][action] = reward + self.discount * np.amax(t)
            self.q_network.fit(state, target, epochs=1, verbose=0)  # Reduce the epochs

def deep_q_learning(env, num_training_episodes=100, num_evaluation_episodes=100, alpha=0.01, gamma=0.6, epsilon=0.1, batch_size=32, timesteps_per_episode=40, epochs=4):
    # Training phase
    start_time = time.time()  # Record start time for training
    total_penalties = 0
    total_timesteps = 0
    total_rewards = 0

    for e in tqdm(range(num_training_episodes), desc="Training Episodes"):
        state = env_taxi.reset()
        state = np.reshape(state, [1, 1])
        reward = 0
        terminated = False
        timesteps = 0
        penalties = 0

        while not terminated and timesteps < timesteps_per_episode:
            action = taxi_agent.act(state)
            next_state, reward, terminated, truncated, info = env_taxi.step(action)
            next_state = np.reshape(next_state, [1, 1])
            taxi_agent.gather(state, action, reward, next_state, terminated)
            state = next_state
            timesteps += 1
            if reward == -10:
                penalties += 1

            # Debugging print statements
            print(f"Episode {e + 1}, Timestep {timesteps}: action={action}, reward={reward}, terminated={terminated}, penalties={penalties}")

            if len(taxi_agent.experience_replay_memory) > batch_size and timesteps % 10 == 0:  # Retrain less frequently
                print(f"Retraining on mini-batch at episode {e + 1}")
                taxi_agent.retrain(batch_size)

        total_penalties += penalties
        total_timesteps += timesteps
        total_rewards += reward

    end_time = time.time()  # Record end time for training
    execution_time = end_time - start_time  # Calculate time to train

    # Evaluation phase
    total_penalties = 0
    total_timesteps = 0
    total_rewards = 0
    for e in tqdm(range(num_evaluation_episodes), desc="Evaluation Episodes"):
        # Evaluation episode loop
        state = env_taxi.reset()
        state = np.reshape(state, [1, 1])
        reward = 0
        terminated = False
        timesteps = 0
        penalties = 0

        while not terminated and timesteps < timesteps_per_episode:
            action = taxi_agent.act(state)
            next_state, reward, terminated, truncated, info = env_taxi.step(action)
            next_state = np.reshape(next_state, [1, 1])
            state = next_state
            timesteps += 1
            if reward == -10:
                penalties += 1

            # Debugging print statements
            print(f"Evaluation Episode {e + 1}, Timestep {timesteps}: action={action}, reward={reward}, terminated={terminated}, penalties={penalties}")

        total_penalties += penalties
        total_timesteps += timesteps
        total_rewards += reward

    # Calculate averages for evaluation metrics
    average_penalties = total_penalties / num_evaluation_episodes
    average_timesteps = total_timesteps / num_evaluation_episodes
    average_rewards_per_move = total_rewards / (total_timesteps - total_penalties)

    # Print metrics
    print("Deep Q-Learning Method:")
    print("Average number of penalties per episode:", average_penalties)
    print("Average number of timesteps per trip:", average_timesteps)
    print("Average rewards per move:", average_rewards_per_move)
    print("Time to train:", execution_time, "seconds")

# Creating the optimizer
optimizer = Adam(learning_rate=0.01)

# Creating the TaxiAgent instance
taxi_agent = TaxiAgent(env_taxi, optimizer)

# Defining parameters for the experiment
num_training_episodes = 100
num_evaluation_episodes = 100
batch_size = 32
timesteps_per_episode = 40

# Running the Deep Q-Learning experiment
deep_q_learning(env_taxi, num_training_episodes, num_evaluation_episodes, batch_size=batch_size, timesteps_per_episode=timesteps_per_episode)


# Table

In [9]:
sm_data = {
    "Metric": ["Average Penalties", "Average Timesteps", "Average Rewards per Move", "Training Execution Time (seconds)"],
    "Sample Method": [63.97952, 196.64697, -5.815110337916346, 676.15]
}

ql_data = {
    "Metric": ["Average Penalties", "Average Timesteps", "Average Rewards per Move", "Training Execution Time (seconds)"],
    "Q-Learning": [0.0, 12.86, 8.14, 81.18]
}

sar_data = {
    "Metric": ["Average Penalties", "Average Timesteps", "Average Rewards per Move", "Training Execution Time (seconds)"],
    "SARSA": [0.0, 28.55, -9.23, 117.97]
}

dql_data = {
    "Metric": ["Average Penalties", "Average Timesteps", "Average Rewards per Move", "Training Execution Time (seconds)"],
    "Deep Q-Learning": [1.4, 40.0, -0.03290155440414508, 3528.5490431785583]
}
# These two are not learning the right thing to do, they are learning what not to do in that they are learning to minimize penalties vs. trying to maximize rewards

# Create DataFrames
sm_df = pd.DataFrame(sm_data)
ql_df = pd.DataFrame(ql_data)
sarsa_df = pd.DataFrame(sar_data)
dql_df = pd.DataFrame(dql_data)

# Merge DataFrames
summary_df = pd.merge(sm_df, ql_df, on="Metric")
summary_df = pd.merge(summary_df, sarsa_df, on="Metric")
summary_df = pd.merge(summary_df, dql_df, on="Metric")
summary_df = summary_df.round(2)

print("Summary DataFrame:")
summary_df

Summary DataFrame:


Unnamed: 0,Metric,Sample Method,Q-Learning,SARSA,Deep Q-Learning
0,Average Penalties,63.98,0.0,0.0,1.4
1,Average Timesteps,196.65,12.86,28.55,40.0
2,Average Rewards per Move,-5.82,8.14,-9.23,-0.03
3,Training Execution Time (seconds),676.15,81.18,117.97,3528.55
