<a href="https://colab.research.google.com/github/mikohuhu/q-learning-taxi-v3-MMAI845/blob/JAY/sarsa_optimum.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
! pip -q install pickle-mixin
! pip -q install collection
! pip -q install click

  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for pickle-mixin (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for collection (setup.py) ... [?25l[?25hdone


In [None]:
import gym
import numpy as np
from collections import defaultdict
import time
from itertools import product

def epsilon_greedy_policy(Q, state, epsilon, env):
    if np.random.uniform(0, 1) > epsilon:
        action = np.argmax(Q[state])
    else:
        action = env.action_space.sample()
    return action

def train_sarsa(Q, env, num_episodes, alpha, gamma, epsilon):
    for episode in range(num_episodes):
        state = env.reset()
        action = epsilon_greedy_policy(Q, state, epsilon, env)
        done = False

        while not done:
            next_state, reward, done, _ = env.step(action)
            next_action = epsilon_greedy_policy(Q, next_state, epsilon, env)

            next_q_value = Q[next_state][next_action] if not done else 0
            Q[state][action] += alpha * (reward + gamma * next_q_value - Q[state][action])

            state = next_state
            action = next_action

def evaluate_sarsa(Q, env, num_episodes):
    total_penalties = 0
    total_timesteps = 0
    total_rewards = 0

    for _ in range(num_episodes):
        state = env.reset()
        penalties = 0
        timesteps = 0
        rewards = 0
        done = False

        while not done:
            action = np.argmax(Q[state])
            state, reward, done, _ = env.step(action)

            rewards += reward
            timesteps += 1

            if reward == -10:
                penalties += 1

        total_penalties += penalties
        total_timesteps += timesteps
        total_rewards += rewards

    average_penalties = total_penalties / num_episodes
    average_timesteps = total_timesteps / num_episodes
    average_rewards = total_rewards / num_episodes

    return average_penalties, average_timesteps, average_rewards

def hyperparameter_tuning(env, alpha_values, gamma_values, epsilon_values, num_episodes, eval_episodes):
    best_params = None
    best_performance = float('-inf')
    results = []

    for alpha, gamma, epsilon in product(alpha_values, gamma_values, epsilon_values):
        Q = defaultdict(lambda: np.zeros(env.action_space.n))
        train_sarsa(Q, env, num_episodes, alpha, gamma, epsilon)
        avg_penalties, avg_timesteps, avg_rewards = evaluate_sarsa(Q, env, eval_episodes)

        results.append((alpha, gamma, epsilon, avg_rewards))

        if avg_rewards > best_performance:
            best_performance = avg_rewards
            best_params = (alpha, gamma, epsilon)

    return best_params, results

if __name__ == "__main__":
    env = gym.make("Taxi-v3")
    alpha_values = [0.2, 0.3, 0.4]
    gamma_values = [0.6, 0.7, 0.8]
    epsilon_values = [0.1, 0.3, 0.5]
    num_episodes = 10000
    eval_episodes = 100

    best_params, results = hyperparameter_tuning(env, alpha_values, gamma_values, epsilon_values, num_episodes, eval_episodes)

    print("Best Parameters:")
    print("Alpha:", best_params[0])
    print("Gamma:", best_params[1])
    print("Epsilon:", best_params[2])

    print("\nAll Results:")
    for res in results:
        print("Alpha: {}, Gamma: {}, Epsilon: {}, Average Rewards: {}".format(res[0], res[1], res[2], res[3]))


Best Parameters:
Alpha: 0.2
Gamma: 0.8
Epsilon: 0.1

All Results:
Alpha: 0.2, Gamma: 0.6, Epsilon: 0.1, Average Rewards: -104.31
Alpha: 0.2, Gamma: 0.6, Epsilon: 0.3, Average Rewards: -156.77
Alpha: 0.2, Gamma: 0.6, Epsilon: 0.5, Average Rewards: -195.72
Alpha: 0.2, Gamma: 0.7, Epsilon: 0.1, Average Rewards: -73.55
Alpha: 0.2, Gamma: 0.7, Epsilon: 0.3, Average Rewards: -187.35
Alpha: 0.2, Gamma: 0.7, Epsilon: 0.5, Average Rewards: -197.86
Alpha: 0.2, Gamma: 0.8, Epsilon: 0.1, Average Rewards: -50.58
Alpha: 0.2, Gamma: 0.8, Epsilon: 0.3, Average Rewards: -126.51
Alpha: 0.2, Gamma: 0.8, Epsilon: 0.5, Average Rewards: -174.82
Alpha: 0.3, Gamma: 0.6, Epsilon: 0.1, Average Rewards: -141.38
Alpha: 0.3, Gamma: 0.6, Epsilon: 0.3, Average Rewards: -170.66
Alpha: 0.3, Gamma: 0.6, Epsilon: 0.5, Average Rewards: -197.88
Alpha: 0.3, Gamma: 0.7, Epsilon: 0.1, Average Rewards: -116.29
Alpha: 0.3, Gamma: 0.7, Epsilon: 0.3, Average Rewards: -164.38
Alpha: 0.3, Gamma: 0.7, Epsilon: 0.5, Average Rewards:

EXP 1:

    alpha_values = [0.2, 0.3, 0.4]
    gamma_values = [0.6, 0.7, 0.8]
    epsilon_values = [0.1, 0.3, 0.5]

Best Parameters:
Alpha: 0.2
Gamma: 0.8
Epsilon: 0.1

# D Q Learning

In [None]:
import numpy as np
import random
from collections import deque
import progressbar
import gym
from keras import Model, Sequential
from keras.layers import Dense, Embedding, Reshape
from keras.optimizers import Adam
import time

class TaxiAgent:
    def __init__(self, env, optimizer, state_size, action_size, discount, exploration):
        self._state_size = state_size
        self._action_size = action_size
        self._optimizer = optimizer
        self.experience_replay_memory = deque(maxlen=2000)
        self.discount = discount
        self.exploration = exploration
        self.q_network = self._build_compile_model()
        self.target_network = self._build_compile_model()
        self.align_both_model()

    def gather(self, state, action, reward, next_state, terminated):
        self.experience_replay_memory.append((state, action, reward, next_state, terminated))

    def _build_compile_model(self):
        model = Sequential()
        model.add(Embedding(self._state_size, 10, input_length=1))
        model.add(Reshape((10,)))
        model.add(Dense(50, activation='relu'))
        model.add(Dense(50, activation='relu'))
        model.add(Dense(self._action_size, activation='linear'))
        model.compile(loss='mse', optimizer=self._optimizer)
        return model

    def align_both_model(self):
        self.target_network.set_weights(self.q_network.get_weights())

    def act(self, state):
        if np.random.rand() <= self.exploration:
            return np.random.choice(self._action_size)
        q_values = self.q_network.predict(state)
        return np.argmax(q_values[0])

    def retrain(self, batch_size):
        minibatch = random.sample(self.experience_replay_memory, batch_size)
        for state, action, reward, next_state, terminated in minibatch:
            target = self.q_network.predict(state)
            if terminated:
                target[0][action] = reward
            else:
                t = self.target_network.predict(next_state)
                target[0][action] = reward + self.discount * np.amax(t)
            self.q_network.fit(state, target, epochs=4, verbose=0) # epoch was 1

def train_and_evaluate(env, optimizer, state_size, action_size, discount, exploration, num_episodes, timesteps_per_episode, batch_size):
    total_penalties = 0
    total_timesteps = 0
    total_rewards = 0

    taxi_agent = TaxiAgent(env, optimizer, state_size, action_size, discount, exploration)

    for e in range(num_episodes):
        state = env.reset()
        state = np.reshape(state, [1, 1])
        reward = 0
        terminated = False
        timesteps = 0
        penalties = 0

        for timestep in range(timesteps_per_episode):
            action = taxi_agent.act(state)
            next_state, reward, terminated, _ = env.step(action)
            next_state = np.reshape(next_state, [1, 1])
            taxi_agent.gather(state, action, reward, next_state, terminated)
            state = next_state
            timesteps += 1
            if terminated:
                total_penalties += penalties
                total_timesteps += timesteps
                total_rewards += reward
                taxi_agent.align_both_model()
                break
            if reward == -10:
                penalties += 1
            if len(taxi_agent.experience_replay_memory) > batch_size:
                taxi_agent.retrain(batch_size)

    average_penalties = total_penalties / num_episodes
    average_timesteps = total_timesteps / num_episodes
    average_rewards_per_move = total_rewards / (total_timesteps - total_penalties)

    return average_penalties, average_timesteps, average_rewards_per_move

# Define hyperparameter grid
hyperparameters = {
    "learning_rate": [0.001, 0.01, 0.1],
    "num_episodes": [10, 50, 100],
    "timesteps_per_episode": [40, 100, 200],
    "batch_size": [16, 32, 64],
    "discount": [0.6, 0.7, 0.8],
    "exploration": [0.1, 0.2, 0.3]
}

# Environment and its parameters
env = gym.make("Taxi-v3").env
state_size = env.observation_space.n
action_size = env.action_space.n

# Perform grid search
best_score = float('-inf')
best_hyperparameters = None

for lr in hyperparameters["learning_rate"]:
    for episodes in hyperparameters["num_episodes"]:
        for timesteps in hyperparameters["timesteps_per_episode"]:
            for batch_size in hyperparameters["batch_size"]:
                for discount in hyperparameters["discount"]:
                    for exploration in hyperparameters["exploration"]:
                        optimizer = Adam(learning_rate=lr)
                        penalties, timesteps, rewards = train_and_evaluate(env, optimizer, state_size, action_size, discount, exploration, episodes, timesteps, batch_size)
                        score = rewards  # You might want to change this to a weighted combination of rewards and penalties based on your specific objectives
                        if score > best_score:
                            best_score = score
                            best_hyperparameters = {
                                "learning_rate": lr,
                                "num_episodes": episodes,
                                "timesteps_per_episode": timesteps,
                                "batch_size": batch_size,
                                "discount": discount,
                                "exploration": exploration
                            }

print("Best hyperparameters:", best_hyperparameters)
print("Best score:", best_score)


  deprecation(
  deprecation(




  if not isinstance(terminated, (bool, np.bool8)):


[1;30;43mStreaming output truncated to the last 5000 lines.[0m


ZeroDivisionError: division by zero

In [None]:
import numpy as np
import time
from keras.optimizers import Adam
import gym
from keras.models import Sequential
from keras.layers import Dense, Embedding, Reshape
from collections import deque
import random
from tqdm import tqdm  # Importing tqdm for progress bar

# Initialize the environment with the new step API
env_taxi = gym.make("Taxi-v3", new_step_api=True).env

class TaxiAgent:
    def __init__(self, env, optimizer):
        self._state_size = env.observation_space.n
        self._action_size = env.action_space.n
        self._optimizer = optimizer
        self.experience_replay_memory = deque(maxlen=2000)
        self.discount = 0.6
        self.exploration = 0.1
        self.q_network = self._build_compile_model()
        self.target_network = self._build_compile_model()
        self.align_both_model()

    def gather(self, state, action, reward, next_state, terminated):
        self.experience_replay_memory.append((state, action, reward, next_state, terminated))

    def _build_compile_model(self):
        model = Sequential()
        model.add(Embedding(self._state_size, 10, input_length=1))
        model.add(Reshape((10,)))
        model.add(Dense(50, activation='relu'))
        model.add(Dense(50, activation='relu'))
        model.add(Dense(self._action_size, activation='linear'))
        model.compile(loss='mse', optimizer=self._optimizer)
        return model

    def align_both_model(self):
        self.target_network.set_weights(self.q_network.get_weights())

    def act(self, state):
        if np.random.rand() <= self.exploration:
            return env_taxi.action_space.sample()
        q_values = self.q_network.predict(state)
        return np.argmax(q_values[0])

    def retrain(self, batch_size):
        minibatch = random.sample(self.experience_replay_memory, batch_size)
        for state, action, reward, next_state, terminated in minibatch:
            target = self.q_network.predict(state)
            if terminated:
                target[0][action] = reward
            else:
                t = self.target_network.predict(next_state)
                target[0][action] = reward + self.discount * np.amax(t)
            self.q_network.fit(state, target, epochs=4, verbose=0)

def deep_q_learning(env, num_training_episodes=100, num_evaluation_episodes=100, alpha=0.01, gamma=0.6, epsilon=0.1, batch_size=32, timesteps_per_episode=40, epochs=4):
    # Training phase
    start_time = time.time()  # Record start time for training
    total_penalties = 0
    total_timesteps = 0
    total_rewards = 0

    for e in tqdm(range(num_training_episodes), desc="Training Episodes"):
        state = env_taxi.reset()
        state = np.reshape(state, [1, 1])
        reward = 0
        terminated = False
        timesteps = 0
        penalties = 0

        while not terminated:
            action = taxi_agent.act(state)
            next_state, reward, terminated, truncated, info = env_taxi.step(action)
            next_state = np.reshape(next_state, [1, 1])
            taxi_agent.gather(state, action, reward, next_state, terminated)
            state = next_state
            timesteps += 1
            if reward == -10:
                penalties += 1

        total_penalties += penalties
        total_timesteps += timesteps
        total_rewards += reward

        if len(taxi_agent.experience_replay_memory) > batch_size:
            print(f"Retraining on mini-batch at episode {e}")
            taxi_agent.retrain(batch_size)
    end_time = time.time()  # Record end time for training
    execution_time = end_time - start_time  # Calculate time to train

    # Evaluation phase
    total_penalties = 0
    total_timesteps = 0
    total_rewards = 0
    for e in tqdm(range(num_evaluation_episodes), desc="Evaluation Episodes"):
        # Evaluation episode loop
        state = env_taxi.reset()
        state = np.reshape(state, [1, 1])
        reward = 0
        terminated = False
        timesteps = 0
        penalties = 0

        while not terminated:
            action = taxi_agent.act(state)
            next_state, reward, terminated, truncated, info = env_taxi.step(action)
            next_state = np.reshape(next_state, [1, 1])
            state = next_state
            timesteps += 1
            if reward == -10:
                penalties += 1

        total_penalties += penalties
        total_timesteps += timesteps
        total_rewards += reward

    # Calculate averages for evaluation metrics
    average_penalties = total_penalties / num_evaluation_episodes
    average_timesteps = total_timesteps / num_evaluation_episodes
    average_rewards_per_move = total_rewards / (total_timesteps - total_penalties)

    # Print metrics
    print("Deep Q-Learning Method:")
    print("Average number of penalties per episode:", average_penalties)
    print("Average number of timesteps per trip:", average_timesteps)
    print("Average rewards per move:", average_rewards_per_move)
    print("Time to train:", execution_time, "seconds")

# Creating the optimizer
optimizer = Adam(learning_rate=0.01)

# Creating the TaxiAgent instance
taxi_agent = TaxiAgent(env_taxi, optimizer)

# Defining parameters for the experiment
num_training_episodes = 100
num_evaluation_episodes = 100
batch_size = 32
timesteps_per_episode = 40

# Running the Deep Q-Learning experiment
deep_q_learning(env_taxi, num_training_episodes, num_evaluation_episodes, batch_size=batch_size, timesteps_per_episode=timesteps_per_episode)


Training Episodes:   0%|          | 0/100 [00:00<?, ?it/s]



Training Episodes:   0%|          | 0/100 [08:54<?, ?it/s]


KeyboardInterrupt: 