In [61]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [62]:
import gymnasium as gym
import nbformat

In [63]:
class_path= "/content/drive/MyDrive/husob/06 ReinForcement Learning/004/Drone_env.ipynb"
with open(class_path, 'r') as f:
    nb = nbformat.read(f, as_version=4)

In [64]:
for cell in nb.cells:
    if cell.cell_type == 'code':
       try:
        exec(cell.source, globals())
       except Exception as e:
        print(f"Error executing cell: {e}")

Error executing cell: not enough values to unpack (expected 5, got 4)


In [65]:
import pickle
def load_q_table_and_env(q_table_path,best_q_table_path, env_path):
    with open(q_table_path, 'rb') as f:
        q_table = pickle.load(f)

    with open(best_q_table_path, 'rb') as f:
        best_q_table = pickle.load(f)
    with open(env_path, 'rb') as f:
        env = pickle.load(f)
    return q_table,best_q_table , env

In [66]:
working_dir = "/content/drive/MyDrive/husob/06 ReinForcement Learning/004/"
q_table_name = working_dir + "q_table1.pkl"
best_q_table_name = working_dir + "best_q_table1.pkl"
env_name = working_dir + "env1.pkl"
q_table,best_q_table ,env = load_q_table_and_env(q_table_name,best_q_table_name ,env_name)

In [67]:
import numpy as np
import time
from tqdm import trange
from IPython.display import clear_output

# --- Q-Learning Agent ---
class QLearningAgent:
    """
    A Q-Learning agent for discrete observation and action spaces.
    """
    def __init__(self, observation_space_size, action_space_size, learning_rate=0.1, discount_factor=0.99, epsilon=1.0):
        self.q_table = np.zeros((observation_space_size, action_space_size))
        self.learning_rate = learning_rate  # Alpha
        self.discount_factor = discount_factor  # Gamma
        self.epsilon = epsilon  # Exploration factor

    def choose_action(self, state):
        """
        Chooses an action using an epsilon-greedy policy.
        """
        if np.random.rand() < self.epsilon:
            return np.random.randint(self.q_table.shape[1])  # Explore
        else:
            return np.argmax(self.q_table[state])  # Exploit

    def learn(self, state, action, reward, next_state):
        """
        Q(s,a) = Q(s,a) + alpha * (reward + gamma * max(Q(s',a')) - Q(s,a))
        """
        old_value = self.q_table[state, action]
        next_max = np.max(self.q_table[next_state])
        new_value = old_value + self.learning_rate * (reward + self.discount_factor * next_max - old_value)
        self.q_table[state, action] = new_value

In [74]:
from IPython.display import clear_output
import time

# --- Evaluation Function ---
def evaluate_agent(env, agent, goal_position, num_episodes=5):
    """
    Runs the agent in exploitation mode (no exploration) and renders episodes.
    """
    original_epsilon = agent.epsilon
    agent.epsilon = 0  # Turn off exploration

    print(f"\nEvaluating agent for {num_episodes} episodes...")

    for episode in range(num_episodes):
        # Reset the environment and get the initial observation (drone's start position)
        state, info = env.reset() # 'info' might contain additional details, good practice to capture

        # Set the goal for the environment
        raw_env = env.unwrapped
        raw_env.set_goal(goal_position) # This sets the target for the environment's internal logic

        # No need to set 'state' to goal_position here.
        # 'state' from env.reset() is the correct starting observation.

        done = False
        total_reward = 0

        print(f"\n--- Episode {episode + 1} ---")
        env.render() # Render initial state

        while not done:
            action = agent.choose_action(state)
            next_state, reward, terminated, truncated, _ = env.step(action)
            total_reward += reward
            state = next_state # Update state for the next iteration
            done = terminated or truncated
            if done:
               clear_output(wait=True)
               env.render() # Render after each step to see progression
               time.sleep(0.1)

        print(f"Episode {episode + 1} reward: {total_reward:.2f}")

    # Restore original epsilon after evaluation
    agent.epsilon = original_epsilon

In [75]:
agent = QLearningAgent(observation_space_size=env.observation_space.n, action_space_size=env.action_space.n)
agent.q_table = q_table
goal_position = [9,0,8]

evaluate_agent(env, agent,goal_position, num_episodes=1)


Episode 1 reward: 976.00


In [77]:
agent = QLearningAgent(observation_space_size=env.observation_space.n, action_space_size=env.action_space.n)
agent.q_table = best_q_table
goal_position = [9,8,8]

evaluate_agent(env, agent,goal_position, num_episodes=3)


Episode 3 reward: 984.00
