In [None]:
# avoid more than 1 element per cell

In [None]:
import numpy as np
import random
import gym
from gym import spaces
import pygame

class WarehouseEnv(gym.Env):
    """
    Environment representing a warehouse where a robot navigates to pick up packages and deliver them to designated points.
    """

    def __init__(self, grid_size, num_packages, num_delivery_points):
        """
        Initializes the Warehouse environment.

        Parameters:
        - grid_size (int): Size of the grid layout.
        - num_packages (int): Number of packages in the warehouse.
        - num_delivery_points (int): Number of delivery points in the warehouse.
        """
        super(WarehouseEnv, self).__init__()
        self.grid_size = grid_size
        self.num_packages = num_packages
        self.num_delivery_points = num_delivery_points
        self.action_space = spaces.Discrete(6)  # Up, Down, Left, Right, Pick up, Drop off
        self.observation_space = spaces.Tuple((
            spaces.Discrete(grid_size),  # Robot X position
            spaces.Discrete(grid_size),  # Robot Y position
            spaces.MultiBinary(num_packages),  # Package locations
            spaces.MultiBinary(num_delivery_points),  # Delivery locations
            spaces.MultiBinary(num_packages)  # Inventory
        ))
        self.reset()

    def reset(self):
        """
        Resets the environment to its initial state.

        Returns:
        - observation (tuple): Initial observation of the environment.
        """
        self.robot_pos = [0, 0]
        self.packages = np.zeros(self.grid_size)
        self.delivery_points = np.zeros(self.grid_size)
        self.inventory = np.zeros(self.num_packages)

        # Randomly place at least one package without overlapping with delivery points
        placed_package = False
        while not placed_package:
            rand_x = np.random.randint(self.grid_size)
            rand_y = np.random.randint(self.grid_size)
            while self.delivery_points[rand_x] == 1:
                rand_x = np.random.randint(self.grid_size)
                rand_y = np.random.randint(self.grid_size)
            self.packages[rand_x] = 1
            placed_package = True

        # Ensure delivery points are placed without overlapping with packages
        placed_delivery = 0
        while placed_delivery < self.num_delivery_points:
            rand_x = np.random.randint(self.grid_size)
            if self.delivery_points[rand_x] == 0 and self.packages[rand_x] == 0:
                self.delivery_points[rand_x] = 1
                placed_delivery += 1

        return self._get_observation()

    def step(self, action):
        """
        Executes one time step in the environment.

        Parameters:
        - action (int): Action to be taken by the agent.

        Returns:
        - observation (tuple): New observation of the environment.
        - reward (float): Reward received from the environment.
        - done (bool): Whether the episode is done or not.
        - info (dict): Additional information about the environment.
        """
        reward = 0
        done = False
        new_pos = self.robot_pos[:]

        if action == 0:  # Move Up
            new_pos[0] -= 1
        elif action == 1:  # Move Down
            new_pos[0] += 1
        elif action == 2:  # Move Left
            new_pos[1] -= 1
        elif action == 3:  # Move Right
            new_pos[1] += 1
        elif action == 4:  # Pick up
            if tuple(self.robot_pos) in self.packages:
                package_index = list(self.robot_pos).index(1)
                self.inventory[package_index] = 1
                self.packages[package_index] = 0
                reward += 10
            else:
                reward -= 1
        elif action == 5:  # Drop off
            if tuple(self.robot_pos) in self.delivery_points:
                if 1 in self.inventory:
                    package_index = list(self.inventory).index(1)
                    self.inventory[package_index] = 0
                    reward += 100
                    # Check if all packages have been delivered
                    if np.sum(self.inventory) == 0:
                        done = True  # Episode ends if all packages are delivered
                else:
                    reward -= 10
            else:
                reward -= 10

        if 0 <= new_pos[0] < self.grid_size and 0 <= new_pos[1] < self.grid_size:
            self.robot_pos = new_pos  # Update the robot's position

        observation = self._get_observation()
        return observation, reward, done, {}

    def _get_observation(self):
        """
        Returns the current observation of the environment.

        Returns:
        - observation (tuple): Current observation of the environment.
        """
        return tuple(self.robot_pos + [int(x) for x in self.packages] + [int(x) for x in self.delivery_points] + [int(x) for x in self.inventory])

def draw_env(screen, env):
    """
    Draws the current state of the environment on the screen.

    Parameters:
    - screen (pygame.Surface): Pygame surface representing the screen.
    - env (WarehouseEnv): Instance of the Warehouse environment.
    """
    screen.fill((255, 255, 255))
    cell_size = 50
    grid_size = env.grid_size
    robot_img = pygame.image.load("robot.png").convert_alpha()
    package_img = pygame.image.load("package.png").convert_alpha()
    delivery_img = pygame.image.load("delivery.png").convert_alpha()

    for i in range(grid_size):
        pygame.draw.line(screen, (0, 0, 0), (i * cell_size, 0), (i * cell_size, grid_size * cell_size))
        pygame.draw.line(screen, (0, 0, 0), (0, i * cell_size), (grid_size * cell_size, i * cell_size))

    robot_pos = env.robot_pos
    screen.blit(robot_img, (robot_pos[1] * cell_size, robot_pos[0] * cell_size))

    for i in range(grid_size):
        for j in range(grid_size):
            if env.packages[i] == 1:
                screen.blit(package_img, (j * cell_size, i * cell_size))
            if env.delivery_points[i] == 1:
                screen.blit(delivery_img, (j * cell_size, i * cell_size))

    pygame.display.flip()

# Q-learning parameters
gamma = 0.9
alpha = 0.1

def q_learning(env, num_episodes=1000, max_steps_per_episode=3):
    """
    Performs Q-learning to train the agent.

    Parameters:
    - env (WarehouseEnv): Instance of the Warehouse environment.
    - num_episodes (int): Number of episodes for training.
    - max_steps_per_episode (int): Maximum number of steps per episode.
    """
    q_table = np.zeros((env.observation_space.n, env.action_space.n))
    for episode in range(num_episodes):
        state = env.reset()
        total_reward = 0
        steps = 0
        done = False
        while not done and steps < max_steps_per_episode:
            action = np.argmax(q_table[state])
            next_state, reward, done, _ = env.step(action)
            q_value = q_table[state, action]
            max_next_q_value = np.max(q_table[next_state])
            new_q_value = q_value + alpha * (reward + gamma * max_next_q_value - q_value)
            q_table[state, action] = new_q_value
            total_reward += reward
            state = next_state
            steps += 1
        print(f"Episode: {episode+1}, Total Reward: {total_reward}")

def main():
    env = WarehouseEnv(grid_size=5, num_packages=3, num_delivery_points=2)

    pygame.init()
    screen = pygame.display.set_mode((env.grid_size * 50, env.grid_size * 50))
    clock = pygame.time.Clock()

    num_episodes = 1000
    max_steps_per_episode = 100  # Define maximum steps per episode
    for episode in range(num_episodes):
        print(f"Current Episode: {episode+1}")  # Print current episode
        state = env.reset()
        total_reward = 0
        done = False
        steps = 0  # Initialize step count
        exit_flag = False  # Flag to exit event handling loop
        while not done and steps < max_steps_per_episode:  # Check maximum steps condition
            for event in pygame.event.get():
                if event.type == pygame.QUIT:
                    exit_flag = True  # Set the exit flag to True
                    break  # Exit the event handling loop
            if exit_flag:
                break  # Exit the episode loop if the exit flag is True
            draw_env(screen, env)
            action = env.action_space.sample()
            next_state, reward, done, _ = env.step(action)
            total_reward += reward
            clock.tick(10)
            steps += 1  # Increment step count
        print(f"Episode: {episode+1}, Total Reward: {total_reward}")

    pygame.quit()

if __name__ == "__main__":
    main()


In [None]:
# this one is good

In [None]:
import numpy as np
import random
import gym
from gym import spaces
import pygame

class WarehouseEnv(gym.Env):
    """
    Environment representing a warehouse where a robot navigates to pick up packages and deliver them to designated points.
    """

    def __init__(self, grid_size, num_packages, num_delivery_points):
        """
        Initializes the Warehouse environment.

        Parameters:
        - grid_size (int): Size of the grid layout.
        - num_packages (int): Number of packages in the warehouse.
        - num_delivery_points (int): Number of delivery points in the warehouse.
        """
        super(WarehouseEnv, self).__init__()
        self.grid_size = grid_size
        self.num_packages = num_packages
        self.num_delivery_points = num_delivery_points
        self.action_space = spaces.Discrete(6)  # Up, Down, Left, Right, Pick up, Drop off
        self.observation_space = spaces.Tuple((
            spaces.Discrete(grid_size),  # Robot X position
            spaces.Discrete(grid_size),  # Robot Y position
            spaces.MultiBinary(num_packages),  # Package locations
            spaces.MultiBinary(num_delivery_points),  # Delivery locations
            spaces.MultiBinary(num_packages)  # Inventory
        ))
        self.reset()

    def reset(self):
        """
        Resets the environment to its initial state.

        Returns:
        - observation (tuple): Initial observation of the environment.
        """
        self.robot_pos = [0, 0]
        self.packages = np.zeros((self.grid_size, self.grid_size))
        self.delivery_points = np.zeros((self.grid_size, self.grid_size))
        self.inventory = np.zeros(self.num_packages)
        self.package_to_delivery = {}  # Dictionary to store package to delivery point associations

        # Randomly place delivery points
        for _ in range(self.num_delivery_points):
            rand_x = np.random.randint(self.grid_size)
            rand_y = np.random.randint(self.grid_size)
            while self.delivery_points[rand_x][rand_y] == 1:  # Ensure unique location for each delivery point
                rand_x = np.random.randint(self.grid_size)
                rand_y = np.random.randint(self.grid_size)
            self.delivery_points[rand_x][rand_y] = 1

        # Assign packages to remaining empty locations
        placed_packages = 0
        for i in range(self.grid_size):
            for j in range(self.grid_size):
                if placed_packages < self.num_packages and self.delivery_points[i][j] == 0:
                    self.packages[i][j] = 1
                    # Find the nearest delivery point and associate the package with it
                    dist_to_delivery = np.inf
                    nearest_delivery = None
                    for di in range(self.grid_size):
                        for dj in range(self.grid_size):
                            if self.delivery_points[di][dj] == 1:
                                dist = abs(i - di) + abs(j - dj)
                                if dist < dist_to_delivery:
                                    dist_to_delivery = dist
                                    nearest_delivery = (di, dj)
                    self.package_to_delivery[(i, j)] = nearest_delivery
                    placed_packages += 1

        return self._get_observation()

    def step(self, action):
        """
        Executes one time step in the environment.

        Parameters:
        - action (int): Action to be taken by the agent.

        Returns:
        - observation (tuple): New observation of the environment.
        - reward (float): Reward received from the environment.
        - done (bool): Whether the episode is done or not.
        - info (dict): Additional information about the environment.
        """
        reward = 0
        done = False
        new_pos = self.robot_pos[:]

        if action == 0:  # Move Up
            new_pos[0] -= 1
        elif action == 1:  # Move Down
            new_pos[0] += 1
        elif action == 2:  # Move Left
            new_pos[1] -= 1
        elif action == 3:  # Move Right
            new_pos[1] += 1
        elif action == 4:  # Pick up
            if tuple(self.robot_pos) in self.packages:
                package_index = list(self.robot_pos).index(1)
                self.inventory[package_index] = 1
                self.packages[self.robot_pos[0]][self.robot_pos[1]] = 0
                reward += 10  # Add a positive reward for picking up a package
            else:
                reward -= 1
        elif action == 5:  # Drop off
            if tuple(self.robot_pos) in self.delivery_points:
                if 1 in self.inventory:
                    package_index = list(self.inventory).index(1)
                    self.inventory[package_index] = 0
                    reward += 100
                    # Check if all packages have been delivered
                    if np.sum(self.inventory) == 0:
                        done = True  # Episode ends if all packages are delivered
                else:
                    reward -= 10
            else:
                reward -= 10

        # Positive reward for reaching intermediate steps (packages or delivery points)
        if reward == 0:
            reward += 1

        if 0 <= new_pos[0] < self.grid_size and 0 <= new_pos[1] < self.grid_size:
            self.robot_pos = new_pos  # Update the robot's position

        observation = self._get_observation()
        return observation, reward, done, {}

    def _get_observation(self):
        """
        Returns the current observation of the environment.

        Returns:
        - observation (tuple): Current observation of the environment.
        """
        package_locations = [int(x) for row in self.packages for x in row]
        delivery_locations = [int(x) for row in self.delivery_points for x in row]
        inventory = [int(x) for x in self.inventory]
        return tuple(self.robot_pos + package_locations + delivery_locations + inventory)

def draw_env(screen, env):
    """
    Draws the current state of the environment on the screen.

    Parameters:
    - screen (pygame.Surface): Pygame surface representing the screen.
    - env (WarehouseEnv): Instance of the Warehouse environment.
    """
    screen.fill((255, 255, 255))
    cell_size = 50
    grid_size = env.grid_size
    robot_img = pygame.image.load("robot.png").convert_alpha()
    package_img = pygame.image.load("package.png").convert_alpha()
    delivery_img = pygame.image.load("delivery.png").convert_alpha()

    for i in range(grid_size):
        for j in range(grid_size):
            pygame.draw.rect(screen, (255, 255, 255), (j * cell_size, i * cell_size, cell_size, cell_size))
            pygame.draw.rect(screen, (0, 0, 0), (j * cell_size, i * cell_size, cell_size, cell_size), 1)

            if env.packages[i][j] == 1:
                screen.blit(package_img, (j * cell_size, i * cell_size))
            if env.delivery_points[i][j] == 1:
                screen.blit(delivery_img, (j * cell_size, i * cell_size))

    robot_pos = env.robot_pos
    screen.blit(robot_img, (robot_pos[1] * cell_size, robot_pos[0] * cell_size))

    pygame.display.flip()


def main():
    env = WarehouseEnv(grid_size=5, num_packages=3, num_delivery_points=3)

    # Print the number of delivery points and packages
    print(f"Number of delivery points: {env.num_delivery_points}")
    print(f"Number of packages: {env.num_packages}")

    pygame.init()
    screen = pygame.display.set_mode((env.grid_size * 50, env.grid_size * 50))
    clock = pygame.time.Clock()

    num_episodes = 1000
    max_steps_per_episode = 1000  # Define maximum steps per episode
    epsilon = 0.1  # Define epsilon value for epsilon-greedy strategy
    for episode in range(num_episodes):
        print(f"Current Episode: {episode+1}")  # Print current episode
        state = env.reset()
        total_reward = 0
        done = False
        steps = 0  # Initialize step count
        exit_flag = False  # Flag to exit event handling loop
        while not done and steps < max_steps_per_episode:  # Check maximum steps condition
            for event in pygame.event.get():
                if event.type == pygame.QUIT:
                    exit_flag = True  # Set the exit flag to True
                    break  # Exit the event handling loop
            if exit_flag:
                break  # Exit the episode loop if the exit flag is True
            draw_env(screen, env)

            # Epsilon-greedy action selection
            if random.random() < epsilon:
                action = env.action_space.sample()  # Choose a random action
            else:
                # Choose the greedy action based on Q-values (not implemented in this code)
                action = 0  # Replace this with your Q-learning or DQN action selection logic

            next_state, reward, done, _ = env.step(action)
            total_reward += reward
            clock.tick(10)
            steps += 1  # Increment step count
        print(f"Episode: {episode+1}, Total Reward: {total_reward}")

    pygame.quit()

if __name__ == "__main__":
    main()


In [None]:
import numpy as np
import random
import gym
from gym import spaces
import pygame

class WarehouseEnv(gym.Env):
    """
    Environment representing a warehouse where a robot navigates to pick up packages and deliver them to designated points.
    """

    def __init__(self, grid_size, num_packages, num_delivery_points):
        """
        Initializes the Warehouse environment.

        Parameters:
        - grid_size (int): Size of the grid layout.
        - num_packages (int): Number of packages in the warehouse.
        - num_delivery_points (int): Number of delivery points in the warehouse.
        """
        super(WarehouseEnv, self).__init__()
        self.grid_size = grid_size
        self.num_packages = num_packages
        self.num_delivery_points = num_delivery_points
        self.action_space = spaces.Discrete(6)  # Up, Down, Left, Right, Pick up, Drop off
        self.observation_space = spaces.Tuple((
            spaces.Discrete(grid_size),  # Robot X position
            spaces.Discrete(grid_size),  # Robot Y position
            spaces.MultiBinary(num_packages),  # Package locations
            spaces.MultiBinary(num_delivery_points),  # Delivery locations
            spaces.MultiBinary(num_packages)  # Inventory
        ))
        self.reset()

    def reset(self):
        """
        Resets the environment to its initial state.

        Returns:
        - observation (tuple): Initial observation of the environment.
        """
        self.robot_pos = [0, 0]
        self.packages = np.zeros((self.grid_size, self.grid_size))
        self.delivery_points = np.zeros((self.grid_size, self.grid_size))
        self.inventory = np.zeros(self.num_packages)
        self.package_to_delivery = {}  # Dictionary to store package to delivery point associations

        # Randomly place delivery points
        for _ in range(self.num_delivery_points):
            rand_x = np.random.randint(self.grid_size)
            rand_y = np.random.randint(self.grid_size)
            while self.delivery_points[rand_x][rand_y] == 1:  # Ensure unique location for each delivery point
                rand_x = np.random.randint(self.grid_size)
                rand_y = np.random.randint(self.grid_size)
            self.delivery_points[rand_x][rand_y] = 1

        # Assign packages to remaining empty locations
        placed_packages = 0
        for i in range(self.grid_size):
            for j in range(self.grid_size):
                if placed_packages < self.num_packages and self.delivery_points[i][j] == 0:
                    self.packages[i][j] = 1
                    # Find the nearest delivery point and associate the package with it
                    dist_to_delivery = np.inf
                    nearest_delivery = None
                    for di in range(self.grid_size):
                        for dj in range(self.grid_size):
                            if self.delivery_points[di][dj] == 1:
                                dist = abs(i - di) + abs(j - dj)
                                if dist < dist_to_delivery:
                                    dist_to_delivery = dist
                                    nearest_delivery = (di, dj)
                    self.package_to_delivery[(i, j)] = nearest_delivery
                    placed_packages += 1

        self.q_table = np.zeros((2 ** (self.grid_size * 2 + self.num_packages * 2), self.action_space.n))
        return self._get_observation()

    def step(self, action):
        """
        Executes one time step in the environment.

        Parameters:
        - action (int): Action to be taken by the agent.

        Returns:
        - observation (tuple): New observation of the environment.
        - reward (float): Reward received from the environment.
        - done (bool): Whether the episode is done or not.
        - info (dict): Additional information about the environment.
        """
        reward = 0
        done = False
        new_pos = self.robot_pos[:]

        if action == 0:  # Move Up
            new_pos[0] -= 1
        elif action == 1:  # Move Down
            new_pos[0] += 1
        elif action == 2:  # Move Left
            new_pos[1] -= 1
        elif action == 3:  # Move Right
            new_pos[1] += 1
        elif action == 4:  # Pick up
            if tuple(self.robot_pos) in self.packages:
                package_index = list(self.robot_pos).index(1)
                self.inventory[package_index] = 1
                self.packages[self.robot_pos[0]][self.robot_pos[1]] = 0
                reward += 10  # Add a positive reward for picking up a package
            else:
                reward -= 1
        elif action == 5:  # Drop off
            if tuple(self.robot_pos) in self.delivery_points:
                if 1 in self.inventory:
                    package_index = list(self.inventory).index(1)
                    self.inventory[package_index] = 0
                    reward += 100
                    # Check if all packages have been delivered
                    if np.sum(self.inventory) == 0:
                        done = True  # Episode ends if all packages are delivered
                else:
                    reward -= 10
            else:
                reward -= 10

        # Positive reward for reaching intermediate steps (packages or delivery points)
        if reward == 0:
            reward += 1

        if 0 <= new_pos[0] < self.grid_size and 0 <= new_pos[1] < self.grid_size:
            self.robot_pos = new_pos  # Update the robot's position

        observation = self._get_observation()
        return observation, reward, done, {}

    def _get_observation(self):
        """
        Returns the current observation of the environment.

        Returns:
        - observation (tuple): Current observation of the environment.
        """
        package_locations = [int(x) for row in self.packages for x in row]
        delivery_locations = [int(x) for row in self.delivery_points for x in row]
        inventory = [int(x) for x in self.inventory]
        return tuple(self.robot_pos + package_locations + delivery_locations + inventory)

def hash_state(env, state):
    """
    Hashes the state tuple into a single integer for indexing.

    Parameters:
    - env (WarehouseEnv): Instance of the Warehouse environment.
    - state (tuple): State tuple to be hashed.

    Returns:
    - hash_value (int): Hashed value of the state.
    """
    hash_value = 0
    for i, s in enumerate(state):
        hash_value += s * (2 ** i)
    return hash_value % env.q_table.shape[0]  # Ensure the hash value is within Q-table bounds


def draw_env(screen, env):
    """
    Draws the current state of the environment on the screen.

    Parameters:
    - screen (pygame.Surface): Pygame surface representing the screen.
    - env (WarehouseEnv): Instance of the Warehouse environment.
    """
    screen.fill((255, 255, 255))
    cell_size = 50
    grid_size = env.grid_size
    robot_img = pygame.image.load("robot.png").convert_alpha()
    package_img = pygame.image.load("package.png").convert_alpha()
    delivery_img = pygame.image.load("delivery.png").convert_alpha()

    for i in range(grid_size):
        for j in range(grid_size):
            pygame.draw.rect(screen, (255, 255, 255), (j * cell_size, i * cell_size, cell_size, cell_size))
            pygame.draw.rect(screen, (0, 0, 0), (j * cell_size, i * cell_size, cell_size, cell_size), 1)

            if env.packages[i][j] == 1:
                screen.blit(package_img, (j * cell_size, i * cell_size))
            if env.delivery_points[i][j] == 1:
                screen.blit(delivery_img, (j * cell_size, i * cell_size))

    robot_pos = env.robot_pos
    screen.blit(robot_img, (robot_pos[1] * cell_size, robot_pos[0] * cell_size))

    pygame.display.flip()


def main():
    env = WarehouseEnv(grid_size=5, num_packages=3, num_delivery_points=3)

    # Print the number of delivery points and packages
    print(f"Number of delivery points: {env.num_delivery_points}")
    print(f"Number of packages: {env.num_packages}")

    pygame.init()
    screen = pygame.display.set_mode((env.grid_size * 50, env.grid_size * 50))
    clock = pygame.time.Clock()

    num_episodes = 500
    max_steps_per_episode = 1000  # Define maximum steps per episode
    epsilon = 0.1  # Define epsilon value for epsilon-greedy strategy
    learning_rate = 0.1  # Define the learning rate
    discount_factor = 0.99  # Define the discount factor

    for episode in range(num_episodes):
        print(f"Current Episode: {episode+1}")  # Print current episode
        state = env.reset()
        total_reward = 0
        done = False
        steps = 0  # Initialize step count
        exit_flag = False  # Flag to exit event handling loop
        while not done and steps < max_steps_per_episode:  # Check maximum steps condition
            for event in pygame.event.get():
                if event.type == pygame.QUIT:
                    exit_flag = True  # Set the exit flag to True
                    break  # Exit the event handling loop
            if exit_flag:
                break  # Exit the episode loop if the exit flag is True
            draw_env(screen, env)

            current_state = env._get_observation()
            hashed_state = hash_state(env, current_state)
            if random.random() < epsilon:
                action = env.action_space.sample()  # Choose a random action
            else:
                action = np.argmax(env.q_table[hashed_state])

            next_state, reward, done, _ = env.step(action)
            total_reward += reward

            next_hashed_state = hash_state(env, next_state)
            best_next_action = np.argmax(env.q_table[next_hashed_state])

            # Update Q-table
            env.q_table[hashed_state][action] += learning_rate * (
                reward + discount_factor * env.q_table[next_hashed_state][best_next_action] - env.q_table[hashed_state][action])

            clock.tick(10)
            steps += 1  # Increment step count
        print(f"Episode: {episode+1}, Total Reward: {total_reward}")

    pygame.quit()

if __name__ == "__main__":
    main()


In [None]:
# with obstacles

In [None]:
import numpy as np
import random
import gym
from gym import spaces
import pygame

class WarehouseEnv(gym.Env):
    """
    Environment representing a warehouse where a robot navigates to pick up packages and deliver them to designated points.
    """

    def __init__(self, grid_size, num_packages, num_delivery_points):
        """
        Initializes the Warehouse environment.

        Parameters:
        - grid_size (int): Size of the grid layout.
        - num_packages (int): Number of packages in the warehouse.
        - num_delivery_points (int): Number of delivery points in the warehouse.
        """
        super(WarehouseEnv, self).__init__()
        self.grid_size = grid_size
        self.num_packages = num_packages
        self.num_delivery_points = num_delivery_points
        self.action_space = spaces.Discrete(6)  # Up, Down, Left, Right, Pick up, Drop off
        self.observation_space = spaces.Tuple((
            spaces.Discrete(grid_size),  # Robot X position
            spaces.Discrete(grid_size),  # Robot Y position
            spaces.MultiBinary(num_packages),  # Package locations
            spaces.MultiBinary(num_delivery_points),  # Delivery locations
            spaces.MultiBinary(num_packages)  # Inventory
        ))
        self.reset()

    def reset(self):
        """
        Resets the environment to its initial state.

        Returns:
        - observation (tuple): Initial observation of the environment.
        """
        self.robot_pos = [0, 0]
        self.packages = np.zeros((self.grid_size, self.grid_size))
        self.delivery_points = np.zeros((self.grid_size, self.grid_size))
        self.obstacles = np.zeros((self.grid_size, self.grid_size))
        self.inventory = np.zeros(self.num_packages)
        self.package_to_delivery = {}  # Dictionary to store package to delivery point associations

        # Randomly place delivery points
        for _ in range(self.num_delivery_points):
            rand_x = np.random.randint(self.grid_size)
            rand_y = np.random.randint(self.grid_size)
            while self.delivery_points[rand_x][rand_y] == 1:  # Ensure unique location for each delivery point
                rand_x = np.random.randint(self.grid_size)
                rand_y = np.random.randint(self.grid_size)
            self.delivery_points[rand_x][rand_y] = 1

        # Assign packages to remaining empty locations
        placed_packages = 0
        for i in range(self.grid_size):
            for j in range(self.grid_size):
                if placed_packages < self.num_packages and self.delivery_points[i][j] == 0:
                    self.packages[i][j] = 1
                    # Find the nearest delivery point and associate the package with it
                    dist_to_delivery = np.inf
                    nearest_delivery = None
                    for di in range(self.grid_size):
                        for dj in range(self.grid_size):
                            if self.delivery_points[di][dj] == 1:
                                dist = abs(i - di) + abs(j - dj)
                                if dist < dist_to_delivery:
                                    dist_to_delivery = dist
                                    nearest_delivery = (di, dj)
                    self.package_to_delivery[(i, j)] = nearest_delivery
                    placed_packages += 1

        # Randomly place obstacles
        num_obstacles = 0
        while num_obstacles < 2:  # Add 2 obstacles per episode
            rand_x = np.random.randint(self.grid_size)
            rand_y = np.random.randint(self.grid_size)
            if (
                self.packages[rand_x][rand_y] == 0
                and self.delivery_points[rand_x][rand_y] == 0
                and self.obstacles[rand_x][rand_y] == 0
            ):
                self.obstacles[rand_x][rand_y] = 1
                num_obstacles += 1

        self.q_table = np.zeros((2 ** (self.grid_size * 2 + self.num_packages * 2), self.action_space.n))
        return self._get_observation()

    def step(self, action):
        """
        Executes one time step in the environment.

        Parameters:
        - action (int): Action to be taken by the agent.

        Returns:
        - observation (tuple): New observation of the environment.
        - reward (float): Reward received from the environment.
        - done (bool): Whether the episode is done or not.
        - info (dict): Additional information about the environment.
        """
        reward = 0
        done = False
        new_pos = self.robot_pos[:]

        if action == 0:  # Move Up
            new_pos[0] -= 1
        elif action == 1:  # Move Down
            new_pos[0] += 1
        elif action == 2:  # Move Left
            new_pos[1] -= 1
        elif action == 3:  # Move Right
            new_pos[1] += 1
        elif action == 4:  # Pick up
            if tuple(self.robot_pos) in self.packages:
                package_index = list(self.robot_pos).index(1)
                self.inventory[package_index] = 1
                self.packages[self.robot_pos[0]][self.robot_pos[1]] = 0
                reward += 10  # Add a positive reward for picking up a package
            else:
                reward -= 1
        elif action == 5:  # Drop off
            if tuple(self.robot_pos) in self.delivery_points:
                if 1 in self.inventory:
                    package_index = list(self.inventory).index(1)
                    self.inventory[package_index] = 0
                    reward += 100
                    # Check if all packages have been delivered
                    if np.sum(self.inventory) == 0:
                        done = True  # Episode ends if all packages are delivered
                else:
                    reward -= 10
            else:
                reward -= 10

        # Check if new position is within grid boundaries
        if 0 <= new_pos[0] < self.grid_size and 0 <= new_pos[1] < self.grid_size:
            # Check for collision with obstacles
            if self.obstacles[new_pos[0]][new_pos[1]] == 1:
                reward -= 5  # Negative reward for collision with obstacles
                done = True  # Episode ends if collision occurs
            else:
                # Update robot's position
                self.robot_pos = new_pos

        # Positive reward for reaching intermediate steps (packages or delivery points)
        if reward == 0:
            reward += 1

        observation = self._get_observation()
        return observation, reward, done, {}


    def _get_observation(self):
        """
        Returns the current observation of the environment.

        Returns:
        - observation (tuple): Current observation of the environment.
        """
        package_locations = [int(x) for row in self.packages for x in row]
        delivery_locations = [int(x) for row in self.delivery_points for x in row]
        obstacles = [int(x) for row in self.obstacles for x in row]
        inventory = [int(x) for x in self.inventory]
        return tuple(self.robot_pos + package_locations + delivery_locations + obstacles + inventory)

def hash_state(env, state):
    """
    Hashes the state tuple into a single integer for indexing.

    Parameters:
    - env (WarehouseEnv): Instance of the Warehouse environment.
    - state (tuple): State tuple to be hashed.

    Returns:
    - hash_value (int): Hashed value of the state.
    """
    hash_value = 0
    for i, s in enumerate(state):
        hash_value += s * (2 ** i)
    return hash_value % env.q_table.shape[0]  # Ensure the hash value is within Q-table bounds


def draw_env(screen, env):
    """
    Draws the current state of the environment on the screen.

    Parameters:
    - screen (pygame.Surface): Pygame surface representing the screen.
    - env (WarehouseEnv): Instance of the Warehouse environment.
    """
    screen.fill((255, 255, 255))
    cell_size = 50
    grid_size = env.grid_size
    robot_img = pygame.image.load("robot.png").convert_alpha()
    package_img = pygame.image.load("package.png").convert_alpha()
    delivery_img = pygame.image.load("delivery.png").convert_alpha()
    obstacle_img = pygame.image.load("warning.png").convert_alpha()  # Load obstacle image

    for i in range(grid_size):
        for j in range(grid_size):
            pygame.draw.rect(screen, (255, 255, 255), (j * cell_size, i * cell_size, cell_size, cell_size))
            pygame.draw.rect(screen, (0, 0, 0), (j * cell_size, i * cell_size, cell_size, cell_size), 1)

            if env.packages[i][j] == 1:
                screen.blit(package_img, (j * cell_size, i * cell_size))
            if env.delivery_points[i][j] == 1:
                screen.blit(delivery_img, (j * cell_size, i * cell_size))
            if env.obstacles[i][j] == 1:  # Draw obstacle if present
                screen.blit(obstacle_img, (j * cell_size, i * cell_size))

    robot_pos = env.robot_pos
    screen.blit(robot_img, (robot_pos[1] * cell_size, robot_pos[0] * cell_size))

    pygame.display.flip()


def main():
    env = WarehouseEnv(grid_size=5, num_packages=3, num_delivery_points=3)

    # Print the number of delivery points and packages
    print(f"Number of delivery points: {env.num_delivery_points}")
    print(f"Number of packages: {env.num_packages}")

    pygame.init()
    screen = pygame.display.set_mode((env.grid_size * 50, env.grid_size * 50))
    clock = pygame.time.Clock()

    num_episodes = 500
    max_steps_per_episode = 1000  # Define maximum steps per episode
    epsilon = 0.1  # Define epsilon value for epsilon-greedy strategy
    learning_rate = 0.1  # Define the learning rate
    discount_factor = 0.99  # Define the discount factor

    for episode in range(num_episodes):
        print(f"Current Episode: {episode+1}")  # Print current episode
        state = env.reset()
        total_reward = 0
        done = False
        steps = 0  # Initialize step count
        exit_flag = False  # Flag to exit event handling loop
        while not done and steps < max_steps_per_episode:  # Check maximum steps condition
            for event in pygame.event.get():
                if event.type == pygame.QUIT:
                    exit_flag = True  # Set the exit flag to True
                    break  # Exit the event handling loop
            if exit_flag:
                break  # Exit the episode loop if the exit flag is True
            draw_env(screen, env)

            current_state = env._get_observation()
            hashed_state = hash_state(env, current_state)
            if random.random() < epsilon:
                action = env.action_space.sample()  # Choose a random action
            else:
                action = np.argmax(env.q_table[hashed_state])

            next_state, reward, done, _ = env.step(action)
            total_reward += reward

            next_hashed_state = hash_state(env, next_state)
            best_next_action = np.argmax(env.q_table[next_hashed_state])

            # Update Q-table
            env.q_table[hashed_state][action] += learning_rate * (
                reward + discount_factor * env.q_table[next_hashed_state][best_next_action] - env.q_table[hashed_state][action])

            clock.tick(10)
            steps += 1  # Increment step count
        print(f"Episode: {episode+1}, Total Reward: {total_reward}")

    pygame.quit()

if __name__ == "__main__":
    main()


Number of delivery points: 3
Number of packages: 3
Current Episode: 1


  if tuple(self.robot_pos) in self.packages:
  if tuple(self.robot_pos) in self.delivery_points:


Episode: 1, Total Reward: 241
Current Episode: 2
