In [None]:
# avoid more than 1 element per cell

In [None]:
import numpy as np
import random
import gym
from gym import spaces
import pygame

class WarehouseEnv(gym.Env):
    """
    Environment representing a warehouse where a robot navigates to pick up packages and deliver them to designated points.
    """

    def __init__(self, grid_size, num_packages, num_delivery_points):
        """
        Initializes the Warehouse environment.

        Parameters:
        - grid_size (int): Size of the grid layout.
        - num_packages (int): Number of packages in the warehouse.
        - num_delivery_points (int): Number of delivery points in the warehouse.
        """
        super(WarehouseEnv, self).__init__()
        self.grid_size = grid_size
        self.num_packages = num_packages
        self.num_delivery_points = num_delivery_points
        self.action_space = spaces.Discrete(6)  # Up, Down, Left, Right, Pick up, Drop off
        self.observation_space = spaces.Tuple((
            spaces.Discrete(grid_size),  # Robot X position
            spaces.Discrete(grid_size),  # Robot Y position
            spaces.MultiBinary(num_packages),  # Package locations
            spaces.MultiBinary(num_delivery_points),  # Delivery locations
            spaces.MultiBinary(num_packages)  # Inventory
        ))
        self.reset()

    def reset(self):
        """
        Resets the environment to its initial state.

        Returns:
        - observation (tuple): Initial observation of the environment.
        """
        self.robot_pos = [0, 0]
        self.packages = np.zeros(self.grid_size)
        self.delivery_points = np.zeros(self.grid_size)
        self.inventory = np.zeros(self.num_packages)

        # Randomly place at least one package without overlapping with delivery points
        placed_package = False
        while not placed_package:
            rand_x = np.random.randint(self.grid_size)
            rand_y = np.random.randint(self.grid_size)
            while self.delivery_points[rand_x] == 1:
                rand_x = np.random.randint(self.grid_size)
                rand_y = np.random.randint(self.grid_size)
            self.packages[rand_x] = 1
            placed_package = True

        # Ensure delivery points are placed without overlapping with packages
        placed_delivery = 0
        while placed_delivery < self.num_delivery_points:
            rand_x = np.random.randint(self.grid_size)
            if self.delivery_points[rand_x] == 0 and self.packages[rand_x] == 0:
                self.delivery_points[rand_x] = 1
                placed_delivery += 1

        return self._get_observation()

    def step(self, action):
        """
        Executes one time step in the environment.

        Parameters:
        - action (int): Action to be taken by the agent.

        Returns:
        - observation (tuple): New observation of the environment.
        - reward (float): Reward received from the environment.
        - done (bool): Whether the episode is done or not.
        - info (dict): Additional information about the environment.
        """
        reward = 0
        done = False
        new_pos = self.robot_pos[:]

        if action == 0:  # Move Up
            new_pos[0] -= 1
        elif action == 1:  # Move Down
            new_pos[0] += 1
        elif action == 2:  # Move Left
            new_pos[1] -= 1
        elif action == 3:  # Move Right
            new_pos[1] += 1
        elif action == 4:  # Pick up
            if tuple(self.robot_pos) in self.packages:
                package_index = list(self.robot_pos).index(1)
                self.inventory[package_index] = 1
                self.packages[package_index] = 0
                reward += 10
            else:
                reward -= 1
        elif action == 5:  # Drop off
            if tuple(self.robot_pos) in self.delivery_points:
                if 1 in self.inventory:
                    package_index = list(self.inventory).index(1)
                    self.inventory[package_index] = 0
                    reward += 100
                    # Check if all packages have been delivered
                    if np.sum(self.inventory) == 0:
                        done = True  # Episode ends if all packages are delivered
                else:
                    reward -= 10
            else:
                reward -= 10

        if 0 <= new_pos[0] < self.grid_size and 0 <= new_pos[1] < self.grid_size:
            self.robot_pos = new_pos  # Update the robot's position

        observation = self._get_observation()
        return observation, reward, done, {}

    def _get_observation(self):
        """
        Returns the current observation of the environment.

        Returns:
        - observation (tuple): Current observation of the environment.
        """
        return tuple(self.robot_pos + [int(x) for x in self.packages] + [int(x) for x in self.delivery_points] + [int(x) for x in self.inventory])

def draw_env(screen, env):
    """
    Draws the current state of the environment on the screen.

    Parameters:
    - screen (pygame.Surface): Pygame surface representing the screen.
    - env (WarehouseEnv): Instance of the Warehouse environment.
    """
    screen.fill((255, 255, 255))
    cell_size = 50
    grid_size = env.grid_size
    robot_img = pygame.image.load("robot.png").convert_alpha()
    package_img = pygame.image.load("package.png").convert_alpha()
    delivery_img = pygame.image.load("delivery.png").convert_alpha()

    for i in range(grid_size):
        pygame.draw.line(screen, (0, 0, 0), (i * cell_size, 0), (i * cell_size, grid_size * cell_size))
        pygame.draw.line(screen, (0, 0, 0), (0, i * cell_size), (grid_size * cell_size, i * cell_size))

    robot_pos = env.robot_pos
    screen.blit(robot_img, (robot_pos[1] * cell_size, robot_pos[0] * cell_size))

    for i in range(grid_size):
        for j in range(grid_size):
            if env.packages[i] == 1:
                screen.blit(package_img, (j * cell_size, i * cell_size))
            if env.delivery_points[i] == 1:
                screen.blit(delivery_img, (j * cell_size, i * cell_size))

    pygame.display.flip()

# Q-learning parameters
gamma = 0.9
alpha = 0.1

def q_learning(env, num_episodes=1000, max_steps_per_episode=3):
    """
    Performs Q-learning to train the agent.

    Parameters:
    - env (WarehouseEnv): Instance of the Warehouse environment.
    - num_episodes (int): Number of episodes for training.
    - max_steps_per_episode (int): Maximum number of steps per episode.
    """
    q_table = np.zeros((env.observation_space.n, env.action_space.n))
    for episode in range(num_episodes):
        state = env.reset()
        total_reward = 0
        steps = 0
        done = False
        while not done and steps < max_steps_per_episode:
            action = np.argmax(q_table[state])
            next_state, reward, done, _ = env.step(action)
            q_value = q_table[state, action]
            max_next_q_value = np.max(q_table[next_state])
            new_q_value = q_value + alpha * (reward + gamma * max_next_q_value - q_value)
            q_table[state, action] = new_q_value
            total_reward += reward
            state = next_state
            steps += 1
        print(f"Episode: {episode+1}, Total Reward: {total_reward}")

def main():
    env = WarehouseEnv(grid_size=5, num_packages=3, num_delivery_points=2)

    pygame.init()
    screen = pygame.display.set_mode((env.grid_size * 50, env.grid_size * 50))
    clock = pygame.time.Clock()

    num_episodes = 1000
    max_steps_per_episode = 100  # Define maximum steps per episode
    for episode in range(num_episodes):
        print(f"Current Episode: {episode+1}")  # Print current episode
        state = env.reset()
        total_reward = 0
        done = False
        steps = 0  # Initialize step count
        exit_flag = False  # Flag to exit event handling loop
        while not done and steps < max_steps_per_episode:  # Check maximum steps condition
            for event in pygame.event.get():
                if event.type == pygame.QUIT:
                    exit_flag = True  # Set the exit flag to True
                    break  # Exit the event handling loop
            if exit_flag:
                break  # Exit the episode loop if the exit flag is True
            draw_env(screen, env)
            action = env.action_space.sample()
            next_state, reward, done, _ = env.step(action)
            total_reward += reward
            clock.tick(10)
            steps += 1  # Increment step count
        print(f"Episode: {episode+1}, Total Reward: {total_reward}")

    pygame.quit()

if __name__ == "__main__":
    main()


# TASK IV

In [None]:
# α=0.5, γ= 0.5 for 2 hyper parameters. In addition, about the policy, εvalue starts at 0.9,
#and it is reduced by multiplied by 0.9999 until εvalue =0.5; then it decreases again by multiplied by 0.999 until εvalue = 0.01

In [None]:
import numpy as np
import random
import gym
from gym import spaces
import pygame

class WarehouseEnv(gym.Env):
    """
    Environment representing a warehouse where a robot navigates to pick up packages and deliver them to designated points.
    """

    def __init__(self, grid_size, num_packages, num_delivery_points):
        """
        Initializes the Warehouse environment.

        Parameters:
        - grid_size (int): Size of the grid layout.
        - num_packages (int): Number of packages in the warehouse.
        - num_delivery_points (int): Number of delivery points in the warehouse.
        """
        super(WarehouseEnv, self).__init__()
        self.grid_size = grid_size
        self.num_packages = num_packages
        self.num_delivery_points = num_delivery_points
        self.action_space = spaces.Discrete(6)  # Up, Down, Left, Right, Pick up, Drop off
        self.observation_space = spaces.Tuple((
            spaces.Discrete(grid_size),  # Robot X position
            spaces.Discrete(grid_size),  # Robot Y position
            spaces.MultiBinary(num_packages),  # Package locations
            spaces.MultiBinary(num_delivery_points),  # Delivery locations
            spaces.MultiBinary(num_packages)  # Inventory
        ))
        self.reset()

    def reset(self):
        """
        Resets the environment to its initial state.

        Returns:
        - observation (tuple): Initial observation of the environment.
        """
        self.robot_pos = [0, 0]
        self.packages = np.zeros((self.grid_size, self.grid_size))
        self.delivery_points = np.zeros((self.grid_size, self.grid_size))
        self.obstacles = np.zeros((self.grid_size, self.grid_size))
        self.inventory = np.zeros(self.num_packages)
        self.package_to_delivery = {}  # Dictionary to store package to delivery point associations

        # Randomly place delivery points
        for _ in range(self.num_delivery_points):
            rand_x = np.random.randint(self.grid_size)
            rand_y = np.random.randint(self.grid_size)
            while self.delivery_points[rand_x][rand_y] == 1:  # Ensure unique location for each delivery point
                rand_x = np.random.randint(self.grid_size)
                rand_y = np.random.randint(self.grid_size)
            self.delivery_points[rand_x][rand_y] = 1

        # Assign packages to remaining empty locations
        placed_packages = 0
        for i in range(self.grid_size):
            for j in range(self.grid_size):
                if placed_packages < self.num_packages and self.delivery_points[i][j] == 0:
                    self.packages[i][j] = 1
                    # Find the nearest delivery point and associate the package with it
                    dist_to_delivery = np.inf
                    nearest_delivery = None
                    for di in range(self.grid_size):
                        for dj in range(self.grid_size):
                            if self.delivery_points[di][dj] == 1:
                                dist = abs(i - di) + abs(j - dj)
                                if dist < dist_to_delivery:
                                    dist_to_delivery = dist
                                    nearest_delivery = (di, dj)
                    self.package_to_delivery[(i, j)] = nearest_delivery
                    placed_packages += 1

        # Randomly place obstacles
        num_obstacles = 0
        while num_obstacles < 2:  # Add 2 obstacles per episode
            rand_x = np.random.randint(self.grid_size)
            rand_y = np.random.randint(self.grid_size)
            if (
                self.packages[rand_x][rand_y] == 0
                and self.delivery_points[rand_x][rand_y] == 0
                and self.obstacles[rand_x][rand_y] == 0
            ):
                self.obstacles[rand_x][rand_y] = 1
                num_obstacles += 1

        self.q_table = np.zeros((2 ** (self.grid_size * 2 + self.num_packages * 2), self.action_space.n))
        return self._get_observation()

    def step(self, action):
        """
        Executes one time step in the environment.

        Parameters:
        - action (int): Action to be taken by the agent.

        Returns:
        - observation (tuple): New observation of the environment.
        - reward (float): Reward received from the environment.
        - done (bool): Whether the episode is done or not.
        - info (dict): Additional information about the environment.
        """
        reward = 0
        done = False
        new_pos = self.robot_pos[:]

        if action == 0:  # Move Up
            new_pos[0] -= 1
        elif action == 1:  # Move Down
            new_pos[0] += 1
        elif action == 2:  # Move Left
            new_pos[1] -= 1
        elif action == 3:  # Move Right
            new_pos[1] += 1
        elif action == 4:  # Pick up
            if tuple(self.robot_pos) in self.packages:
                package_index = list(self.robot_pos).index(1)
                self.inventory[package_index] = 1
                self.packages[self.robot_pos[0]][self.robot_pos[1]] = 0
                reward += 10  # Add a positive reward for picking up a package
            else:
                reward -= 1
        elif action == 5:  # Drop off
            if tuple(self.robot_pos) in self.delivery_points:
                if 1 in self.inventory:
                    package_index = list(self.inventory).index(1)
                    self.inventory[package_index] = 0
                    reward += 100
                    # Check if all packages have been delivered
                    if np.sum(self.inventory) == 0:
                        done = True  # Episode ends if all packages are delivered
                else:
                    reward -= 10
            else:
                reward -= 10

        # Check if new position is within grid boundaries
        if 0 <= new_pos[0] < self.grid_size and 0 <= new_pos[1] < self.grid_size:
            # Check for collision with obstacles
            if self.obstacles[new_pos[0]][new_pos[1]] == 1:
                reward -= 5  # Negative reward for collision with obstacles
                done = True  # Episode ends if collision occurs
            else:
                # Update robot's position
                self.robot_pos = new_pos

        # Positive reward for reaching intermediate steps (packages or delivery points)
        if reward == 0:
            reward += 1

        observation = self._get_observation()
        return observation, reward, done, {}


    def _get_observation(self):
        """
        Returns the current observation of the environment.

        Returns:
        - observation (tuple): Current observation of the environment.
        """
        package_locations = [int(x) for row in self.packages for x in row]
        delivery_locations = [int(x) for row in self.delivery_points for x in row]
        obstacles = [int(x) for row in self.obstacles for x in row]
        inventory = [int(x) for x in self.inventory]
        return tuple(self.robot_pos + package_locations + delivery_locations + obstacles + inventory)

def hash_state(env, state):
    """
    Hashes the state tuple into a single integer for indexing.

    Parameters:
    - env (WarehouseEnv): Instance of the Warehouse environment.
    - state (tuple): State tuple to be hashed.

    Returns:
    - hash_value (int): Hashed value of the state.
    """
    hash_value = 0
    for i, s in enumerate(state):
        hash_value += s * (2 ** i)
    return hash_value % env.q_table.shape[0]  # Ensure the hash value is within Q-table bounds


def draw_env(screen, env):
    """
    Draws the current state of the environment on the screen.

    Parameters:
    - screen (pygame.Surface): Pygame surface representing the screen.
    - env (WarehouseEnv): Instance of the Warehouse environment.
    """
    screen.fill((255, 255, 255))
    cell_size = 50
    grid_size = env.grid_size
    robot_img = pygame.image.load("robot.png").convert_alpha()
    package_img = pygame.image.load("package.png").convert_alpha()
    delivery_img = pygame.image.load("delivery.png").convert_alpha()
    obstacle_img = pygame.image.load("warning.png").convert_alpha()  # Load obstacle image

    for i in range(grid_size):
        for j in range(grid_size):
            pygame.draw.rect(screen, (255, 255, 255), (j * cell_size, i * cell_size, cell_size, cell_size))
            pygame.draw.rect(screen, (0, 0, 0), (j * cell_size, i * cell_size, cell_size, cell_size), 1)

            if env.packages[i][j] == 1:
                screen.blit(package_img, (j * cell_size, i * cell_size))
            if env.delivery_points[i][j] == 1:
                screen.blit(delivery_img, (j * cell_size, i * cell_size))
            if env.obstacles[i][j] == 1:  # Draw obstacle if present
                screen.blit(obstacle_img, (j * cell_size, i * cell_size))

    robot_pos = env.robot_pos
    screen.blit(robot_img, (robot_pos[1] * cell_size, robot_pos[0] * cell_size))

    pygame.display.flip()


def main():
    env = WarehouseEnv(grid_size=5, num_packages=3, num_delivery_points=3)

    # Print the number of delivery points and packages
    print(f"Number of delivery points: {env.num_delivery_points}")
    print(f"Number of packages: {env.num_packages}")

    pygame.init()
    screen = pygame.display.set_mode((env.grid_size * 50, env.grid_size * 50))
    clock = pygame.time.Clock()

    num_episodes = 1000
    max_steps_per_episode = 1000  # Define maximum steps per episode
    alpha = 0.5  # Learning rate
    gamma = 0.5  # Discount factor
    epsilon = 0.9  # Initial epsilon value
    min_epsilon = 0.01  # Minimum epsilon value
    decay_rate = 0.9999  # Epsilon decay rate

    all_rewards = []

    for episode in range(num_episodes):
        print(f"Current Episode: {episode + 1}")  # Print current episode
        state = env.reset()
        total_reward = 0
        done = False
        steps = 0  # Initialize step count
        exit_flag = False  # Flag to exit event handling loop

        # Update epsilon based on decay policy
        if epsilon > min_epsilon:
            epsilon *= decay_rate
            epsilon = max(epsilon, min_epsilon)

        while not done and steps < max_steps_per_episode:  # Check maximum steps condition
            for event in pygame.event.get():
                if event.type == pygame.QUIT:
                    exit_flag = True  # Set the exit flag to True
                    break  # Exit the event handling loop
            if exit_flag:
                break  # Exit the episode loop if the exit flag is True
            draw_env(screen, env)

            # Epsilon-greedy action selection
            if random.random() < epsilon:
                action = env.action_space.sample()  # Choose a random action
            else:
                # Choose the greedy action based on Q-values
                hashed_state = hash_state(env, env._get_observation())
                action = np.argmax(env.q_table[hashed_state])

            next_state, reward, done, _ = env.step(action)

            # Update Q-table using Q-learning equation with specified alpha and gamma
            hashed_state = hash_state(env, state)
            next_hashed_state = hash_state(env, next_state)
            best_next_action = np.argmax(env.q_table[next_hashed_state])
            env.q_table[hashed_state][action] += alpha * (
                reward + gamma * env.q_table[next_hashed_state][best_next_action] - env.q_table[hashed_state][action]
            )

            total_reward += reward
            clock.tick(10)
            steps += 1  # Increment step count
        print(f"Episode: {episode + 1}, Total Reward: {total_reward}")
        all_rewards.append(total_reward)

    pygame.quit()

    # Print the Q matrix after training
    print("Q matrix after training:")
    print(env.q_table)

    # Calculate average rewards for each episode
    avg_rewards = np.mean(all_rewards, axis=0)

    # Plotting
    episodes = np.arange(1, num_episodes + 1)
    plt.figure(figsize=(10, 6))
    plt.plot(episodes, avg_rewards)
    plt.title("Average Cumulative Reward per Episode")
    plt.xlabel("Episode Number")
    plt.ylabel("Average Cumulative Reward")
    plt.show()


if __name__ == "__main__":
    main()



In [None]:
# TASK V

In [None]:
# with obstacles

# TASK 1 TO 6

# IV

In [None]:
import numpy as np
import random
import gym
from gym import spaces
import pygame
import seaborn as sns
import matplotlib.pyplot as plt

class WarehouseEnv(gym.Env):
    """
    Environment representing a warehouse where a robot navigates to pick up packages and deliver them to designated points.
    """

    def __init__(self, grid_size, num_packages, num_delivery_points):
        """
        Initializes the Warehouse environment.

        Parameters:
        - grid_size (int): Size of the grid layout.
        - num_packages (int): Number of packages in the warehouse.
        - num_delivery_points (int): Number of delivery points in the warehouse.
        """
        super(WarehouseEnv, self).__init__()
        self.grid_size = grid_size
        self.num_packages = num_packages
        self.num_delivery_points = num_delivery_points
        self.action_space = spaces.Discrete(6)  # Up, Down, Left, Right, Pick up, Drop off
        self.observation_space = spaces.Tuple((
            spaces.Discrete(grid_size),  # Robot X position
            spaces.Discrete(grid_size),  # Robot Y position
            spaces.MultiBinary(num_packages),  # Package locations
            spaces.MultiBinary(num_delivery_points),  # Delivery locations
            spaces.MultiBinary(num_packages)  # Inventory
        ))
        self.reset()

    def reset(self):
        """
        Resets the environment to its initial state.

        Returns:
        - observation (tuple): Initial observation of the environment.
        """
        self.robot_pos = [0, 0]
        self.packages = np.zeros((self.grid_size, self.grid_size))
        self.delivery_points = np.zeros((self.grid_size, self.grid_size))
        self.obstacles = np.zeros((self.grid_size, self.grid_size))
        self.inventory = np.zeros(self.num_packages)
        self.package_to_delivery = {}  # Dictionary to store package to delivery point associations

        # Randomly place delivery points
        for _ in range(self.num_delivery_points):
            rand_x = np.random.randint(self.grid_size)
            rand_y = np.random.randint(self.grid_size)
            while self.delivery_points[rand_x][rand_y] == 1:  # Ensure unique location for each delivery point
                rand_x = np.random.randint(self.grid_size)
                rand_y = np.random.randint(self.grid_size)
            self.delivery_points[rand_x][rand_y] = 1

        # Assign packages to remaining empty locations
        placed_packages = 0
        for i in range(self.grid_size):
            for j in range(self.grid_size):
                if placed_packages < self.num_packages and self.delivery_points[i][j] == 0:
                    self.packages[i][j] = 1
                    # Find the nearest delivery point and associate the package with it
                    dist_to_delivery = np.inf
                    nearest_delivery = None
                    for di in range(self.grid_size):
                        for dj in range(self.grid_size):
                            if self.delivery_points[di][dj] == 1:
                                dist = abs(i - di) + abs(j - dj)
                                if dist < dist_to_delivery:
                                    dist_to_delivery = dist
                                    nearest_delivery = (di, dj)
                    self.package_to_delivery[(i, j)] = nearest_delivery
                    placed_packages += 1

        # Randomly place obstacles
        num_obstacles = 0
        while num_obstacles < 2:  # Add 2 obstacles per episode
            rand_x = np.random.randint(self.grid_size)
            rand_y = np.random.randint(self.grid_size)
            if (
                self.packages[rand_x][rand_y] == 0
                and self.delivery_points[rand_x][rand_y] == 0
                and self.obstacles[rand_x][rand_y] == 0
            ):
                self.obstacles[rand_x][rand_y] = 1
                num_obstacles += 1

        self.q_table = np.zeros((2 ** (self.grid_size * 2 + self.num_packages * 2), self.action_space.n))
        return self._get_observation()

    def step(self, action):
        """
        Executes one time step in the environment.

        Parameters:
        - action (int): Action to be taken by the agent.

        Returns:
        - observation (tuple): New observation of the environment.
        - reward (float): Reward received from the environment.
        - done (bool): Whether the episode is done or not.
        - info (dict): Additional information about the environment.
        """
        reward = 0
        done = False
        new_pos = self.robot_pos[:]

        if action == 0:  # Move Up
            new_pos[0] -= 1
        elif action == 1:  # Move Down
            new_pos[0] += 1
        elif action == 2:  # Move Left
            new_pos[1] -= 1
        elif action == 3:  # Move Right
            new_pos[1] += 1
        elif action == 4:  # Pick up
            if tuple(self.robot_pos) in self.packages:
                package_index = list(self.robot_pos).index(1)
                self.inventory[package_index] = 1
                self.packages[self.robot_pos[0]][self.robot_pos[1]] = 0
                reward += 10  # Add a positive reward for picking up a package
            else:
                reward -= 1
        elif action == 5:  # Drop off
            if tuple(self.robot_pos) in self.delivery_points:
                if 1 in self.inventory:
                    package_index = list(self.inventory).index(1)
                    self.inventory[package_index] = 0
                    reward += 100
                    # Check if all packages have been delivered
                    if np.sum(self.inventory) == 0:
                        done = True  # Episode ends if all packages are delivered
                else:
                    reward -= 10
            else:
                reward -= 10

        # Check if new position is within grid boundaries
        if 0 <= new_pos[0] < self.grid_size and 0 <= new_pos[1] < self.grid_size:
            # Check for collision with obstacles
            if self.obstacles[new_pos[0]][new_pos[1]] == 1:
                reward -= 5  # Negative reward for collision with obstacles
                done = True  # Episode ends if collision occurs
            else:
                # Update robot's position
                self.robot_pos = new_pos

        # Positive reward for reaching intermediate steps (packages or delivery points)
        if reward == 0:
            reward += 1

        observation = self._get_observation()
        return observation, reward, done, {}


    def _get_observation(self):
        """
        Returns the current observation of the environment.

        Returns:
        - observation (tuple): Current observation of the environment.
        """
        package_locations = [int(x) for row in self.packages for x in row]
        delivery_locations = [int(x) for row in self.delivery_points for x in row]
        obstacles = [int(x) for row in self.obstacles for x in row]
        inventory = [int(x) for x in self.inventory]
        return tuple(self.robot_pos + package_locations + delivery_locations + obstacles + inventory)

def hash_state(env, state):
    """
    Hashes the state tuple into a single integer for indexing.

    Parameters:
    - env (WarehouseEnv): Instance of the Warehouse environment.
    - state (tuple): State tuple to be hashed.

    Returns:
    - hash_value (int): Hashed value of the state.
    """
    hash_value = 0
    for i, s in enumerate(state):
        hash_value += s * (2 ** i)
    return hash_value % env.q_table.shape[0]  # Ensure the hash value is within Q-table bounds


def draw_env(screen, env):
    """
    Draws the current state of the environment on the screen.

    Parameters:
    - screen (pygame.Surface): Pygame surface representing the screen.
    - env (WarehouseEnv): Instance of the Warehouse environment.
    """
    screen.fill((255, 255, 255))
    cell_size = 50
    grid_size = env.grid_size
    robot_img = pygame.image.load("robot.png").convert_alpha()
    package_img = pygame.image.load("package.png").convert_alpha()
    delivery_img = pygame.image.load("delivery.png").convert_alpha()
    obstacle_img = pygame.image.load("warning.png").convert_alpha()  # Load obstacle image

    for i in range(grid_size):
        for j in range(grid_size):
            pygame.draw.rect(screen, (255, 255, 255), (j * cell_size, i * cell_size, cell_size, cell_size))
            pygame.draw.rect(screen, (0, 0, 0), (j * cell_size, i * cell_size, cell_size, cell_size), 1)

            if env.packages[i][j] == 1:
                screen.blit(package_img, (j * cell_size, i * cell_size))
            if env.delivery_points[i][j] == 1:
                screen.blit(delivery_img, (j * cell_size, i * cell_size))
            if env.obstacles[i][j] == 1:  # Draw obstacle if present
                screen.blit(obstacle_img, (j * cell_size, i * cell_size))

    robot_pos = env.robot_pos
    screen.blit(robot_img, (robot_pos[1] * cell_size, robot_pos[0] * cell_size))

    pygame.display.flip()
    
def main():
    env = WarehouseEnv(grid_size=5, num_packages=3, num_delivery_points=3)

    # Print the number of delivery points and packages
    print(f"Number of delivery points: {env.num_delivery_points}")
    print(f"Number of packages: {env.num_packages}")

    pygame.init()
    screen = pygame.display.set_mode((env.grid_size * 50, env.grid_size * 50))
    clock = pygame.time.Clock()

    num_episodes = 1000
    max_steps_per_episode = 1000  # Define maximum steps per episode
    alpha = 0.5  # Learning rate
    gamma = 0.5  # Discount factor
    epsilon = 0.9  # Initial epsilon value
    min_epsilon = 0.01  # Minimum epsilon value
    decay_rate = 0.9999  # Epsilon decay rate

    all_rewards = []

    for episode in range(num_episodes):
        print(f"Current Episode: {episode + 1}")  # Print current episode
        state = env.reset()
        total_reward = 0
        done = False
        steps = 0  # Initialize step count

        while not done and steps < max_steps_per_episode:  # Check maximum steps condition
            draw_env(screen, env)

            # Epsilon-greedy action selection
            if random.random() < epsilon:
                action = env.action_space.sample()  # Choose a random action
            else:
                # Choose the greedy action based on Q-values
                hashed_state = hash_state(env, env._get_observation())
                action = np.argmax(env.q_table[hashed_state])

            next_state, reward, done, _ = env.step(action)

            # Update Q-table using Q-learning equation with specified alpha and gamma
            hashed_state = hash_state(env, state)
            next_hashed_state = hash_state(env, next_state)
            best_next_action = np.argmax(env.q_table[next_hashed_state])
            env.q_table[hashed_state][action] += alpha * (
                reward + gamma * env.q_table[next_hashed_state][best_next_action] - env.q_table[hashed_state][action]
            )

            total_reward += reward
            clock.tick(10)
            steps += 1  # Increment step count
            
            for event in pygame.event.get():
                if event.type == pygame.QUIT:
                    pygame.quit()
                    return
            
        print(f"Episode: {episode + 1}, Total Reward: {total_reward}")
        all_rewards.append(total_reward)

    pygame.quit()

if __name__ == "__main__":
    main()

In [None]:
import matplotlib.pyplot as plt

# Provided cumulative rewards data
cumulative_rewards = [
    -11, -1, -68, -90, -29, -12, -76, -83, -17, -57, -156, -23, -3, -20, -53, -5, -73, -6, -103, -53,
    -117, -79, 0, -272, -58, -10, -137, -14, -12, -2, -13, -21, -15, -76, -74, -273, -35, 0, -6, -42,
    -5, -268, -11, -12, -23, -24, -50, -5, -13, -243, -51, -3, -85, -14, -59, -61, -227, -316, -69,
    -51, -12, -43, -76, -12, -130, -127, 1, -44, -19, -8, -70, -119, -36, -16, -1, -29, -180, -25,
    -602, -27, -64, -36, -50, -108, -77, -38, -3, -165, -99, -65, -50, -13, -32, -22, -338, -20,
    -24, -57, -5, 3, -81, -95, -9, -48, -114, -185, -101, -114, -36, -69, -21, -39, -20, -11, -117,
    -5, -13, -96, -75, 8, -61, -53, -16, -62, -19, -280, -49, -64, -227, -192, 1, -98, -13, -31,
    -81, -226, -11, -98, -226, -79, -108, 17, -5, 15, -94, -2, -86, -9, -111, -382,-52, -26, -22, 3, -5, -47, -120, -183, -27, -38, -1, -623, -80, -56, -183, -70, -2, -94, -53,
    -6, -26, -77, -2, -367, -154, -35, -177, -73, -29, -41, -102, -25, -62, -20, -113, -42, -159,
    -27, -60, -136, -11, 3, -5, -26, -186, -36, -3, -101, -157, -18, -146, -32, -17, -17, -48,
    -122, -4, -80, -158, -19, -39, -10, -40, 9, -4, -160, -8, -92, -11, -65, -305, -868, -17,
    -11, 1, -156, -165, -12, -7, -34, -2, -42, -15, -74, -29, -104, -30, 1, -12, -22, -6, -11,
    -88, -88, -2, -35, -34, -3, -89, -13, -19, -67, -38, -8, -37, -392, -349, -90, -112, -36,
    -9, -317, -63, 2, -5, 12, -28, -367, -211, -63, -28, -14, -3, -4, -240, -38, -10, -25, -153,
    -15, -31, -162, -30, -8, -75, -44, -20, -44, -22, -19, -683, -27, -92, -50, -54, -159, -35,
    -15, -15, -88,-4, -171, -225, -28, -108, -1, -167, -32, -181, -364, -49, -270, -176, -56, -48, -81, -78, -12, -110,
    -84, -893, -21, -29, -8, -85, -152, -19, -56, -28, -1, -54, -14, -17, -36, -15, -13, -13, -7, -55, -56,
    -86, -152, 11, -4, -76, -4, -32, -7, -15, -20, -82, -58, -10, -349, -4, -18, -1, -223, -76, -266, -15,
    -98, -31, -12, -149, -83, -39, -19, -82, -4, -64, -28, -472, -4, -8, -14, -77, -49, -4, -114, -22, -59,
    -98, -24, -5, 1, -3, -69, -6, -716, -63, -406, -10, -18, -31, -101, -18, -21, -43, -139, -185, -15, -2,
    -18, -8, -166, -66, -158, -211, -95, -155, -3, -392, -61, -16, -28, -97, -60, -23, -120, -4, -97, -23,
    -47, -15, -7, 4, 1, -97, -43, -23, -10, -115, -38, -58, -14, -229, -26, -273, -210, -72, -36, -157, -96,
    -79, 0, -14, -15, -165, -47, -52, -6, -45, -17, -112, -56, -1, -149, -51, -23, 7, -38, -68, -4, -43,
    -178, 1, -48, -16, -2, -3, -58, -68, -10, 1, -12, -47, -63, 6, -3, -176, -129, -3, -16, -80, -24,
    -57, -40, -22, -141, 4, -102, -33, -175, -15, -122, -1, -24, -5, -13,-66, 8, -31, -33, -236, -300, -4, 
    -230, -22, -2, -1, -133, -56, -59, -157, -8, -4, -14, -5, -88, -3, -133, -93, -10, -614, -164, -11, 
    -234, -5, -84, 1, -9, -61, -102, -49, -19, -46, -14, -195, -48, -62, -132, -53, -3, -45, -34, -24, 
    -28, -3, -18, -41, -106, -74, -43, -108, -48, -4, -108, -9, -293, -121, -31, 12, -99, -73, -37, 5, 0, 
    -63, -9, -96, -24, -132, -50, -13, -25, -86, -12, -18, -13, -28, -83, -14, -52, -54, -225, -119, -4, 
    -29, -54, -14, -5, -20, -57, -10, -282, -20, -11, -64, -71, -35, -16, 5, -14, -14, -21, 5, -83, -57, 
    -14, -61, -94, -12, -61, -14, -49, -133, -5, -30, -24, -72, -36, -70, -42, -30, -22, -1, -33, -191, 
    -180, -62, -149, -37, -20, -31, -96, -58, -7, -38, -42, -2, -4, -16, -26, -271, -340, -20, 5, -40, -52, 
    -287, -7, -322, -4, -58, -51, -43, -36, -29, -57, -11, -17, -489, -10, -153, -7, -14, -68, -19, -105, 
    -72, -143, -60, -41, -44, -5, -5, -29, -49, -32, -83, -7, -6, -749, -68, -18, -68, -117, -52, -11, -39, 
    -8, -82, -38, -5, -143, -74, -158, -67, -14, -157, -1, -2, -8, -112, -47, -115, -113, -149, -153, -61,
    -67, -128, -142, -39, -20, -2, -4, -292, -24, -37, -96, -17, -39, -3, -121, -123, -430, -9, -12, -264, 
    -58, -20, -93, -25, -142, -174, -49, -45, -2, -384, -79, -113, -27, -1, -72, -43, -44, -123, -84,-21, 0, -78, -13, 5, -117, -83, -51, -6, -131, -217, -123, -10, -12, -301, 
    -113, -631, -379, -11, -118, -23, -297, -171, -240, -22, -323, -44, -61, -68, 
    5, -14, -34, -17, -5, -5, -96, -15, -11, -31, 0, -73, -648, -855, -128, -42, 
    -78, -127, -33, -3, -90, -28, -73, 6, -204, -3, -21, -111, -99, -5, -42, -85, 
    -10, 2, -137, -9, -9, -34, -33, -90, -162, -14, -62, -5, -46, -19, -36, -99, 
    -118, -10, -48, 3, -161, -66, -35, -194, -12, -86, -67, -382, -34, -118, -22, 
    -25, -29, -19, -41, -76, -99, -48, -74, -35, -159, -81, -308, -120, -18, -34, 
    -38, -21, -100, -51, -421, -8, -67, -50, 8, -32, -79, -88, -226, -45, -3, -50, 
    -10, -132, -175, -12, -213, -32, -82, -4, -40, -15, -41, -90, -23, -24, -45, 
    -53, -121, -28, -90, -5, -97, -54, -229, -2, -83, -181, -35, -5, -541, -17, 
    -28, -124, -6, -52, -53, -131, -41, -18, -21, -38, -27, -24, -13, -34, -134, 
    -245, -25, -15, -38, -45, -81, -68, -47, -5, -167, -59, -21, -157, -72, -286, 
    -43, -70, -415, -34, -127, -13, -68, -20, -30, -36, -22, -275, -271, -246, -32, 
    -155, -144, -6, -24, -17, -69, -3, -101, -191, -39, -6, -10, 9, -110, -4, -109, 
    -16, 0, -26, -39, -86, -29, -743, -33, -808, -43, -134, -68, -9, -9, -53, -121, 
    -22, -7, -66, -29, -2, -96, -111, -47, -5, 4, -287, -5, -34, -230, -95, -15, -319, 
    -79, -15, -43]


# Plotting
episodes = range(1, len(cumulative_rewards) + 1)
plt.figure(figsize=(10, 6))
plt.plot(episodes, cumulative_rewards)
plt.title("Cumulative Reward per Episode")
plt.xlabel("Episode Number")
plt.ylabel("Cumulative Reward")
plt.grid(True)
plt.show()


# V

In [None]:
import numpy as np
import matplotlib.pyplot as plt

class WarehouseEnvironment:
    def __init__(self, grid_size):
        self.grid_size = grid_size
        self.state = (0, 0)  # Initial state
        self.goal_state = (3, 4)  # Goal state
        self.actions = [(1, 0), (0, 1), (-1, 0), (0, -1)]  # Possible actions (down, right, up, left)
        self.obstacles = [(2, 1), (3, 3)]  # Obstacle positions
        self.package_location = (1, 1)  # Package location

    def reset(self):
        self.state = (0, 0)
        return self.state

    def step(self, action):
      if action in self.actions:
          next_state = (self.state[0] + action[0], self.state[1] + action[1])
          next_state = (max(0, min(next_state[0], self.grid_size - 1)),
                        max(0, min(next_state[1], self.grid_size - 1)))  # Ensure state is within bounds

          # Check if the agent is trying to pick up the package
          if next_state == self.package_location and self.package_location != (-1, -1):
              # Agent picks up the package
              self.package_location = (-1, -1)  # Package is picked up
          elif next_state == self.goal_state and self.package_location == (-1, -1):
              # Agent drops off the package
              self.package_location = self.goal_state  # Package is dropped off
          elif next_state == self.goal_state and self.package_location != (-1, -1):
              # Agent tries to drop off the package in the wrong location
              reward = -10
              return self.state, reward, False  # Return the current state with the penalty and mark episode as not done

          if next_state not in self.obstacles:
              self.state = next_state
          else:
              reward = -5  # Apply penalty if the agent crosses an obstacle
      else:
          reward = -5  # Apply penalty if the action is invalid

      done = self.state == self.goal_state and self.package_location == self.goal_state
      reward = 50 if done else (reward if 'reward' in locals() else -1)  # Reward is 50 for reaching the goal with the package, -1 otherwise

      return self.state, reward, done




    def visualize_grid(self):
        grid = [['-' for _ in range(self.grid_size)] for _ in range(self.grid_size)]
        grid[self.state[0]][self.state[1]] = 'A'  # Agent
        grid[self.goal_state[0]][self.goal_state[1]] = 'G'  # Goal

        grid[self.package_location[0]][self.package_location[1]] = 'P'  # Package

        for obstacle in self.obstacles:
            grid[obstacle[0]][obstacle[1]] = 'X'  # Obstacle

        for row in grid:
            print(' '.join(row))
        print()


def q_learning(env, num_episodes, alpha, gamma, epsilon):
    q_table = np.zeros((env.grid_size, env.grid_size, len(env.actions)))
    total_rewards = []

    state = env.reset()

    for episode in range(num_episodes):
        done = False
        total_reward = 0

        while not done:
            if np.random.uniform(0, 1) < epsilon:  # Epsilon-greedy policy
                action = np.random.randint(0, len(env.actions))
            else:
                action = np.argmax(q_table[state[0], state[1]])

            next_state, reward, done = env.step(env.actions[action])
            total_reward += reward
            q_table[state[0], state[1], action] += alpha * (reward + gamma * np.max(q_table[next_state[0], next_state[1]]) - q_table[state[0], state[1], action])
            state = next_state

        total_rewards.append(total_reward)

    return total_rewards

# Define environment parameters
grid_size = 5
num_episodes = 1000
alphas = [0.9, 0.1, 0.5]  # Different alpha values to try
gammas = [0.9, 0.5, 0.2]  # Different gamma values to try
epsilon = 0.2  # Epsilon for epsilon-greedy policy

# Print initial state of the grid before trying different combinations
env = WarehouseEnvironment(grid_size)
env.visualize_grid()

# Train Q-learning agent for different combinations of alpha and gamma
for alpha in alphas:
    for gamma in gammas:
        total_rewards = q_learning(env, num_episodes, alpha, gamma, epsilon)

        # Calculate average reward for each episode
        average_rewards = [np.mean(total_rewards[:i+1]) for i in range(num_episodes)]

        # Plot
        plt.plot(range(1, num_episodes+1), average_rewards, label=f'alpha={alpha}, gamma={gamma}')

plt.xlabel('Episode Number')
plt.ylabel('Average Reward')
plt.title('Average Reward per Episode')
plt.legend()
plt.grid(True)
plt.show()


# TASK 7 AND 8

In [None]:
#Motivation: Target Networks: Target Networks are used to stabilize training in Deep Q-Learning. By maintaining a separate target network with fixed parameters for estimating Q-values, we reduce the risk of divergence during training. This improvement helps in achieving more stable and reliable learning.
#Double DQN: Double DQN addresses the overestimation bias in traditional Q-learning algorithms. By decoupling the action selection and evaluation steps, Double DQN leads to more accurate Q-value estimations, resulting in improved learning performance.

In [None]:
# without improvement

In [None]:
import numpy as np
import pygame
import time
import gym
import random
from keras import Sequential
from collections import deque
from keras.layers import Dense
from keras.optimizers import Adam
import matplotlib.pyplot as plt
from keras.activations import relu, linear


class DeepQNetwork:


    def __init__(self, action_space, state_space, learning_rate=0.8):

        self.epsilon = 1.0
        self.gamma = .95
        self.batch_size = 64
        self.epsilon_min = .01
        self.learning_rate = learning_rate
        self.epsilon_decay = .90
        self.memory = deque(maxlen=100000)
        self.action_space_size = action_space
        self.state_space_shape = state_space
        self.model = self.build_model()

    def build_model(self):

        model = Sequential()
        model.add(Dense(20, input_dim=self.state_space_shape, activation=relu))
        model.add(Dense(25, activation=relu))
        model.add(Dense(self.action_space_size, activation=linear))
        model.compile(loss='mse', optimizer=Adam(learning_rate=self.learning_rate))
        return model

    def rememberFunction(self, state, action, reward, nextState, done):
        self.memory.append((state, action, reward, nextState, done))

    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_space_size)
        state = np.expand_dims(state, axis=0)
        actValues = self.model.predict(state)
        return np.argmax(actValues[0])

    def replayFunction(self):

        if len(self.memory) < self.batch_size:
            return

        miniBatchVar = random.sample(self.memory, self.batch_size)
        states = np.array([i[0] for i in miniBatchVar])
        actions = np.array([i[1] for i in miniBatchVar])
        rewards = np.array([i[2] for i in miniBatchVar])
        nextStates = np.array([i[3] for i in miniBatchVar])
        dones = np.array([i[4] for i in miniBatchVar])

        states = np.squeeze(states)
        nextStates = np.squeeze(nextStates)

        targets = rewards + self.gamma * (np.amax(self.model.predict_on_batch(nextStates), axis=1)) * (1 - dones)
        targetsFull = self.model.predict_on_batch(states)

        indexes = np.array([i for i in range(self.batch_size)])
        targetsFull[[indexes], [actions]] = targets

        self.model.fit(states, targetsFull, epochs=1, verbose=0)
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay


def rewardFunction(nextStateInfo):
    nextState = nextStateInfo[0]  # Extracting the nextState array
    if nextState[0] >= 0.5:
        print("Reached Goal")
        return 10
    if nextState[0] > -0.4:
        return (1 + nextState[0]) ** 2
    return 0


def trainDQNetwork(environment, agent, episode):
    episodeScores = []
    for e in range(episode):
        state = environment.reset()[0]  # Extracting the state array
        score = 0
        maxSteps = 1000
        for i in range(maxSteps):
            environment.render()
            for event in pygame.event.get():
                if event.type == pygame.QUIT:
                    pygame.quit()
                    quit()
            state = np.array(state)  # Convert state tuple to numpy array
            action = agent.act(state)
            stepResult = environment.step(action)  # get all return values
            nextState, reward, done = stepResult[:3]  # get the first three elements
            reward = rewardFunction(stepResult)  # pass values to rewardFunction function
            score += reward
            nextState = np.array(nextState)  
            agent.rememberFunction(state, action, reward, nextState, done)
            state = nextState
            agent.replayFunction()
            if done:
                print("episode: {}/{}, score: {}".format(e, episode, score))
                break
        episodeScores.append(score)
    return episodeScores

def main():
    pygame.init()  # initialize Pygame
    environment = gym.make('MountainCar-v0', render_mode="human") # render as human
    np.random.seed(10)  # numpy random seed

    print(environment.observation_space)
    print(environment.action_space)
    agent = DeepQNetwork(environment.action_space.n, environment.observation_space.shape[0], learning_rate=0.001)
    episodes = 100
    episodeScores = trainDQNetwork(environment, agent, episodes)
    plt.plot([i+1 for i in range(episodes)], episodeScores)
    plt.show()

if __name__ == '__main__':
    main()


In [None]:
# with double DQN

In [None]:
import numpy as np
import pygame
import gym
import random
from keras import Sequential
from collections import deque
from keras.layers import Dense
from keras.optimizers import Adam
import matplotlib.pyplot as plt
from keras.activations import relu, linear

class DoubleDeepQNetwork:
    def __init__(self, action_space, state_space, learning_rate=0.001):
        self.epsilon = 1.0
        self.gamma = 0.99
        self.batch_size = 64
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995
        self.memory = deque(maxlen=100000)
        self.action_space_size = action_space
        self.state_space_shape = state_space
        self.model = self.build_model(learning_rate)
        self.target_model = self.build_model(learning_rate)

    def build_model(self, learning_rate):
        model = Sequential()
        model.add(Dense(20, input_dim=self.state_space_shape, activation=relu))
        model.add(Dense(25, activation=relu))
        model.add(Dense(self.action_space_size, activation=linear))
        model.compile(loss='mse', optimizer=Adam(learning_rate=learning_rate))
        return model

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_space_size)
        state = np.expand_dims(state, axis=0)
        act_values = self.model.predict(state)
        return np.argmax(act_values[0])

    def replay(self):
        if len(self.memory) < self.batch_size:
            return
        minibatch = random.sample(self.memory, self.batch_size)
        states = np.array([i[0] for i in minibatch])
        actions = np.array([i[1] for i in minibatch])
        rewards = np.array([i[2] for i in minibatch])
        next_states = np.array([i[3] for i in minibatch])
        dones = np.array([i[4] for i in minibatch])

        states = np.squeeze(states)
        next_states = np.squeeze(next_states)

        target_actions = np.argmax(self.model.predict_on_batch(next_states), axis=1)
        target_q_values = self.target_model.predict_on_batch(next_states)
        targets = rewards + self.gamma * target_q_values[np.arange(self.batch_size), target_actions] * (1 - dones)

        targets_full = self.model.predict_on_batch(states)
        targets_full[np.arange(self.batch_size), actions] = targets

        self.model.fit(states, targets_full, epochs=1, verbose=0)

        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

        if len(self.memory) % 1000 == 0:
            self.update_target_model()

    def update_target_model(self):
        self.target_model.set_weights(self.model.get_weights())

def get_reward(next_state_info):
    next_state = next_state_info[0]
    if next_state[0] >= 0.5:
        print("Car has reached the goal")
        return 10
    if next_state[0] > -0.4:
        return (1 + next_state[0]) ** 2
    return 0

def train_ddqn(environment, agent, episode):
    episode_scores = []
    for e in range(episode):
        state = environment.reset()[0]
        score = 0
        max_steps = 1000
        for i in range(max_steps):
            state = np.array(state)
            action = agent.act(state)
            step_result = environment.step(action)
            next_state, reward, done = step_result[:3]
            reward = get_reward(step_result)
            score += reward
            next_state = np.array(next_state)
            agent.remember(state, action, reward, next_state, done)
            state = next_state
            agent.replay()
            if done:
                print("episode: {}/{}, score: {}".format(e, episode, score))
                break
        episode_scores.append(score)
    return episode_scores

def main():
    pygame.init()
    environment = gym.make('MountainCar-v0', render_mode="human")
    np.random.seed(10)

    print(environment.observation_space)
    print(environment.action_space)
    agent = DoubleDeepQNetwork(environment.action_space.n, environment.observation_space.shape[0], learning_rate=0.001)
    episodes = 60
    episode_scores = train_ddqn(environment, agent, episodes)
    plt.plot([i+1 for i in range(episodes)], episode_scores)
    plt.show()

if __name__ == '__main__':
    main()


In [None]:
#target networks

In [None]:
import numpy as np
import pygame
import time
import gym
import random
from keras import Sequential
from collections import deque
from keras.layers import Dense
from keras.optimizers import Adam
import matplotlib.pyplot as plt
from keras.activations import relu, linear


class DeepQNetwork:
    def __init__(self, action_space, state_space, learning_rate=0.001):
        self.epsilon = 1.0
        self.gamma = 0.95
        self.batch_size = 64
        self.epsilon_min = 0.01
        self.learning_rate = learning_rate
        self.epsilon_decay = 0.995
        self.memory = deque(maxlen=100000)
        self.action_space_size = action_space
        self.state_space_shape = state_space
        self.model = self.build_model()
        self.target_model = self.build_model()
        self.update_target_model()

    def build_model(self):
        model = Sequential()
        model.add(Dense(20, input_dim=self.state_space_shape, activation=relu))
        model.add(Dense(25, activation=relu))
        model.add(Dense(self.action_space_size, activation=linear))
        model.compile(loss='mse', optimizer=Adam(learning_rate=self.learning_rate))
        return model

    def update_target_model(self):
        self.target_model.set_weights(self.model.get_weights())

    def rememberFunction(self, state, action, reward, nextState, done):
        self.memory.append((state, action, reward, nextState, done))

    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_space_size)
        state = np.expand_dims(state, axis=0)
        actValues = self.model.predict(state)
        return np.argmax(actValues[0])

    def replayFunction(self):
        if len(self.memory) < self.batch_size:
            return
        miniBatchVar = random.sample(self.memory, self.batch_size)
        states = np.array([i[0] for i in miniBatchVar])
        actions = np.array([i[1] for i in miniBatchVar])
        rewards = np.array([i[2] for i in miniBatchVar])
        nextStates = np.array([i[3] for i in miniBatchVar])
        dones = np.array([i[4] for i in miniBatchVar])

        states = np.squeeze(states)
        nextStates = np.squeeze(nextStates)

        targets = rewards + self.gamma * (np.amax(self.target_model.predict_on_batch(nextStates), axis=1)) * (1 - dones)
        targetsFull = self.model.predict_on_batch(states)

        indexes = np.array([i for i in range(self.batch_size)])
        targetsFull[[indexes], [actions]] = targets

        self.model.fit(states, targetsFull, epochs=1, verbose=0)
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

    def soft_update_target_network(self, tau):
        model_weights = self.model.get_weights()
        target_model_weights = self.target_model.get_weights()

        new_weights = []
        for model_weight, target_model_weight in zip(model_weights, target_model_weights):
            new_weight = tau * model_weight + (1 - tau) * target_model_weight
            new_weights.append(new_weight)

        self.target_model.set_weights(new_weights)



def rewardFunction(nextStateInfo):
    nextState = nextStateInfo[0]  # Extracting the nextState array
    if nextState[0] >= 0.5:
        print("Reached Goal")
        return 10
    if nextState[0] > -0.4:
        return (1 + nextState[0]) ** 2
    return 0


def trainDQNetwork(environment, agent, episode):
    episodeScores = []
    for e in range(episode):
        state = environment.reset()[0]  # Extracting the state array
        score = 0
        maxSteps = 1000
        for i in range(maxSteps):
            environment.render()
            for event in pygame.event.get():
                if event.type == pygame.QUIT:
                    pygame.quit()
                    quit()
            state = np.array(state)  # Convert state tuple to numpy array
            action = agent.act(state)
            stepResult = environment.step(action)  # get all return values
            nextState, reward, done = stepResult[:3]  # get the first three elements
            reward = rewardFunction(stepResult)  # pass values to rewardFunction function
            score += reward
            nextState = np.array(nextState)
            agent.rememberFunction(state, action, reward, nextState, done)
            state = nextState
            agent.replayFunction()
            if done:
                print("episode: {}/{}, score: {}".format(e, episode, score))
                break
        episodeScores.append(score)
        agent.soft_update_target_network(0.01)  # Soft update the target network
    return episodeScores


def main():
    pygame.init()  # initialize Pygame
    environment = gym.make('MountainCar-v0', render_mode="human")  # render as human
    np.random.seed(10)  # numpy random seed

    print(environment.observation_space)
    print(environment.action_space)
    agent = DeepQNetwork(environment.action_space.n, environment.observation_space.shape[0], learning_rate=0.001)
    episodes = 100
    episodeScores = trainDQNetwork(environment, agent, episodes)
    plt.plot([i + 1 for i in range(episodes)], episodeScores)
    plt.show()


if __name__ == '__main__':
    main()


In [3]:
import random
import gym
import numpy as np
from collections import deque
import tensorflow as tf
import os

env = gym.make('MountainCar-v0', render_mode="human")

state_size = env.observation_space.shape[0]
action_size = env.action_space.n
batch_size = 32
n_episodes = 70000
output_dir = 'model/'

if not os.path.exists(output_dir):
    os.makedirs(output_dir)

class DQNAgent:
    def __init__(self, state_size, action_size):
        self.env = env
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=200000)
        self.gamma = 0.99
        self.epsilon = 1.0
        self.epsilon_decay = 0.85
        self.epsilon_min = 0.00001
        self.learning_rate = 0.001
        self.model = self._build_model()
        self.target_model = self._build_model()
        self.update_target_model()

    def _build_model(self):
        model = tf.keras.models.Sequential()
        state_shape = self.env.observation_space.shape
        model.add(tf.keras.layers.Dense(24, input_shape=state_shape, activation='relu'))
        model.add(tf.keras.layers.Dense(48, activation='relu'))
        model.add(tf.keras.layers.Dense(self.action_size, activation='linear'))
        model.compile(loss='mse', optimizer=tf.keras.optimizers.Adam(learning_rate=self.learning_rate))
        return model

    def update_target_model(self):
        self.target_model.set_weights(self.model.get_weights())

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        act_values = self.model.predict(state)
        return np.argmax(act_values[0])

    def replay(self, batch_size):
        minibatch = random.sample(self.memory, batch_size)
        states = []
        targets = []
        for state, action, reward, next_state, done in minibatch:
            target = reward
            if not done:
                target = (reward + self.gamma * np.amax(self.target_model.predict(next_state)[0]))
            target_f = self.model.predict(state)
            target_f[0][action] = target
            states.append(state[0])
            targets.append(target_f[0])
        self.model.fit(np.array(states), np.array(targets), epochs=1, verbose=0)

    def load(self, name):
        self.model.load_weights(name)

    def save(self, name):
        self.model.save_weights(name)


agent = DQNAgent(state_size, action_size)

done = False
counter = 0
scores_memory = deque(maxlen=100)
for e in range(n_episodes):
    state = env.reset()
    state = np.reshape(state, [1, state_size])
    for time in range(7000):
        action = agent.act(state)
        next_state, reward, done, halp = env.step(action)
        next_state = np.reshape(next_state, [1, state_size])
        agent.remember(state, action, reward, next_state, done)
        if len(agent.memory) > batch_size:
            agent.replay(batch_size)
        state = next_state
        if done:
            scores_memory.append(time)
            scores_avg = np.mean(scores_memory) * -1
            print('episode: {}/{}, score: {}, e {:.2}, help: {}, reward: {}, 100score avg: {}'.format(e, n_episodes, time, agent.epsilon, state, reward, scores_avg))
            break
    agent.update_target_model()
    if agent.epsilon > agent.epsilon_min:
        agent.epsilon *= agent.epsilon_decay
    if e % 50 == 0:
        agent.save(output_dir + 'weights_final' + '{:04d}'.format(e) + ".hdf5")


ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (2,) + inhomogeneous part.