In [None]:
import pygame
from pygame.locals import QUIT
import gym
from gym import spaces
import numpy as np

class SimpleGridEnvironment(gym.Env):
    def __init__(self, grid_size=(20, 20), cell_size=30, wall_positions=None, point_positions=None):
        super(SimpleGridEnvironment, self).__init__()

        self.grid_size = grid_size
        self.cell_size = cell_size
        self.wall_positions = set(wall_positions) if wall_positions is not None else set()
        self.point_positions = set(point_positions) if point_positions is not None else set()
        self.action_space = spaces.Discrete(4)  # 4 possible actions: 0=up, 1=down, 2=left, 3=right
        self.observation_space = spaces.Discrete(np.prod(grid_size))

        self.agent_pos = np.array([0, 0])
        self.goal_pos = np.array([grid_size[0] - 1, 0])  # Updated goal position
        self.max_steps = np.prod(grid_size) * 2

        self.current_step = 0

        self.q_table = np.zeros((np.prod(grid_size), self.action_space.n))

        # Q-learning parameters
        self.alpha = 0.1  # Learning rate
        self.gamma = 0.99  # Discount factor
        self.epsilon = 1.5  # Exploration-exploitation trade-off

        # Initialize Pygame
        pygame.init()
        self.screen = pygame.display.set_mode((grid_size[1] * cell_size, grid_size[0] * cell_size))

    def reset(self):
        self.agent_pos = np.array([0, 0])
        self.current_step = 0
        self.goal_pos = np.array([self.grid_size[0] - 1, 0])  # Reset goal position to left-down corner
        self.wall_positions = set(self.generate_walls())  # Optionally reset walls
        self.point_positions = set(self.generate_points())  # Optionally reset points
        return self._get_observation()

    def generate_walls(self):
        # Set predefined wall positions
        return {
            (0, 5), (1, 5), (2, 5), (5, 5), (5, 4), (5, 3), (5, 2), (5, 1), (5, 0),
            (12, 0), (12, 1), (12, 2), (12, 3), (12, 4), (12, 5), (12, 6), (12, 7), (0, 11), (1, 11),
            (2, 11), (3, 11), (4, 11), (5, 11), (6, 11), (7, 11), (10, 11), (11, 11),
            (12, 11), (13, 11), (14, 11), (15, 11), (16, 11), (17, 11), (18, 11), (19, 11), (20, 11),
            (12, 12), (12, 13), (12, 14), (12, 10), (12, 17), (12, 15), (12, 16), (12, 19), (12, 20),
            (12, 18), (4, 16), (4, 15), (3, 16), (3, 15), (9, 15), (9, 16), (16, 4), (16, 3), (16, 5),
            (17, 4), (17, 3), (17, 5), (9, 15), (9, 16)
        }

    def generate_points(self):
        # Set predefined point positions
        return {
            (2, 2), (2, 3), (2, 4), (9, 2), (9, 3), (9, 4), (8, 2), (8, 3), (8, 4),
            (7, 13), (7, 14), (7, 15), (6, 14), (6, 13), (6, 15), (6, 15), (10, 13),
            (15, 1), (15, 2), (14, 1), (14, 2)
        }

    def step(self, action):
        self.current_step += 1

        if np.random.uniform() < self.epsilon:
            action = np.random.choice(self.action_space.n)
        else:
            action = np.argmax(self.q_table[self._get_observation()])

        if action == 0:  # move up
            new_pos = (max(0, self.agent_pos[0] - 1), self.agent_pos[1])
        elif action == 1:  # move down
            new_pos = (min(self.grid_size[0] - 1, self.agent_pos[0] + 1), self.agent_pos[1])
        elif action == 2:  # move left
            new_pos = (self.agent_pos[0], max(0, self.agent_pos[1] - 1))
        elif action == 3:  # move right
            new_pos = (self.agent_pos[0], min(self.grid_size[1] - 1, self.agent_pos[1] + 1))

        # Check if the new position is a valid move (not a wall)
        if new_pos not in self.wall_positions:
            self.agent_pos = np.array(new_pos)

        # Check if the new position corresponds to a point
        if tuple(self.agent_pos) in self.point_positions:  # Convert NumPy array to tuple
            self.point_positions.remove(tuple(self.agent_pos))
            reward = 10  # Reward for collecting a point
        else:
            reward = 0

        done = np.array_equal(self.agent_pos, self.goal_pos) or self.current_step >= self.max_steps

        if done:
            reward += 1 if np.array_equal(self.agent_pos, self.goal_pos) else 0

        # Q-value update
        current_q = self.q_table[self._get_observation(), action]
        max_future_q = np.max(self.q_table[self._get_observation()])
        new_q = (1 - self.alpha) * current_q + self.alpha * (reward + self.gamma * max_future_q)
        self.q_table[self._get_observation(), action] = new_q

        return self._get_observation(), reward, done, {}

    def _get_observation(self):
        return np.ravel_multi_index(self.agent_pos, self.grid_size)

    def render(self):
        for event in pygame.event.get():
            if event.type == QUIT:
                pygame.quit()
                return

        self.screen.fill((255, 255, 255))

        agent_rect = pygame.Rect(self.agent_pos[1] * self.cell_size, self.agent_pos[0] * self.cell_size,
                                 self.cell_size, self.cell_size)
        pygame.draw.rect(self.screen, (0, 0, 255), agent_rect)

        goal_rect = pygame.Rect(self.goal_pos[1] * self.cell_size, self.goal_pos[0] * self.cell_size,
                                self.cell_size, self.cell_size)
        pygame.draw.rect(self.screen, (0, 255, 0), goal_rect)

        # Convert the set to a list before iterating over it
        wall_positions_list = list(self.wall_positions)
        for wall_pos in wall_positions_list:
            wall_rect = pygame.Rect(wall_pos[1] * self.cell_size, wall_pos[0] * self.cell_size,
                                    self.cell_size, self.cell_size)
            pygame.draw.rect(self.screen, (150, 150, 150), wall_rect)

        # Convert the set to a list before iterating over it
        point_positions_list = list(self.point_positions)
        for point_pos in point_positions_list:
            point_rect = pygame.Rect(point_pos[1] * self.cell_size, point_pos[0] * self.cell_size,
                                     self.cell_size, self.cell_size)
            pygame.draw.rect(self.screen, (255, 0, 0), point_rect)

        pygame.display.flip()

# Example usage:
try:
    best_q_table = np.load('best_q_table.npy')
except FileNotFoundError:
    best_q_table = None

# Create the environment
env = SimpleGridEnvironment(grid_size=(20, 20), cell_size=30)

# Initialize Q-table with the loaded best Q-table or zeros if not available
if best_q_table is not None:
    env.q_table = best_q_table
else:
    env.q_table = np.zeros((np.prod(env.grid_size), env.action_space.n))

# Training loop
for episode in range(10000):
    observation = env.reset()
    total_reward = 0

    while True:
        #env.render()

        if np.random.uniform() < env.epsilon:
            action = np.random.choice(env.action_space.n)
        else:
            action = np.argmax(env.q_table[observation])

        next_observation, reward, done, _ = env.step(action)
        total_reward += reward

        current_q = env.q_table[observation, action]
        max_future_q = np.max(env.q_table[next_observation])
        new_q = (1 - env.alpha) * current_q + env.alpha * (reward + env.gamma * max_future_q)
        env.q_table[observation, action] = new_q

        observation = next_observation

        if done:
            if best_q_table is None or total_reward > np.sum(best_q_table):
                best_q_table = np.copy(env.q_table)
                np.save('best_q_table.npy', best_q_table)

            if (episode + 1) % 100 == 0:
                print(f"Episode {episode + 1}, Total Reward: {total_reward}")
            break
            

# Display the best run
print(f"\nBest Run - Total Reward: {np.sum(best_q_table)}")
env.q_table = best_q_table  # Use the best Q-table for rendering the best run
for _ in range(100):  # Run the environment with the best Q-table for 100 steps
    env.render()
    action = np.argmax(env.q_table[observation])
    observation, _, done, _ = env.step(action)
    if done:
        break

pygame.quit()   # Close the Pygame window after displaying the best run


In [1]:
import pygame
from pygame.locals import QUIT
import gym
from gym import spaces
import numpy as np

class SimpleGridEnvironment(gym.Env):
    def __init__(self, grid_size=(20, 20), cell_size=30, wall_positions=None, point_positions=None):
        super(SimpleGridEnvironment, self).__init__()

        self.grid_size = grid_size
        self.cell_size = cell_size
        self.wall_positions = set(wall_positions) if wall_positions is not None else set()
        self.point_positions = set(point_positions) if point_positions is not None else set()
        self.action_space = spaces.Discrete(4)  # 4 possible actions: 0=up, 1=down, 2=left, 3=right
        self.observation_space = spaces.Discrete(np.prod(grid_size))

        self.agent_pos = np.array([0, 0])
        self.goal_pos = np.array([grid_size[0] - 1, 0])  # Updated goal position
        self.max_steps = np.prod(grid_size) * 2

        self.current_step = 0

        self.q_table = np.zeros((np.prod(grid_size), self.action_space.n))

        # Q-learning parameters
        self.alpha = 0.1  # Learning rate
        self.gamma = 0.99  # Discount factor
        self.epsilon = 1.5  # Exploration-exploitation trade-off

        # Initialize Pygame
        pygame.init()
        self.screen = pygame.display.set_mode((grid_size[1] * cell_size, grid_size[0] * cell_size))

    def reset(self):
        self.agent_pos = np.array([0, 0])
        self.current_step = 0
        self.goal_pos = np.array([self.grid_size[0] - 1, 0])  # Reset goal position to left-down corner
        self.wall_positions = set(self.generate_walls())  # Optionally reset walls
        self.point_positions = set(self.generate_points())  # Optionally reset points
        return self._get_observation()

    def generate_walls(self):
        # Set predefined wall positions
        return {
            (0, 5), (1, 5), (2, 5), (5, 5), (5, 4), (5, 3), (5, 2), (5, 1), (5, 0),
            (12, 0), (12, 1), (12, 2), (12, 3), (12, 4), (12, 5), (12, 6), (12, 7), (0, 11), (1, 11),
            (2, 11), (3, 11), (4, 11), (5, 11), (6, 11), (7, 11), (10, 11), (11, 11),
            (12, 11), (13, 11), (14, 11), (15, 11), (16, 11), (17, 11), (18, 11), (19, 11), (20, 11),
            (12, 12), (12, 13), (12, 14), (12, 10), (12, 17), (12, 15), (12, 16), (12, 19), (12, 20),
            (12, 18), (4, 16), (4, 15), (3, 16), (3, 15), (9, 15), (9, 16), (16, 4), (16, 3), (16, 5),
            (17, 4), (17, 3), (17, 5), (9, 15), (9, 16)
        }

    def generate_points(self):
        # Set predefined point positions
        return {
            (2, 2), (2, 3), (2, 4), (9, 2), (9, 3), (9, 4), (8, 2), (8, 3), (8, 4),
            (7, 13), (7, 14), (7, 15), (6, 14), (6, 13), (6, 15), (6, 15), (10, 13),
            (15, 1), (15, 2), (14, 1), (14, 2)
        }

    def step(self, action):
        self.current_step += 1

        if np.random.uniform() < self.epsilon:
            action = np.random.choice(self.action_space.n)
        else:
            action = np.argmax(self.q_table[self._get_observation()])

        if action == 0:  # move up
            new_pos = (max(0, self.agent_pos[0] - 1), self.agent_pos[1])
        elif action == 1:  # move down
            new_pos = (min(self.grid_size[0] - 1, self.agent_pos[0] + 1), self.agent_pos[1])
        elif action == 2:  # move left
            new_pos = (self.agent_pos[0], max(0, self.agent_pos[1] - 1))
        elif action == 3:  # move right
            new_pos = (self.agent_pos[0], min(self.grid_size[1] - 1, self.agent_pos[1] + 1))

        # Check if the new position is a valid move (not a wall)
        if new_pos not in self.wall_positions:
            self.agent_pos = np.array(new_pos)

        # Check if the new position corresponds to a point
        if tuple(self.agent_pos) in self.point_positions:  # Convert NumPy array to tuple
            self.point_positions.remove(tuple(self.agent_pos))
            reward = 10  # Reward for collecting a point
        else:
            reward = 0

        done = np.array_equal(self.agent_pos, self.goal_pos) or self.current_step >= self.max_steps

        if done:
            reward += 1 if np.array_equal(self.agent_pos, self.goal_pos) else 0

        # Q-value update
        current_q = self.q_table[self._get_observation(), action]
        max_future_q = np.max(self.q_table[self._get_observation()])
        new_q = (1 - self.alpha) * current_q + self.alpha * (reward + self.gamma * max_future_q)
        self.q_table[self._get_observation(), action] = new_q

        return self._get_observation(), reward, done, {}

    def _get_observation(self):
        return np.ravel_multi_index(self.agent_pos, self.grid_size)

    def render(self):
        for event in pygame.event.get():
            if event.type == QUIT:
                pygame.quit()
                return

        self.screen.fill((255, 255, 255))

        agent_rect = pygame.Rect(self.agent_pos[1] * self.cell_size, self.agent_pos[0] * self.cell_size,
                                 self.cell_size, self.cell_size)
        pygame.draw.rect(self.screen, (0, 0, 255), agent_rect)

        goal_rect = pygame.Rect(self.goal_pos[1] * self.cell_size, self.goal_pos[0] * self.cell_size,
                                self.cell_size, self.cell_size)
        pygame.draw.rect(self.screen, (0, 255, 0), goal_rect)

        # Convert the set to a list before iterating over it
        wall_positions_list = list(self.wall_positions)
        for wall_pos in wall_positions_list:
            wall_rect = pygame.Rect(wall_pos[1] * self.cell_size, wall_pos[0] * self.cell_size,
                                    self.cell_size, self.cell_size)
            pygame.draw.rect(self.screen, (150, 150, 150), wall_rect)

        # Convert the set to a list before iterating over it
        point_positions_list = list(self.point_positions)
        for point_pos in point_positions_list:
            point_rect = pygame.Rect(point_pos[1] * self.cell_size, point_pos[0] * self.cell_size,
                                     self.cell_size, self.cell_size)
            pygame.draw.rect(self.screen, (255, 0, 0), point_rect)

        pygame.display.flip()

# Example usage:
try:
    best_q_table = np.load('best_q_table.npy')
except FileNotFoundError:
    best_q_table = None

# Create the environment
env = SimpleGridEnvironment(grid_size=(20, 20), cell_size=30)

# Initialize Q-table with the loaded best Q-table or zeros if not available
if best_q_table is not None:
    env.q_table = best_q_table
else:
    env.q_table = np.zeros((np.prod(env.grid_size), env.action_space.n))

# Training loop
for episode in range(10):
    observation = env.reset()
    total_reward = 0

    while True:
        env.render()

        if np.random.uniform() < env.epsilon:
            action = np.random.choice(env.action_space.n)
        else:
            action = np.argmax(env.q_table[observation])

        next_observation, reward, done, _ = env.step(action)
        total_reward += reward

        current_q = env.q_table[observation, action]
        max_future_q = np.max(env.q_table[next_observation])
        new_q = (1 - env.alpha) * current_q + env.alpha * (reward + env.gamma * max_future_q)
        env.q_table[observation, action] = new_q

        observation = next_observation

        if done:
            if best_q_table is None or total_reward > np.sum(best_q_table):
                best_q_table = np.copy(env.q_table)
                np.save('best_q_table.npy', best_q_table)

            if (episode + 1) % 100 == 0:
                print(f"Episode {episode + 1}, Total Reward: {total_reward}")
            break
            

# Display the best run
print(f"\nBest Run - Total Reward: {np.sum(best_q_table)}")
env.q_table = best_q_table  # Use the best Q-table for rendering the best run
for _ in range(100):  # Run the environment with the best Q-table for 100 steps
    env.render()
    action = np.argmax(env.q_table[observation])
    observation, _, done, _ = env.step(action)
    if done:
        break

pygame.quit()   # Close the Pygame window after displaying the best run


pygame 2.3.0 (SDL 2.24.2, Python 3.9.18)
Hello from the pygame community. https://www.pygame.org/contribute.html

Best Run - Total Reward: 517.4003301776922
