## Intializing stuff

In [None]:
import pygame
import numpy as np
from gymnasium import Env
from gymnasium import spaces
import matplotlib.pyplot as plt

## Build Determinisitic Env

In [92]:
class GridEnvironment(Env):
    metadata = {
        "render_modes": ["human", "rgb_array"],
        "render_fps": 4,
        "goal_img": "./goal.png",
        "agent_img": "./agent.png",
        "reward_img": "./reward.png",
        "neg_reward_img": "./neg_reward.png" 
    }

    def __init__(
        self,
        size: int = 4,
        max_time_steps:int = 20,
        render_mode=None
    ):
        self.size = size
        self.observation_space = spaces.Dict(
            {
                "agent": spaces.Box(0, size - 1, shape=(2, ), dtype=int),
                "goal": spaces.Box(0, size - 1, shape=(2, ), dtype=int),
            }
        )
        self.observation_space['agent'].sample()
        self.observation_space['goal'].sample()

        self.action_space = spaces.Discrete(4)
        self.max_timesteps = max_time_steps

        # Randomly distribute rewards between -0.5 to 0.5
        self._base_state = self.np_random.uniform(
            -.75, .75, size=(self.size, self.size)
        )
        np.around(self._base_state, 2, self._base_state)

        """The following dictionary maps abstract actions from `self.action_space` to
        the direction we will walk in if that action is taken.
        I.e. 0 corresponds to "right", 1 to "up" etc.
        """
        self._action_to_direction = {
            0: np.array([1, 0]), # down
            1: np.array([-1, 0]), # up
            2: np.array([0, 1]), # right
            3: np.array([0, -1]), # left
        }

        self.timestep = 0
        self._agent_pos = self.observation_space['agent'].sample()
        self._goal_pos = self._agent_pos.copy()

        # We will sample the goal's location randomly until it does not coincide
        # with the agent's location
        while np.array_equal(self._goal_pos, self._agent_pos):
            self._goal_pos = self.observation_space['goal'].sample()

        self.state = self._base_state.copy()
        self.state[tuple(self._goal_pos)] = 1. # Max Reward
        self.state[tuple(self._agent_pos)] = 0. # 0 reward at start

        # Check for render mode legality
        assert render_mode is None or render_mode in self.metadata["render_modes"]
        self.render_mode = render_mode

        self.window = None
        self.clock = None
        self.window_size = 744  # The size of the PyGame window


    def reset(self, seed=None, options=None):
        super().reset(seed=seed)

        # Reset env
        self.state = self._base_state.copy()
        # Agent start pos changes everytime
        self._agent_pos = self.observation_space['agent'].sample()
        self.state[tuple(self._goal_pos)] = 1. # Max Reward
        self.state[tuple(self._agent_pos)] = 0. # 0 reward at start

        observation = self._get_obs()
        info = self._get_info()

        if self.render_mode == "human":
            self._render_frame()

        return observation, info

    def step(self, action):
        direction = self._action_to_direction[action]
        
        # Restricting the agent to the grid.
        self._agent_pos = np.clip(
            self._agent_pos + direction, 0, self.size - 1
        )

        observation = self._get_obs()
        
        reward = self.state[tuple(self._agent_pos)]

        # Consume reward
        self.state[tuple(self._agent_pos)] = 0        
        
        # An episode is done iff the agent has reached the goal
        self.timestep += 1
        terminated = np.array_equal(self._agent_pos, self._goal_pos)

        truncated = self.timestep > self.max_timesteps

        info = self._get_info()

        if self.render_mode == "human":
            self._render_frame()

        return observation, reward, terminated, truncated, info

    def _get_obs(self):
        return {"agent": self._agent_pos, "goal": self._goal_pos}

    # City block distance between goal and agent
    def _get_info(self):
        return {
            "distance": np.linalg.norm(
                self._agent_pos - self._goal_pos, ord=1
            )
        }

    def render(self):
        if self.render_mode == "rgb_array":
            return self._render_frame()

    def _render_frame(self):
        if self.window is None and self.render_mode == "human":
            pygame.init()
            pygame.display.init()
            self.window = pygame.display.set_mode(
                (self.window_size, self.window_size)
            )
        if self.clock is None and self.render_mode == "human":
            self.clock = pygame.time.Clock()

        canvas = pygame.Surface((self.window_size, self.window_size))
        canvas.fill((255, 255, 255))
        pix_square_size = (
            self.window_size / self.size
        )  # The size of a single grid square in pixels

        # Goal image
        goal_img = pygame.image.load(self.metadata['goal_img'])
        goal_img = pygame.transform.scale(goal_img, (pix_square_size, pix_square_size))
        canvas.blit(
            goal_img,
            pix_square_size * self._goal_pos[::-1]
        )

        # Agent image
        agent_img = pygame.image.load(self.metadata['agent_img'])
        agent_img = pygame.transform.scale(agent_img, (pix_square_size, pix_square_size))
        canvas.blit(
            agent_img,
            pix_square_size * self._agent_pos[::-1]
        )

        # Reward image
        reward_img = pygame.image.load(self.metadata['reward_img'])
        # Negative reward image
        neg_reward_img = pygame.image.load(self.metadata['neg_reward_img'])

        # Add the reward and neg reward
        for x in range(self.size):
            for y in range(self.size):
                reward = self.state[x, y]
                if self.state[x, y] > 0 and self.state[x, y] < 1:
                    sreward_img = pygame.transform.scale(
                        reward_img,
                        (pix_square_size*reward, pix_square_size*reward)
                    )
                    rew_sz = np.array(sreward_img.get_size()) # w x h
                    position = pix_square_size * (np.array([y, x])+0.5)
                    # To center to grid square
                    position -= rew_sz[::-1] / 2
                    canvas.blit(
                        sreward_img,
                        position
                    )
                elif self.state[x, y] < 0:
                    reward *= -1
                    sneg_reward_img = pygame.transform.scale(
                        neg_reward_img,
                        (pix_square_size*reward, pix_square_size*reward)
                    )
                    nrew_sz = np.array(sneg_reward_img.get_size()) # w x h
                    position = pix_square_size * (np.array([y, x])+0.5)
                    # To center to grid square
                    position -= nrew_sz[::-1] / 2
                    canvas.blit(
                        sneg_reward_img,
                        position
                    )

        # Finally, add some gridlines
        for x in range(self.size + 1):
            pygame.draw.line(
                canvas,
                0,
                (0, pix_square_size * x),
                (self.window_size, pix_square_size * x),
                width=3,
            )
            pygame.draw.line(
                canvas,
                0,
                (pix_square_size * x, 0),
                (pix_square_size * x, self.window_size),
                width=3,
            )

        if self.render_mode == "human":
            # The following line copies our drawings from `canvas` to the visible window
            self.window.blit(canvas, canvas.get_rect())
            pygame.event.pump()
            pygame.display.update()

            # We need to ensure that human-rendering occurs at the predefined framerate.
            # The following line will automatically add a delay to keep the framerate stable.
            self.clock.tick(self.metadata["render_fps"])
        else:  # rgb_array
            return np.transpose(
                np.array(pygame.surfarray.pixels3d(canvas)), axes=(1, 0, 2)
            )

    def close(self):
        if self.window is not None:
            pygame.display.quit()
            pygame.quit()

In [96]:
class RandomAgent:
  def __init__(self, env):
    self.env = env
    self.observation_space = env.observation_space
    self.action_space = env.action_space

  def step(self, obs):
    return self.env.action_space.sample()

## Take the env for a spin

In [99]:
import time

env = GridEnvironment(size=6, render_mode='human')
agent = RandomAgent(env)

obs, info = env.reset()
print(obs)

terminated, truncated = False, False

env.render()
while not (terminated or truncated):
  action = agent.step(obs)
  obs, reward, terminated, truncated, info = env.step(action)

  env.render()
  print(obs, reward)
  time.sleep(1)


{'agent': array([1, 5]), 'goal': array([4, 2])}
{'agent': array([2, 5]), 'goal': array([4, 2])} -0.38
{'agent': array([2, 4]), 'goal': array([4, 2])} -0.08
{'agent': array([2, 3]), 'goal': array([4, 2])} -0.67
{'agent': array([1, 3]), 'goal': array([4, 2])} -0.41
{'agent': array([0, 3]), 'goal': array([4, 2])} -0.44
{'agent': array([0, 3]), 'goal': array([4, 2])} 0.0
{'agent': array([1, 3]), 'goal': array([4, 2])} 0.0
{'agent': array([2, 3]), 'goal': array([4, 2])} 0.0
{'agent': array([1, 3]), 'goal': array([4, 2])} 0.0
{'agent': array([0, 3]), 'goal': array([4, 2])} 0.0
{'agent': array([0, 3]), 'goal': array([4, 2])} 0.0
{'agent': array([0, 2]), 'goal': array([4, 2])} 0.33
{'agent': array([1, 2]), 'goal': array([4, 2])} 0.49
{'agent': array([1, 1]), 'goal': array([4, 2])} -0.57
{'agent': array([1, 2]), 'goal': array([4, 2])} 0.0
{'agent': array([2, 2]), 'goal': array([4, 2])} -0.52
{'agent': array([3, 2]), 'goal': array([4, 2])} -0.61
{'agent': array([2, 2]), 'goal': array([4, 2])} 0.

In [100]:
# End the simulation
env.close()