In [1]:
import numpy as np

In [7]:
# Step 1: Define the action and state space of the environment
class FrozenLake:
    def __init__(self, grid_size=4):
        self.grid_size = grid_size
        self.state_space = np.arange(grid_size * grid_size)
        self.action_space = np.arange(4) # 0: left, 1: down, 2: right, 3: up

    # Step 2: Initialize/reset the env
    def reset(self):
        """Reset the environment to the initial state
        
        (0,0) is a start block, (-1,-1) is a goal block, and all others are frozen blocks 
        with holes scattered randomly.

        States are represented as an integer because it is simpler. (0,0) is implicitly
        mapped to 0, (0,1) mapped to 1, etc.

        F: frozen block
        S: start block
        G: goal block
        H: hole block
        """
        self.state = 0
        self.grid = np.full((self.grid_size, self.grid_size), 'F') # Fill the grid with frozen blocks
        self.grid[0, 0] = 'S' # Place the start block
        self.grid[-1, -1] = 'G' # Place the goal block

        # Place the hole blocks randomly
        for _ in range(self.grid_size):
            i, j = np.random.randint(self.grid_size, size=2)
            if self.grid[i, j] == 'F':
                self.grid[i, j] = 'H'

    # Step 3: Take an action and return the next state and reward
    def step(self, action):
        """Take an action and return the next state and reward

        Args:
            action (int): action to take

        Returns:
            state (int): next state
            reward (int): reward
        """
        i, j = self.state // self.grid_size, self.state % self.grid_size
        if action == 0: # left
            j = max(j - 1, 0)
        elif action == 1: # down
            i = min(i + 1, self.grid_size - 1)
        elif action == 2: # right
            j = min(j + 1, self.grid_size - 1)
        elif action == 3: # up
            i = max(i - 1, 0)
        
        reward = self._get_reward()

        return self.state, reward

def _get_reward(self):
    """Return the reward based on the current state.
    
    Returns:
        reward (int): -1 for falling into a hole, 1 for reaching the goal, and 0 otherwise
    """
    i, j = self.state // self.grid_size, self.state % self.grid_size

    if self.grid[i, j] == 'H':  # If the agent falls into a hole
        return -1
    elif self.grid[i, j] == 'G':  # If the agent reaches the goal
        return 1
    else:  # If the agent is on a frozen block
        return 0

# TODO: Define the agent and the training loop, just using GPT-4 to build out a custom tutorial

env = FrozenLake()
env.reset()
print(env.grid)

[['S' 'F' 'H' 'F']
 ['F' 'F' 'F' 'H']
 ['F' 'F' 'F' 'F']
 ['F' 'H' 'H' 'G']]
