In [1]:
import numpy as np

In [7]:
# Step 1: Define the action and state space of the environment
class FrozenLake:
    def __init__(self, grid_size=4):
        """Init the environment

        state_space: list of all possible states
        action_space: list of all possible actions
        
        Args:
            grid_size (int, optional): size of the grid. Defaults to 4.
        """
        self.grid_size = grid_size
        self.state_space = np.arange(grid_size * grid_size)
        self.action_space = np.arange(4) # 0: left, 1: down, 2: right, 3: up

    # Step 2: Initialize/reset the env
    def reset(self):
        """Reset the environment to the initial state
        
        (0,0) is a start block, (-1,-1) is a goal block, and all others are frozen blocks 
        with holes scattered randomly.

        States are represented as an integer because it is simpler. (0,0) is implicitly
        mapped to 0, (0,1) mapped to 1, etc.

        F: frozen block
        S: start block
        G: goal block
        H: hole block
        """
        self.state = 0
        self.grid = np.full((self.grid_size, self.grid_size), 'F') # Fill the grid with frozen blocks
        self.grid[0, 0] = 'S' # Place the start block
        self.grid[-1, -1] = 'G' # Place the goal block

        # Place the hole blocks randomly
        for _ in range(self.grid_size):
            i, j = np.random.randint(self.grid_size, size=2)
            if self.grid[i, j] == 'F':
                self.grid[i, j] = 'H'

    # Step 3: Take an action and return the next state and reward
    def step(self, action):
        """Take an action and return the next state and reward

        Args:
            action (int): action to take

        Returns:
            state (int): next state
            reward (int): reward
        """
        i, j = self.state // self.grid_size, self.state % self.grid_size
        if action == 0: # left
            j = max(j - 1, 0)
        elif action == 1: # down
            i = min(i + 1, self.grid_size - 1)
        elif action == 2: # right
            j = min(j + 1, self.grid_size - 1)
        elif action == 3: # up
            i = max(i - 1, 0)
        
        reward = self._get_reward()

        return self.state, reward

    def _get_reward(self):
        """Return the reward based on the current state.
        
        Returns:
            reward (int): -1 for falling into a hole, 1 for reaching the goal, and 0 otherwise
        """
        i, j = self.state // self.grid_size, self.state % self.grid_size

        if self.grid[i, j] == 'H':  # If the agent falls into a hole
            return -1
        elif self.grid[i, j] == 'G':  # If the agent reaches the goal
            return 1
        else:  # If the agent is on a frozen block
            return 0

env = FrozenLake()
env.reset()
print(env.grid)

[['S' 'F' 'H' 'F']
 ['F' 'F' 'F' 'H']
 ['F' 'F' 'F' 'F']
 ['F' 'H' 'H' 'G']]


In [None]:
# Step 4: Define a simple agent
class SimpleAgent:
    def __init__(self, num_states, num_actions, alpha=0.5, gamma=0.95, epsilon=0.1):
        """Init the agent
        
        Args:
            num_states (int): number of states
            num_actions (int): number of actions
            alpha (float, optional): learning rate. Defaults to 0.5.
            gamma (float, optional): discount factor. Defaults to 0.95.
            epsilon (float, optional): exploration rate. Defaults to 0.1.
        """
        self.num_states = num_states
        self.num_actions = num_actions
        self.alpha = alpha
        self.gamma = gamma
        self.epsilon = epsilon

        # Q table: Expected return (not reward) for each state-action pair 
        self.Q = np.zeros((num_states, num_actions))
    
    def get_action(self, state):
        """Pick a random action with probability epsilon, otherwise pick the best action
        
        Args:
            state (int): current state
        """
        if np.random.rand() < self.epsilon:
            return np.random.choice(self.num_actions) # explore
        else:
            return np.argmax(self.Q[state]) # exploit
    
    # TODO keep walking through this
    # def update_Q(self, state, action, reward, next_state):
    #     best_next_action = np.argmax(self.Q[next_state])  
    #     td_target = reward + self.gamma * self.Q[next_state][best_next_action]
    #     td_error = td_target - self.Q[state][action]
    #     self.Q[state][action] += self.alpha * td_error

In [None]:
# Step 5: Training loop

# Initialize agent and environment
agent = SimpleAgent(env.state_space.shape[0], env.action_space.shape[0])
env = FrozenLake()

# Training parameters
num_episodes = 5000
max_steps_per_episode = 100

for episode in range(num_episodes):
    state = env.reset()
    
    for step in range(max_steps_per_episode):
        action = agent.get_action(state)
        next_state, reward = env.step(action)
        agent.update_Q(state, action, reward, next_state)
        
        state = next_state
        
        if reward == -1 or reward == 1:  # agent fell in a hole or reached the goal
            break
            
    # Print out progress
    if (episode + 1) % 1000 == 0:
        print(f"Episode {episode + 1}/{num_episodes} completed")
        
print("Training finished.")
