# Frozen Lake

The frozen lake environment is a stochastic grid cell environment, with finite episodes. You can find the Gymnasium documentation [here](https://gymnasium.farama.org/environments/toy_text/frozen_lake/).

### Tasks:

Write your own frozen lake agent or complete the following code:
- Calculate the q-values using the Bellman Optimality Equation
- Determine the optimal policy

Interpretation:
- Print your learning progress (q-values, v-values, learned policy)
- How fast does your agent learn the optimal policy?

Test your code and experiment with different hyperparameters, e.g.
- Discount factor
- Exploration strategy (e.g., epsilon-greedy with different epsilon values)
- Map size (e.g., 4x4, 8x8). How does the map size affect learning and performance?
- Is the training performance comparable to the test performance? Why (not)?
...


In [4]:
import numpy as np
import gymnasium as gym

# Create the environment, with slippery cells and an 4x4 map
env = gym.make('FrozenLake-v1',map_name="4x4", is_slippery=True)
env=env.unwrapped
# get environment details
num_states = env.observation_space.n  # 16 cells
num_actions = env.action_space.n  # 4 actions
print(env)
P = env.P  # transition probabilities: {state: [(trans. prob., next state, reward, done), ...]}
print(P)
# hyperparameters
gamma = 0.99  # discount factor
theta = 1e-3  # convergence threshold

# initialise the value function with all zeros
V = np.zeros(num_states)
# initialise the policy with all zeros
policy = np.zeros(num_states, dtype=int)

# Value Iteration Algorithm
while True:
    delta = 0  # to track convergence

    # loop over all states
    for state in range(num_states):
        ### calculate the q-values ###
        v = V[state]
        vals = np.zeros(len(P[state].items()))
        vals[:] = -np.inf
        for idx, p in P[state].items(): #different action possible from state state
            val =0
            for pp in p: #possible outcomes of choosing an action
                prob = pp[0]
                next_state = pp[1]
                reward=pp[2]
                done = pp[3]
                val += prob*(reward+gamma*V[next_state])
            vals[idx]=val
        V[state]=np.max(vals)
        # update the delta
        delta = max(delta, abs(v - V[state]))
    
    # check for convergence
    if delta < theta:
        break
print(V)

### determine the optimal policy ###
for state in range(num_states):
    vals = np.zeros(len(P[state].items()))
    vals[:] = -np.inf
    for idx, p in P[state].items():
        val =0
        for pp in p:
            prob = pp[0]
            next_state = pp[1]
            reward=pp[2]
            done = pp[3]
            val += prob*(reward+gamma*V[next_state])
        vals[idx]=val
    policy[state]=np.argmax(vals)

print(policy)





<FrozenLakeEnv<FrozenLake-v1>>
{0: {0: [(0.33333333333333337, 0, 0, False), (0.3333333333333333, 0, 0, False), (0.33333333333333337, 4, 0, False)], 1: [(0.33333333333333337, 0, 0, False), (0.3333333333333333, 4, 0, False), (0.33333333333333337, 1, 0, False)], 2: [(0.33333333333333337, 4, 0, False), (0.3333333333333333, 1, 0, False), (0.33333333333333337, 0, 0, False)], 3: [(0.33333333333333337, 1, 0, False), (0.3333333333333333, 0, 0, False), (0.33333333333333337, 0, 0, False)]}, 1: {0: [(0.33333333333333337, 1, 0, False), (0.3333333333333333, 0, 0, False), (0.33333333333333337, 5, 0, True)], 1: [(0.33333333333333337, 0, 0, False), (0.3333333333333333, 5, 0, True), (0.33333333333333337, 2, 0, False)], 2: [(0.33333333333333337, 5, 0, True), (0.3333333333333333, 2, 0, False), (0.33333333333333337, 1, 0, False)], 3: [(0.33333333333333337, 2, 0, False), (0.3333333333333333, 1, 0, False), (0.33333333333333337, 0, 0, False)]}, 2: {0: [(0.33333333333333337, 2, 0, False), (0.3333333333333333, 

In [None]:
# evaluate the policy over a number of episodes
def evaluate_policy(env, policy, episodes=100):
    success = 0
    for _ in range(episodes):
        observation, info = env.reset()
        episode_over = 0
        total_reward =  0

        while not episode_over:
            # Take the action and see what happens
            action = policy[observation]
            observation, reward, terminated, truncated, info = env.step(action)
            total_reward += reward
            if(truncated):
                episode_over=1
            if(terminated):
                episode_over=2
        if(episode_over==2 and total_reward == 1):
            success+=1
    return success / episodes


def pretty_print_map(map):
    print('\nmap: ')
    # get the map from the environment (which is a numpy array of byte strings, i.e. b'S')
    # convert the byte array to a string array
    map_str = np.char.decode(map, 'utf-8')
    # pretty print the map
    for row in map_str:
        print(' '.join(row))  # removes brackets etc.
    print(end='\n')

def pretty_print_policy(policy):
    policy_directions = {0:'left', 1:'down', 2:'right', 3:'up'}
    print('directions: ')
    for direction in policy:
        print(policy_directions[direction], end=' ')
    print(end='\n\n')

# env.unwrapped.desc gives the map as byte strings
success_rate = evaluate_policy(env, policy,100)
pretty_print_map(env.unwrapped.desc)  
pretty_print_policy(policy)
print(f'Success Rate: {success_rate * 100:.2f}%')




map: 
S F F F
F H F H
F F F H
H F F G

directions: 
left up up up left left left left up down left left left right down left 

Success Rate: 84.00%


In [85]:
class Frozen_lake:
    def __init__(self,gamma =0.99, theta=1e-3,episodes = 100, epsilon = 0 ,seed = 1):
        self.gamma=gamma
        self.theta=theta
        self.epsilon = epsilon
        self.episodes = episodes
        self.rng = np.random
        self.rng.seed(seed)
        env = gym.make('FrozenLake-v1',map_name="4x4", is_slippery=True)
        self.env=env.unwrapped
        # get environment details
        num_states = self.env.observation_space.n  # 16 cells
        self.num_actions = self.env.action_space.n  # 4 actions
        P = self.env.P  # transition probabilities: {state: [(trans. prob., next state, reward, done), ...]}

        # initialise the value function with all zeros
        V = np.zeros(num_states)
        # initialise the policy with all zeros
        policy = np.zeros(num_states, dtype=int)

        # Value Iteration Algorithm
        while True:
            delta = 0  # to track convergence

            # loop over all states
            for state in range(num_states):
                ### calculate the q-values ###
                v = V[state]
                vals = np.zeros(len(P[state].items()))
                vals[:] = -np.inf
                for idx, p in P[state].items(): #different action possible from state state
                    val =0
                    for pp in p: #possible outcomes of choosing an action
                        prob = pp[0]
                        next_state = pp[1]
                        reward=pp[2]
                        done = pp[3]
                        val += prob*(reward+gamma*V[next_state])
                    vals[idx]=val
                V[state]=np.max(vals)
                # update the delta
                delta = max(delta, abs(v - V[state]))
            
            # check for convergence
            if delta < theta:
                break
        ### determine the optimal policy ###
        for state in range(num_states):
            vals = np.zeros(len(P[state].items()))
            vals[:] = -np.inf
            for idx, p in P[state].items():
                val =0
                for pp in p:
                    prob = pp[0]
                    next_state = pp[1]
                    reward=pp[2]
                    done = pp[3]
                    val += prob*(reward+gamma*V[next_state])
                vals[idx]=val
            policy[state]=np.argmax(vals)
        self.policy=policy
    def pretty_print_map(self):
        print('\nmap: ')
        map = self.env.unwrapped.desc
        # get the map from the environment (which is a numpy array of byte strings, i.e. b'S')
        # convert the byte array to a string array
        map_str = np.char.decode(map, 'utf-8')
        # pretty print the map
        for row in map_str:
            print(' '.join(row))  # removes brackets etc.
        print(end='\n')

    def pretty_print_policy(self):
        policy_directions = {0:'left', 1:'down', 2:'right', 3:'up'}
        print('directions: ')
        for direction in self.policy:
            print(policy_directions[direction], end=' ')
        print(end='\n\n')
    def evaluate_policy(self):
        success = 0
        for _ in range(self.episodes):
            observation, info = self.env.reset()
            episode_over = 0
            total_reward =  0
            while not episode_over:
                # Take the action and see what happens
                action = self.policy[observation]
                rnd = self.rng.random()
                if(rnd <self.epsilon): #epsilon greedy
                    action = self.rng.randint(0,self.num_actions)
                observation, reward, terminated, truncated, info = self.env.step(action)
                total_reward += reward
                if(truncated):
                    episode_over=1
                if(terminated):
                    episode_over=2
            if(episode_over==2 and total_reward == 1):
                success+=1
        return success / self.episodes
    def run(self):
        success_rate = self.evaluate_policy()
        self.pretty_print_map()  
        self.pretty_print_policy()
        print("-------")
        print(f"Hyperparameters: gamma={self.gamma}, theta={self.theta}, episodes={self.episodes}, epsilon={self.epsilon}")
        print(f'Success Rate: {success_rate * 100:.2f}%')




In [86]:
# base level test
test1 = Frozen_lake()
print(test1.policy)
test1.run()
#play around with hyperparameters
test2 = Frozen_lake(epsilon=0.1)
test2.run()
test3 = Frozen_lake(episodes=1000,epsilon=0.1)
test3.run()
test4 = Frozen_lake(episodes=1000,epsilon=0.2)
test4.run()
test5=Frozen_lake(episodes=100, theta = 1e-5)
test5.run()
test6 = Frozen_lake(episodes=100, gamma = 0.9)
test6.run()
test7 = Frozen_lake(episodes=100, gamma = 0.9, theta = 1e-5)
test7.run()
test8=Frozen_lake(episodes=1000, theta = 1e-7)
test8.run()

[0 3 3 3 0 0 0 0 3 1 0 0 0 2 1 0]

map: 
S F F F
F H F H
F F F H
H F F G

directions: 
left up up up left left left left up down left left left right down left 

-------
Hyperparameters: gamma=0.99, theta=0.001, episodes=100, epsilon=0
Success Rate: 84.00%

map: 
S F F F
F H F H
F F F H
H F F G

directions: 
left up up up left left left left up down left left left right down left 

-------
Hyperparameters: gamma=0.99, theta=0.001, episodes=100, epsilon=0.1
Success Rate: 49.00%

map: 
S F F F
F H F H
F F F H
H F F G

directions: 
left up up up left left left left up down left left left right down left 

-------
Hyperparameters: gamma=0.99, theta=0.001, episodes=1000, epsilon=0.1
Success Rate: 41.90%

map: 
S F F F
F H F H
F F F H
H F F G

directions: 
left up up up left left left left up down left left left right down left 

-------
Hyperparameters: gamma=0.99, theta=0.001, episodes=1000, epsilon=0.2
Success Rate: 24.40%

map: 
S F F F
F H F H
F F F H
H F F G

directions: 
left up up up