In [23]:
import gym
import numpy as np

In [45]:
# make the frozen lake environment using OpenAI’s Gym 
env = gym.make("FrozenLake-v1") # or the latest version

In [85]:
# explore the environment
observation_space = env.observation_space.n
action_space = env.action_space.n

print(observation_space)
print(action_space)

16
4


In [204]:
def value_iteration(env, gamma = 1.0): 
    """ 
    Inputs: 
        - env: the frozen lake environment. 
        - gamma: discount factor 
 
    Returns: 
        - value_table: state value function 
        - Q_value: state-action value function (Q function). 
    """

    observation_space = env.observation_space.n
    value_table = np.zeros(observation_space)
    
    threshold = 1e-10
    
    for i in range(10000):
        
        prev_value_table = np.copy(value_table)
        
        for state in range(observation_space):
            q_value = []
            for action in range(action_space):
                next_states_rewards = []
                for next_state_reward in env.P[state][action]: 
                    transition_probability, next_state, reward, done = next_state_reward 
                    next_states_rewards.append((transition_probability * (reward + gamma * prev_value_table[next_state])))
                    
                q_value.append(np.sum(next_states_rewards))
                
            value_table[state] = max(q_value)
        
        # check for convergence
        if (np.sum(np.fabs(prev_value_table - value_table)) <= threshold):
             print ('Value-iteration converged at iteration# %d.' %(i+1))
             break
    
    return value_table, q_value

In [205]:
def extract_policy(value_table, gamma = 1.0): 
    """ 
    Inputs: 
        - value_table: state value function 
        - gamma: discount factor 
 
    Returns: 
        - policy: the optimal policy. 
    """
    policy = np.zeros(observation_space)
    
    for state in range(observation_space):
        q_table = np.zeros(action_space)
        
        for action in range(action_space):
            for next_state_reward in env.P[state][action]:
                transition_probability, next_state, reward, done = next_state_reward
                q_table[action] += (transition_probability * (reward + gamma * value_table[next_state]))
        
        # getting argmax
        policy[state] = np.argmax(q_table)
    
    return policy

In [230]:
optimal_value_function, q_value = value_iteration(env=env, gamma=1.0)

Value-iteration converged at iteration# 877.


In [231]:
optimal_policy = extract_policy(optimal_value_function, gamma=1.0)

In [232]:
print(optimal_policy)

[0. 3. 3. 3. 0. 0. 0. 0. 3. 1. 0. 0. 0. 2. 1. 0.]


In [235]:
all_rewards=[]
for _ in range(1000):
    obs=env.reset()[0]
    total_reward = 0
    while True:
        action = optimal_policy[obs]
        obs,reward,done,info,_ = env.step(action)
        if done:
            all_rewards.append(reward)
            break

print("Average Reward: ", np.mean(all_rewards))

Average Reward:  0.838
