In [1]:
import gym
import numpy as np

In [2]:
env=gym.make('FrozenLake-v0')

[2019-02-08 16:59:18,656] Making new env: FrozenLake-v0


In [3]:
env.render()

[41mS[0mFFF
FHFH
FFFH
HFFG



<ipykernel.iostream.OutStream at 0x7efeeb4f0198>

In [4]:
print(env.observation_space.n)

16


In [5]:
print(env.action_space.n)

4


LEFT = 0
DOWN = 1
RIGHT = 2
UP = 3

In [6]:
env.P[0]

{0: [(0.3333333333333333, 0, 0.0, False),
  (0.3333333333333333, 0, 0.0, False),
  (0.3333333333333333, 4, 0.0, False)],
 1: [(0.3333333333333333, 0, 0.0, False),
  (0.3333333333333333, 4, 0.0, False),
  (0.3333333333333333, 1, 0.0, False)],
 2: [(0.3333333333333333, 4, 0.0, False),
  (0.3333333333333333, 1, 0.0, False),
  (0.3333333333333333, 0, 0.0, False)],
 3: [(0.3333333333333333, 1, 0.0, False),
  (0.3333333333333333, 0, 0.0, False),
  (0.3333333333333333, 0, 0.0, False)]}

### Value Iteration algorithm

In [7]:
def value_iteration(env, gamma=1.0):
    # initialize value tables with zeros
    value_table=np.zeros(env.observation_space.n)
    
    # setting number of iterations and thresold 
    num_iterations=100000
    thresold=1e-20
    for i in range(num_iterations):
    
        # copying the value table to the updated value_table
        updated_value_table=np.copy(value_table)
    
        for state in range(env.observation_space.n):
            Q_value=[]
            for action in range(env.action_space.n):
                next_states_rewards=[] # next state rewards store the Q_Value for each action
                for next_s in env.P[state][action]:
                    # env.P gets the model of the env: (p(s'|s,a),s',r(s'|s,a))
                    trans_prob, next_state, reward_prob, _=next_s
                    next_states_rewards.append(trans_prob*(reward_prob+gamma*updated_value_table[next_state]))
                Q_value.append(np.sum(next_states_rewards))
                # Q_value <-sum over s' (p(s'|s,a)*(r(s'|s,a)+gamma*V(s')))
            
            value_table[state]=max(Q_value)
            # Q_value is a list of [q(state, a') for all a' in action space]
            # V_updated (state) <- max over actions (Q(state, action))
        
        if(np.sum(np.fabs(updated_value_table-value_table))<=thresold):
            print('Value Iteration converging at iteration:{}'.format(i+1))
            break
            
    return value_table

In [21]:
def extract_policy(env, value_table, gamma=1.0):
    # initializing the policy table
    policy=np.zeros(env.observation_space.n)
    
    for state in range(env.observation_space.n):
        
        # initializing the Q_table for each state
        Q_table=np.zeros(env.action_space.n)
        
        # compute Q Value for each actions of the state
        #Q_value(state,action) <-sum over s' (p(s'|s,a)*(r(s'|s,a)+gamma*V(s')))
        for action in range(env.action_space.n):
            for next_s in env.P[state][action]:
                trans_prob, next_state, reward_prob, _=next_s
                # adding the next_state_rewards for the succesor state next_state to the Q_value
                Q_table[action]+=(trans_prob*(reward_prob+gamma*value_table[next_state]))
             
        #select the action that has the maximum Q-value for finding the optimal policy
        policy[state]=np.argmax(Q_table)
        #pi(state)<-argmax over actions (Q_table(state, action))
            
    return policy

In [9]:
optimal_value_function=value_iteration(env=env, gamma=0.9)

Value Iteration converging at iteration:267


In [10]:
print(optimal_value_function)

[ 0.0688909   0.06141457  0.07440976  0.05580732  0.09185454  0.
  0.11220821  0.          0.14543635  0.24749695  0.29961759  0.          0.
  0.3799359   0.63902015  0.        ]


In [11]:
optimal_policy=extract_policy(optimal_value_function, gamma=0.9)

In [12]:
print(optimal_policy)

[ 0.  3.  0.  3.  0.  0.  0.  0.  3.  1.  0.  0.  0.  2.  1.  0.]


In [13]:
from gym.envs.registration import register

In [14]:
register(
    id='FrozenLakeNotSlippery-v0',
    entry_point='gym.envs.toy_text:FrozenLakeEnv',
    kwargs={'map_name' : '4x4', 'is_slippery': False}
)

In [15]:
env_d=gym.make('FrozenLakeNotSlippery-v0')

[2019-02-08 16:59:18,882] Making new env: FrozenLakeNotSlippery-v0


In [16]:
optimal_value_function_d=value_iteration(env=env_d, gamma=0.9)

Value Iteration converging at iteration:7


In [22]:
optimal_policy_d=extract_policy(env_d, optimal_value_function_d,gamma=0.9)

In [23]:
print(optimal_policy_d)

[ 1.  2.  1.  0.  1.  0.  1.  0.  2.  1.  1.  0.  0.  2.  2.  0.]


In [19]:
env_d.P[0][0]

[(1.0, 0, 0.0, False)]