# Reinforcement Learning : Value Iteration

Python Reinforcement Learning 

Chapter 03

# Frozen Lake

![title](../images/fig_frozenlake_statespace.png)

#### Actions :

[0,1,2,3] = [Left, Down, Right, Up]

In [2]:
"""import libraries"""
import gym
import numpy as np

In [3]:
"""create a simulation instance using make function"""
env = gym.make('FrozenLake-v0', is_slippery=True)

In [4]:
"""initialize the environemnt using reset method"""
env.reset()

0

In [5]:
"""create the enviroment using render method"""
"""returns a popup window display of the environment"""
env.render()


[41mS[0mFFF
FHFH
FFFH
HFFG


In [6]:
print('observation space:', env.observation_space )
print('state space  :', env.observation_space.n )
print('state sample :', env.observation_space.sample() )

observation space: Discrete(16)
state space  : 16
state sample : 8


In [7]:
print('action_space:', env.action_space )
print('action space  :', env.action_space.n)
print('action sample :', env.action_space.sample())

action_space: Discrete(4)
action space  : 4
action sample : 3


## Value Iteration

#### Method 1

In [8]:
def value_iteration(env, gamma=1.0):
    number_of_iter = 10000
    epsilon = 1.0E-06
    statevalue_table = np.zeros( env.observation_space.n )
    
    for iter in range(number_of_iter):
        updated_statevalue_table = np.copy( statevalue_table )
        
        for state in range(env.observation_space.n):
            Q_value = []
            
            for action in range(env.action_space.n):
                prob_state_reward = []
                
                for pr_ns_re in env.P[state][action]:
                    trans_prob, nextstate, reward, done = pr_ns_re
                    q = trans_prob * (reward + gamma * updated_statevalue_table[nextstate])
                    prob_state_reward.append( q )
                    
                Q_value.append( np.sum(prob_state_reward) )
            statevalue_table[state] = max( Q_value ) 
            
        delta = np.sum( np.abs(updated_statevalue_table - statevalue_table) )
        if delta <= epsilon: 
            print("Value Iteration Converged at Step :", iter+1)
            print("Breaking out of the Loop ... ... ..")
            break
                
    return statevalue_table

In [9]:
value_table = value_iteration(env, gamma=1.0)
print( [round(x,4) for x in value_table ] )

Value Iteration Converged at Step : 502
Breaking out of the Loop ... ... ..
[0.8235, 0.8235, 0.8235, 0.8235, 0.8235, 0.0, 0.5294, 0.0, 0.8235, 0.8235, 0.7647, 0.0, 0.0, 0.8824, 0.9412, 0.0]


In [10]:
def extract_policy(value_table, gamma=1.0):
    policy_table = np.zeros( env.observation_space.n)
    
    for state in range(env.observation_space.n):
        Q_table = np.zeros( env.action_space.n )
        
        for action in range( env.action_space.n):
            for pr_ns_re in env.P[state][action]:
                trans_prob, nextstate, reward, done = pr_ns_re
                q = trans_prob * (reward + gamma * value_table[nextstate])
                Q_table[action] += q
                
        policy_table[state] = np.argmax(Q_table)
        
    return policy_table

In [10]:
optimal_policy = extract_policy(value_table, gamma=1.0)
print(optimal_policy)

[0. 3. 3. 3. 0. 0. 0. 0. 3. 1. 0. 0. 0. 2. 1. 0.]


#### Method 2

In [11]:
def value_iteration(env, gamma=1.0):
    number_of_iter = 10000
    epsilon = 1.0E-06
    value_table  = np.zeros( env.observation_space.n )
    policy_table = np.zeros( env.observation_space.n )
    
    for iter in range(number_of_iter):
        updated_value_table = np.copy( value_table )
        
        for state in range(env.observation_space.n):
            Q_value = np.zeros(env.action_space.n)
            
            for action in range(env.action_space.n):
                
                for pr_ns_re in env.P[state][action]:
                    trans_prob, nextstate, reward, done = pr_ns_re
                    q = trans_prob * (reward + gamma * updated_value_table[nextstate])
                    Q_value[action] += q
                    
            policy_table[state] = np.argmax(Q_value)
            value_table[state]  = Q_value[ np.argmax(Q_value) ] 
            
        delta = np.sum( np.abs(updated_value_table - value_table) )
        if delta <= epsilon: 
            print("Value Iteration Converged at Step :", iter+1)
            print("Breaking out of the Loop ... ... ..")
            break
                
    return value_table, policy_table

In [12]:
value_table, policy_table = value_iteration(env, gamma=1.0)
print( [round(x,4) for x in value_table ] )
print(optimal_policy)

Value Iteration Converged at Step : 502
Breaking out of the Loop ... ... ..
[0.8235, 0.8235, 0.8235, 0.8235, 0.8235, 0.0, 0.5294, 0.0, 0.8235, 0.8235, 0.7647, 0.0, 0.0, 0.8824, 0.9412, 0.0]
[0. 3. 3. 3. 0. 0. 0. 0. 3. 1. 0. 0. 0. 2. 1. 0.]


In [13]:
env.close()