In [1]:
import gym
import numpy as np

In [11]:
env_id='FrozenLake-v1'
env_id='FrozenLake8x8-v1'
env= gym.make(env_id)
env.render()


[41mS[0mFFFFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG


### policy iteration
1. choose a random policy
2. evaluate this policy by calculating state value V(s) using Bellman expectation equation
3. using V, extract best policy as new_policy
4. Stop when policy doesn't change ( new_policy=old_policy )

$$
V_\pi(s) = \sum_a \pi(a|s) \sum_{s',a}  p(s'|s,a)(r + \gamma*V(s')
$$

### eqs

$$ q(s,a) = \sum_{s',a}  p(s'|s,a) \big[r + \gamma*V(s') \big]$$

$$V^*(s)=\underset{a}{max} q(s,a) $$

$$V_\pi(s)=\sum_a \pi(a|s) q(s,a) $$

$$\pi^*(a|s)=\underset{a}{argmax} \space q(s,a) $$


In [12]:
def calc_qvalues(env, state, V, gamma):
    # calculate state-action values q(s,a) for all the actions.
    action_values = np.zeros(env.nA)
    for a in range(env.nA):
        transitions=env.P[state][a]
        expected_actionvalue=0
        for transition in transitions:
            psas, next_state, reward, done=transition
            expected_actionvalue += psas* ( reward + gamma * V[next_state] )

        action_values[a] =expected_actionvalue
    return action_values

In [13]:
def evaluate_policy(env, policy, gamma, theta=1e-9, max_iterations=1e9, verbose=False):
    #calculate state-value V_pi(s) using bellman expectation equation for all s.
    V=np.zeros(env.nS)  
    for i in range(int(max_iterations)):
        max_delta=0  
        for state in range(env.nS): 
            qs=calc_qvalues(env, state, V, gamma) 
            v=np.sum(policy[state]*qs)
            max_delta= max(max_delta, np.abs(V[state]-v))
            V[state]=v

        if max_delta<theta:
            print(f'converged at {i}th iteration')
            break
        elif verbose and  i%100==0:
            print(f'i={i} delta={max_delta}')
    return V

### policy iteration algorithm

In [14]:
gamma=1.0

In [15]:
policy = np.ones([env.nS, env.nA]) / env.nA
max_itr=1e9
for i in range(int(max_itr)):
    V=evaluate_policy(env, policy, gamma)
    stable=True
    for state in range(env.nS):
        policy_action=np.argmax(policy[state]) 
        action_values=calc_qvalues(env, state, V, gamma)  
        best_action=np.argmax(action_values)
        if policy_action!=best_action:
            stable=False
        policy[state] = np.eye(env.nA)[best_action] 

    if stable:
        print(f'policy got stable at {i}')
        break
    elif i%100==0:
            print(f'i={i}')

converged at 201th iteration
i=0
converged at 860th iteration
converged at 905th iteration
converged at 956th iteration
converged at 1063th iteration
converged at 934th iteration
policy got stable at 5


In [16]:
def play_episodes(policy, episodes=1000):
    rewards=0
    wins=0
    for i in range(episodes):
        state=env.reset()
        done=False
        while not done:
            action=np.argmax(policy[state])
            next_state,reward,done,info=env.step(action)
            rewards+=reward
            state=next_state
            if done and reward==1.0:
                wins+=1
    return wins,rewards

In [18]:
episodes=1000
wins, rewards=play_episodes(policy, episodes=episodes)
print(f'total play {episodes} total wins {wins} total rewards {rewards}')
print(f'success rate {(wins/episodes)*100}%')

total play 1000 total wins 881 total rewards 881.0
success rate 88.1%
