In [4]:
import gym
import numpy as np
import sys
import os
np.random.seed(44)

In [5]:
def value_iteration(env, number_of_states, number_of_actions):
    policy = np.zeros((1, number_of_states))
    value_list = np.zeros((1, number_of_states))
    old_value_list = value_list.copy()
    episode = 0
    max_change = 1
    sigma = 0.9
    while max_change > 1e-12:
        episode += 1
        for s in range(number_of_states):
            assigned_value = -np.inf
            for a in range(number_of_actions):
                # get new state and its reward        
                total_cand_value = 0
                for prob, new_state, reward, done in env.P[s][a]:
                    # get new states value
                    value_new_state = old_value_list[0][new_state]
                    cand_value = 0
                    if done:
                        cand_value = reward 
                    else:
                        cand_value = reward + sigma*value_new_state
                    total_cand_value += cand_value*prob 
                        
                if total_cand_value > assigned_value:
                    assigned_value = total_cand_value
                    policy[0][s] = a
                    value_list[0][s] = assigned_value
        changes = np.abs(value_list - old_value_list)
        max_change = np.max(changes)
        old_value_list = value_list.copy()
    print("Solved in: ", episode, " episodes")
    return value_list, policy

In [64]:
def policy_iteration(env, number_of_states, number_of_actions):
    
    ## 1
    policy = np.random.randint(number_of_actions, size=(1,number_of_states))
#     policy = np.zeros((1,number_of_states))
    value_list = np.zeros((1, number_of_states))
    episode = 0
    sigma = 0.9
    
    ## 2
    policy_stable = False
    while not policy_stable:
        episode += 1
        eval_acc = True
        while eval_acc:
            eps = 0
            for s in range(number_of_states):
                # first row
                v = value_list[0][s]

                # get the new value 
                a = policy[0][s]
                total_val_new_state = 0
                for prob, new_state, reward, done in env.P[s][a]:
                    value_new_state = value_list[0][new_state]
                    # second row
                    cand_value = 0
                    if done:
                        cand_value = reward
                        # value_list[0][s] = reward
                    else:
                        cand_value = reward + sigma*value_new_state
                    total_val_new_state += cand_value*prob 
                value_list[0][s] = total_val_new_state
                    
                # third row
                eps = max(eps, np.abs(v-value_list[0][s]))
            if eps < 1e-12:
                eval_acc = False


        ## 3
        policy_stable = True
        for s in range(number_of_states):

            # assign 
            old_action = policy[0][s]
            # get the argmax a here
            max_value = -np.inf
            for a in range(number_of_actions):
                # get the new value 
                total_cand_value = 0
                for prob, new_state, reward, done in env.P[s][a]:
                    value_new_state = value_list[0][new_state]
                    cand_value = 0
                    if done:
                        cand_value = reward
                    else:
                        cand_value = reward + sigma*value_new_state
                    total_cand_value += prob*cand_value
                if total_cand_value > max_value:
                    max_value = total_cand_value
                    policy[0][s] = a

            # if old-action != policy[s]
            if old_action != policy[0][s]:
                policy_stable = False
    print("Solved in: ", episode, " episodes")

    return value_list, policy       
        

In [65]:
# env = gym.make('FrozenLake-v0', map_name="8x8").env
env = gym.make("Taxi-v3")
current_state = env.reset()
value_list, policy = policy_iteration(env, env.observation_space.n, env.action_space.n)
rewards = []
env.render()

Solved in:  17  episodes
+---------+
|[34;1mR[0m: | : :G|
|[43m [0m: | : : |
| : : : : |
| | : | : |
|[35mY[0m| : |B: |
+---------+



In [10]:
env = gym.make('FrozenLake-v0', map_name="8x8").env
# env = gym.make("Taxi-v3")
# env = gym.make("HotterColder-v0")
current_state = env.reset()
value_list, policy = value_iteration(env, env.observation_space.n, env.action_space.n)
rewards = []
env.render()

Solved in:  193  episodes

[41mS[0mFFFFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG


In [56]:

act = int(policy[0][current_state])

print(act)
new_state, reward, finished, _ = env.step(act)
rewards.append(reward)
current_state = new_state
print(finished, reward)
env.render()

2
False 0.0
  (Right)
SFFFFFFF
FFFFFF[41mF[0mF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG


In [57]:
done = False
ep = 0
while not done:
    ep += 1
    act = int(policy[0][current_state])
    new_state, reward, done, _ = env.step(act)
    current_state = new_state
    print(done, reward, new_state)
print("Finished at step: ", ep)
env.render()    

False 0.0 15
False 0.0 14
False 0.0 22
False 0.0 14
False 0.0 22
False 0.0 14
False 0.0 22
False 0.0 14
False 0.0 6
False 0.0 6
False 0.0 7
False 0.0 7
False 0.0 7
False 0.0 7
False 0.0 15
False 0.0 23
False 0.0 22
False 0.0 23
False 0.0 23
False 0.0 31
False 0.0 39
False 0.0 31
False 0.0 31
False 0.0 31
False 0.0 30
False 0.0 38
False 0.0 37
False 0.0 38
False 0.0 30
False 0.0 38
False 0.0 37
False 0.0 38
False 0.0 37
False 0.0 36
False 0.0 37
False 0.0 36
False 0.0 28
False 0.0 20
False 0.0 28
False 0.0 36
False 0.0 44
False 0.0 45
False 0.0 53
False 0.0 45
False 0.0 44
False 0.0 43
False 0.0 51
False 0.0 43
False 0.0 44
False 0.0 45
False 0.0 44
False 0.0 45
False 0.0 53
True 0.0 52
Finished at step:  54
  (Left)
SFFFFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFF[41mH[0mFHF
FFFHFFFG
