In [18]:
import gym
import numpy as np

In [19]:
def learn_terminal_states(env):
    
    terminal_states = []
    
    for s in range(env.nS):
        if s in terminal_states:
            continue
        for a in env.P[s]:
            for prob,s_prim,reward,done in env.P[s][a]:
                if done:
                    if not s_prim in terminal_states:
                        terminal_states.append(s_prim)
    return terminal_states

In [20]:
def calculate_sums(env, s, V, gamma):
    actions = np.zeros(env.nA)
    for a in range(env.nA):
        for prob,s_prim,reward,done in env.P[s][a]:
            actions[a] += prob * (reward + gamma * V[s_prim])

    return actions

In [21]:
def value_iteration(env, gamma):

    V = np.zeros(env.nS)
    policy = np.zeros(env.nS)
    
    terminal_states = learn_terminal_states(env)
    loop_counter = 0
    while True:
#    for count in range(10000):
        delta = 0
        for s in range(env.nS):
            if s in terminal_states:
                V[s] = 0
                continue
            
            v = V[s]
            V[s] = np.max(calculate_sums(env, s, V, gamma))
                        
            delta = max(delta, abs(v - V[s]))
            loop_counter += 1
        if delta < 0.00001:
            break
    for s in range(env.nS):
        policy[s] = np.argmax(calculate_sums(env, s, V, gamma))
    
    print('Number of loops:', loop_counter)
    return policy

In [22]:
env = gym.make('CliffWalking-v0')
env.reset()

policy = value_iteration(env, 0.9)


steps = 0
total_reward = 0

for c in range(1000):
    done = False
    state = env.reset()

    while not done:
        state, reward, done, info = env.step(policy[state])
        total_reward += reward
        steps += 1

env.close()
print('Mean reward: ', total_reward/1000, 'Mean steps:', steps/1000)


Number of loops: 705
Mean reward:  -13.0 Mean steps: 13.0


In [23]:
env = gym.make('FrozenLake-v0')
env.reset()

policy = value_iteration(env, 0.9)

steps = 0
total_reward = 0

for c in range(1000):
    done = False
    state = env.reset()

    while not done:
        state, reward, done, info = env.step(policy[state])
        total_reward += reward
        steps += 1

env.close()
print('Mean reward: ', total_reward/1000, 'Mean steps:', steps/1000)

Number of loops: 528
Mean reward:  0.729 Mean steps: 41.038


In [24]:
env = gym.make('Taxi-v3')
env.reset()

policy = value_iteration(env, 0.9)

steps = 0
total_reward = 0

for c in range(1000):
    done = False
    state = env.reset()

    while not done:
        state, reward, done, info = env.step(policy[state])
        total_reward += reward
        steps += 1

env.close()
print('Mean reward: ', total_reward/1000, 'Mean steps:', steps/1000)

Number of loops: 6448
Mean reward:  7.856 Mean steps: 13.144
