In [2]:
import gym
import numpy as np

In [3]:
def learn_terminal_states(env):
    
    terminal_states = []
    
    for s in range(env.nS):
        if s in terminal_states:
            continue
        for a in env.P[s]:
            for prob,s_prim,reward,done in env.P[s][a]:
                if done:
                    if not s_prim in terminal_states:
                        terminal_states.append(s_prim)
    return terminal_states

In [4]:
def calculate_sums(env, s, V, gamma):
    actions = np.zeros(env.nA)
    for a in range(env.nA):
        for prob,s_prim,reward,done in env.P[s][a]:
            actions[a] += prob * (reward + gamma * V[s_prim])

    return actions

In [5]:
def value_iteration(env, gamma):

    V = np.zeros(env.nS)
    policy = np.zeros(env.nS)
    
    terminal_states = learn_terminal_states(env)
    
    while True:
#    for count in range(10000):
        delta = 0
        for s in range(env.nS):
            if s in terminal_states:
                V[s] = 0
                continue
            
            v = V[s]
            V[s] = np.max(calculate_sums(env, s, V, gamma))
                        
            delta = max(delta, abs(v - V[s]))
        if delta < 0.00001:
            break
    for s in range(env.nS):
        policy[s] = np.argmax(calculate_sums(env, s, V, gamma))
    
    return policy

In [6]:
env = gym.make('CliffWalking-v0')
env.reset()

policy = value_iteration(env, 0.9)

done = False
steps = 0
total_reward = 0
state = env.reset()

while not done:
    state, reward, done, info = env.step(policy[state])
    total_reward += reward
    steps += 1

print('Total reward: ', total_reward, 'in steps:', steps)


Total reward:  -13 in steps: 13


In [13]:
env = gym.make('FrozenLake-v0')
env.reset()

policy = value_iteration(env, 0.9)

done = False
steps = 0
total_reward = 0
state = env.reset()

while not done:
    state, reward, done, info = env.step(policy[state])
    total_reward += reward
    steps += 1

print('Total reward: ', total_reward, 'in steps:', steps)

Total reward:  0.0 in steps: 100


In [17]:
env = gym.make('Taxi-v3')
env.reset()

policy = value_iteration(env, 0.9)

done = False
steps = 0
total_reward = 0
state = env.reset()

while not done:
    state, reward, done, info = env.step(policy[state])
    total_reward += reward
    steps += 1

print('Total reward: ', total_reward, 'in steps:', steps)

Total reward:  6 in steps: 15
