In [7]:
from hiive.mdptoolbox import mdp
import mdptoolbox
import gym
import numpy as np
import sys
import os
np.random.seed(44)

In [4]:
from gym.envs.toy_text.frozen_lake import generate_random_map, FrozenLakeEnv

In [25]:
env = FrozenLakeEnv(desc=generate_random_map(50))
# generate_random_map(100)

In [30]:
# env = gym.make('FrozenLake-v0', map_name="8x8").env
env = gym.make('FrozenLake-v0')
# env = gym.make("Taxi-v3")
P = np.zeros((env.action_space.n, env.observation_space.n, env.observation_space.n))
R = np.zeros((env.action_space.n, env.observation_space.n, env.observation_space.n))

for state in env.P:
    for action in env.P[state]:
        for prob, new_state, reward, done in env.P[state][action]:
            P[action][state][new_state] += prob
            R[action][state][new_state] = reward

In [44]:
env = gym.make('FrozenLake-v0', map_name="8x8").env
# env = gym.make("Taxi-v3")
P = np.zeros((env.action_space.n, env.observation_space.n, env.observation_space.n))
R = np.zeros((env.action_space.n, env.observation_space.n, env.observation_space.n))

for state in env.P:
    for action in env.P[state]:
        for prob, new_state, reward, done in env.P[state][action]:
            P[action][state][new_state] += prob
            R[action][state][new_state] = reward
for state in env.P:
    for action in env.P[state]:
        for prob, new_state, reward, done in env.P[state][action]:
            if done:
                P[action][new_state][:] = 0
                P[action][new_state][0] = 1
                R[action][new_state][0] = -10
#                 print(new_state, P[action][new_state])


In [45]:
def train_and_test(env, P, R, discount=0.9, epsilon=1e-9, max_iter=1e9, 
                   n_iter=1e6):
    
    # run value iteration
    vi = mdptoolbox.mdp.ValueIteration(P, R, discount=discount, 
                                       epsilon=epsilon, 
                                       max_iter=max_iter)
    vi.run()
    
    # run policy iteration
    pi = mdptoolbox.mdp.PolicyIteration(P, R, discount=discount)
    pi.run()
    
    # run q-learning
    q = mdptoolbox.mdp.QLearning(P, R, discount=0.9, n_iter=n_iter)
    q.run()

    vi_mrews, vi_meps, vi_rews, vi_eps = test_policy(env, vi.policy)    
    pi_mrews, pi_meps, pi_rews, pi_eps = test_policy(env, pi.policy)    
    q_mrews, q_meps, q_rews, q_eps = test_policy(env, q.policy)
    
    mean_rewards = (vi_mrews, pi_mrews, q_mrews)
    mean_eps = (vi_meps, pi_meps, q_meps)
    return mean_rewards, mean_eps

In [None]:
train_and_test(env, P, R)

In [31]:
vi = mdptoolbox.mdp.ValueIteration(P, R, discount=0.9, epsilon=1e-12, max_iter=1e12)
vi.run()
vi_policy = vi.policy # result is (0, 0, 0)
vi.iter

195

In [32]:
pi = mdptoolbox.mdp.PolicyIteration(P, R, discount=0.9)
pi.run()
pi_policy = pi.policy
pi.iter

6

In [33]:
q = mdptoolbox.mdp.QLearning(P, R, discount=0.9, n_iter=1e6)
q.run()
q_policy = q.policy

In [34]:
pi_mean_reward, pi_mean_eps, pi_rewards, pi_episodes = test_policy(env, pi_policy)

In [35]:
vi_mean_reward, vi_mean_eps, vi_rewards, vi_episodes = test_policy(env, vi_policy)

In [36]:
q_mean_reward, q_mean_eps, q_rewards, q_episodes = test_policy(env, q_policy)

In [37]:
q_mean_reward, pi_mean_reward, vi_mean_reward

(0.057, 0.719, 0.707)

In [6]:
def test_policy(env, policy, n_epoch=1000):
    rewards = []
    episode_counts = []
    for i in range(n_epoch):
        current_state = env.reset()
        ep = 0
        done = False
        episode_reward = 0
        while not done:
            ep += 1
            act = int(policy[current_state])
            new_state, reward, done, _ = env.step(act)
            episode_reward += reward
            current_state = new_state
        rewards.append(episode_reward)
        episode_counts.append(ep)
    
    # all done
    mean_reward = sum(rewards)/len(rewards)
    mean_eps = sum(episode_counts)/len(episode_counts)
    return mean_reward, mean_eps, rewards, episode_counts 
            
            
        
        

## Single Step Run

In [137]:
current_state = env.reset()
rewards = []

In [None]:

act = int(vi_policy[current_state])

print(act)
new_state, reward, finished, _ = env.step(act)
rewards.append(reward)
current_state = new_state
print(finished, reward)
env.render()

In [None]:

act = int(pi_policy[current_state])

print(act)
new_state, reward, finished, _ = env.step(act)
rewards.append(reward)
current_state = new_state
print(finished, reward)
env.render()

In [84]:

act = int(q_policy[current_state])

print(act)
new_state, reward, finished, _ = env.step(act)
rewards.append(reward)
current_state = new_state
print(finished, reward)
env.render()

1
False -1
+---------+
|[35mR[0m:[43m [0m| : :G|
| : | : : |
| : : : : |
| | : | : |
|[34;1mY[0m| : |B: |
+---------+
  (North)


## Multiple Step Run

In [352]:
done = False
ep = 0
current_state = env.reset()
while not done:
    ep += 1
    act = int(vi_policy[current_state])
    new_state, reward, done, _ = env.step(act)
    current_state = new_state
#     print(done, reward, new_state)
print("Finished at step: ", ep)
env.render()    

Finished at step:  52
  (Right)
SFFFFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFF[41mG[0m


In [336]:
done = False
ep = 0
current_state = env.reset()
while not done:
    ep += 1
    act = int(pi_policy[current_state])
    new_state, reward, done, _ = env.step(act)
    current_state = new_state
#     print(done, reward, new_state)
print("Finished at step: ", ep)
env.render()    

Finished at step:  89
  (Right)
SFFFFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFF[41mG[0m


In [278]:
done = False
ep = 0
current_state = env.reset()
while not done:
    ep += 1
    act = int(q_policy[current_state])
    new_state, reward, done, _ = env.step(act)
    current_state = new_state
#     print(done, reward, new_state)
print("Finished at step: ", ep)
env.render()    

Finished at step:  44
  (Down)
SFFFFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
F[41mH[0mHFFFHF
FHFFHFHF
FFFHFFFG
