In [3]:
from hiive.mdptoolbox import mdp
import mdptoolbox
import gym
import numpy as np
import sys
import os
np.random.seed(44)

In [3]:
from gym.envs.toy_text.frozen_lake import generate_random_map, FrozenLakeEnv

In [4]:
# generate_random_map(100)

In [5]:
def test_policy(env, policy, n_epoch=1000):
    rewards = []
    episode_counts = []
    for i in range(n_epoch):
        current_state = env.reset()
        ep = 0
        done = False
        episode_reward = 0
        while not done and ep < 10000:
            ep += 1
            act = int(policy[current_state])
            new_state, reward, done, _ = env.step(act)
            episode_reward += reward
            current_state = new_state
        rewards.append(episode_reward)
        episode_counts.append(ep)
    
    # all done
    mean_reward = sum(rewards)/len(rewards)
    mean_eps = sum(episode_counts)/len(episode_counts)
    return mean_reward, mean_eps, rewards, episode_counts 
            
            
        
        

In [32]:
# env = gym.make('FrozenLake-v0', map_name="8x8").env
env = FrozenLakeEnv(desc=generate_random_map(16))
# env = gym.make('FrozenLake-v0').env
# env = gym.make("Taxi-v3")
P = np.zeros((env.action_space.n, env.observation_space.n, env.observation_space.n))
R = np.zeros((env.action_space.n, env.observation_space.n, env.observation_space.n))

for state in env.P:
    for action in env.P[state]:
        for prob, new_state, reward, done in env.P[state][action]:
            P[action][state][new_state] += prob
            R[action][state][new_state] = reward
for state in env.P:
    for action in env.P[state]:
        for prob, new_state, reward, done in env.P[state][action]:
            if done:
                P[action][new_state][:] = 0
                P[action][new_state][0] = 1
                R[action][new_state][0] = -1
#                 print(new_state, P[action][new_state])


In [34]:
env.P[0][0]

[(0.3333333333333333, 0, 0.0, False),
 (0.3333333333333333, 0, 0.0, False),
 (0.3333333333333333, 16, 0.0, False)]

In [35]:
P[0][0]

array([0.66666667, 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.33333333, 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.     

In [29]:
def train_and_test(env, P, R, discount=[0.9], epsilon=[1e-9], 
                   n_iters=[1e6], mute=False):
    
    vi_dict = {}
    
    for dis in discount:
        vi_dict[dis] = {}
        for eps in epsilon:
            vi_dict[dis][eps] = {}
            # run value iteration
            vi = mdptoolbox.mdp.ValueIteration(P, R, discount=dis, 
                                               epsilon=eps, 
                                               max_iter=int(1e6))
            vi.run()
            vi_mrews, vi_meps, vi_rews, vi_eps = test_policy(env, vi.policy)    
            vi_dict[dis][eps]["mean_reward"] = vi_mrews
            vi_dict[dis][eps]["mean_eps"] = vi_meps
            vi_dict[dis][eps]["policy"] = vi.policy
            vi_dict[dis][eps]["iteration"] = vi.iter
            if not mute:
                print("Value iteration discount: {} eps: {} iter: {}".format(dis, eps, vi.iter))
                print("Mean reward: {} - mean eps: {}".format(vi_mrews, vi_meps))
    # run policy iteration
    pi_dict = {}
    for dis in discount:
        pi_dict[dis] = {}
        
        pi = mdptoolbox.mdp.PolicyIteration(P, R, discount=dis, 
                                            max_iter=int(1e6))
        pi.run()
        pi_mrews, pi_meps, pi_rews, pi_eps = test_policy(env, pi.policy)    
        pi_dict[dis]["mean_reward"] = pi_mrews
        pi_dict[dis]["mean_eps"] = pi_meps
        pi_dict[dis]["policy"] = pi.policy
        pi_dict[dis]["iteration"] = pi.iter
        if not mute:
            print("Policy iteration discount: {} iter: {}".format(dis, pi.iter))            
            print("Mean reward: {} - mean eps: {}".format(pi_mrews, pi_meps))
    
    q_dict = {}
    for dis in discount:
        q_dict[dis] = {}
        for n_iter in n_iters:
            q_dict[dis][n_iter] = {}
            # run first
            q = mdptoolbox.mdp.QLearning(P, R, discount=dis, n_iter=n_iter)
            q.run()
            q_mrews, q_meps, q_rews, q_eps = test_policy(env, q.policy)
            q_dict[dis][n_iter]['mean_reward'] = q_mrews
            q_dict[dis][n_iter]['mean_eps'] = q_meps
            q_dict[dis][n_iter]['policy'] = q.policy
            if not mute:
                print("Q-learning iteration for {} discount and {} iterations is done".format(dis, n_iter))
                print("Mean reward: {} - mean eps: {}".format(q_mrews, q_meps))
    
    return vi_dict, pi_dict, q_dict

In [30]:
discounts = [0.1, 0.5, 0.75, 0.9, 0.95, 0.99]
epsilones = [1e-3, 1e-6, 1e-9, 1e-12]
n_iters = [10000, 100000, 1000000]

In [31]:
# reward is -1 16x16
vid, pid, qd = train_and_test(env, P, R, discounts, epsilones, n_iters, False)

Value iteration discount: 0.1 eps: 0.001 iter: 3
Mean reward: 0.0 - mean eps: 210.37
Value iteration discount: 0.1 eps: 1e-06 iter: 6
Mean reward: 0.006 - mean eps: 248.083
Value iteration discount: 0.1 eps: 1e-09 iter: 8
Mean reward: 0.011 - mean eps: 304.164
Value iteration discount: 0.1 eps: 1e-12 iter: 11
Mean reward: 0.018 - mean eps: 341.609
Value iteration discount: 0.5 eps: 0.001 iter: 8
Mean reward: 0.018 - mean eps: 320.521
Value iteration discount: 0.5 eps: 1e-06 iter: 16
Mean reward: 0.018 - mean eps: 328.771
Value iteration discount: 0.5 eps: 1e-09 iter: 26
Mean reward: 0.015 - mean eps: 326.445
Value iteration discount: 0.5 eps: 1e-12 iter: 35
Mean reward: 0.015 - mean eps: 332.302
Value iteration discount: 0.75 eps: 0.001 iter: 17
Mean reward: 0.028 - mean eps: 337.71
Value iteration discount: 0.75 eps: 1e-06 iter: 38
Mean reward: 0.019 - mean eps: 341.647
Value iteration discount: 0.75 eps: 1e-09 iter: 61
Mean reward: 0.027 - mean eps: 339.413
Value iteration discount: 

In [22]:
# reward is -1 8x8
vid, pid, qd = train_and_test(env, P, R, discounts, epsilones, n_iters, False)

Value iteration discount: 0.1 eps: 0.001 iter: 3
Mean reward: 0.0 - mean eps: 10000.0
Value iteration discount: 0.1 eps: 1e-06 iter: 5
Mean reward: 0.0 - mean eps: 10000.0
Value iteration discount: 0.1 eps: 1e-09 iter: 8
Mean reward: 0.0 - mean eps: 10000.0
Value iteration discount: 0.1 eps: 1e-12 iter: 11
Mean reward: 0.0 - mean eps: 10000.0
Value iteration discount: 0.5 eps: 0.001 iter: 7
Mean reward: 0.0 - mean eps: 10000.0
Value iteration discount: 0.5 eps: 1e-06 iter: 15
Mean reward: 0.918 - mean eps: 91.553
Value iteration discount: 0.5 eps: 1e-09 iter: 24
Mean reward: 0.912 - mean eps: 90.338
Value iteration discount: 0.5 eps: 1e-12 iter: 32
Mean reward: 0.932 - mean eps: 94.671
Value iteration discount: 0.75 eps: 0.001 iter: 15
Mean reward: 0.972 - mean eps: 98.765
Value iteration discount: 0.75 eps: 1e-06 iter: 31
Mean reward: 0.965 - mean eps: 100.961
Value iteration discount: 0.75 eps: 1e-09 iter: 52
Mean reward: 0.963 - mean eps: 99.554
Value iteration discount: 0.75 eps: 1

In [15]:
# reward is 0 4x4
vid, pid, qd = train_and_test(env, P, R, discounts, epsilones, n_iters, False)

Value iteration discount: 0.1 eps: 0.001 iter: 3
Mean reward: 0.822 - mean eps: 49.409
Value iteration discount: 0.1 eps: 1e-06 iter: 5
Mean reward: 0.818 - mean eps: 48.353
Value iteration discount: 0.1 eps: 1e-09 iter: 8
Mean reward: 0.835 - mean eps: 48.498
Value iteration discount: 0.1 eps: 1e-12 iter: 11
Mean reward: 0.822 - mean eps: 47.959
Value iteration discount: 0.5 eps: 0.001 iter: 7
Mean reward: 0.806 - mean eps: 49.186
Value iteration discount: 0.5 eps: 1e-06 iter: 14
Mean reward: 0.821 - mean eps: 49.146
Value iteration discount: 0.5 eps: 1e-09 iter: 23
Mean reward: 0.822 - mean eps: 48.364
Value iteration discount: 0.5 eps: 1e-12 iter: 32
Mean reward: 0.827 - mean eps: 47.795
Value iteration discount: 0.75 eps: 0.001 iter: 12
Mean reward: 0.806 - mean eps: 48.351
Value iteration discount: 0.75 eps: 1e-06 iter: 31
Mean reward: 0.824 - mean eps: 48.024
Value iteration discount: 0.75 eps: 1e-09 iter: 50
Mean reward: 0.822 - mean eps: 48.285
Value iteration discount: 0.75 ep

In [None]:
vid

In [8]:
frozen_lake = {}
frozen_lake[4] = {}
frozen_lake[4][0] = (vid, pid, qd)

In [10]:
# reward is -1 4x4
vid, pid, qd = train_and_test(env, P, R, discounts, epsilones, n_iters, True)

In [11]:
frozen_lake[4][-1] = (vid, pid, qd)

In [13]:
vid

{0.5: {0.001: {'mean_reward': 0.421,
   'mean_eps': 26.52,
   'policy': (1, 3, 2, 3, 0, 0, 0, 0, 3, 1, 0, 0, 0, 2, 1, 0)},
  1e-06: {'mean_reward': 0.453,
   'mean_eps': 28.354,
   'policy': (1, 3, 2, 3, 0, 0, 0, 0, 3, 1, 0, 0, 0, 2, 1, 0)},
  1e-09: {'mean_reward': 0.452,
   'mean_eps': 27.579,
   'policy': (1, 3, 2, 3, 0, 0, 0, 0, 3, 1, 0, 0, 0, 2, 1, 0)},
  1e-12: {'mean_reward': 0.46,
   'mean_eps': 26.115,
   'policy': (1, 3, 2, 3, 0, 0, 0, 0, 3, 1, 0, 0, 0, 2, 1, 0)}},
 0.75: {0.001: {'mean_reward': 0.436,
   'mean_eps': 27.014,
   'policy': (1, 3, 2, 3, 0, 0, 0, 0, 3, 1, 0, 0, 0, 2, 1, 0)},
  1e-06: {'mean_reward': 0.437,
   'mean_eps': 28.812,
   'policy': (1, 3, 2, 3, 0, 0, 0, 0, 3, 1, 0, 0, 0, 2, 1, 0)},
  1e-09: {'mean_reward': 0.464,
   'mean_eps': 26.948,
   'policy': (1, 3, 2, 3, 0, 0, 0, 0, 3, 1, 0, 0, 0, 2, 1, 0)},
  1e-12: {'mean_reward': 0.447,
   'mean_eps': 27.225,
   'policy': (1, 3, 2, 3, 0, 0, 0, 0, 3, 1, 0, 0, 0, 2, 1, 0)}},
 0.9: {0.001: {'mean_reward': 0.716,
