In [7]:
import mdptoolbox.example
import mdptoolbox
import gym
import numpy as np
import sys
import os
np.random.seed(44)

In [4]:
def test_policy(env, policy, n_epoch=1000):
    rewards = []
    episode_counts = []
    for i in range(n_epoch):
        current_state = env.reset()
        ep = 0
        done = False
        episode_reward = 0
        while not done and ep < 10000:
            ep += 1
            act = int(policy[current_state])
            new_state, reward, done, _ = env.step(act)
            episode_reward += reward
            current_state = new_state
        rewards.append(episode_reward)
        episode_counts.append(ep)
    
    # all done
    mean_reward = sum(rewards)/len(rewards)
    mean_eps = sum(episode_counts)/len(episode_counts)
    return mean_reward, mean_eps, rewards, episode_counts 
            
            
        
        

In [142]:
P, R = mdptoolbox.example.forest(S=100, r1=5.17, r2=10, p=0)

In [154]:
1e-3255555 < 0

False

In [155]:
vi = mdptoolbox.mdp.ValueIteration(P, R, discount=0.5, 
                                   epsilon=1e-322, 
                                   max_iter=int(1e15))
vi.run()
vi.iter

55

In [144]:
np.asarray(vi.policy)

array([0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0])

In [12]:
# env = gym.make('FrozenLake-v0', map_name="8x8").env
env = gym.make('FrozenLake-v0').env
# env = gym.make("Taxi-v3")
P = np.zeros((env.action_space.n, env.observation_space.n, env.observation_space.n))
R = np.zeros((env.action_space.n, env.observation_space.n, env.observation_space.n))

for state in env.P:
    for action in env.P[state]:
        for prob, new_state, reward, done in env.P[state][action]:
            P[action][state][new_state] += prob
            R[action][state][new_state] = reward
for state in env.P:
    for action in env.P[state]:
        for prob, new_state, reward, done in env.P[state][action]:
            if done:
                P[action][new_state][:] = 0
                P[action][new_state][0] = 1
                R[action][new_state][0] = -1
#                 print(new_state, P[action][new_state])


In [13]:
def train_and_test(env, P, R, discount=[0.9], epsilon=[1e-9], 
                   n_iters=[1e6], mute=False):
    
    vi_dict = {}
    
    for dis in discount:
        vi_dict[dis] = {}
        for eps in epsilon:
            vi_dict[dis][eps] = {}
            # run value iteration
            vi = mdptoolbox.mdp.ValueIteration(P, R, discount=dis, 
                                               epsilon=eps, 
                                               max_iter=int(1e9))
            vi.run()
            vi_mrews, vi_meps, vi_rews, vi_eps = test_policy(env, vi.policy)    
            vi_dict[dis][eps]["mean_reward"] = vi_mrews
            vi_dict[dis][eps]["mean_eps"] = vi_meps
            vi_dict[dis][eps]["policy"] = vi.policy
            vi_dict[dis][eps]["iteration"] = vi.iter
            if not mute:
                print("Value iteration discount: {} eps: {} iter: {}".format(dis, eps, vi.iter))
                print("Mean reward: {} - mean eps: {}".format(vi_mrews, vi_meps))
    # run policy iteration
    pi_dict = {}
    for dis in discount:
        pi_dict[dis] = {}
        
        pi = mdptoolbox.mdp.PolicyIteration(P, R, discount=dis, max_iter=int(1e6))
        pi.run()
        pi_mrews, pi_meps, pi_rews, pi_eps = test_policy(env, pi.policy)    
        pi_dict[dis]["mean_reward"] = pi_mrews
        pi_dict[dis]["mean_eps"] = pi_meps
        pi_dict[dis]["policy"] = pi.policy
        pi_dict[dis]["iteration"] = pi.iter
        if not mute:
            print("Policy iteration discount: {} iter: {}".format(dis, pi.iter))            
            print("Mean reward: {} - mean eps: {}".format(pi_mrews, pi_meps))
    
    q_dict = {}
    for dis in discount:
        q_dict[dis] = {}
        for n_iter in n_iters:
            q_dict[dis][n_iter] = {}
            # run first
            q = mdptoolbox.mdp.QLearning(P, R, discount=dis, n_iter=n_iter)
            q.run()
            q_mrews, q_meps, q_rews, q_eps = test_policy(env, q.policy)
            q_dict[dis][n_iter]['mean_reward'] = q_mrews
            q_dict[dis][n_iter]['mean_eps'] = q_meps
            q_dict[dis][n_iter]['policy'] = q.policy
            if not mute:
                print("Q-learning iteration for {} discount and {} iterations is done".format(dis, n_iter))
                print("Mean reward: {} - mean eps: {}".format(q_mrews, q_meps))
    
    return vi_dict, pi_dict, q_dict

In [14]:
discounts = [0.1, 0.5, 0.75, 0.9, 0.95, 0.99]
epsilones = [1e-3, 1e-6, 1e-9, 1e-12]
n_iters = [10000, 100000, 1000000]

In [None]:
# reward is 0 4x4
vid, pid, qd = train_and_test(env, P, R, discounts, epsilones, n_iters, False)

Value iteration discount: 0.1 eps: 0.001 iter: 3
Mean reward: 0.822 - mean eps: 49.409
Value iteration discount: 0.1 eps: 1e-06 iter: 5
Mean reward: 0.818 - mean eps: 48.353
Value iteration discount: 0.1 eps: 1e-09 iter: 8
Mean reward: 0.835 - mean eps: 48.498
Value iteration discount: 0.1 eps: 1e-12 iter: 11
Mean reward: 0.822 - mean eps: 47.959
Value iteration discount: 0.5 eps: 0.001 iter: 7
Mean reward: 0.806 - mean eps: 49.186
Value iteration discount: 0.5 eps: 1e-06 iter: 14
Mean reward: 0.821 - mean eps: 49.146
Value iteration discount: 0.5 eps: 1e-09 iter: 23
Mean reward: 0.822 - mean eps: 48.364
Value iteration discount: 0.5 eps: 1e-12 iter: 32
Mean reward: 0.827 - mean eps: 47.795
Value iteration discount: 0.75 eps: 0.001 iter: 12
Mean reward: 0.806 - mean eps: 48.351
Value iteration discount: 0.75 eps: 1e-06 iter: 31
Mean reward: 0.824 - mean eps: 48.024
Value iteration discount: 0.75 eps: 1e-09 iter: 50
Mean reward: 0.822 - mean eps: 48.285
Value iteration discount: 0.75 ep

In [None]:
vid

In [8]:
frozen_lake = {}
frozen_lake[4] = {}
frozen_lake[4][0] = (vid, pid, qd)

In [10]:
# reward is -1 4x4
vid, pid, qd = train_and_test(env, P, R, discounts, epsilones, n_iters, True)

In [11]:
frozen_lake[4][-1] = (vid, pid, qd)

In [13]:
vid

{0.5: {0.001: {'mean_reward': 0.421,
   'mean_eps': 26.52,
   'policy': (1, 3, 2, 3, 0, 0, 0, 0, 3, 1, 0, 0, 0, 2, 1, 0)},
  1e-06: {'mean_reward': 0.453,
   'mean_eps': 28.354,
   'policy': (1, 3, 2, 3, 0, 0, 0, 0, 3, 1, 0, 0, 0, 2, 1, 0)},
  1e-09: {'mean_reward': 0.452,
   'mean_eps': 27.579,
   'policy': (1, 3, 2, 3, 0, 0, 0, 0, 3, 1, 0, 0, 0, 2, 1, 0)},
  1e-12: {'mean_reward': 0.46,
   'mean_eps': 26.115,
   'policy': (1, 3, 2, 3, 0, 0, 0, 0, 3, 1, 0, 0, 0, 2, 1, 0)}},
 0.75: {0.001: {'mean_reward': 0.436,
   'mean_eps': 27.014,
   'policy': (1, 3, 2, 3, 0, 0, 0, 0, 3, 1, 0, 0, 0, 2, 1, 0)},
  1e-06: {'mean_reward': 0.437,
   'mean_eps': 28.812,
   'policy': (1, 3, 2, 3, 0, 0, 0, 0, 3, 1, 0, 0, 0, 2, 1, 0)},
  1e-09: {'mean_reward': 0.464,
   'mean_eps': 26.948,
   'policy': (1, 3, 2, 3, 0, 0, 0, 0, 3, 1, 0, 0, 0, 2, 1, 0)},
  1e-12: {'mean_reward': 0.447,
   'mean_eps': 27.225,
   'policy': (1, 3, 2, 3, 0, 0, 0, 0, 3, 1, 0, 0, 0, 2, 1, 0)}},
 0.9: {0.001: {'mean_reward': 0.716,
