### Planning Algorithms

In [1]:
import numpy as np
import pandas as pd
import tempfile
import pprint
import json
import sys
import gym

from gym import wrappers
from subprocess import check_output
from IPython.display import HTML

#### Value Iteration

In [16]:
def value_iteration(S, A, P, gamma=.99, theta = 0.0000001):
 
    V = np.random.random(len(S))
    for i in range(100000):
        old_V = V.copy()
        
        Q = np.zeros((len(S), len(A)), dtype=float)
        for s in S:
            for a in A:
                for prob, s_prime, reward, done in P[s][a]:
                    Q[s][a] += prob * (reward + gamma * old_V[s] * (not done))
            V[s] = Q[s].max()
        if np.all(np.abs(old_V - V) < theta):
            break
    
    pi = np.argmax(Q, axis=1)
    return pi, V

In [None]:
mdir = tempfile.mkdtemp()
env = gym.make('FrozenLake-v0')
env = wrappers.Monitor(env, mdir, force=True)

S = range(env.env.observation_space.n)
A = range(env.env.action_space.n)
P = env.env.env.P

pi, V = value_iteration(S, A, P)

In [13]:
for _ in range(10000):
    state = env.reset()
    while True:
        state, reward, done, info = env.step(pi[state])
        if done:
            break
last_video = env.videos[-1][0]

[2017-04-01 17:52:57,463] Starting new video recorder writing to /tmp/tmpmedscieh/openaigym.video.3.137.video000000.json
[2017-04-01 17:52:57,468] Starting new video recorder writing to /tmp/tmpmedscieh/openaigym.video.3.137.video000001.json
[2017-04-01 17:52:57,478] Starting new video recorder writing to /tmp/tmpmedscieh/openaigym.video.3.137.video000008.json
[2017-04-01 17:52:57,505] Starting new video recorder writing to /tmp/tmpmedscieh/openaigym.video.3.137.video000027.json
[2017-04-01 17:52:57,555] Starting new video recorder writing to /tmp/tmpmedscieh/openaigym.video.3.137.video000064.json
[2017-04-01 17:52:57,606] Starting new video recorder writing to /tmp/tmpmedscieh/openaigym.video.3.137.video000125.json
[2017-04-01 17:52:57,667] Starting new video recorder writing to /tmp/tmpmedscieh/openaigym.video.3.137.video000216.json
[2017-04-01 17:52:57,766] Starting new video recorder writing to /tmp/tmpmedscieh/openaigym.video.3.137.video000343.json
[2017-04-01 17:52:57,897] Starti

In [4]:
out = check_output(["asciinema", "upload", last_video])
out = out.decode("utf-8").replace('\n', '').replace('\r', '')
print(out)

https://asciinema.org/a/9hcmd0xprfc736y7mj52x7vxe


In [5]:
castid = out.split('/')[-1]
html_tag = """
<script type="text/javascript" 
    src="https://asciinema.org/a/{0}.js" 
    id="asciicast-{0}" 
    async data-autoplay="true" data-size="big">
</script>
"""
html_tag = html_tag.format(castid)
HTML(data=html_tag)

In [14]:
V

array([  8.42424131e-01,   1.27597470e-01,   4.46821586e-01,
         8.22393114e-01,   3.27417690e-01,   0.00000000e+00,
         5.83066888e-07,   0.00000000e+00,   5.01252977e-01,
         9.52402699e-01,   4.65184406e-01,   0.00000000e+00,
         0.00000000e+00,   3.80762908e-01,   9.99998223e-01,
         0.00000000e+00])

In [7]:
pi

array([0, 3, 0, 3, 0, 0, 0, 0, 3, 1, 0, 0, 0, 2, 1, 0])

In [8]:
env.close()

[2017-04-01 16:55:41,787] Finished writing results. You can upload them to the scoreboard via gym.upload('/tmp/tmpfukeltbz')


In [9]:
gym.upload(mdir, api_key='<YOUR OPENAI API KEY>')

[2017-04-01 16:55:43,229] [FrozenLake-v0] Uploading 10000 episodes of training data
[2017-04-01 16:55:44,905] [FrozenLake-v0] Uploading videos of 19 training episodes (2158 bytes)
[2017-04-01 16:55:45,131] [FrozenLake-v0] Creating evaluation object from /tmp/tmpfukeltbz with learning curve and training video
[2017-04-01 16:55:45,620] 
****************************************************
You successfully uploaded your evaluation on FrozenLake-v0 to
OpenAI Gym! You can find it at:

    https://gym.openai.com/evaluations/eval_ycTPCbyiTWK6T0C4DyrvRg

****************************************************


#### Policy Evaluation

In [123]:
def policy_evaluation(pi, S, A, P, gamma=.99, theta=0.0000001):
    
    V = np.zeros(len(S))
    while True:
        delta = 0
        for s in S:
            v = V[s]
            
            V[s] = 0
            for prob, dst, reward, done in P[s][pi[s]]:
                V[s] += prob * (reward + gamma * V[dst] * (not done))
            
            delta = max(delta, np.abs(v - V[s]))
        if delta < theta:
            break
    return V

In [124]:
def policy_improvement(pi, V, S, A, P, gamma=.99):
    for s in S:
        old_a = pi[s]
        
        Qs = np.zeros(len(A), dtype=float)
        for a in A:
            for prob, s_prime, reward, done in P[s][a]:
                Qs[a] += prob * (reward + gamma * V[s] * (not done))
        pi[s] = np.argmax(Qs)
        V[s] = np.max(Qs)
    return pi, V

In [125]:
def policy_iteration(S, A, P, gamma=.99):
    pi = np.random.choice(A, len(S))
    while True:    
        V = policy_evaluation(pi, S, A, P, gamma)
        new_pi, new_V = policy_improvement(
            pi.copy(), V.copy(), S, A, P, gamma)
        if np.all(pi == new_pi):
            break
        pi = new_pi
        V = new_V
    return pi

In [126]:
mdir = tempfile.mkdtemp()
env = gym.make('FrozenLake-v0')
env = wrappers.Monitor(env, mdir, force=True)

S = range(env.env.observation_space.n)
A = range(env.env.action_space.n)
P = env.env.env.P

pi = policy_iteration(S, A, P)
print(pi)

[2017-04-01 20:39:32,143] Making new env: FrozenLake-v0
[2017-04-01 20:39:32,212] Finished writing results. You can upload them to the scoreboard via gym.upload('/tmp/tmpj0ku0pbt')


[0 3 0 3 0 0 0 0 3 1 0 0 0 2 1 0]


In [127]:
for _ in range(10000):
    state = env.reset()
    while True:
        state, reward, done, info = env.step(pi[state])
        if done:
            break
last_video = env.videos[-1][0]

[2017-04-01 20:39:34,925] Starting new video recorder writing to /tmp/tmpyspcx0sa/openaigym.video.96.137.video000000.json
[2017-04-01 20:39:34,932] Starting new video recorder writing to /tmp/tmpyspcx0sa/openaigym.video.96.137.video000001.json
[2017-04-01 20:39:34,940] Starting new video recorder writing to /tmp/tmpyspcx0sa/openaigym.video.96.137.video000008.json
[2017-04-01 20:39:34,977] Starting new video recorder writing to /tmp/tmpyspcx0sa/openaigym.video.96.137.video000027.json
[2017-04-01 20:39:35,024] Starting new video recorder writing to /tmp/tmpyspcx0sa/openaigym.video.96.137.video000064.json
[2017-04-01 20:39:35,075] Starting new video recorder writing to /tmp/tmpyspcx0sa/openaigym.video.96.137.video000125.json
[2017-04-01 20:39:35,160] Starting new video recorder writing to /tmp/tmpyspcx0sa/openaigym.video.96.137.video000216.json
[2017-04-01 20:39:35,260] Starting new video recorder writing to /tmp/tmpyspcx0sa/openaigym.video.96.137.video000343.json
[2017-04-01 20:39:35,382

In [128]:
out = check_output(["asciinema", "upload", last_video])
out = out.decode("utf-8").replace('\n', '').replace('\r', '')
print(out)

https://asciinema.org/a/5k34nstnvcznz9hpqd2gleuxf


In [129]:
castid = out.split('/')[-1]
html_tag = """
<script type="text/javascript" 
    src="https://asciinema.org/a/{0}.js" 
    id="asciicast-{0}" 
    async data-autoplay="true" data-size="big">
</script>
"""
html_tag = html_tag.format(castid)
HTML(data=html_tag)

In [130]:
V

array([  8.16998304e-03,   3.29652708e-03,   1.28990882e-03,
         4.29969608e-04,   1.50867136e-02,   0.00000000e+00,
         3.66919617e-02,   0.00000000e+00,   3.43668617e-02,
         1.03100630e-01,   1.63823969e-01,   0.00000000e+00,
         0.00000000e+00,   1.45477988e-01,   6.24289324e-01,
         0.00000000e+00])

In [131]:
pi

array([0, 3, 0, 3, 0, 0, 0, 0, 3, 1, 0, 0, 0, 2, 1, 0])

In [133]:
env.close()

[2017-04-01 20:40:49,419] Finished writing results. You can upload them to the scoreboard via gym.upload('/tmp/tmpyspcx0sa')


In [134]:
gym.upload(mdir, api_key='<YOUR OPENAI API KEY>')

[2017-04-01 20:40:54,103] [FrozenLake-v0] Uploading 10000 episodes of training data
[2017-04-01 20:40:55,854] [FrozenLake-v0] Uploading videos of 19 training episodes (2278 bytes)
[2017-04-01 20:40:56,102] [FrozenLake-v0] Creating evaluation object from /tmp/tmpyspcx0sa with learning curve and training video
[2017-04-01 20:40:56,451] 
****************************************************
You successfully uploaded your evaluation on FrozenLake-v0 to
OpenAI Gym! You can find it at:

    https://gym.openai.com/evaluations/eval_vAvbhsGQRVSAe5DZkFNrQ

****************************************************
