In [1]:
import gym
import numpy as np

from gym import wrappers

def iterate_value_function(v_inp, q_inp, gamma, env):
    
    nstates = env.env.observation_space.n
    nactions = env.env.action_space.n
    
    ret = np.zeros(nstates)
    retq = np.zeros((nstates, nactions))

    for sid in range(nstates):
        temp_v = np.zeros(nactions)
        for action in range(nactions):
            for (prob, dst_state, reward, is_final) in env.env.env.P[sid][action]:
                if is_final:
                    temp_v[action] = temp_v[action] + prob * (reward)
                else:
                    temp_v[action] = temp_v[action] + prob * (reward + gamma * v_inp[dst_state])
        retq[sid] = temp_v
        ret[sid] = max(temp_v)
    return ret, retq

def build_greedy_policy(v_inp, gamma, env):
    nstates = env.env.observation_space.n
    nactions = env.env.action_space.n
    
    new_policy = np.zeros(nstates)
    
    for state_id in range(nstates):
        profits = np.zeros(nactions)
        for action in range(nactions):
            for (prob, dst_state, reward, is_final) in env.env.env.P[state_id][action]:
                profits[action] += prob*(reward + gamma*v[dst_state])
        new_policy[state_id] = np.argmax(profits)
    return new_policy


env = gym.make('Taxi-v2')
env = wrappers.Monitor(env, '/tmp/', force=True)

gamma = 0.9
cum_reward = 0
n_rounds = 1000

# solve the world with value iteration 
# (without interacting with the environment)

for t_rounds in range(n_rounds):
    # init env and value function
    observation = env.reset()
    
    v = np.zeros(env.env.observation_space.n)
    q = np.zeros((env.env.observation_space.n, 
                  env.env.action_space.n))

    # solve MDP
    for _ in range(200):
        v_old = v.copy()
        q_old = q.copy()
        v, q = iterate_value_function(v, q, gamma, env)
        if np.all(v == v_old):
            break
    policy = build_greedy_policy(v, gamma, env).astype(np.int)

    # apply policy
    for t in range(1000):
        action = policy[observation]
        observation, reward, done, info = env.step(action)
        cum_reward += reward
        if done:
            break
    if t_rounds % 50 == 0 and t_rounds > 0:
        print(cum_reward * 1.0 / (t_rounds + 1))

np.set_printoptions(threshold=np.nan)
print('v states')
#print(v.tolist())
print('q values')
#print(q.tolist())


[2017-03-20 13:59:02,002] Making new env: Taxi-v2
[2017-03-20 13:59:02,011] Starting new video recorder writing to /tmp/openaigym.video.0.27549.video000000.json
[2017-03-20 13:59:02,083] Starting new video recorder writing to /tmp/openaigym.video.0.27549.video000001.json
[2017-03-20 13:59:02,574] Starting new video recorder writing to /tmp/openaigym.video.0.27549.video000008.json
[2017-03-20 13:59:03,906] Starting new video recorder writing to /tmp/openaigym.video.0.27549.video000027.json


8.666666666666666


[2017-03-20 13:59:06,497] Starting new video recorder writing to /tmp/openaigym.video.0.27549.video000064.json


8.504950495049505


[2017-03-20 13:59:10,777] Starting new video recorder writing to /tmp/openaigym.video.0.27549.video000125.json


8.516556291390728
8.462686567164178


[2017-03-20 13:59:17,161] Starting new video recorder writing to /tmp/openaigym.video.0.27549.video000216.json


8.43824701195219
8.508305647840531


[2017-03-20 13:59:26,066] Starting new video recorder writing to /tmp/openaigym.video.0.27549.video000343.json


8.47008547008547
8.4214463840399
8.425720620842572
8.403193612774452


[2017-03-20 13:59:37,971] Starting new video recorder writing to /tmp/openaigym.video.0.27549.video000512.json


8.406533575317605
8.372712146422629
8.40552995391705
8.398002853067046


[2017-03-20 13:59:53,286] Starting new video recorder writing to /tmp/openaigym.video.0.27549.video000729.json


8.363515312916112
8.415730337078651
8.428907168037602
8.427302996670367
8.41850683491062
v states
q values


In [29]:
# act greedily
for episode in range(10):
    state = env.reset()

    while True:
        action = np.argmax(q[state])
        _, _, done, _ = env.step(action)
        if done:
            break

In [33]:
# apply policy

observation = env.reset()
for t in range(1000):
    action = policy[observation]
    observation, reward, done, info = env.step(action)
    cum_reward += reward
    if done:
        break

In [34]:
env.videos

[('/tmp/openaigym.video.3.13807.video000000.json',
  '/tmp/openaigym.video.3.13807.video000000.meta.json'),
 ('/tmp/openaigym.video.3.13807.video000001.json',
  '/tmp/openaigym.video.3.13807.video000001.meta.json'),
 ('/tmp/openaigym.video.3.13807.video000008.json',
  '/tmp/openaigym.video.3.13807.video000008.meta.json'),
 ('/tmp/openaigym.video.3.13807.video000027.json',
  '/tmp/openaigym.video.3.13807.video000027.meta.json'),
 ('/tmp/openaigym.video.3.13807.video000064.json',
  '/tmp/openaigym.video.3.13807.video000064.meta.json'),
 ('/tmp/openaigym.video.3.13807.video000125.json',
  '/tmp/openaigym.video.3.13807.video000125.meta.json'),
 ('/tmp/openaigym.video.3.13807.video000216.json',
  '/tmp/openaigym.video.3.13807.video000216.meta.json'),
 ('/tmp/openaigym.video.3.13807.video000343.json',
  '/tmp/openaigym.video.3.13807.video000343.meta.json'),
 ('/tmp/openaigym.video.3.13807.video000512.json',
  '/tmp/openaigym.video.3.13807.video000512.meta.json'),
 ('/tmp/openaigym.video.3.13

In [32]:
env.videos[-1][0]

'/tmp/openaigym.video.3.13807.video001000.json'

In [21]:
from subprocess import check_output

out = check_output(["asciinema", "upload", env.videos[-1][0]])
out = out.decode("utf-8").replace('\n', '').replace('\r', '')

In [22]:
print(out)

https://asciinema.org/a/f30vvah0mw3l7vwaqys3qvubr


In [3]:
castid = out.split('/')[-1]
castid

NameError: name 'out' is not defined

In [2]:
from IPython.display import HTML

html_tag = """
<script type="text/javascript" 
    src="https://asciinema.org/a/{0}.js" 
    id="asciicast-{0}" 
    async data-autoplay="true" data-size="big">
</script>
"""
html_tag = html_tag.format(castid)
HTML(data=html_tag)

NameError: name 'castid' is not defined