In [15]:
import gym
import numpy as np

from gym import wrappers

def iterate_value_function(v_inp, q_inp, gamma, env):
    
    nstates = env.env.observation_space.n
    nactions = env.env.action_space.n
    
    ret = np.zeros(nstates)
    retq = np.zeros((nstates, nactions))

    for sid in range(nstates):
        temp_v = np.zeros(nactions)
        for action in range(nactions):
            for (prob, dst_state, reward, is_final) in env.env.env.P[sid][action]:
                if is_final:
                    temp_v[action] = temp_v[action] + prob * (reward)
                else:
                    temp_v[action] = temp_v[action] + prob * (reward + gamma * v_inp[dst_state])
        retq[sid] = temp_v
        ret[sid] = max(temp_v)
    return ret, retq

def build_greedy_policy(v_inp, gamma, env):
    nstates = env.env.observation_space.n
    nactions = env.env.action_space.n
    
    new_policy = np.zeros(nstates)
    
    for state_id in range(nstates):
        profits = np.zeros(nactions)
        for action in range(nactions):
            for (prob, dst_state, reward, is_final) in env.env.env.P[state_id][action]:
                profits[action] += prob*(reward + gamma*v[dst_state])
        new_policy[state_id] = np.argmax(profits)
    return new_policy


env = gym.make('Taxi-v2')
env = wrappers.Monitor(env, '/tmp/', force=True)

gamma = 0.9
cum_reward = 0
n_rounds = 1000

# solve the world with value iteration 
# (without interacting with the environment)

for t_rounds in range(n_rounds):
    # init env and value function
    observation = env.reset()
    
    v = np.zeros(env.env.observation_space.n)
    q = np.zeros((env.env.observation_space.n, 
                  env.env.action_space.n))

    # solve MDP
    for _ in range(200):
        v_old = v.copy()
        q_old = q.copy()
        v, q = iterate_value_function(v, q, gamma, env)
        if np.all(v == v_old):
            break
    policy = build_greedy_policy(v, gamma, env).astype(np.int)

    # apply policy
    for t in range(1000):
        action = policy[observation]
        observation, reward, done, info = env.step(action)
        cum_reward += reward
        if done:
            break
    if t_rounds % 50 == 0 and t_rounds > 0:
        print(cum_reward * 1.0 / (t_rounds + 1))

np.set_printoptions(threshold=np.nan)
print('v states')
#print(v.tolist())
print('q values')
#print(q.tolist())


[2017-03-19 19:12:59,371] Making new env: Taxi-v2
[2017-03-19 19:12:59,378] Clearing 2 monitor files from previous run (because force=True was provided)
[2017-03-19 19:12:59,380] Starting new video recorder writing to /tmp/openaigym.video.3.13807.video000000.json
[2017-03-19 19:12:59,463] Starting new video recorder writing to /tmp/openaigym.video.3.13807.video000001.json
[2017-03-19 19:12:59,981] Starting new video recorder writing to /tmp/openaigym.video.3.13807.video000008.json
[2017-03-19 19:13:01,377] Starting new video recorder writing to /tmp/openaigym.video.3.13807.video000027.json


9.27450980392157


[2017-03-19 19:13:04,072] Starting new video recorder writing to /tmp/openaigym.video.3.13807.video000064.json


9.089108910891088


[2017-03-19 19:13:08,512] Starting new video recorder writing to /tmp/openaigym.video.3.13807.video000125.json


8.649006622516556
8.562189054726367


[2017-03-19 19:13:15,395] Starting new video recorder writing to /tmp/openaigym.video.3.13807.video000216.json


8.52191235059761
8.541528239202657


[2017-03-19 19:13:24,924] Starting new video recorder writing to /tmp/openaigym.video.3.13807.video000343.json


8.472934472934472
8.446384039900249
8.472283813747229
8.451097804391217


[2017-03-19 19:13:37,624] Starting new video recorder writing to /tmp/openaigym.video.3.13807.video000512.json


8.490018148820326
8.472545757071547
8.488479262672811
8.489300998573466


[2017-03-19 19:13:57,616] Starting new video recorder writing to /tmp/openaigym.video.3.13807.video000729.json


8.491344873501998
8.488139825218477
8.475910693301998
8.492785793562708
8.46056782334385
v states
q values


In [29]:
# act greedily
for episode in range(10):
    state = env.reset()

    while True:
        action = np.argmax(q[state])
        _, _, done, _ = env.step(action)
        if done:
            break

In [33]:
# apply policy

observation = env.reset()
for t in range(1000):
    action = policy[observation]
    observation, reward, done, info = env.step(action)
    cum_reward += reward
    if done:
        break

In [34]:
env.videos

[('/tmp/openaigym.video.3.13807.video000000.json',
  '/tmp/openaigym.video.3.13807.video000000.meta.json'),
 ('/tmp/openaigym.video.3.13807.video000001.json',
  '/tmp/openaigym.video.3.13807.video000001.meta.json'),
 ('/tmp/openaigym.video.3.13807.video000008.json',
  '/tmp/openaigym.video.3.13807.video000008.meta.json'),
 ('/tmp/openaigym.video.3.13807.video000027.json',
  '/tmp/openaigym.video.3.13807.video000027.meta.json'),
 ('/tmp/openaigym.video.3.13807.video000064.json',
  '/tmp/openaigym.video.3.13807.video000064.meta.json'),
 ('/tmp/openaigym.video.3.13807.video000125.json',
  '/tmp/openaigym.video.3.13807.video000125.meta.json'),
 ('/tmp/openaigym.video.3.13807.video000216.json',
  '/tmp/openaigym.video.3.13807.video000216.meta.json'),
 ('/tmp/openaigym.video.3.13807.video000343.json',
  '/tmp/openaigym.video.3.13807.video000343.meta.json'),
 ('/tmp/openaigym.video.3.13807.video000512.json',
  '/tmp/openaigym.video.3.13807.video000512.meta.json'),
 ('/tmp/openaigym.video.3.13

In [32]:
env.videos[-1][0]

'/tmp/openaigym.video.3.13807.video001000.json'

In [21]:
from subprocess import check_output

out = check_output(["asciinema", "upload", env.videos[-1][0]])
out = out.decode("utf-8").replace('\n', '').replace('\r', '')

In [22]:
print(out)

https://asciinema.org/a/f30vvah0mw3l7vwaqys3qvubr


In [23]:
castid = out.split('/')[-1]
castid

'f30vvah0mw3l7vwaqys3qvubr'

In [25]:
from IPython.display import HTML

html_tag = """
<script type="text/javascript" 
    src="https://asciinema.org/a/{0}.js" 
    id="asciicast-{0}" 
    async data-autoplay="true" data-size="big">
</script>
"""
html_tag = html_tag.format(castid)
HTML(data=html_tag)