### Planning Algorithms

In [1]:
import numpy as np
import pprint
import json
import sys
import gym

from gym import wrappers
from subprocess import check_output
from IPython.display import HTML

#### Policy Evaluation

In [2]:
def policy_evaluation(policy, env, discount_factor=.75, theta=0.000001):
    V = np.zeros(env.nS)
    # print('evaluating policy')
    while True:
        delta = 0
        # For each state, perform a "full backup"
        for s in range(env.nS):
            v = 0
            # Look at the possible next actions
            for a, action_prob in enumerate(policy[s]):
                # For each action, look at the possible next states...
                for  prob, next_state, reward, done in env.P[s][a]:
                    # Calculate the expected value
                    v += action_prob * prob * (reward + discount_factor * V[next_state] * (not done))
                    # print(s, a, prob, next_state, reward, done, v)
            # How much our value function changed (across any states)
            delta = max(delta, np.abs(v - V[s]))
            # print(delta)
            V[s] = v
        # Stop evaluating once our value function change is below a threshold
        if delta < theta:
            break
    return np.array(V)

#### Policy Improvement

In [3]:
def policy_improvement(env, policy_eval_fn=policy_evaluation, discount_factor=0.75):
    # Start with a random policy
    policy = np.random.random([env.nS, env.nA])

    iterations = 0
    while True:
        # Evaluate the current policy
        V = policy_eval_fn(policy, env, discount_factor)
        iterations += 1
        # Will be set to false if we make any changes to the policy
        policy_stable = True
        # For each state...
        for s in range(env.nS):
            # The best action we would take under the currect policy
            chosen_a = np.argmax(policy[s])

            # Find the best action by one-step lookahead
            # Ties are resolved arbitarily
            action_values = np.zeros(env.nA)
            for a in range(env.nA):
                for prob, next_state, reward, done in env.P[s][a]:
                    action_values[a] += prob * (reward + discount_factor * V[next_state])
            best_a = np.argmax(action_values)

            # Greedily update the policy
            if chosen_a != best_a:
                policy_stable = False
            policy[s] = np.eye(env.nA)[best_a]

        # If the policy is stable we've found an optimal policy. Return it
        if policy_stable:
            return policy, V, iterations

#### Policy Iteration

#### Value Iteration

In [4]:
def iterate_value_function(v_inp, q_inp, gamma, env):
    
    nstates = env.env.observation_space.n
    nactions = env.env.action_space.n
    
    ret = np.zeros(nstates)
    retq = np.zeros((nstates, nactions))

    for sid in range(nstates):
        temp_v = np.zeros(nactions)
        for action in range(nactions):
            for (prob, dst_state, reward, is_final) in env.env.env.P[sid][action]:
                if is_final:
                    temp_v[action] = temp_v[action] + prob * (reward)
                else:
                    temp_v[action] = temp_v[action] + prob * (reward + gamma * v_inp[dst_state])
        retq[sid] = temp_v
        ret[sid] = max(temp_v)
    return ret, retq

def build_greedy_policy(v_inp, gamma, env):
    nstates = env.env.observation_space.n
    nactions = env.env.action_space.n
    
    new_policy = np.zeros(nstates)
    
    for state_id in range(nstates):
        profits = np.zeros(nactions)
        for action in range(nactions):
            for (prob, dst_state, reward, is_final) in env.env.env.P[state_id][action]:
                profits[action] += prob*(reward + gamma*v[dst_state])
        new_policy[state_id] = np.argmax(profits)
    return new_policy


env = gym.make('Taxi-v2')
env = wrappers.Monitor(env, '/tmp/', force=True)

gamma = 0.9
cum_reward = 0
n_rounds = 1000

# solve the world with value iteration 
# (without interacting with the environment)

for t_rounds in range(n_rounds):
    # init env and value function
    observation = env.reset()
    
    v = np.zeros(env.env.observation_space.n)
    q = np.zeros((env.env.observation_space.n, 
                  env.env.action_space.n))

    # solve MDP
    for _ in range(200):
        v_old = v.copy()
        q_old = q.copy()
        v, q = iterate_value_function(v, q, gamma, env)
        if np.all(v == v_old):
            break
    policy = build_greedy_policy(v, gamma, env).astype(np.int)

    # apply policy
    for t in range(1000):
        action = policy[observation]
        observation, reward, done, info = env.step(action)
        cum_reward += reward
        if done:
            break
    if t_rounds % 50 == 0 and t_rounds > 0:
        print(cum_reward * 1.0 / (t_rounds + 1))

np.set_printoptions(threshold=np.nan)
print('v states')
#print(v.tolist())
print('q values')
#print(q.tolist())


[2017-03-24 15:06:57,642] Making new env: Taxi-v2
[2017-03-24 15:06:57,654] Starting new video recorder writing to /tmp/openaigym.video.0.36.video000000.json
[2017-03-24 15:06:57,721] Starting new video recorder writing to /tmp/openaigym.video.0.36.video000001.json
[2017-03-24 15:06:58,172] Starting new video recorder writing to /tmp/openaigym.video.0.36.video000008.json
[2017-03-24 15:06:59,388] Starting new video recorder writing to /tmp/openaigym.video.0.36.video000027.json


8.137254901960784


[2017-03-24 15:07:01,761] Starting new video recorder writing to /tmp/openaigym.video.0.36.video000064.json


8.178217821782178


[2017-03-24 15:07:05,637] Starting new video recorder writing to /tmp/openaigym.video.0.36.video000125.json


8.33774834437086
8.482587064676617


[2017-03-24 15:07:11,447] Starting new video recorder writing to /tmp/openaigym.video.0.36.video000216.json


8.541832669322709
8.445182724252492


[2017-03-24 15:07:19,535] Starting new video recorder writing to /tmp/openaigym.video.0.36.video000343.json


8.450142450142451
8.528678304239401
8.494456762749445
8.479041916167665


[2017-03-24 15:07:30,283] Starting new video recorder writing to /tmp/openaigym.video.0.36.video000512.json


8.491833030852995
8.464226289517471
8.448540706605224
8.419400855920115


[2017-03-24 15:07:44,114] Starting new video recorder writing to /tmp/openaigym.video.0.36.video000729.json


8.427430093209054
8.406991260923846
8.421856639247943
8.426193118756936
8.436382754994742
v states
q values


In [5]:
# act greedily
for episode in range(10):
    state = env.reset()

    while True:
        action = np.argmax(q[state])
        _, _, done, _ = env.step(action)
        if done:
            break

[2017-03-24 15:09:25,380] Starting new video recorder writing to /tmp/openaigym.video.0.36.video001000.json


In [6]:
env.videos[-1][0]

'/tmp/openaigym.video.0.36.video001000.json'

In [7]:
out = check_output(["asciinema", "upload", env.videos[-1][0]])
out = out.decode("utf-8").replace('\n', '').replace('\r', '')

In [8]:
print(out)

https://asciinema.org/a/elglsxzuwzhbmtzhbcjk9nc9a


In [9]:
castid = out.split('/')[-1]
castid

'elglsxzuwzhbmtzhbcjk9nc9a'

In [10]:
html_tag = """
<script type="text/javascript" 
    src="https://asciinema.org/a/{0}.js" 
    id="asciicast-{0}" 
    async data-autoplay="true" data-size="big">
</script>
"""
html_tag = html_tag.format(castid)
HTML(data=html_tag)