### Planning Algorithms

In [2]:
import numpy as np
import pprint
import json
import sys
import gym

#from gym import wrappers
from subprocess import check_output
from IPython.display import HTML

#### Policy Evaluation

In [3]:
def policy_evaluation(policy, env, discount_factor=.75, theta=0.000001):
    V = np.zeros(env.nS)
    # print('evaluating policy')
    while True:
        delta = 0
        # For each state, perform a "full backup"
        for s in range(env.nS):
            v = 0
            # Look at the possible next actions
            for a, action_prob in enumerate(policy[s]):
                # For each action, look at the possible next states...
                for  prob, next_state, reward, done in env.P[s][a]:
                    # Calculate the expected value
                    v += action_prob * prob * (reward + discount_factor * V[next_state] * (not done))
                    # print(s, a, prob, next_state, reward, done, v)
            # How much our value function changed (across any states)
            delta = max(delta, np.abs(v - V[s]))
            # print(delta)
            V[s] = v
        # Stop evaluating once our value function change is below a threshold
        if delta < theta:
            break
    return np.array(V)

#### Policy Improvement

In [4]:
def policy_improvement(env, policy_eval_fn=policy_evaluation, discount_factor=0.75):
    # Start with a random policy
    policy = np.random.random([env.nS, env.nA])

    iterations = 0
    while True:
        # Evaluate the current policy
        V = policy_eval_fn(policy, env, discount_factor)
        iterations += 1
        # Will be set to false if we make any changes to the policy
        policy_stable = True
        # For each state...
        for s in range(env.nS):
            # The best action we would take under the currect policy
            chosen_a = np.argmax(policy[s])

            # Find the best action by one-step lookahead
            # Ties are resolved arbitarily
            action_values = np.zeros(env.nA)
            for a in range(env.nA):
                for prob, next_state, reward, done in env.P[s][a]:
                    action_values[a] += prob * (reward + discount_factor * V[next_state])
            best_a = np.argmax(action_values)

            # Greedily update the policy
            if chosen_a != best_a:
                policy_stable = False
            policy[s] = np.eye(env.nA)[best_a]

        # If the policy is stable we've found an optimal policy. Return it
        if policy_stable:
            return policy, V, iterations

#### Policy Iteration

#### Value Iteration

In [6]:
def iterate_value_function(v_inp, q_inp, gamma, env):
    
    nstates = env.env.observation_space.n
    nactions = env.env.action_space.n
    
    ret = np.zeros(nstates)
    retq = np.zeros((nstates, nactions))

    for sid in range(nstates):
        temp_v = np.zeros(nactions)
        for action in range(nactions):
            for (prob, dst_state, reward, is_final) in env.env.env.P[sid][action]:
                if is_final:
                    temp_v[action] = temp_v[action] + prob * (reward)
                else:
                    temp_v[action] = temp_v[action] + prob * (reward + gamma * v_inp[dst_state])
        retq[sid] = temp_v
        ret[sid] = max(temp_v)
    return ret, retq

def build_greedy_policy(v_inp, gamma, env):
    nstates = env.env.observation_space.n
    nactions = env.env.action_space.n
    
    new_policy = np.zeros(nstates)
    
    for state_id in range(nstates):
        profits = np.zeros(nactions)
        for action in range(nactions):
            for (prob, dst_state, reward, is_final) in env.env.env.P[state_id][action]:
                profits[action] += prob*(reward + gamma*v[dst_state])
        new_policy[state_id] = np.argmax(profits)
    return new_policy


env = gym.make('Taxi-v1')
#env = wrappers.Monitor(env, '/tmp/', force=True)

gamma = 0.9
cum_reward = 0
n_rounds = 1000

# solve the world with value iteration 
# (without interacting with the environment)

for t_rounds in range(n_rounds):
    # init env and value function
    observation = env.reset()
    
    v = np.zeros(env.env.observation_space.n)
    q = np.zeros((env.env.observation_space.n, 
                  env.env.action_space.n))

    # solve MDP
    for _ in range(200):
        v_old = v.copy()
        q_old = q.copy()
        v, q = iterate_value_function(v, q, gamma, env)
        if np.all(v == v_old):
            break
    policy = build_greedy_policy(v, gamma, env).astype(np.int)

    # apply policy
    for t in range(1000):
        action = policy[observation]
        observation, reward, done, info = env.step(action)
        cum_reward += reward
        if done:
            break
    if t_rounds % 50 == 0 and t_rounds > 0:
        print(cum_reward * 1.0 / (t_rounds + 1))

np.set_printoptions(threshold=np.nan)
print('v states')
#print(v.tolist())
print('q values')
#print(q.tolist())


[2017-03-27 19:03:41,332] Making new env: Taxi-v1


AttributeError: 'TaxiEnv' object has no attribute 'env'

In [5]:
# act greedily
for episode in range(10):
    state = env.reset()

    while True:
        action = np.argmax(q[state])
        _, _, done, _ = env.step(action)
        if done:
            break

[2017-03-24 15:09:25,380] Starting new video recorder writing to /tmp/openaigym.video.0.36.video001000.json


In [6]:
env.videos[-1][0]

'/tmp/openaigym.video.0.36.video001000.json'

In [7]:
out = check_output(["asciinema", "upload", env.videos[-1][0]])
out = out.decode("utf-8").replace('\n', '').replace('\r', '')

In [8]:
print(out)

https://asciinema.org/a/elglsxzuwzhbmtzhbcjk9nc9a


In [9]:
castid = out.split('/')[-1]
castid

'elglsxzuwzhbmtzhbcjk9nc9a'

In [10]:
html_tag = """
<script type="text/javascript" 
    src="https://asciinema.org/a/{0}.js" 
    id="asciicast-{0}" 
    async data-autoplay="true" data-size="big">
</script>
"""
html_tag = html_tag.format(castid)
HTML(data=html_tag)

In [9]:
import gym
import pandas as pd
import numpy as np
import random

# https://gym.openai.com/envs/CartPole-v0
# Carlos Aguayo - carlos.aguayo@gmail.com


class QLearner(object):
    def __init__(self,
                 num_states=100,
                 num_actions=4,
                 alpha=0.2,
                 gamma=0.9,
                 random_action_rate=0.5,
                 random_action_decay_rate=0.99):
        self.num_states = num_states
        self.num_actions = num_actions
        self.alpha = alpha
        self.gamma = gamma
        self.random_action_rate = random_action_rate
        self.random_action_decay_rate = random_action_decay_rate
        self.state = 0
        self.action = 0
        self.qtable = np.random.uniform(low=-1, high=1, size=(num_states, num_actions))

    def set_initial_state(self, state):
        """
        @summary: Sets the initial state and returns an action
        @param state: The initial state
        @returns: The selected action
        """
        self.state = state
        self.action = self.qtable[state].argsort()[-1]
        return self.action

    def move(self, state_prime, reward):
        """
        @summary: Moves to the given state with given reward and returns action
        @param state_prime: The new state
        @param reward: The reward
        @returns: The selected action
        """
        alpha = self.alpha
        gamma = self.gamma
        state = self.state
        action = self.action
        qtable = self.qtable

        choose_random_action = (1 - self.random_action_rate) <= np.random.uniform(0, 1)

        if choose_random_action:
            action_prime = random.randint(0, self.num_actions - 1)
        else:
            action_prime = self.qtable[state_prime].argsort()[-1]

        self.random_action_rate *= self.random_action_decay_rate

        qtable[state, action] = (1 - alpha) * qtable[state, action] + alpha * (reward + gamma * qtable[state_prime, action_prime])

        self.state = state_prime
        self.action = action_prime

        return self.action


def cart_pole_with_qlearning():
    env = gym.make('CartPole-v0')
    experiment_filename = './cartpole-experiment-1'
    env.monitor.start(experiment_filename, force=True)

    goal_average_steps = 195
    max_number_of_steps = 200
    number_of_iterations_to_average = 100

    number_of_features = env.observation_space.shape[0]
    last_time_steps = np.ndarray(0)

    cart_position_bins = pd.cut([-2.4, 2.4], bins=10, retbins=True)[1][1:-1]
    pole_angle_bins = pd.cut([-2, 2], bins=10, retbins=True)[1][1:-1]
    cart_velocity_bins = pd.cut([-1, 1], bins=10, retbins=True)[1][1:-1]
    angle_rate_bins = pd.cut([-3.5, 3.5], bins=10, retbins=True)[1][1:-1]

    def build_state(features):
        return int("".join(map(lambda feature: str(int(feature)), features)))

    def to_bin(value, bins):
        return np.digitize(x=[value], bins=bins)[0]

    learner = QLearner(num_states=10 ** number_of_features,
                       num_actions=env.action_space.n,
                       alpha=0.2,
                       gamma=1,
                       random_action_rate=0.5,
                       random_action_decay_rate=0.99)

    for episode in xrange(50000):
        observation = env.reset()
        cart_position, pole_angle, cart_velocity, angle_rate_of_change = observation
        state = build_state([to_bin(cart_position, cart_position_bins),
                             to_bin(pole_angle, pole_angle_bins),
                             to_bin(cart_velocity, cart_velocity_bins),
                             to_bin(angle_rate_of_change, angle_rate_bins)])
        action = learner.set_initial_state(state)

        for step in xrange(max_number_of_steps - 1):
            observation, reward, done, info = env.step(action)

            cart_position, pole_angle, cart_velocity, angle_rate_of_change = observation

            state_prime = build_state([to_bin(cart_position, cart_position_bins),
                                       to_bin(pole_angle, pole_angle_bins),
                                       to_bin(cart_velocity, cart_velocity_bins),
                                       to_bin(angle_rate_of_change, angle_rate_bins)])

            if done:
                reward = -200

            action = learner.move(state_prime, reward)

            if done:
                last_time_steps = np.append(last_time_steps, [int(step + 1)])
                if len(last_time_steps) > number_of_iterations_to_average:
                    last_time_steps = np.delete(last_time_steps, 0)
                break

        if last_time_steps.mean() > goal_average_steps:
            print "Goal reached!"
            print "Episodes before solve: ", episode + 1
            print u"Best 100-episode performance {} {} {}".format(last_time_steps.max(),
                                                                  unichr(177),  # plus minus sign
                                                                  last_time_steps.std())
            break

    env.monitor.close()

if __name__ == "__main__":
    random.seed(0)
    cart_pole_with_qlearning()

[2017-03-27 19:19:21,581] Making new env: CartPole-v0
[2017-03-27 19:19:21,586] Creating monitor directory ./cartpole-experiment-1
[2017-03-27 19:19:21,590] Starting new video recorder writing to /mnt/notebooks/cartpole-experiment-1/openaigym.video.0.49.video000000.mp4
[2017-03-27 19:19:22,409] Starting new video recorder writing to /mnt/notebooks/cartpole-experiment-1/openaigym.video.0.49.video000001.mp4
[2017-03-27 19:19:22,600] Starting new video recorder writing to /mnt/notebooks/cartpole-experiment-1/openaigym.video.0.49.video000008.mp4
[2017-03-27 19:19:23,155] Starting new video recorder writing to /mnt/notebooks/cartpole-experiment-1/openaigym.video.0.49.video000027.mp4
[2017-03-27 19:19:24,055] Starting new video recorder writing to /mnt/notebooks/cartpole-experiment-1/openaigym.video.0.49.video000064.mp4
[2017-03-27 19:19:25,189] Starting new video recorder writing to /mnt/notebooks/cartpole-experiment-1/openaigym.video.0.49.video000125.mp4
[2017-03-27 19:19:27,387] Starting 

KeyboardInterrupt: 

In [10]:
env.videos[-1][0]

AttributeError: 'TaxiEnv' object has no attribute 'videos'

In [12]:
import io
import json
import base64

from IPython.display import HTML

#video_path, meta_path = env.videos[-1]

video = io.open('/mnt/notebooks/cartpole-experiment-1/openaigym.video.0.49.video008000.mp4', 'r+b').read()
encoded = base64.b64encode(video)

#with open(meta_path) as data_file:    
#    meta = json.load(data_file)

html_tag = """
<h2>{0}<h2/>
<video width="960" height="540" controls>
    <source src="data:video/mp4;base64,{1}" type="video/mp4" />
</video>"""
strm = html_tag.format('Episode ' + str('008000'), encoded.decode('ascii'))
HTML(data=strm)

In [None]:
strm = ''
for video_path, meta_path in env.videos:
    video = io.open(video_path, 'r+b').read()
    encoded = base64.b64encode(video)
    
    with open(meta_path) as data_file:    
        meta = json.load(data_file)

    html_tag = """
    <h2>{0}<h2/>
    <video width="960" height="540" controls>
        <source src="data:video/mp4;base64,{1}" type="video/mp4" />
    </video>"""
    strm += html_tag.format('Episode ' + str(meta['episode_id']), encoded.decode('ascii'))
HTML(data=strm)