In [1]:
################################################################
# EXPERIMENT 1:
#
# In this experiment we will train an actor according to a critic
# and then simultaneously learn and plot the resulting
################################################################
import gym
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from IPython import display
from itertools import chain
import gc
import argparse
from mpl_toolkits.mplot3d import Axes3D
gc.enable()

from brain import DDPG
from brain import SubCritics
from brain.common.filter_env import makeFilteredEnv
import brain.common.utils as utils
from brain.common.utils import episode_stats, write_row
%matplotlib inline

In [2]:


def test(env, agent, num_tests):
    """
    Tests the agent.
    """
    total_reward = 0
    for i in range(num_tests):
        state = env.reset()
        for j in range(env.spec.timestep_limit):
            #env.render()
            action = agent.action(state) # direct action for test
            state,reward,done,_ = env.step(action)
            total_reward += reward
            if done:
                break
    avg_reward = total_reward/num_tests
    return avg_reward


def run_experiment(exp_name, ENV_NAME='MountainCarContinuous-v0', EPISODES=10000, TEST=10):
    """
    Runs the experiment on the target en
    """
    env = makeFilteredEnv(gym.make(ENV_NAME))

    # Create the standard DDPG agent.
    agent = DDPG(env)

    sub_critics = SubCritics(agent, order=1) # Make linear (order 1) subcritics

    # Set up tensorboard.
    merged = tf.merge_all_summaries()
    train_writer = tf.train.SummaryWriter('/tmp/tboard/{}'.format(exp_name),
                                      agent.sess.graph)
    # To see graph run tensorboard --logdir=/tmp/exp1/tboard
    init_op = tf.initialize_all_variables()
    agent.sess.run(init_op)

    criticToPlot = sub_critics.critics[0]
    minPos = -1.2
    maxPos = 0.6
    minVelocity = -0.07
    maxVelocity = 0.07
    nx = 400
    ny = 400
    xRange = np.linspace(minPos, maxPos, nx)
    yRange = np.linspace(minVelocity, maxVelocity, ny)
    xv, yv = np.meshgrid(xRange, yRange)
    zToPlot = []
    xToPlot = xv[0]
    yToPlot = [row[0] for row in yv]
    t = 0
    for episode in range(EPISODES):
        state = env.reset()
        activations = None
        print("Episode: ", episode, end="")
        r_tot = 0

        for step in range(env.spec.timestep_limit):
            t+= 1
            # Explore state space.
            next_action, next_activations = agent.noise_action_activations(state)

            # Deal with the environment
            next_state,reward,done,_ = env.step(next_action)
            r_tot += reward
            # env.render()

            # Train subcrticis and plot to tensorflow
            if activations is not None and action is not None:
                ops, feeds = sub_critics.get_perceive_run(activations, next_activations, reward, done)
                ops += [
                    agent.critic_network.q_value_output,
                    agent.critic_network.target_q_value_output]
                feeds.update({
                    agent.critic_network.state_input: [state],
                    agent.critic_network.action_input: [action],
                    agent.critic_network.target_state_input: [next_state],
                    agent.critic_network.target_action_input: [next_action]
                    })
                ops = [merged] + ops
                result = agent.sess.run(ops, feeds)
                # use ordered dict for stats_map
                stats_result = agent.sess.run(episode_stats['variables'], feeds)
                write_row(episode, step, stats_result)
                train_writer.add_summary(result[0], t)

            # Train DDPG
            agent.perceive(state,next_action,reward,next_state,done)
            if done:
                break
            # Move on to next frame.
            state = next_state
            activations = next_activations
            action = next_action
        print(" ", r_tot)

        # Testing:
        if episode % 100 == 0 and episode > 100:
            avg_reward = test(env, agent, TEST)
            print(('episode: ',episode,'Evaluation Average Reward:',avg_reward))
    for i in range(nx):
        for j in range(ny):
            x, y = xv[i,j], yv[i,j]
            # tempAction = np.array([1])
            # tempState = np.array([x,y])
            # zToPlot.append(criticToPlot.q_value(tempState,tempAction))
            ops = [agent.critic_network.q_value_output]
            feeds = {}
            state = np.array([x,y])[:,np.newaxis].T
            action = np.array([1])[:,np.newaxis].T
            feeds.update({
                agent.critic_network.state_input: state,
                agent.critic_network.action_input: action,
                })
            result = agent.sess.run(ops, feeds)
            zToPlot.append(result[0].squeeze())
    zToPlot = np.array(zToPlot).reshape((len(xToPlot), len(yToPlot)))
    h = plt.contourf(xToPlot,yToPlot,zToPlot)
    plt.title('Global q value')
    plt.xlabel('position')
    plt.ylabel('velocity')
    plt.colorbar(h)
    plt.show()
    # criticToPlot = sub_critics.critics[0]
    # tempAction = [[.5]]
    # tempState = np.array([1,2])
    # print(criticToPlot.q_value(tempAction,tempState))
    # fixedAction = 0.5
    # for velocity in range(-10,10):
    #     for position in range(-1,1):
    #         ops, feeds = sub_critics.get_perceive_run(activations, next_activations, reward, done)
    #         ops += [
    #             agent.critic_network.q_value_output,
    #         feeds.update({
    #             agent.critic_network.state_input: [state],
    #             agent.critic_network.action_input: [fixedAction],
    #             })
    #         ops = [merged] + ops
    #         result = agent.sess.run(ops, feeds)
    #         # use ordered dict for stats_map
    #         stats_result = agent.sess.run(episode_stats['variables'], feeds)
    #         write_row(episode, step, stats_result)
    #         train_writer.add_summary(result[0], t)



In [3]:
# TODO remove output dir
output_dir = 'output/'
exp_name='tboard'
import shutil

shutil.rmtree(output_dir)
utils.set_output_dir(output_dir)
# run_experiment(experiment_name, EPISODES=70)

In [None]:
exp_name='tboard'
ENV_NAME='MountainCarContinuous-v0'
EPISODES=70
TEST=10

In [None]:
"""
Runs the experiment on the target en
"""
env = makeFilteredEnv(gym.make(ENV_NAME))

# Create the standard DDPG agent.
agent = DDPG(env)

sub_critics = SubCritics(agent, order=1) # Make linear (order 1) subcritics

# Set up tensorboard.
merged = tf.merge_all_summaries()
train_writer = tf.train.SummaryWriter('/tmp/tboard/{}'.format(exp_name),
                                  agent.sess.graph)
# To see graph run tensorboard --logdir=/tmp/exp1/tboard
init_op = tf.initialize_all_variables()
agent.sess.run(init_op)

t = 0
for episode in range(EPISODES):
    state = env.reset()
    activations = None
    print("Episode: ", episode, end="")
    r_tot = 0

    for step in range(env.spec.timestep_limit):
        t+= 1
        # Explore state space.
        next_action, next_activations = agent.noise_action_activations(state)

        # Deal with the environment
        next_state,reward,done,_ = env.step(next_action)
        r_tot += reward
        # env.render()

        # Train subcrticis and plot to tensorflow
        if activations is not None and action is not None:
            ops, feeds = sub_critics.get_perceive_run(activations, next_activations, reward, done)
            ops += [
                agent.critic_network.q_value_output,
                agent.critic_network.target_q_value_output]
            feeds.update({
                agent.critic_network.state_input: [state],
                agent.critic_network.action_input: [action],
                agent.critic_network.target_state_input: [next_state],
                agent.critic_network.target_action_input: [next_action]
                })
            ops = [merged] + ops
            result = agent.sess.run(ops, feeds)
            # use ordered dict for stats_map
            stats_result = agent.sess.run(episode_stats['variables'], feeds)
            write_row(episode, step, stats_result)
            train_writer.add_summary(result[0], t)

        # Train DDPG
        agent.perceive(state,next_action,reward,next_state,done)
        if done:
            break
        # Move on to next frame.
        state = next_state
        activations = next_activations
        action = next_action
    print(" ", r_tot)

    # Testing:
    if episode % 100 == 0 and episode > 100:
        avg_reward = test(env, agent, TEST)
        print(('episode: ',episode,'Evaluation Average Reward:',avg_reward))


[2016-11-29 16:12:18,028] Making new env: MountainCarContinuous-v0


True action space: [-1.], [ 1.]
True state space: [-1.2  -0.07], [ 0.6   0.07]
Filtered action space: [-1.], [ 1.]
Filtered state space: [-1. -1.], [ 1.  1.]
DIMS 2 1
Episode:  0  -13.396874741333322
Episode:  1  -14.646075579940293
Episode:  2  87.75209003655684
Episode:  3  90.32768424231654
Episode:  4

## Plot a = 1

In [None]:
criticToPlot = sub_critics.critics[0]
minPos = -1.2
maxPos = 0.6
minVelocity = -0.07
maxVelocity = 0.07
nx = 400
ny = 400
xRange = np.linspace(minPos, maxPos, nx)
yRange = np.linspace(minVelocity, maxVelocity, ny)
xv, yv = np.meshgrid(xRange, yRange)
zToPlot = []
xToPlot = xv[0]
yToPlot = [row[0] for row in yv]

for i in range(nx):
    for j in range(ny):
        x, y = xv[i,j], yv[i,j]
        # tempAction = np.array([1])
        # tempState = np.array([x,y])
        # zToPlot.append(criticToPlot.q_value(tempState,tempAction))
        ops = [agent.critic_network.q_value_output]
        feeds = {}
        state = np.array([x,y])[:,np.newaxis].T
        action = np.array([1])[:,np.newaxis].T
        feeds.update({
            agent.critic_network.state_input: state,
            agent.critic_network.action_input: action,
            })
        result = agent.sess.run(ops, feeds)
        zToPlot.append(result[0].squeeze())
zToPlot = np.array(zToPlot).reshape((len(xToPlot), len(yToPlot)))
h = plt.contourf(xToPlot,yToPlot,zToPlot)
plt.title('Global q value')
plt.xlabel('position')
plt.ylabel('velocity')
plt.colorbar(h)
plt.show()

## Plot a = -1

In [None]:
criticToPlot = sub_critics.critics[0]
minPos = -1.2
maxPos = 0.6
minVelocity = -0.07
maxVelocity = 0.07
nx = 400
ny = 400
xRange = np.linspace(minPos, maxPos, nx)
yRange = np.linspace(minVelocity, maxVelocity, ny)
xv, yv = np.meshgrid(xRange, yRange)
zToPlot = []
xToPlot = xv[0]
yToPlot = [row[0] for row in yv]

for i in range(nx):
    for j in range(ny):
        x, y = xv[i,j], yv[i,j]
        # tempAction = np.array([1])
        # tempState = np.array([x,y])
        # zToPlot.append(criticToPlot.q_value(tempState,tempAction))
        ops = [agent.critic_network.q_value_output]
        feeds = {}
        state = np.array([x,y])[:,np.newaxis].T
        action = np.array([-1])[:,np.newaxis].T
        feeds.update({
            agent.critic_network.state_input: state,
            agent.critic_network.action_input: action,
            })
        result = agent.sess.run(ops, feeds)
        zToPlot.append(result[0].squeeze())
zToPlot = np.array(zToPlot).reshape((len(xToPlot), len(yToPlot)))
h = plt.contourf(xToPlot,yToPlot,zToPlot)
plt.title('Global q value')
plt.xlabel('position')
plt.ylabel('velocity')
plt.colorbar(h)
plt.show()