This notebook is written by Mathias Rose Bjare (muthissar@gmail.com) and  answears Deep RL Assignment 1, http://rail.eecs.berkeley.edu/deeprlcourse/static/homeworks/hw1.pdf.
The code base for this file can be found at https://github.com/muthissar/homework.

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib notebook

In [2]:
import tensorflow as tf
import numpy as np
import pandas as pd
from behavioral_cloning import *
from load_policy import load_policy
import matplotlib.pyplot as plt

# DAGGER
Implimentation of the dagger algorithm. The algoirthm is run on the 'Hopper-v2' task, for which behaviroal cloning failed to perform as well as expert. The setup is as identical to the setup for previous section. The network is identical, the algorithm is initialized with the same number of expert rollouts (20). About 10 iterations is needed to perform as good as the expert. At each iteration 20 rollouts are made, and the return is computed and stored. This is printed in the table below.

In [None]:
batch_size = 32
expert_rollout = 20
dagger_rollout = 5
#0 = run until termination
max_timesteps = 0
use_expert_cache = True
store=False
training_epochs = 1
losses = []

from IPython.display import display, clear_output
import matplotlib.pyplot as plt

tf.reset_default_graph()
losses = []

it_dagger = 20
#tasks = ['Hopper-v2', 'Ant-v2', 'HalfCheetah-v2',
# 'Humanoid-v2', 'Reacher-v2','Walker2d-v2']
tasks = ['Hopper-v2']
results = pd.DataFrame(columns=[])
for task in tasks:
    tf.reset_default_graph()
    tf_config = tf.ConfigProto(inter_op_parallelism_threads=1,
            #device_count = {'GPU': 2},
            intra_op_parallelism_threads=1,
            gpu_options = tf.GPUOptions(
                #per_process_gpu_memory_fraction=1./16. # 1gb
                allow_growth=True
                )
        )
    with tf.Session(config=tf_config) as sess:
        
        expert_policy = load_policy('./experts/{}.pkl'.format(task))
        if use_expert_cache:
            dataset, expert_returns = get_dataset(\
                './expert_data/{}-{}.pkl'.format(task,expert_rollout))
        else:
            dataset, expert_returns = get_dataset(\
                envname=task,render=False,
                expert_policy_file='experts/'+task+'.pkl'\
                ,max_timesteps=0,num_rollouts=expert_rollout,store=store)
        dataset = dataset.map(lambda x, y: (tf.cast(x,tf.float32),y))
        obs_dim = dataset.output_shapes[0].as_list()
        action_dim = dataset.output_shapes[1].as_list()[1:]
        dataset = dataset.repeat(training_epochs)
        batched_dataset = dataset.batch(batch_size)
        iterator = batched_dataset.make_one_shot_iterator()
        next_element = iterator.get_next()
        # FNN network
        behavioral_net = BehavioralCloningNet(obs_dim, action_dim,\
            batch_size,learning_rate=0.001)
        losses = []
        returns = []
        # object used for interacting with gym environment
        evaluate = EvaluateDagger(task,expert_policy,behavioral_net,render=False)
        saver = tf.train.Saver()
        for i in range(it_dagger):
            if i % 5 == 0:
                print("iiiiii " + str(i))
                saver.save(sess,'dagger_model{}-{}.ckpt'.format(task,i))
            losses += behavioral_net.train(next_element)
            evaluate.expert_obs = []
            evaluate.expert_actions = []
            video_path = "{}_{}".format(task,i) if i % 10 == 0 else None
            #render = True if i % 10 == 0 else False
            render = False
            return_ = evaluate.evaluate(rollouts=dagger_rollout,video_path=video_path,render=render)
            print("dagger it {}, mean return {}".format(i,np.mean(return_)))
            returns.append(return_)
            expert_obs_arr = np.asarray(evaluate.expert_obs,dtype=np.float32)
            expert_actions_arr = np.asarray(evaluate.expert_actions,\
                dtype=np.float32)
            expert_labeled_new_data = tf.data.Dataset.from_tensor_slices(\
                (expert_obs_arr,expert_actions_arr))
            dataset = dataset.concatenate(expert_labeled_new_data)
            dataset = dataset.shuffle(buffer_size=10000)
            # avoid that the dataset explodes eponentially by having
            #a cutoff for which the dataset will then grow linearly.
            dataset = dataset.take(10000)
            batched_dataset = dataset.batch(batch_size,True)
            iterator = batched_dataset.make_one_shot_iterator()
            next_element = iterator.get_next()
            results = results.append({'Task': task,
                                'mean return': np.mean(return_),
                                'std return': np.std(return_),
                                'expert mean return': np.mean(expert_returns),
                                'expert std return': np.std(expert_returns),
                                'expert rollouts': expert_rollout,
                                'training epochs': training_epochs,
                                'dagger its': i},
                                ignore_index=True)
#fig = plt.figure()
#plt.plot(losses)
#plt.xlabel('Batch')
#plt.ylabel('Loss (mean square)')
#plt.legend(['Train loss {}'.format(task)],loc='upper center')
#plt.show(fig)
results.to_csv('results_dagger.csv')

obs (1, 11) (1, 11)


Possibly run from cached results

In [None]:
results = pd.read_csv('results_dagger.csv')

## Learning curve
Learning curve is plotted for different dagger iterations. Also the experts return is plotted. It is seen at some steps that the variance of the 20 rollouts are high, and that the convergence to the experts return is jumpy.

In [None]:
fig = plt.figure()
hopper_data = results[results['Task']=='Hopper-v2']
dagger_its = hopper_data['dagger its'].values
plt.errorbar(dagger_its,hopper_data['mean return'].values,\
    yerr=[hopper_data['std return'], hopper_data['std return'],], c='blue',fmt='-o',linestyle=":")
plt.errorbar(dagger_its,hopper_data['expert mean return'].values,\
    yerr=[hopper_data['expert std return'], hopper_data['expert std return']\
    ,], c='red')
plt.xlabel("Dagger iterations")
plt.ylabel("Return")
plt.legend(['Hopper','Hopper expert'])
plt.show(fig)

# Demo

In [8]:
import tensorflow as tf
import numpy as np
import pandas as pd
from behavioral_cloning import *
from load_policy import load_policy
import matplotlib.pyplot as plt
import gym

expert_policy = load_policy('./experts/{}.pkl'.format('Hopper-v2'))
dataset, expert_returns = get_dataset(\
        './expert_data/{}-{}.pkl'.format('Hopper-v2',20))
dataset = dataset.map(lambda x, y: (tf.cast(x,tf.float32),y))
obs_dim = dataset.output_shapes[0].as_list()
action_dim = dataset.output_shapes[1].as_list()[1:]

tf.reset_default_graph()
tf_config = tf.ConfigProto(inter_op_parallelism_threads=1,
        device_count = {'GPU': 0},
        #intra_op_parallelism_threads=1,
        #gpu_options = tf.GPUOptions(
            #per_process_gpu_memory_fraction=1./16. # 1gb
        #    allow_growth=True
        #    )
    )
with tf.Session(config=tf_config) as sess:
    policy_estimator = BehavioralCloningNet(obs_dim,action_dim, 1)
    sess.run(tf.initialize_all_variables())
    #sess = tf.get_default_session()
    env = gym.make('Hopper-v2')
    saver = tf.train.Saver()
    model = 'dagger_modelHopper-v2-10.ckpt'
    saver.restore(sess,model)
    state = env.reset()
    env.render('human')
    viewer = env.env.viewer                
    #HIDE CONTROLS
    viewer.key_callback(None,glfw.KEY_H,None,glfw.RELEASE,None)
    #PAUSE 
    #viewer.key_callback(None,glfw.KEY_SPACE,None,glfw.RELEASE,None)
    #FOLLOW AGENT
    #viewer.key_callback(None,glfw.KEY_TAB,None,glfw.RELEASE,None)
    #ZOOM OUT
    viewer.move_camera(const.MOUSE_ZOOM, 0, -0.05 * 30)
    #viewer.move_camera(const.MOUSE_ROTATE_H, 0, -0.05 * 20)
    #PLAY SLOWER 
    for i in range(3):
        viewer.key_callback(None,glfw.KEY_S,None,glfw.RELEASE,None)
    
    #observation, reward, done, info = env.step(action)
    
    print(policy_estimator.output_layer)
    
    while True:
        
        env.render('human')
        action = policy_estimator.predict([state])
        state, reward, done, _ = env.step(action)
        if done:
            #print("Episode finished after {} timesteps".format(t+1))
            break
    viewer = None
env.close()     

obs (1, 11) (1, 11)
INFO:tensorflow:Restoring parameters from dagger_modelHopper-v2-10.ckpt
Creating window glfw
Tensor("behavioral_cloning/fully_connected_1/BiasAdd:0", shape=(?, 3), dtype=float32)


%%latex
\lstinputlisting[language=Python]{behavioral_cloning.py}