In [None]:
# Import helper libraries
from __future__ import print_function
from __future__ import unicode_literals
from __future__ import division
from __future__ import absolute_import
from future import standard_library
standard_library.install_aliases()  # NOQA
import argparse
import logging
import sys

#___________________________________________________________________
# Import scientific libraries

import numpy as np               # Numpy, a good library to deal with matrices in python.
import matplotlib.pyplot as plt  # Matplotlib, a good library for plotting in python.
from matplotlib import style
#___________________________________________________________________

import gym                       # Gym, a collection of RL environemnts.
gym.undo_logger_setup()  # NOQA
from gym import spaces
import gym.wrappers

from osim.env import ProstheticsEnv   # Open simulator (OSIM), an open source simnulation for biomechanical modeling.

#___________________________________________________________________



import tensorflow as tf                      # Tensorflow, a deep learning library.
import chainer                               # Chainer, a python-based deep learning framework. Chainerrl, a reinforcement learning library based on chainer framework.
from chainer import optimizers               # a collection of Neural Network optimizers.
from chainerrl.agents.ddpg import DDPG       # a DDPG agent
from chainerrl.agents.ddpg import DDPGModel  # a DDPG model, responsibles to combine the policy network and the value function network.
from chainerrl import explorers              # a collection of explores functions.
from chainerrl import misc                   # a collection of utility functions to manipulate the environemnts.
from chainerrl import policy                 # a policy network
from chainerrl import q_functions            # a value function network
from chainerrl import replay_buffer          # a Replay buffer to store a set of observations for the DDPG agent.


from ExpertAgents import ExpertDDPGAgent
style.use('ggplot')

# A set of helper functions 

In [None]:
# Helper's functions

def clip_action_filter(a):
    """ limit the an action value between the higest and lowest values in action space.
    Input: a
    Output: clipped action
    """
    return np.clip(a, action_space.low, action_space.high)

def reward_filter(r):
    """ Scale the reward value.
    Input: reward (r)
    Output: scaled reward
    """
    return r *1 #1e-2


def phi(obs):
    """ Convert the data type of the observation to float-32
    Input: observation (obs)
    Output:  the processed observation 
    """ 
    obs=np.array(obs)
    return obs.astype(np.float32)


def random_action():
    """ Generate a random action.
    Input: None
    Output:  a random action
    """ 
    a = action_space.sample()
    if isinstance(a, np.ndarray):
        a = a.astype(np.float32)
    return a


def make_env(test,render=False):
    
    """ Create an instance from "ProstheticEnv" environment
    Input: a boolean value to show if it's an agent training experiment or test experiment (test)
    Output:  "ProstheticEnv" environment (env)
    """ 
        
    env = ProstheticsEnv(visualize=render)
    # Use different random seeds for train and test envs
    env_seed = 2 ** 32 - 1 - seed if test else seed
    env.seed(env_seed)
    #if args.monitor:
        #env = gym.wrappers.Monitor(env, args.outdir)
    if isinstance(env.action_space, spaces.Box):
        misc.env_modifiers.make_action_filtered(env, clip_action_filter)
    if not test:
        misc.env_modifiers.make_reward_filtered(env, reward_filter)
    if render and not test:
        misc.env_modifiers.make_rendered(env)
    return env

In [None]:
seed=0

In [None]:
# Setup the environment
env = make_env(test=False,render=False)
#timestep_limit = env.spec.tags.get('wrapper_config.TimeLimit.max_episode_steps')
obs_size = np.asarray(env.observation_space.shape).prod()
action_space = env.action_space
action_size = np.asarray(action_space.shape).prod()

In [None]:
load_expert_DDPG=ExpertDDPGAgent(path="Models/Model")
expert=load_expert_DDPG.load_agent()

In [None]:
number_of_episodes=50
max_episode_length=1000

In [None]:
# STEP 1: Generate Data using Expert.

returns = []
observations = []
actions = []
for i in range(number_of_episodes):
    print('episode: ', i)
    obs = env.reset()
    done = False
    G = 0.
    steps = 0
    while not done:
        action = expert.act(obs)
        
        observations.append(obs)
        actions.append(action)
        
        obs, r, done, _ = env.step(action)
        G += r
        steps += 1
        
        if steps % 100 == 0: print("%i/%i" % (steps, max_episode_length))
        if steps >= max_episode_length:
            break
    returns.append(G)
    
print('returns', returns)
print('mean return', np.mean(returns))
print('std of return', np.std(returns))

# Build the policy network for the target agent.

In [None]:
# pass observations, actions to imitation learning
obs_data = np.squeeze(np.array(observations))
act_data = np.squeeze(np.array(actions))

In [None]:
np.savetxt('Datasets/obs_data_naive_agent-action_value_function.txt', obs_data, fmt='%f')
np.savetxt('Datasets/act_data_naive_agent-action_value_function.txt', obs_data, fmt='%f')

In [None]:
#===========================================================================
# Build the policy network for the Target agent.
#===========================================================================

In [None]:
obs_dim = obs_data.shape[1]
act_dim = act_data.shape[1]

In [None]:
# Architecture of the target policy function

x = tf.placeholder(tf.float32, shape=[None, obs_dim])
yhot = tf.placeholder(tf.float32, shape=[None, act_dim])
    
h1 = tf.layers.dense(inputs=x, units=128, activation=tf.nn.relu)
h2 = tf.layers.dense(inputs=h1, units=64, activation=tf.nn.relu)
h3 = tf.layers.dense(inputs=h2, units=32, activation=tf.nn.relu)
yhat = tf.layers.dense(inputs=h3, units=act_dim, activation=None)
yhat= tf.squeeze(yhat)
    
loss = tf.reduce_mean(tf.square(yhot - yhat))
train = tf.train.AdamOptimizer().minimize(loss)

# Saver object to save the model
saver = tf.train.Saver()

In [None]:
# train The target network
number_of_iterations=100
batch_size = 25

# Metrices

global_mean_G=[]

In [None]:
#___________ STEP 2: Train the target agent with the data that collected by the expert agent. ____________

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    # record return and std for plotting
    save_mean = []
    save_std = []
    save_train_size = []
    #loop for dagger alg
    
    for dagger_iter in range(number_of_iterations):
        print ('DAgger iteration ', dagger_iter)
        
        # train a policy by fitting the MLP
        for step in range(10000):
            batch_i = np.random.randint(0, obs_data.shape[0], size=batch_size)
            train.run(feed_dict={x: obs_data[batch_i, ], yhot: act_data[batch_i, ]})
            if (step % 1000 == 0):
                print ('opmization step ', step)
                print ('obj value is ', loss.eval(feed_dict={x:obs_data, yhot:act_data}) )        
                
        print('Optimization Finished!')
        
        # save naive agent
        saver.save(sess, 'NaiveAgents/Model_V1/naive_agent-action_value_function',global_step=1000,write_meta_graph=False)



#______________________ STEP 3: Generate data using target Agent. ________________________


        
        
        returns = []
        naive_observations = []
        naive_actions = []
        expert_actions=[]
        naive_actions_values=[]
        expert_actions_values=[]
        for i in range(number_of_episodes):
            print('iter', i)
            obs = env.reset()
            done = False
            G = 0.
            steps = 0
            naive_rewards=[]
            expert_rewards=[]
            while not done:
                obs=np.array(obs)
                naive_observations.append(obs)
                
                action = yhat.eval(feed_dict={x:obs[None, :]})
                expert_action=expert.act(obs)
                
                expert_actions.append(expert_action)
                naive_actions.append(action)
                
                # Double steps in the envoronment, an expert and an agent.
                obs, r, done, _ = env.step(action)
                _,expert_reward,_,_=env.step(expert_action)
                
                # Collecting the rewards.
                naive_rewards.append(r)
                expert_rewards.append(expert_reward)
                
            
                G += r
                steps += 1   
                if steps % 100 == 0: print("%i/%i" % (steps, max_episode_length))
                if steps >= max_episode_length:
                    break

            # calculating actions values.
            
            for j in range(len(naive_rewards)):
                naive_actions_values.append(sum(naive_rewards[j:]))
                expert_actions_values.append(sum(expert_rewards[j:]))
                
            returns.append(G)
            
        global_mean_G.append(np.mean(returns))
        
        if dagger_iter%10==0:
            
            global_mean_G_numpy=np.array(global_mean_G)
            np.savetxt('Variables/global_mean_G_action_value_function.txt', global_mean_G_numpy , fmt='%f')
            
        print('mean return', np.mean(returns))
        print('std of return', np.std(returns))
            
            
        
            
#_________________________ STEP 4: Expert labeling ____________________________________
       corrected_act=[]  
        for i in range(len(naive_actions_values)):
 
            if expert_actions_values[i] >= naive_actions_values[i]:          # if expert's action value is bigger or equal, use expert action.     
                corrected_act.append(expert_actions[i])
            elif expert_actions_values[i] < naive_actions_values[i] :        # else use target's action
                corrected_act.append(naive_actions[i])
                
                
            
        # record training size
        train_size = obs_data.shape[0]
        
#_________________________ STEP 5: data aggregation _____________________________________
  
        
        obs_data = np.concatenate((obs_data, np.array(naive_observations)), axis=0)
        act_data = np.concatenate((act_data, np.squeeze(np.array(corrected_act))), axis=0)
        
        
        
        # record mean return & std
        save_mean = np.append(save_mean, np.mean(returns))
        save_std = np.append(save_std, np.std(returns))
        save_train_size = np.append(save_train_size, train_size)
        
    dagger_results = {'means': save_mean, 'stds': save_std, 'train_size': save_train_size,}
    
    print ('DAgger iterations finished!')


In [None]:
# Save the Mean of the Returns.
global_mean_G=np.array(global_mean_G)
np.savetxt('Variables/global_mean_G_action_value_function.txt', global_mean_G , fmt='%f')

In [None]:
plt.plot(global_mean_G)
plt.ylabel('Average of Returns')
plt.xlabel('Number of iterations')
plt.savefig("Plots/Average_of_Returns_iterations_action_value_function.png")