# Unity ML Agents
## Proximal Policy Optimization (PPO)
Contains an implementation of PPO as described [here](https://arxiv.org/abs/1707.06347).

In [1]:
import numpy as np
import os
import tensorflow as tf

from ppo.history import *
from ppo.models import *
from ppo.trainer import Trainer
from unityagents import *

### Hyperparameters

In [5]:
### General parameters
max_steps = 10000000 # Set maximum number of steps to run environment.
run_path = "Tennis_Z_4" # The sub-directory name for model and summary statistics
load_model = False # Whether to load a saved model.
train_model = True # Whether to train the model.
summary_freq = 1000 # Frequency at which to save training statistics.
save_freq = 50000 # Frequency at which to save model.
env_name = "Tennis_Z" # Name of the training environment file.
curriculum_file = None

### Algorithm-specific parameters for tuning
gamma = 0.99 # Reward discount rate.
lambd = 0.95 # Lambda parameter for GAE.
time_horizon = 2048 # How many steps to collect per agent before adding to buffer.
beta = 2.5e-3 # Strength of entropy regularization
num_epoch = 5 # Number of gradient descent steps per batch of experiences.
num_layers = 2 # Number of hidden layers between state/observation encoding and value/policy layers.
epsilon = 0.2 # Acceptable threshold around ratio of old and new policy probabilities.
buffer_size = 2048 # How large the experience buffer should be before gradient descent.
learning_rate = 3e-4 # Model learning rate.
hidden_units = 64 # Number of units in hidden layer.
batch_size = 64 # How many experiences per gradient descent update step.
normalize = False

### Logging dictionary for hyperparameters
hyperparameter_dict = {'max_steps':max_steps, 'run_path':run_path, 'env_name':env_name,
    'curriculum_file':curriculum_file, 'gamma':gamma, 'lambd':lambd, 'time_horizon':time_horizon,
    'beta':beta, 'num_epoch':num_epoch, 'epsilon':epsilon, 'buffe_size':buffer_size,
    'leaning_rate':learning_rate, 'hidden_units':hidden_units, 'batch_size':batch_size}

### Load the environment

In [6]:
env = UnityEnvironment(file_name=env_name, curriculum=curriculum_file, worker_id=1)
print(str(env))
brain_name = env.external_brain_names[0]

INFO:unityagents:
'Academy' started successfully!


Unity Academy name: Academy
        Number of brains: 2
        Reset Parameters :
		
Unity brain name: MyBrain
        Number of observations (per agent): 0
        State space type: continuous
        State space size (per agent): 12
        Action space type: discrete
        Action space size (per agent): 6
        Memory space size (per agent): 0
        Action descriptions: , , , , , 
Unity brain name: TennisBrain
        Number of observations (per agent): 0
        State space type: continuous
        State space size (per agent): 12
        Action space type: discrete
        Action space size (per agent): 6
        Memory space size (per agent): 0
        Action descriptions: , , , , , 


### Train the Agent(s)

In [None]:
tf.reset_default_graph()

if curriculum_file == "None":
    curriculum_file = None


def get_progress():
    if curriculum_file is not None:
        if env._curriculum.measure_type == "progress":
            return steps / max_steps
        elif env._curriculum.measure_type == "reward":
            return last_reward
        else:
            return None
    else:
        return None

# Create the Tensorflow model graph
ppo_model = create_agent_model(env, lr=learning_rate,
                               h_size=hidden_units, epsilon=epsilon,
                               beta=beta, max_step=max_steps, 
                               normalize=normalize, num_layers=num_layers)

is_continuous = (env.brains[brain_name].action_space_type == "continuous")
use_observations = (env.brains[brain_name].number_observations > 0)
use_states = (env.brains[brain_name].state_space_size > 0)

model_path = './models/{}'.format(run_path)
summary_path = './summaries/{}'.format(run_path)

if not os.path.exists(model_path):
    os.makedirs(model_path)

if not os.path.exists(summary_path):
    os.makedirs(summary_path)

init = tf.global_variables_initializer()
saver = tf.train.Saver()

config = tf.ConfigProto(device_count = {'GPU': 0})
#config.gpu_options.allow_growth = True

with tf.Session(config = config) as sess:
    # Instantiate model parameters
    if load_model:
        print('Loading Model...')
        ckpt = tf.train.get_checkpoint_state(model_path)
        saver.restore(sess, ckpt.model_checkpoint_path)
    else:
        sess.run(init)
    steps, last_reward = sess.run([ppo_model.global_step, ppo_model.last_reward])    
    summary_writer = tf.summary.FileWriter(summary_path)
    info = env.reset(train_mode=train_model, progress=get_progress())[brain_name]
    trainer = Trainer(ppo_model, sess, info, is_continuous, use_observations, use_states, train_model)
    if train_model:
        trainer.write_text(summary_writer, 'Hyperparameters', hyperparameter_dict, steps)
    while steps <= max_steps:
        if env.global_done:
            info = env.reset(train_mode=train_model, progress=get_progress())[brain_name]
        # Decide and take an action
        new_info = trainer.take_action(info, env, brain_name, steps, normalize)
        info = new_info
        trainer.process_experiences(info, time_horizon, gamma, lambd)
        if len(trainer.training_buffer['actions']) > buffer_size and train_model:
            # Perform gradient descent with experience buffer
            trainer.update_model(batch_size, num_epoch)
        if steps % summary_freq == 0 and steps != 0 and train_model:
            # Write training statistics to tensorboard.
            trainer.write_summary(summary_writer, steps, env._curriculum.lesson_number)
        if steps % save_freq == 0 and steps != 0 and train_model:
            # Save Tensorflow model
            save_model(sess, model_path=model_path, steps=steps, saver=saver)
            export_graph(model_path, env_name+"_"+str(steps))
        steps += 1
        sess.run(ppo_model.increment_step)
        if len(trainer.stats['cumulative_reward']) > 0:
            mean_reward = np.mean(trainer.stats['cumulative_reward'])
            sess.run(ppo_model.update_reward, feed_dict={ppo_model.new_reward: mean_reward})
            last_reward = sess.run(ppo_model.last_reward)
    # Final save Tensorflow model
    if steps != 0 and train_model:
        save_model(sess, model_path=model_path, steps=steps, saver=saver)
env.close()
export_graph(model_path, env_name)

Step: 1000. Mean Reward: -0.027136752136752137. Std of Reward: 0.07803849652497628.
Step: 2000. Mean Reward: -0.024904086738949123. Std of Reward: 0.08144833639326328.
Step: 3000. Mean Reward: -0.025503715937241954. Std of Reward: 0.08154762193650911.
Step: 4000. Mean Reward: -0.025004108463434677. Std of Reward: 0.08216825923784211.
Step: 5000. Mean Reward: -0.02413382594417077. Std of Reward: 0.0825627898906324.
Step: 6000. Mean Reward: -0.02465983606557377. Std of Reward: 0.08280808759091669.
Step: 7000. Mean Reward: -0.024537643207855973. Std of Reward: 0.08263752000323545.
Step: 8000. Mean Reward: -0.024087947882736153. Std of Reward: 0.08350417152791866.
Step: 9000. Mean Reward: -0.02491823385118561. Std of Reward: 0.0825981753048072.
Step: 10000. Mean Reward: -0.024747351263243685. Std of Reward: 0.08283582548889544.
Step: 11000. Mean Reward: -0.024387755102040817. Std of Reward: 0.08272780948573082.
Step: 12000. Mean Reward: -0.024677023712183156. Std of Reward: 0.0827839765522

INFO:tensorflow:Restoring parameters from ./models/Tennis_Z_4\model-50000.cptk


INFO:tensorflow:Froze 4 variables.


INFO:tensorflow:Froze 4 variables.


Converted 4 variables to const ops.
Step: 51000. Mean Reward: -0.024155844155844156. Std of Reward: 0.08361969127470005.
Step: 52000. Mean Reward: -0.023982683982683984. Std of Reward: 0.08377293394683218.
Step: 53000. Mean Reward: -0.025996533795493933. Std of Reward: 0.08195035899741492.
Step: 54000. Mean Reward: -0.024112554112554113. Std of Reward: 0.0836580618605029.
Step: 55000. Mean Reward: -0.025450216450216452. Std of Reward: 0.08245655231352854.
Step: 56000. Mean Reward: -0.02365684575389948. Std of Reward: 0.08405952392394984.
Step: 57000. Mean Reward: -0.025086655112651644. Std of Reward: 0.08278483383282557.
Step: 58000. Mean Reward: -0.024025974025974027. Std of Reward: 0.08373468314174129.
Step: 59000. Mean Reward: -0.02519047619047619. Std of Reward: 0.08269347297137553.
Step: 60000. Mean Reward: -0.024675324675324677. Std of Reward: 0.08315610546422215.
Step: 61000. Mean Reward: -0.02504325259515571. Std of Reward: 0.08282427011998665.
Step: 62000. Mean Reward: -0.0247

INFO:tensorflow:Restoring parameters from ./models/Tennis_Z_4\model-100000.cptk


INFO:tensorflow:Froze 4 variables.


INFO:tensorflow:Froze 4 variables.


Converted 4 variables to const ops.
Step: 101000. Mean Reward: -0.024638186573670447. Std of Reward: 0.08302857591205796.
Step: 102000. Mean Reward: -0.02605217391304348. Std of Reward: 0.0817758169289952.
Step: 103000. Mean Reward: -0.025529513888888893. Std of Reward: 0.08206100053914721.
Step: 104000. Mean Reward: -0.02470945359930616. Std of Reward: 0.08288555557271869.
Step: 105000. Mean Reward: -0.024171725932350393. Std of Reward: 0.08353961516061832.
Step: 106000. Mean Reward: -0.024241110147441458. Std of Reward: 0.08342703534643144.
Step: 107000. Mean Reward: -0.025364583333333336. Std of Reward: 0.08234648424724607.
Step: 108000. Mean Reward: -0.024948006932409013. Std of Reward: 0.08289522149875414.
Step: 109000. Mean Reward: -0.025433275563258233. Std of Reward: 0.08246911801718498.
Step: 110000. Mean Reward: -0.02560553633217993. Std of Reward: 0.08231131366277833.
Step: 111000. Mean Reward: -0.025541125541125545. Std of Reward: 0.08237038511092384.
Step: 112000. Mean Rew

INFO:tensorflow:Restoring parameters from ./models/Tennis_Z_4\model-150000.cptk


INFO:tensorflow:Froze 4 variables.


INFO:tensorflow:Froze 4 variables.


Converted 4 variables to const ops.
Step: 151000. Mean Reward: -0.024833625218914183. Std of Reward: 0.08249512212290615.
Step: 152000. Mean Reward: -0.024540682414698162. Std of Reward: 0.08253758579792957.
Step: 153000. Mean Reward: -0.02491703056768559. Std of Reward: 0.0822394900727565.
Step: 154000. Mean Reward: -0.02452048823016565. Std of Reward: 0.08270761975107281.
Step: 155000. Mean Reward: -0.02487772925764192. Std of Reward: 0.08221288736562049.
Step: 156000. Mean Reward: -0.024221347331583553. Std of Reward: 0.0830487634591597.
Step: 157000. Mean Reward: -0.024846356453028972. Std of Reward: 0.0823227769636418.
Step: 158000. Mean Reward: -0.024166666666666666. Std of Reward: 0.08315209348862838.
Step: 159000. Mean Reward: -0.025253496503496505. Std of Reward: 0.08215586460608315.
Step: 160000. Mean Reward: -0.025048076923076923. Std of Reward: 0.0821179294184048.
Step: 161000. Mean Reward: -0.025061349693251536. Std of Reward: 0.0824441526402948.
Step: 162000. Mean Reward:

INFO:tensorflow:Restoring parameters from ./models/Tennis_Z_4\model-200000.cptk


INFO:tensorflow:Froze 4 variables.


INFO:tensorflow:Froze 4 variables.


Converted 4 variables to const ops.
Step: 201000. Mean Reward: -0.02434262948207171. Std of Reward: 0.08330909024463425.
Step: 202000. Mean Reward: -0.024331337325349305. Std of Reward: 0.08331338281749814.
Step: 203000. Mean Reward: -0.025434131736526946. Std of Reward: 0.08231288071760429.
Step: 204000. Mean Reward: -0.025891434262948206. Std of Reward: 0.08190949581286283.
Step: 205000. Mean Reward: -0.025743512974051898. Std of Reward: 0.08204592017157666.
Step: 206000. Mean Reward: -0.024626121635094714. Std of Reward: 0.08307244850437252.
Step: 207000. Mean Reward: -0.02525974025974026. Std of Reward: 0.08248426403695888.
Step: 208000. Mean Reward: -0.023448103792415166. Std of Reward: 0.0841070081726232.
Step: 209000. Mean Reward: -0.026335. Std of Reward: 0.08147771336383958.
Step: 210000. Mean Reward: -0.026394422310756973. Std of Reward: 0.08144944297979004.
Step: 211000. Mean Reward: -0.025199203187250996. Std of Reward: 0.08255422073955329.
Step: 212000. Mean Reward: -0.025

INFO:tensorflow:Restoring parameters from ./models/Tennis_Z_4\model-250000.cptk


INFO:tensorflow:Froze 4 variables.


INFO:tensorflow:Froze 4 variables.


Converted 4 variables to const ops.
Step: 251000. Mean Reward: -0.024623115577889446. Std of Reward: 0.08292355060273608.
Step: 252000. Mean Reward: -0.02317404426559356. Std of Reward: 0.08392913971501116.
Step: 253000. Mean Reward: -0.02501010101010101. Std of Reward: 0.08241279381769727.
Step: 254000. Mean Reward: -0.024120934959349593. Std of Reward: 0.08300773551436887.
Step: 255000. Mean Reward: -0.023579373104145603. Std of Reward: 0.08352042440887832.
Step: 256000. Mean Reward: -0.023293051359516617. Std of Reward: 0.08387836900874596.
Step: 257000. Mean Reward: -0.022996926229508198. Std of Reward: 0.08357462057147906.
Step: 258000. Mean Reward: -0.024502032520325203. Std of Reward: 0.08254190852249255.
Step: 259000. Mean Reward: -0.02469450101832994. Std of Reward: 0.08215378722805124.
Step: 260000. Mean Reward: -0.024136874361593463. Std of Reward: 0.08259730176137103.
Step: 261000. Mean Reward: -0.022786377708978327. Std of Reward: 0.08350906206540777.
Step: 262000. Mean Re

INFO:tensorflow:Restoring parameters from ./models/Tennis_Z_4\model-300000.cptk


INFO:tensorflow:Froze 4 variables.


INFO:tensorflow:Froze 4 variables.


Converted 4 variables to const ops.
Step: 301000. Mean Reward: -0.022943121693121692. Std of Reward: 0.08301129484435546.
Step: 302000. Mean Reward: -0.023768606224627876. Std of Reward: 0.08177045027460421.
Step: 303000. Mean Reward: -0.022456021650879565. Std of Reward: 0.0829275922054298.
Step: 304000. Mean Reward: -0.022586666666666665. Std of Reward: 0.08256376821719211.
Step: 305000. Mean Reward: -0.023787262872628726. Std of Reward: 0.0819996083693388.
Step: 306000. Mean Reward: -0.021793478260869564. Std of Reward: 0.0837798665815567.
Step: 307000. Mean Reward: -0.023435374149659866. Std of Reward: 0.08230254193239406.
Step: 308000. Mean Reward: -0.023519515477792734. Std of Reward: 0.08143023723225074.
Step: 309000. Mean Reward: -0.022799174690508944. Std of Reward: 0.08260505527813164.
Step: 310000. Mean Reward: -0.02241758241758242. Std of Reward: 0.0821905205536537.
Step: 311000. Mean Reward: -0.022017783857729137. Std of Reward: 0.08199107229110003.
Step: 312000. Mean Rewa

INFO:tensorflow:Restoring parameters from ./models/Tennis_Z_4\model-350000.cptk


INFO:tensorflow:Froze 4 variables.


INFO:tensorflow:Froze 4 variables.


Converted 4 variables to const ops.
Step: 351000. Mean Reward: -0.022410179640718564. Std of Reward: 0.08239146018093242.
Step: 352000. Mean Reward: -0.022068452380952383. Std of Reward: 0.08270379636582086.
Step: 353000. Mean Reward: -0.02373692077727952. Std of Reward: 0.08177183702491159.
Step: 354000. Mean Reward: -0.02336053412462908. Std of Reward: 0.08188649586504455.
Step: 355000. Mean Reward: -0.022578710644677664. Std of Reward: 0.08206083070849605.
Step: 356000. Mean Reward: -0.023164179104477614. Std of Reward: 0.08181150468578974.
Step: 357000. Mean Reward: -0.02297761194029851. Std of Reward: 0.0819556638802857.
Step: 358000. Mean Reward: -0.02264359351988218. Std of Reward: 0.08260580085711701.
Step: 359000. Mean Reward: -0.022312312312312312. Std of Reward: 0.0827106378174339.
Step: 360000. Mean Reward: -0.02477645305514158. Std of Reward: 0.08048605573851397.
Step: 361000. Mean Reward: -0.021792592592592593. Std of Reward: 0.08346594748503584.
Step: 362000. Mean Reward

INFO:tensorflow:Restoring parameters from ./models/Tennis_Z_4\model-400000.cptk


INFO:tensorflow:Froze 4 variables.


INFO:tensorflow:Froze 4 variables.


Converted 4 variables to const ops.
Step: 401000. Mean Reward: -0.02203422053231939. Std of Reward: 0.08286600324834309.
Step: 402000. Mean Reward: -0.023333333333333334. Std of Reward: 0.08191745897835263.
Step: 403000. Mean Reward: -0.02262693156732892. Std of Reward: 0.081523489639434.
Step: 404000. Mean Reward: -0.023300438596491228. Std of Reward: 0.08111858927596945.
Step: 405000. Mean Reward: -0.022571743929359823. Std of Reward: 0.08215171877056932.
Step: 406000. Mean Reward: -0.021488888888888887. Std of Reward: 0.0833466981875433.
Step: 407000. Mean Reward: -0.020951859956236323. Std of Reward: 0.08229125758074846.
Step: 408000. Mean Reward: -0.022070484581497796. Std of Reward: 0.08266158577719945.
Step: 409000. Mean Reward: -0.02339246119733925. Std of Reward: 0.08040514834935758.
Step: 410000. Mean Reward: -0.020695652173913042. Std of Reward: 0.08433985843138962.
Step: 411000. Mean Reward: -0.023211920529801325. Std of Reward: 0.081207453191979.
Step: 412000. Mean Reward:

INFO:tensorflow:Restoring parameters from ./models/Tennis_Z_4\model-450000.cptk


INFO:tensorflow:Froze 4 variables.


INFO:tensorflow:Froze 4 variables.


Converted 4 variables to const ops.
Step: 451000. Mean Reward: -0.02338046272493573. Std of Reward: 0.0818316142376032.
Step: 452000. Mean Reward: -0.02143044619422572. Std of Reward: 0.08348653225389059.
Step: 453000. Mean Reward: -0.02264030612244898. Std of Reward: 0.08258602940550851.
Step: 454000. Mean Reward: -0.023280423280423283. Std of Reward: 0.08219270612106268.
Step: 455000. Mean Reward: -0.024373368146214103. Std of Reward: 0.08116682701520149.
Step: 456000. Mean Reward: -0.02198684210526316. Std of Reward: 0.08319997998181893.
Step: 457000. Mean Reward: -0.02414021164021164. Std of Reward: 0.08117149147654389.
Step: 458000. Mean Reward: -0.023956185567010308. Std of Reward: 0.08152820736260169.
Step: 459000. Mean Reward: -0.02318537859007833. Std of Reward: 0.08185215351019268.
Step: 460000. Mean Reward: -0.02227034120734908. Std of Reward: 0.08218291733996945.
Step: 461000. Mean Reward: -0.02195822454308094. Std of Reward: 0.08265394421937816.
Step: 462000. Mean Reward: 

INFO:tensorflow:Restoring parameters from ./models/Tennis_Z_4\model-500000.cptk


### Export the trained Tensorflow graph
Once the model has been trained and saved, we can export it as a .bytes file which Unity can embed.

In [5]:
export_graph(model_path, env_name)

INFO:tensorflow:Restoring parameters from ./models/Tennis_Z_2\model-100000.cptk


INFO:tensorflow:Restoring parameters from ./models/Tennis_Z_2\model-100000.cptk


INFO:tensorflow:Froze 4 variables.


INFO:tensorflow:Froze 4 variables.


Converted 4 variables to const ops.
