# Unity ML Agents
## Proximal Policy Optimization (PPO)
Contains an implementation of PPO as described [here](https://arxiv.org/abs/1707.06347).

In [1]:
import numpy as np
import os
import tensorflow as tf

from ppo.history import *
from ppo.models import *
from ppo.trainer import Trainer
from unityagents import *

### Hyperparameters

In [2]:
### General parameters
max_steps = 10000000 # Set maximum number of steps to run environment.
run_path = "ppo" # The sub-directory name for model and summary statistics
load_model = False # Whether to load a saved model.
train_model = True # Whether to train the model.
summary_freq = 10000 # Frequency at which to save training statistics.
save_freq = 50000 # Frequency at which to save model.
env_name = "tictac" # Name of the training environment file.
curriculum_file = None

### Algorithm-specific parameters for tuning
gamma = 0.99 # Reward discount rate.
lambd = 0.95 # Lambda parameter for GAE.
time_horizon = 64 # How many steps to collect per agent before adding to buffer.
beta = 1e-2 # Strength of entropy regularization
num_epoch = 5 # Number of gradient descent steps per batch of experiences.
num_layers = 2 # Number of hidden layers between state/observation encoding and value/policy layers.
epsilon = 0.3 # Acceptable threshold around ratio of old and new policy probabilities.
buffer_size = 4096 # How large the experience buffer should be before gradient descent.
learning_rate = 3e-4 # Model learning rate.
hidden_units = 27 # Number of units in hidden layer.
batch_size = 32 # How many experiences per gradient descent update step.
normalize = True

### Logging dictionary for hyperparameters
hyperparameter_dict = {'max_steps':max_steps, 'run_path':run_path, 'env_name':env_name,
    'curriculum_file':curriculum_file, 'gamma':gamma, 'lambd':lambd, 'time_horizon':time_horizon,
    'beta':beta, 'num_epoch':num_epoch, 'epsilon':epsilon, 'buffe_size':buffer_size,
    'leaning_rate':learning_rate, 'hidden_units':hidden_units, 'batch_size':batch_size}

### Load the environment

In [3]:
env = UnityEnvironment(file_name=env_name, curriculum=curriculum_file)
print(str(env))
brain_name = env.external_brain_names[0]

INFO:unityagents:
'Academy' started successfully!


Unity Academy name: Academy
        Number of brains: 1
        Reset Parameters :
		defence_penalty -> -0.5
		defence_reward -> 0.5
Unity brain name: Brain
        Number of observations (per agent): 0
        State space type: continuous
        State space size (per agent): 10
        Action space type: discrete
        Action space size (per agent): 9
        Memory space size (per agent): 0
        Action descriptions: 0, 1, 2, 3, 4, 5, 6, 7, 8


### Train the Agent(s)

In [4]:
tf.reset_default_graph()

if curriculum_file == "None":
    curriculum_file = None


def get_progress():
    if curriculum_file is not None:
        if env._curriculum.measure_type == "progress":
            return steps / max_steps
        elif env._curriculum.measure_type == "reward":
            return last_reward
        else:
            return None
    else:
        return None

# Create the Tensorflow model graph
ppo_model = create_agent_model(env, lr=learning_rate,
                               h_size=hidden_units, epsilon=epsilon,
                               beta=beta, max_step=max_steps, 
                               normalize=normalize, num_layers=num_layers)

is_continuous = (env.brains[brain_name].action_space_type == "continuous")
use_observations = (env.brains[brain_name].number_observations > 0)
use_states = (env.brains[brain_name].state_space_size > 0)

model_path = './models/{}'.format(run_path)
summary_path = './summaries/{}'.format(run_path)

if not os.path.exists(model_path):
    os.makedirs(model_path)

if not os.path.exists(summary_path):
    os.makedirs(summary_path)

init = tf.global_variables_initializer()
saver = tf.train.Saver()

with tf.Session() as sess:
    # Instantiate model parameters
    if load_model:
        print('Loading Model...')
        ckpt = tf.train.get_checkpoint_state(model_path)
        saver.restore(sess, ckpt.model_checkpoint_path)
    else:
        sess.run(init)
    steps, last_reward = sess.run([ppo_model.global_step, ppo_model.last_reward])    
    summary_writer = tf.summary.FileWriter(summary_path)
    info = env.reset(train_mode=train_model, progress=get_progress())[brain_name]
    trainer = Trainer(ppo_model, sess, info, is_continuous, use_observations, use_states, train_model)
    if train_model:
        trainer.write_text(summary_writer, 'Hyperparameters', hyperparameter_dict, steps)
    while steps <= max_steps:
        if env.global_done:
            info = env.reset(train_mode=train_model, progress=get_progress())[brain_name]
        # Decide and take an action
        new_info = trainer.take_action(info, env, brain_name, steps, normalize)
        info = new_info
        trainer.process_experiences(info, time_horizon, gamma, lambd)
        if len(trainer.training_buffer['actions']) > buffer_size and train_model:
            # Perform gradient descent with experience buffer
            trainer.update_model(batch_size, num_epoch)
        if steps % summary_freq == 0 and steps != 0 and train_model:
            # Write training statistics to tensorboard.
            trainer.write_summary(summary_writer, steps, env._curriculum.lesson_number)
        if steps % save_freq == 0 and steps != 0 and train_model:
            # Save Tensorflow model
            save_model(sess, model_path=model_path, steps=steps, saver=saver)
        steps += 1
        sess.run(ppo_model.increment_step)
        if len(trainer.stats['cumulative_reward']) > 0:
            mean_reward = np.mean(trainer.stats['cumulative_reward'])
            sess.run(ppo_model.update_reward, feed_dict={ppo_model.new_reward: mean_reward})
            last_reward = sess.run(ppo_model.last_reward)
    # Final save Tensorflow model
    if steps != 0 and train_model:
        save_model(sess, model_path=model_path, steps=steps, saver=saver)
env.close()
export_graph(model_path, env_name)

Step: 10000. Mean Reward: -1.12950600801. Std of Reward: 0.520203202085.
Step: 20000. Mean Reward: -1.13077274806. Std of Reward: 0.539309366336.
Step: 30000. Mean Reward: -1.08160036166. Std of Reward: 0.50869302744.
Step: 40000. Mean Reward: -1.02829099307. Std of Reward: 0.528333331637.
Step: 50000. Mean Reward: -0.969452247191. Std of Reward: 0.568955058983.
Saved Model
Step: 60000. Mean Reward: -0.913595021541. Std of Reward: 0.609347130942.
Step: 70000. Mean Reward: -0.807567432567. Std of Reward: 0.686198264818.
Step: 80000. Mean Reward: -0.707080998472. Std of Reward: 0.697476191901.
Step: 90000. Mean Reward: -0.640689834025. Std of Reward: 0.782002358072.
Step: 100000. Mean Reward: -0.519936204147. Std of Reward: 0.885264625327.
Saved Model
Step: 110000. Mean Reward: -0.444872476089. Std of Reward: 0.938276570155.
Step: 120000. Mean Reward: -0.308962264151. Std of Reward: 1.02200341853.
Step: 130000. Mean Reward: -0.266518600777. Std of Reward: 1.06583563097.
Step: 140000. Mea

Saved Model
Step: 1110000. Mean Reward: 2.72779304029. Std of Reward: 0.651515548764.
Step: 1120000. Mean Reward: 2.75854108957. Std of Reward: 0.601904100031.
Step: 1130000. Mean Reward: 2.7371559633. Std of Reward: 0.62950773216.
Step: 1140000. Mean Reward: 2.82597583643. Std of Reward: 0.511236633397.
Step: 1150000. Mean Reward: 2.78026802218. Std of Reward: 0.592385620174.
Saved Model
Step: 1160000. Mean Reward: 2.81464318814. Std of Reward: 0.574039583639.
Step: 1170000. Mean Reward: 2.83589385475. Std of Reward: 0.561210277612.
Step: 1180000. Mean Reward: 2.87348272642. Std of Reward: 0.455392945959.
Step: 1190000. Mean Reward: 2.87663246269. Std of Reward: 0.43233587947.
Step: 1200000. Mean Reward: 2.89165103189. Std of Reward: 0.447644129403.
Saved Model
Step: 1210000. Mean Reward: 2.90662931839. Std of Reward: 0.405998362375.
Step: 1220000. Mean Reward: 2.90262172285. Std of Reward: 0.458735090356.
Step: 1230000. Mean Reward: 2.92011278195. Std of Reward: 0.37806962509.
Step: 

Step: 2190000. Mean Reward: 2.99099526066. Std of Reward: 0.131215444474.
Step: 2200000. Mean Reward: 2.9864800759. Std of Reward: 0.156666656484.
Saved Model
Step: 2210000. Mean Reward: 2.98671726755. Std of Reward: 0.156497606076.
Step: 2220000. Mean Reward: 2.98816287879. Std of Reward: 0.15721907361.
Step: 2230000. Mean Reward: 2.98436018957. Std of Reward: 0.189766489403.
Step: 2240000. Mean Reward: 2.9710626186. Std of Reward: 0.289756178774.
Step: 2250000. Mean Reward: 2.98484848485. Std of Reward: 0.195250363165.
Saved Model
Step: 2260000. Mean Reward: 2.99049429658. Std of Reward: 0.104121085942.
Step: 2270000. Mean Reward: 2.99762808349. Std of Reward: 0.046142159353.
Step: 2280000. Mean Reward: 2.97798295455. Std of Reward: 0.243379387428.
Step: 2290000. Mean Reward: 2.98907882241. Std of Reward: 0.177361709856.
Step: 2300000. Mean Reward: 2.9855450237. Std of Reward: 0.176436720156.
Saved Model
Step: 2310000. Mean Reward: 2.98837760911. Std of Reward: 0.168485061786.
Step: 

Step: 3270000. Mean Reward: 2.99120722433. Std of Reward: 0.142896326404.
Step: 3280000. Mean Reward: 2.9824311491. Std of Reward: 0.215276002217.
Step: 3290000. Mean Reward: 2.99857685009. Std of Reward: 0.0344083260072.
Step: 3300000. Mean Reward: 2.99050332384. Std of Reward: 0.163153960575.
Saved Model
Step: 3310000. Mean Reward: 2.9990530303. Std of Reward: 0.0307582988048.
Step: 3320000. Mean Reward: 2.99904852521. Std of Reward: 0.030831306843.
Step: 3330000. Mean Reward: 2.99620853081. Std of Reward: 0.0753181820865.
Step: 3340000. Mean Reward: 2.99714013346. Std of Reward: 0.0925819679098.
Step: 3350000. Mean Reward: 2.99810066477. Std of Reward: 0.0435399558836.
Saved Model
Step: 3360000. Mean Reward: 2.99004739336. Std of Reward: 0.167630539443.
Step: 3370000. Mean Reward: 2.99288425047. Std of Reward: 0.145118651574.
Step: 3380000. Mean Reward: 2.99715370019. Std of Reward: 0.0923623192007.
Step: 3390000. Mean Reward: 2.98789173789. Std of Reward: 0.176108939721.
Step: 3400

Step: 4370000. Mean Reward: 2.99405893536. Std of Reward: 0.141372174008.
Step: 4380000. Mean Reward: 3.0. Std of Reward: 0.0.
Step: 4390000. Mean Reward: 3.0. Std of Reward: 0.0.
Step: 4400000. Mean Reward: 2.99619410086. Std of Reward: 0.123325227372.
Saved Model
Step: 4410000. Mean Reward: 2.99195075758. Std of Reward: 0.152493732607.
Step: 4420000. Mean Reward: 3.0. Std of Reward: 0.0.
Step: 4430000. Mean Reward: 2.99455492424. Std of Reward: 0.103358337095.
Step: 4440000. Mean Reward: 2.99619771863. Std of Reward: 0.108938844927.
Step: 4450000. Mean Reward: 2.99952426261. Std of Reward: 0.0154156534215.
Saved Model
Step: 4460000. Mean Reward: 2.99619771863. Std of Reward: 0.123266654581.
Step: 4470000. Mean Reward: 2.99619771863. Std of Reward: 0.09495241391.
Step: 4480000. Mean Reward: 3.0. Std of Reward: 0.0.
Step: 4490000. Mean Reward: 3.0. Std of Reward: 0.0.
Step: 4500000. Mean Reward: 2.99952516619. Std of Reward: 0.015401020646.
Saved Model
Step: 4510000. Mean Reward: 2.996

Saved Model
Step: 5560000. Mean Reward: 2.99857278782. Std of Reward: 0.0462469602646.
Step: 5570000. Mean Reward: 3.0. Std of Reward: 0.0.
Step: 5580000. Mean Reward: 3.0. Std of Reward: 0.0.
Step: 5590000. Mean Reward: 2.99619410086. Std of Reward: 0.123325227372.
Step: 5600000. Mean Reward: 2.99241706161. Std of Reward: 0.173994978663.
Saved Model
Step: 5610000. Mean Reward: 3.0. Std of Reward: 0.0.
Step: 5620000. Mean Reward: 2.99573459716. Std of Reward: 0.124034873888.
Step: 5630000. Mean Reward: 2.99905033238. Std of Reward: 0.030802041292.
Step: 5640000. Mean Reward: 2.99098671727. Std of Reward: 0.180038931627.
Step: 5650000. Mean Reward: 3.0. Std of Reward: 0.0.
Saved Model
Step: 5660000. Mean Reward: 2.99667616334. Std of Reward: 0.107807144522.
Step: 5670000. Mean Reward: 2.99905033238. Std of Reward: 0.030802041292.
Step: 5680000. Mean Reward: 2.99621212121. Std of Reward: 0.123033195219.
Step: 5690000. Mean Reward: 2.9943019943. Std of Reward: 0.137698529614.
Step: 570000

Saved Model
Step: 6760000. Mean Reward: 2.9971563981. Std of Reward: 0.0923185767625.
Step: 6770000. Mean Reward: 3.0. Std of Reward: 0.0.
Step: 6780000. Mean Reward: 3.0. Std of Reward: 0.0.
Step: 6790000. Mean Reward: 3.0. Std of Reward: 0.0.
Step: 6800000. Mean Reward: 2.99952426261. Std of Reward: 0.0154156534215.
Saved Model
Step: 6810000. Mean Reward: 3.0. Std of Reward: 0.0.
Step: 6820000. Mean Reward: 3.0. Std of Reward: 0.0.
Step: 6830000. Mean Reward: 3.0. Std of Reward: 0.0.
Step: 6840000. Mean Reward: 3.0. Std of Reward: 0.0.
Step: 6850000. Mean Reward: 3.0. Std of Reward: 0.0.
Saved Model
Step: 6860000. Mean Reward: 3.0. Std of Reward: 0.0.
Step: 6870000. Mean Reward: 3.0. Std of Reward: 0.0.
Step: 6880000. Mean Reward: 3.0. Std of Reward: 0.0.
Step: 6890000. Mean Reward: 3.0. Std of Reward: 0.0.
Step: 6900000. Mean Reward: 2.9990521327. Std of Reward: 0.0307728589208.
Saved Model
Step: 6910000. Mean Reward: 3.0. Std of Reward: 0.0.
Step: 6920000. Mean Reward: 2.9981024667

Step: 7980000. Mean Reward: 3.0. Std of Reward: 0.0.
Step: 7990000. Mean Reward: 2.99690770695. Std of Reward: 0.10020174724.
Step: 8000000. Mean Reward: 3.0. Std of Reward: 0.0.
Saved Model
Step: 8010000. Mean Reward: 2.99667931689. Std of Reward: 0.107756039067.
Step: 8020000. Mean Reward: 2.99952516619. Std of Reward: 0.015401020646.
Step: 8030000. Mean Reward: 3.0. Std of Reward: 0.0.
Step: 8040000. Mean Reward: 3.0. Std of Reward: 0.0.
Step: 8050000. Mean Reward: 2.9990512334. Std of Reward: 0.0307874397336.
Saved Model
Step: 8060000. Mean Reward: 2.99904852521. Std of Reward: 0.030831306843.
Step: 8070000. Mean Reward: 3.0. Std of Reward: 0.0.
Step: 8080000. Mean Reward: 2.99123222749. Std of Reward: 0.153108077877.
Step: 8090000. Mean Reward: 3.0. Std of Reward: 0.0.
Step: 8100000. Mean Reward: 2.99691943128. Std of Reward: 0.100011791493.
Saved Model
Step: 8110000. Mean Reward: 2.99810066477. Std of Reward: 0.061604082584.
Step: 8120000. Mean Reward: 3.0. Std of Reward: 0.0.
St

Step: 9220000. Mean Reward: 3.0. Std of Reward: 0.0.
Step: 9230000. Mean Reward: 2.99714828897. Std of Reward: 0.092449990936.
Step: 9240000. Mean Reward: 3.0. Std of Reward: 0.0.
Step: 9250000. Mean Reward: 3.0. Std of Reward: 0.0.
Saved Model
Step: 9260000. Mean Reward: 2.99857414449. Std of Reward: 0.046224995468.
Step: 9270000. Mean Reward: 2.99904852521. Std of Reward: 0.030831306843.
Step: 9280000. Mean Reward: 3.0. Std of Reward: 0.0.
Step: 9290000. Mean Reward: 3.0. Std of Reward: 0.0.
Step: 9300000. Mean Reward: 2.99904942966. Std of Reward: 0.0308166636453.
Saved Model
Step: 9310000. Mean Reward: 3.0. Std of Reward: 0.0.
Step: 9320000. Mean Reward: 3.0. Std of Reward: 0.0.
Step: 9330000. Mean Reward: 2.9981042654. Std of Reward: 0.0615457178417.
Step: 9340000. Mean Reward: 3.0. Std of Reward: 0.0.
Step: 9350000. Mean Reward: 3.0. Std of Reward: 0.0.
Saved Model
Step: 9360000. Mean Reward: 3.0. Std of Reward: 0.0.
Step: 9370000. Mean Reward: 3.0. Std of Reward: 0.0.
Step: 9380

INFO:tensorflow:Restoring parameters from ./models/ppo/model-10000001.cptk


INFO:tensorflow:Froze 7 variables.


INFO:tensorflow:Froze 7 variables.


Converted 7 variables to const ops.


### Export the trained Tensorflow graph
Once the model has been trained and saved, we can export it as a .bytes file which Unity can embed.

In [5]:
export_graph(model_path, env_name)

INFO:tensorflow:Restoring parameters from ./models/ppo/model-10000001.cptk


INFO:tensorflow:Restoring parameters from ./models/ppo/model-10000001.cptk


INFO:tensorflow:Froze 7 variables.


INFO:tensorflow:Froze 7 variables.


Converted 7 variables to const ops.
