# Continuous Control

Training Notebook

In [1]:
from unityagents import UnityEnvironment
import numpy as np
from dist_environment import UnityEnv
from agent import Agent

import numpy as np
import os
import matplotlib.pyplot as plt

%matplotlib inline

In [8]:
def plot_scores(scores):
    """ Helper function for plotting the score at the end
    """
    fig = plt.figure()
    ax = fig.add_subplot(111)
    plt.plot(np.arange(len(scores)), scores)
    plt.ylabel('Score')
    plt.xlabel('Episode #')
    plt.show()

In [9]:
hyperparameters = {
    'max_iter': 2500,
    'batch_size': 64,
    'ppo_epochs': 4,
    'clip_gradients': 0.2,
    'clip_eps': 0.2,
    'trajectory_size': 33,
    'gae_lambda': 0.6,
    'gamma': 0.99,
    'policy_lr': 0.0001,
    'value_lr': 0.0005
}

In [10]:
raw_env = UnityEnvironment(file_name='Reacher.app')

env = UnityEnv(raw_env, hyperparameters['trajectory_size'], hyperparameters['gamma'], hyperparameters['gae_lambda'])
print('state space size: ', env.observation_n)
print('action space size: ', env.action_n)

INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		goal_size -> 5.0
		goal_speed -> 1.0
Unity brain name: ReacherBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 33
        Number of stacked Vector Observation: 1
        Vector Action space type: continuous
        Vector Action space size (per agent): 4
        Vector Action descriptions: , , , 


state space size:  33
action space size:  4


### Training

Network training process starts here.

In [11]:
agent = Agent([env.observation_n], [env.action_n], hyperparameters)
reward_log = []
for i in range(hyperparameters['max_iter']):
    batch = env.steps(agent)
    reward_log.append({
        'reward': env.last_game_rew,
        'rollingmean_100': np.mean(env.last_games_rews[-100:])
    })
    agent.learn(batch, hyperparameters['gamma'])
    agent.save(rollingmean=np.mean(env.last_games_rews[-100:]))

step: 0; pg_loss: -0.0010535494777248106; vl_loss: 0.007092178188031539
step: 100; pg_loss: -0.0008674081838268954; vl_loss: 0.00045955694586155007
step: 200; pg_loss: -0.0004932256585852849; vl_loss: 0.0003386459741705039
step: 300; pg_loss: -0.0011189747971318401; vl_loss: 0.0006646480629569851
step: 400; pg_loss: -0.0018558499707620163; vl_loss: 0.0005535603711905423
step: 500; pg_loss: -0.002190141972257721; vl_loss: 0.0005559008250202169
step: 600; pg_loss: -0.001487493669061348; vl_loss: 0.0009355714311823249
step: 700; pg_loss: -0.002022782722507166; vl_loss: 0.0016069542438344798
step: 800; pg_loss: -0.0026153851915566425; vl_loss: 0.0011470237011963035
step: 900; pg_loss: -0.002774622412990313; vl_loss: 0.0012993787993764272
step: 1000; pg_loss: -0.002137664744391682; vl_loss: 0.0012082470631867182
step: 1100; pg_loss: -0.0016717256187552953; vl_loss: 0.0020272251873393542
step: 1200; pg_loss: -0.002967177223303763; vl_loss: 0.0029260033923492303
step: 1300; pg_loss: -0.003126

In [29]:
from bokeh.plotting import figure 
from bokeh.models import Legend
from bokeh.layouts import column
from bokeh.io import output_notebook, show
output_notebook()

### Training stats

In [31]:
PLOT_WIDTH = 900
PLOT_HEIGHT = 300
LINE_WIDTH = 2


def get_figure(data, x_axis_label, y_axis_label):
    #data = list(map(lambda x : 0 if x == float('nan') else x , data))
    #print(data)
    fig = figure(
        plot_width=PLOT_WIDTH,
        plot_height=PLOT_HEIGHT,
        y_axis_label=y_axis_label,
        x_axis_label=x_axis_label
    )
    fig.line(range(len(data)), data, line_width=LINE_WIDTH)
    return fig

plots = []

plots.append(get_figure([l['reward'] for l in reward_log],
                        'Episodes', 'Avg Episodic Reward over 20 arms'))
plots.append(get_figure([l['rollingmean_100'] for l in reward_log],
                        'Episodes', 'Moving Avg Episodic Reward over 20 arms'))
plots.append(get_figure([l['pg_loss'] for l in agent.logs],
                        'Episodes', 'Clipped surrogate loss'))
plots.append(get_figure([l['vl_loss'] for l in agent.logs],
                        'Episodes', 'Value network loss'))


main_row = column(*plots)
show(main_row)

### Trained agent test run

In [24]:
def get_actions(agent, states):
    return zip(*[agent.act(obs) for obs in states])

env_info = env.env.reset(train_mode=False)[env.brain_name]     # reset the environment    
states = env_info.vector_observations                  # get the current state (for each agent)
scores = np.zeros(env.num_agents)                          # initialize the score (for each agent)
while True:
#     actions = np.random.randn(num_agents, action_size) # select an action (for each agent)
#     actions = np.clip(actions, -1, 1)                  # all actions between -1 and 1
    actions, state_values = get_actions(agent, states)
    env_info = env.env.step(np.array(actions))[env.brain_name]           # send all actions to tne environment
    next_states = env_info.vector_observations        # get next state (for each agent)
    rewards = env_info.rewards                         # get reward (for each agent)
    dones = env_info.local_done                        # see if episode finished
    scores += env_info.rewards                         # update the score (for each agent)
    states = next_states                               # roll over states to next time step
    if np.any(dones):                                  # exit loop if episode finished
        break
print('Total score (averaged over agents) this episode: {}'.format(np.mean(scores)))

Total score (averaged over agents) this episode: 36.169499191548674


In [None]:
env.env.close()