# Continuous Control

---



### 1. Start the Environment



Please select one of the two options below for loading the environment.

In [1]:
%load_ext autoreload
%autoreload 2

from unityagents import UnityEnvironment
import numpy as np
import pandas as pd
import torch
from collections import namedtuple, deque
from ddpg_agent import Agent 

# select this option to load version 2 (with 20 agents) of the environment for Windows
#env = UnityEnvironment(file_name='Reacher_Windows_x86_64_parallel/Reacher.exe')

# select this option to load version 2 (with 20 agents) of the environment for Linux
env = UnityEnvironment(file_name='Reacher_Linux_NoVis/Reacher.x86_64')

INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		goal_size -> 5.0
		goal_speed -> 1.0
Unity brain name: ReacherBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 33
        Number of stacked Vector Observation: 1
        Vector Action space type: continuous
        Vector Action space size (per agent): 4
        Vector Action descriptions: , , , 


Environments contain **_brains_** which are responsible for deciding the actions of their associated agents. Here we check for the first brain available, and set it as the default brain we will be controlling from Python.

In [2]:
# get the default brain
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

### 2. Examine the State and Action Spaces

Run the code cell below to print some information about the environment.

In [3]:
# reset the environment
env_info = env.reset(train_mode=True)[brain_name]

# number of agents
num_agents = len(env_info.agents)
print('Number of agents:', num_agents)

# size of each action
action_size = brain.vector_action_space_size
print('Size of each action:', action_size)

# examine the state space 
states = env_info.vector_observations
state_size = states.shape[1]
print('There are {} agents. Each observes a state with length: {}'.format(states.shape[0], state_size))
print('The state for the first agent looks like:', states[0])

Number of agents: 20
Size of each action: 4
There are 20 agents. Each observes a state with length: 33
The state for the first agent looks like: [ 0.00000000e+00 -4.00000000e+00  0.00000000e+00  1.00000000e+00
 -0.00000000e+00 -0.00000000e+00 -4.37113883e-08  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00 -1.00000000e+01  0.00000000e+00
  1.00000000e+00 -0.00000000e+00 -0.00000000e+00 -4.37113883e-08
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  5.75471878e+00 -1.00000000e+00
  5.55726624e+00  0.00000000e+00  1.00000000e+00  0.00000000e+00
 -1.68164849e-01]


### 3. Train

In [4]:
agent = Agent(state_size=state_size, action_size=action_size, random_seed=10)

def ddpg(n_episodes=150, max_t=1000):
    scores_deque = deque(maxlen=10)
    scores = []
    for i_episode in range(1, n_episodes+1):
        env_info = env.reset(train_mode=True)[brain_name] 
        states = env_info.vector_observations
        agent.reset()
        score = 0
        for t in range(max_t):
            actions = agent.act(states)
            actions = np.clip(actions, -1, 1)    
            env_info = env.step(actions)[brain_name]           # send all actions to tne environment
            next_states = env_info.vector_observations         # get next state (for each agent)
            rewards = env_info.rewards                         # get reward (for each agent)
            dones = env_info.local_done                        # see if episode finished

            if np.all(dones):
                break

            for state, action, reward, next_state, done in zip(states, actions, rewards, next_states, dones):
                agent.step(state, action, reward, next_state, done)

            states = next_states
            score += np.mean(rewards)

        scores_deque.append(score)
        scores.append(score)
        print('\rEpisode {}\tAverage Score: {:.2f}\tScore: {:.2f}'.format(i_episode, np.mean(scores_deque), score), end="")
        if i_episode % 10 == 0:
            torch.save(agent.actor_local.state_dict(), 'checkpoint_actor_e%s.pth' % i_episode)
            torch.save(agent.critic_local.state_dict(), 'checkpoint_critic_e%s.pth' % i_episode)
            print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_deque)))   
    return scores

scores = ddpg()


  torch.nn.utils.clip_grad_norm(self.critic_local.parameters(), GRAD_CLIP)


Episode 10	Average Score: 1.39	Score: 1.96
Episode 20	Average Score: 2.56	Score: 3.54
Episode 30	Average Score: 5.82	Score: 7.25
Episode 40	Average Score: 9.61	Score: 10.26
Episode 50	Average Score: 14.67	Score: 16.50
Episode 60	Average Score: 17.19	Score: 19.40
Episode 70	Average Score: 18.11	Score: 17.10
Episode 80	Average Score: 18.73	Score: 18.27
Episode 90	Average Score: 23.99	Score: 27.58
Episode 100	Average Score: 34.09	Score: 33.17
Episode 110	Average Score: 36.51	Score: 35.93
Episode 120	Average Score: 37.94	Score: 38.80
Episode 130	Average Score: 37.76	Score: 36.23
Episode 140	Average Score: 37.82	Score: 38.06
Episode 150	Average Score: 37.04	Score: 37.00


In [5]:
data = pd.DataFrame({'score': scores})
data.to_csv('scores.csv', sep='\t', index=False, header=True)

### 4. Test Model

every checkpoint is loaded and evaluated for 100 episodes.

In [10]:
avg_scores = []

for train_checkpoint in range(10,151,10):
    agent = Agent(state_size=state_size, action_size=action_size, random_seed=10)
    agent.actor_local.load_state_dict(torch.load('checkpoint_actor_e%s.pth' % train_checkpoint))
    agent.critic_local.load_state_dict(torch.load('checkpoint_critic_e%s.pth'% train_checkpoint))
    for i_episode in range(1, 101):
        scores = []
        env_info = env.reset(train_mode=True)[brain_name]
        states = env_info.vector_observations
        score = 0
        for t in range(2000):
            actions = agent.act(states, False)
            actions = np.clip(actions, -1, 1)    
            env_info = env.step(actions)[brain_name]           # send all actions to tne environment
            states = env_info.vector_observations              # get next state (for each agent)
            rewards = env_info.rewards                         # get reward (for each agent)
            dones = env_info.local_done                        # see if episode finished

            if np.all(dones):
                break

            score += np.mean(rewards)

        scores.append(score)       # save most recent score
        print('\rEpisode {}\t Score: {:.2f}'.format(i_episode, np.mean(scores)), end="")
    print('\rTrain Checkpoint {}\tAverage Score: {:.2f}'.format(train_checkpoint, np.mean(scores)))    
    avg_scores.append(np.mean(scores))



Train Checkpoint 10	Average Score: 2.02
Train Checkpoint 20	Average Score: 3.45
Train Checkpoint 30	Average Score: 6.21
Train Checkpoint 40	Average Score: 15.02
Train Checkpoint 50	Average Score: 15.41
Train Checkpoint 60	Average Score: 18.56
Train Checkpoint 70	Average Score: 17.91
Train Checkpoint 80	Average Score: 22.09
Train Checkpoint 90	Average Score: 30.54
Train Checkpoint 100	Average Score: 34.57
Train Checkpoint 110	Average Score: 34.09
Train Checkpoint 120	Average Score: 38.31
Train Checkpoint 130	Average Score: 34.91
Train Checkpoint 140	Average Score: 34.55
Train Checkpoint 150	Average Score: 36.33


In [11]:
data_avg = pd.DataFrame({'avg score': avg_scores})
data_avg.index = np.arange(10,151,10)
data_avg.to_csv('avg_scores.csv', sep='\t', index=True, header=True)

When finished, you can close the environment.

In [12]:
env.close()