# Udacity Deep Reinforcement Learning Nanodegree - Project 2: Continuous Control

In [1]:
from collections import deque

from unityagents import UnityEnvironment
import numpy as np

from agent import Agent

## Set everything up
* Unity environment
* Hyperparameters
* Agent

In [4]:
env_seed = 42

In [5]:
env = UnityEnvironment(file_name='Reacher.app', seed=env_seed)

INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		goal_size -> 5.0
		goal_speed -> 1.0
Unity brain name: ReacherBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 33
        Number of stacked Vector Observation: 1
        Vector Action space type: continuous
        Vector Action space size (per agent): 4
        Vector Action descriptions: , , , 


In [6]:
# get the default brain
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

In [7]:
# reset the environment
env_info = env.reset(train_mode=True)[brain_name]

# number of agents
num_agents = len(env_info.agents)
print('Number of agents:', num_agents)

# size of each action
action_size = brain.vector_action_space_size
print('Size of each action:', action_size)

# examine the state space 
states = env_info.vector_observations
state_size = states.shape[1]
print('There are {} agents. Each observes a state with length: {}'.format(states.shape[0], state_size))
print('The state for the first agent looks like:', states[0])

Number of agents: 1
Size of each action: 4
There are 1 agents. Each observes a state with length: 33
The state for the first agent looks like: [ 0.00000000e+00 -4.00000000e+00  0.00000000e+00  1.00000000e+00
 -0.00000000e+00 -0.00000000e+00 -4.37113883e-08  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00 -1.00000000e+01  0.00000000e+00
  1.00000000e+00 -0.00000000e+00 -0.00000000e+00 -4.37113883e-08
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00 -7.79496002e+00 -1.00000000e+00
  1.79960966e+00  0.00000000e+00  1.00000000e+00  0.00000000e+00
  1.64491057e-01]


## Params

In [8]:
n_episodes = 1000
max_t = 300 # check later
agent_seed = 42

In [None]:
hyperparams = {}

## Instantiate DDPG Agent

* Each action is a vector with four numbers, corresponding to torque applicable to two joints. Every entry in the action vector should be a number between -1 and 1.

In [9]:
agent = Agent(state_size=state_size,
              action_size=action_size,
              random_seed=agent_seed)

## Train the Agent with DDPG (Deep Deterministic Policy Gradients)

In [22]:
def perform_ddpg_training(env, brain_name, agent,
                          solution_threshold, n_episodes, max_t):
    """
    
    
    """
    scores = []
    window_length = 100
    env_is_solved = False
    scores_window = deque(maxlen=window_length)
    
    for i_episode in range(1, n_episodes+1):
        agent.reset()  # TODO: Turn to agent.noise.reset() to make clear that noise and not agent params are set back
        env_info = env.reset(train_mode=True)[brain_name]
        state = env_info.vector_observations[0]
        score = 0
        
        for t in range(max_t):
            action = agent.act(state)
            env_info = env.step(action)[brain_name]
            next_state = env_info.vector_observations[0]
            reward = env_info.rewards[0]
            done = env_info.local_done[0]  # required?
            agent.step(state, action, reward, next_state, done)
            state = next_state
            score += reward
            if done:
                break
                
        scores_window.append(score)
        scores.append(score)
        
        print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)), end="")
        
        if i_episode % window_length == 0:
            print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)))
            
        if (np.mean(scores_window) >= solution_threshold) & (not env_is_solved):
            print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(i_episode,
                                                                                         np.mean(scores_window)))
            env_is_solved = True
        
        # TODO: Adapt to Reacher
        # if env_is_solved & (score == max(scores)):
        #     torch.save(agent.q_network.state_dict(), 'best_banana_picker_agent.pth')
        
        
    return scores

In [None]:
scores = perform_dqn_training(env, brain_name, agent,
                              solution_threshold=30,
                              n_episodes=n_episodes,
                              max_steps=max_t)
env.close()

## Performance Plot

In [None]:
solution_threshold = 30
window_length = 100
average_scores = np.array([np.mean(scores[i:(i+window_length)])
                           for i in range(len(scores)-window_length)])

fig = plt.figure(figsize=(15, 10))
plt.plot(np.arange(len(scores)), scores, linewidth=0.5)
plt.plot(np.arange(window_length, len(scores)), average_scores, 'g-')
plt.plot(np.arange(len(scores)), [solution_threshold]*len(scores), 'r-')
plt.ylabel('Score', fontsize=16)
plt.xlabel('Episode #', fontsize=16)
plt.legend(['Score', 'Average Score (w = 100)', 'Solution Threshold'])
plt.grid(True)
plt.show()

In [None]:
print("Environment was solved in Episode {}!".format(np.argmax((average_scores >= solution_threshold))+window_length))