### 1. Initialize project environemnt and create an agent

In [None]:
from unityagents import UnityEnvironment
import numpy as np
env = UnityEnvironment(file_name='Reacher.app')

In [None]:
# get the default brain
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

# reset the environment
env_info = env.reset(train_mode=True)[brain_name]

# number of agents
num_agents = len(env_info.agents)
print('Number of agents:', num_agents)

# size of each action
action_size = brain.vector_action_space_size
print('Size of each action:', action_size)

# examine the state space
states = env_info.vector_observations
state_size = states.shape[1]
print('There are {} agents. Each observes a state with length: {}'.format(states.shape[0], state_size))


### 2. Train the agent and store trained model weights into checkpoint.pth file.

FYI: Instead of training the agent from sracth to run in project environment, it is possible to load the stored model weights of already trained agent from please skip steps 2 and 3 and jump directly to step 4.

In [None]:
from collections import deque
import torch

def ddpg(n_episodes=1000, max_t=2000, print_every=100):
    scores_deque = deque(maxlen=print_every)
    scores = []
    scores_mean = []  # list the means of the window scores
    for i_episode in range(1, n_episodes+1):
        state = env.reset(train_mode=True)[brain_name].vector_observations[0]
        agent.reset()
        score = 0
        for t in range(max_t):
            action = agent.act(state)
            env_info = env.step(action)[brain_name]        # pass action into environment
            next_state = env_info.vector_observations[0]   # get next state from updated environment
            reward = env_info.rewards[0]                   # get reward from updated environment
            done = env_info.local_done[0]                  # get info whether episode has finished
            agent.step(state, action, reward, next_state, done)
            state = next_state
            score += reward
            if done:
                break
        scores_deque.append(score)
        scores.append(score)
        scores_mean.append(np.mean(scores_deque))  # save most recent mean score
        print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, scores_mean[-1]), end="")
        torch.save(agent.actor_local.state_dict(), 'checkpoint_actor.pth')
        torch.save(agent.critic_local.state_dict(), 'checkpoint_critic.pth')

        if i_episode % print_every == 0:
            print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, scores_mean[-1]))

        if scores_mean[-1]>=30.0:
            print('\nEnvironment solved in {:d} Episodes \tAverage Score: {:.2f}'.format(i_episode, scores_mean[-1]))
            torch.save(agent.actor_local.state_dict(), 'checkpoint_actor.pth')
            torch.save(agent.critic_local.state_dict(), 'checkpoint_critic.pth')
            break

    return scores, scores_mean

In [None]:
from ddpg_agent import Agent

agent = Agent(state_size=state_size, action_size=action_size, random_seed=13)
scores, mean = ddpg(n_episodes=600)


### 3. Visualize results of agent's training

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
## Plot the scores
fig = plt.figure(figsize=(12,7))

ax = fig.add_subplot(111)
plt.plot(np.arange(1, len(scores)+1), scores)
plt.plot(np.arange(len(mean)), mean)
plt.ylabel('Score')
plt.xlabel('Episode #')
plt.legend(('Score', 'Mean'))

plt.show()

### 4. Run the agent with pretrained model weights

Once this cell is executed, a Unity window with loaded project environemnt should pop up where the trained agent can be observed in action, as it moves through the environment. The agent's 'experience' is loaded from stored 'checkpoint.pth' file. This file contains the weights of the agent's neural netowrk model learned during training phase, therefore no need to train the agent every time from scratch.

In [None]:
env_info = env.reset(train_mode=False)[brain_name]     # reset the environment    
#agent.actor_local.load_state_dict(torch.load('checkpoint_actor.pth')) # load stored weights of trained agent's model
#agent.critic_local.load_state_dict(torch.load('checkpoint_critic.pth'))
states = env_info.vector_observations                  # get the current state (for each agent)
scores = np.zeros(num_agents)                          # initialize the score (for each agent)
while True:
    actions = np.random.randn(num_agents, action_size) # select an action (for each agent)
    actions = np.clip(actions, -1, 1)                  # all actions between -1 and 1
    #actions = agent.act(states, add_noise=False)
    env_info = env.step(actions)[brain_name]           # send all actions to tne environment
    next_states = env_info.vector_observations         # get next state (for each agent)
    rewards = env_info.rewards                         # get reward (for each agent)
    dones = env_info.local_done                        # see if episode finished
    scores += env_info.rewards                         # update the score (for each agent)
    states = next_states                               # roll over states to next time step
    if np.any(dones):                                  # exit loop if episode finished
        break
print('Total score (averaged over agents) this episode: {}'.format(np.mean(scores)))

After agent is finished, the Unity window with project environment can be closed

In [None]:
env.close()