# Collaboration and Competition

---

You are welcome to use this coding environment to train your agent for the project.  Follow the instructions below to get started!

### 1. Start the Environment


The environment is already saved in the Workspace and can be accessed at the file path provided below. 

In [1]:
%load_ext autoreload
%autoreload 2

import itertools
from unityagents import UnityEnvironment
import numpy as np
import pandas as pd
import torch
from collections import namedtuple, deque

#env = UnityEnvironment(file_name="Tennis_Windows_x86_64/Tennis.exe")
env = UnityEnvironment(file_name="Tennis_Linux_NoVis/Tennis.x86_64")

INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		
Unity brain name: TennisBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 8
        Number of stacked Vector Observation: 3
        Vector Action space type: continuous
        Vector Action space size (per agent): 2
        Vector Action descriptions: , 


Environments contain **_brains_** which are responsible for deciding the actions of their associated agents. Here we check for the first brain available, and set it as the default brain we will be controlling from Python.

In [2]:
# get the default brain
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

### 2. Examine the State and Action Spaces

Run the code cell below to print some information about the environment.

In [3]:
# reset the environment
env_info = env.reset(train_mode=True)[brain_name]

# number of agents 
num_agents = len(env_info.agents)
print('Number of agents:', num_agents)

# size of each action
action_size = brain.vector_action_space_size
print('Size of each action:', action_size)

# examine the state space 
states = env_info.vector_observations
state_size = states.shape[1]
print('There are {} agents. Each observes a state with length: {}'.format(states.shape[0], state_size))
print('The state for the first agent looks like:', states[0])

Number of agents: 2
Size of each action: 2
There are 2 agents. Each observes a state with length: 24
The state for the first agent looks like: [ 0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.         -6.65278625 -1.5
 -0.          0.          6.83172083  6.         -0.          0.        ]


### 3. Train

In [7]:
from ddpg_agent import Agent 

agent = Agent(state_size=state_size, action_size=action_size, random_seed=10)

def ddpg(n_episodes=2000, max_t=1000):
    left_scores_deque = deque(maxlen=100)
    left_scores = []
    right_scores_deque = deque(maxlen=100)
    right_scores = []
    max_scores_deque = deque(maxlen=100)
    max_scores = []

    for i_episode in range(1, n_episodes+1):
        env_info = env.reset(train_mode=True)[brain_name] 
        states = env_info.vector_observations
        agent.reset()
        left_score = 0
        right_score = 0
        for i in range(1, max_t):
            
            actions = agent.act(states)
            actions = np.clip(actions, -1, 1) 
            env_info = env.step(actions)[brain_name]           # send all actions to tne environment
            next_states = env_info.vector_observations         # get next state (for each agent)
            rewards = env_info.rewards                        # get reward (for each agent)
            dones = env_info.local_done                        # see if episode finished

            left_score += rewards[0]
            right_score+= rewards[1]
            
            
            agent.step(states[0], actions[0], rewards[0], next_states[0], dones[0])
            agent.step(states[1], actions[1], rewards[1], next_states[1], dones[1])

            states = next_states
            
            if np.any(dones):
                break

        left_scores_deque.append(left_score)
        left_scores.append(left_score)
        right_scores_deque.append(right_score)
        right_scores.append(right_score)
        max_score = max(left_score, right_score)
        max_scores_deque.append(max_score)
        max_scores.append(max_score)
        print('\rEpisode {}\tAvg Left Score: {:.3f}\tAvg Right Score: {:.3f}\tAvg Max Score: {:.3f}\tLast Score: {:.3f}'
             .format(i_episode, np.mean(left_scores_deque), np.mean(right_scores_deque), np.mean(max_scores_deque), max_score ), end="")
        if i_episode % 100 == 0:
            torch.save(agent.actor_local.state_dict(), 'checkpoint_actor_e%s.pth' % i_episode)
            torch.save(agent.critic_local.state_dict(), 'checkpoint_critic_e%s.pth' % i_episode)

            print('\rEpisode {}\tAvg Left Score: {:.3f}\tAvg Right Score: {:.3f}\tAvg Max Score: {:.3f}'
                   .format(i_episode, np.mean(left_scores_deque), np.mean(right_scores_deque), np.mean(max_scores_deque)))
    return max_scores

scores = ddpg()

Episode 100	Avg Left Score: 0.003	Avg Right Score: -0.004	Avg Max Score: 0.009	Last Score: 0.000
Episode 200	Avg Left Score: 0.019	Avg Right Score: 0.015	Avg Max Score: 0.040	Last Score: 0.1000
Episode 300	Avg Left Score: 0.042	Avg Right Score: 0.050	Avg Max Score: 0.082	Last Score: 0.090
Episode 400	Avg Left Score: 0.053	Avg Right Score: 0.055	Avg Max Score: 0.093	Last Score: 0.100
Episode 500	Avg Left Score: 0.055	Avg Right Score: 0.050	Avg Max Score: 0.096	Last Score: 0.090
Episode 600	Avg Left Score: 0.054	Avg Right Score: 0.037	Avg Max Score: 0.096	Last Score: 0.100
Episode 700	Avg Left Score: 0.061	Avg Right Score: 0.076	Avg Max Score: 0.101	Last Score: 0.100
Episode 800	Avg Left Score: 0.285	Avg Right Score: 0.287	Avg Max Score: 0.306	Last Score: 0.100
Episode 900	Avg Left Score: 0.718	Avg Right Score: 0.736	Avg Max Score: 0.764	Last Score: 0.200
Episode 1000	Avg Left Score: 1.068	Avg Right Score: 1.074	Avg Max Score: 1.102	Last Score: 2.100
Episode 1100	Avg Left Score: 1.813	Av

In [8]:
data = pd.DataFrame({'score': scores})
data.to_csv('scores.csv', sep='\t', index=False, header=True)

### 4. Test Model

In [9]:
agent = Agent(state_size=state_size, action_size=action_size, random_seed=10)
agent.actor_local.load_state_dict(torch.load('checkpoint_actor_e1900.pth'))
agent.critic_local.load_state_dict(torch.load('checkpoint_critic_e1900.pth'))

scores = []

for i_episode in range(1, 101):
    env_info = env.reset(train_mode=True)[brain_name]
    states = env_info.vector_observations
    score = np.zeros(2)
    for t in range(2000):
        actions = agent.act(states, False)
        actions = np.clip(actions, -1, 1)    
        env_info = env.step(actions)[brain_name]           # send all actions to tne environment
        states = env_info.vector_observations              # get next state (for each agent)
        rewards = env_info.rewards                         # get reward (for each agent)
        dones = env_info.local_done                        # see if episode finished
        score += rewards        
                
        if np.all(dones):
            break
    scores.append(np.max(score))       # save most recent score
    
    print('\rEpisode {}\tAverage Score: {:.3f}'.format(i_episode, np.mean(scores)), end="")

Episode 100	Average Score: 2.578

When finished, you can close the environment.

In [None]:
env.close()