# Collaboration and Competition

---

You are welcome to use this coding environment to train your agent for the project.  Follow the instructions below to get started!

### 1. Start the Environment

Run the next code cell to install a few packages.  This line will take a few minutes to run!

In [1]:
!pip -q install ./python

The environment is already saved in the Workspace and can be accessed at the file path provided below. 

In [2]:
from unityagents import UnityEnvironment
import numpy as np

env = UnityEnvironment(file_name="/data/Tennis_Linux_NoVis/Tennis")

INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		
Unity brain name: TennisBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 8
        Number of stacked Vector Observation: 3
        Vector Action space type: continuous
        Vector Action space size (per agent): 2
        Vector Action descriptions: , 


Environments contain **_brains_** which are responsible for deciding the actions of their associated agents. Here we check for the first brain available, and set it as the default brain we will be controlling from Python.

In [3]:
# get the default brain
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

### 2. Examine the State and Action Spaces

Run the code cell below to print some information about the environment.

In [4]:
# reset the environment
env_info = env.reset(train_mode=True)[brain_name]

# number of agents 
num_agents = len(env_info.agents)
print('Number of agents:', num_agents)

# size of each action
action_size = brain.vector_action_space_size
print('Size of each action:', action_size)

# examine the state space 
states = env_info.vector_observations
state_size = states.shape[1]
print('There are {} agents. Each observes a state with length: {}'.format(states.shape[0], state_size))
print('The state for the first agent looks like:', states[0])

Number of agents: 2
Size of each action: 2
There are 2 agents. Each observes a state with length: 24
The state for the first agent looks like: [ 0.          0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.          0.
  0.          0.         -6.65278625 -1.5        -0.          0.
  6.83172083  6.         -0.          0.        ]


In [5]:
import torch
from MADDPG_wrapper import MADDPGWrapper
from collections import deque

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

class params():
    buffer_size = 100000
    batch_size = 128
    tau = 0.008
    actor_lr = 0.0001
    critic_lr = 0.001
    actor_weight_decay = 0
    critic_weight_decay = 0
    gamma = 0.99
    updates_per_step = 3
    atoms_number = 51
    V_min = -5
    V_max = 5
    delta = (V_max - V_min)/(atoms_number -1)
    n_steps = 5
    EPS_START = 5.0         # initial value for epsilon in noise decay process in Agent.act()
    EPS_EP_END = 300        # episode to end the noise decay process
    EPS_FINAL = 0 # final value for epsilon after decay

MADDPG_agent = MADDPGWrapper(action_size, state_size, device, params, num_agents)

In [6]:
MADDPG_agent.prepopulateBuffer(brain_name, env)

prepopulation start
0
1000
2000
3000
4000
5000
6000
7000
8000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
40000
41000
42000
43000
44000
45000
46000
46000
47000
48000
49000
50000
51000
52000
53000
54000
55000
56000
57000
58000
59000
60000
61000
62000
63000
64000
65000
66000
66000
67000
68000
68000
69000
70000
71000
72000
73000
74000
75000
76000
77000
78000
79000
80000
81000
82000
83000
84000
85000
86000
87000
88000
89000
90000
91000
92000
92000
93000
93000
94000
95000
96000
96000
97000
98000
99000
prepopulation end


In [7]:
episode_nr = 0
finish_score = 0.5
last_100_scores = deque(maxlen=100)
scores_all = []
max_reward = 0
finished = False
finish_episode_nr = 0
episodes_after = 100

while True:
    if episode_nr%100 == 0:
        name = str(episode_nr) + '.pth'
        MADDPG_agent.save_model(name)
    episode_nr += 1
    env_info = env.reset(train_mode=True)[brain_name]     # reset the environment    
    next_states = states = env_info.vector_observations                  # get the current state (for each agent)
    scores = np.zeros(num_agents)                          # initialize the score (for each agent)
    while True:
        n_step_rewards = np.zeros(num_agents)
        for i in range(params.n_steps-1):
            states = next_states 
            actions = MADDPG_agent.select_actions(states)
            env_info = env.step(actions)[brain_name]           # send all actions to tne environment
            next_states = env_info.vector_observations         # get next state (for each agent)
            rewards = env_info.rewards                         # get reward (for each agent)
            n_step_rewards += [reward * (params.gamma**i) for reward in rewards]
            dones = env_info.local_done                        # see if episode finished
        scores += np.max(n_step_rewards)                         # update the score (for each agent)
        MADDPG_agent.step(states, actions, n_step_rewards, next_states, dones)
        states = next_states                               # roll over states to next time step
        if np.any(dones):                                  # exit loop if episode finished
            break
    last_100_scores.append(np.max(scores))
    scores_all.append(np.max(scores))
    if np.max(scores) > max_reward:
        max_reward = np.max(scores)
    if episode_nr >= 100 and np.mean(last_100_scores) >= finish_score and finished == False:
        print("Reached  ", np.mean(last_100_scores), " in episode ", episode_nr)
        MADDPG_agent.save_model('final.pth')
        finished = True
        finish_episode_nr = episode_nr
        
    if episode_nr%10 ==0 and episode_nr >= 100:
        print(episode_nr, ': Total score (averaged over agents): {}'.format(np.mean(last_100_scores)), " reward this episode: ", np.max(scores), "max: ", max_reward)
    
    if finished == True and episode_nr >= (finish_episode_nr + episodes_after):
        break

100 : Total score (averaged over agents): 0.017701896263778806  reward this episode:  0.0 max:  0.296029904411
110 : Total score (averaged over agents): 0.015741696234569547  reward this episode:  0.0 max:  0.296029904411
120 : Total score (averaged over agents): 0.00886089813203767  reward this episode:  0.0 max:  0.296029904411
130 : Total score (averaged over agents): 0.012781596190460622  reward this episode:  0.194059802892 max:  0.298010004441
140 : Total score (averaged over agents): 0.011791596175708472  reward this episode:  0.0 max:  0.298010004441
150 : Total score (averaged over agents): 0.03943399058761224  reward this episode:  0.587099708748 max:  0.888099713234
160 : Total score (averaged over agents): 0.052235087778363457  reward this episode:  0.0 max:  0.888099713234
170 : Total score (averaged over agents): 0.07293698308684574  reward this episode:  0.492059807332 max:  0.888099713234
180 : Total score (averaged over agents): 0.13610227002808187  reward this episode

In [8]:
import matplotlib.pyplot as plt
# plot the scores
fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot(np.arange(len(scores_all)), scores_all, label='MADDPG')
plt.ylabel('Score')
plt.xlabel('Episode #')
plt.legend(loc='upper left');
plt.show()
fig.set_size_inches(18.5, 10.5)
fig.savefig('result.png', dpi=100)


<matplotlib.figure.Figure at 0x7f783eeed080>

In [8]:
MADDPG_agent.load_model()
episode = 0
scores =0
for episode in range(5):
    print("episode ", episode)
    scores =0
    env_info = env.reset(train_mode=False)[brain_name]     # reset the environment    
    next_states = states = env_info.vector_observations                  # get the current state (for each agent)
    while True:
        actions = MADDPG_agent.select_actions(states, False)
        env_info = env.step(actions)[brain_name]           # send all actions to tne environment
        next_states = env_info.vector_observations         # get next state (for each agent)
        rewards = env_info.rewards                         # get reward (for each agent)
        dones = env_info.local_done                        # see if episode finished
        scores += np.max(rewards)                         # update the score (for each agent)
        states = next_states                               # roll over states to next time step
        if np.any(dones):                                  # exit loop if episode finished
            break
    print("episode ", episode, " score : ", np.max(scores))

episode  0
episode  0  score :  0.10000000149
episode  1
episode  1  score :  1.10000001639
episode  2
episode  2  score :  1.50000002235
episode  3
episode  3  score :  5.20000007749
episode  4
episode  4  score :  0.20000000298


In [None]:
env.close()