# Multi-Agent Tennis

---

In this notebook, you will learn how to use the Unity ML-Agents environment for the third project of the [Deep Reinforcement Learning Nanodegree](https://www.udacity.com/course/deep-reinforcement-learning-nanodegree--nd893) program.

### 1. Start the Environment

We begin by importing the necessary packages.  If the code cell below returns an error, please revisit the project instructions to double-check that you have installed [Unity ML-Agents](https://github.com/Unity-Technologies/ml-agents/blob/master/docs/Installation.md) and [NumPy](http://www.numpy.org/).

In [1]:
%matplotlib inline

import torch
import time
import config

import numpy as np
import matplotlib.pyplot as plt

from ddpg import DDPGAgent
# from config import Config
from collections import deque
from unityagents import UnityEnvironment
from utilities import ReplayBuffer
# from agent.maddpg import MADDPG, MultiAgentConfig
# from IPython.display import clear_output
# from unityagents import UnityEnvironment



In [2]:
env = UnityEnvironment(file_name="/home/maurice/Documents/udacity_new/data/Tennis_Linux_NoVis/Tennis.x86_64")


INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		
Unity brain name: TennisBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 8
        Number of stacked Vector Observation: 3
        Vector Action space type: continuous
        Vector Action space size (per agent): 2
        Vector Action descriptions: , 


In [3]:
# Get default environment parameters
brain_name  = env.brain_names[0]
brain       = env.brains[brain_name]
env_info    = env.reset(train_mode=True)[brain_name]
num_agents  = len(env_info.agents)
action_size = brain.vector_action_space_size
state_size  = env_info.vector_observations.shape[1]

# state = env_info.vector_observations

# print(action_size, state_size)


In [4]:
def transpose_to_tensor(input_list):
    make_tensor = lambda x: torch.tensor(x, dtype=torch.float)
    return list(map(make_tensor, zip(*input_list)))


def soft_update(target, source, tau):
    for target_param, param in zip(target.parameters(), source.parameters()):
        target_param.data.copy_(target_param.data * (1.0 - tau) + param.data * tau)

In [5]:
class MADDPG():
    def __init__(self, state_size, action_size, config):
        super().__init__()
        
        self.config = config
        self.seed   = config.seed 
        self.agents = [DDPGAgent(state_size, action_size, self.config) for _ in range(num_agents)]
        self.iter   = 0 
        self.learn_iter = 0
        
        self.beta_function = lambda x: min(1.0, self.config.beta + x * (1.0 - self.config.beta) / self.config.beta_decay)
        self.memory = ReplayBuffer(self.config.buffer_size, self.config.seed)
    
    
    def reset(self):
        for ddpg_agent in self.agents:
            ddpg_agent.reset()
    
    def get_actors(self):
        """get actors of all the agents in the MADDPG object"""
        actors = [ddpg_agent.actor for ddpg_agent in self.agents]
        return actors
    
    
    def get_target_actors(self):
        """get target_actors of all the agents in the MADDPG object"""
        target_actors = [ddpg_agent.target_actor for ddpg_agent in self.agents]
        return target_actors
    
    
    def act(self, obs_all_agents, noise=0.0):
        """get actions from all agents in the MADDPG object"""
        obs_all_agents = torch.tensor(obs_all_agents, dtype=torch.float).to(self.config.device)
        actions = [np.clip(agent.act(obs, noise).cpu().data.numpy(), -1, 1) 
                   for agent, obs in zip(self.agents, obs_all_agents)]
        return actions
    
    
    def target_act(self, obs_all_agents, noise=0.0):
        """get target network actions from all the agents in the MADDPG object """
        target_actions = [ddpg_agent.target_act(obs, noise) for ddpg_agent, obs 
                          in zip(self.agents, obs_all_agents)]
        return target_actions
    
    def step(self, state, action, reward, next_state, done):
        self.memory.push(state, action, reward, next_state, done)
        self.iter += 1

        if(len(self.memory) >= self.config.batch_size) and self.iter % self.config.update_every == 0:
            beta = self.beta_function(self.learn_iter)
            for i in range(len(self.agents)):
                samples = self.memory.sample(self.config.batch_size, beta)
                self.update(samples, i)
            self.learn_iter += 1
            self.update_targets()

    def _prep_samples(self, samples):
        convert = lambda x: torch.tensor(x, dtype=torch.float).to(self.config.device)

        obs, action, reward, next_obs, done, weights, idx = samples

        obs           = np.rollaxis(obs, 1)
        next_obs      = np.rollaxis(next_obs, 1)
        obs_full      = np.hstack(obs)
        next_obs_full = np.hstack(next_obs)

        obs           = convert(obs)
        obs_full      = convert(obs_full)
        action        = convert(action)
        reward        = convert(reward)
        next_obs      = convert(next_obs)
        next_obs_full = convert(next_obs_full)
        done          = convert(np.float32(done))
        weights       = convert(weights)

        return obs, obs_full, action, reward, next_obs, next_obs_full, done, idx, weights            
    
    def update(self, samples, agent_number):
        """update the critics and actors of all the agents """

        # need to transpose each element of the samples
        # to flip obs[parallel_agent][agent_number] to
        # obs[agent_number][parallel_agent]
        
        
        obs, obs_full, action, reward, next_obs, next_obs_full, done, idx, weights = self._prep_samples(samples)        
#         obs, obs_full, action, reward, next_obs, next_obs_full, done = map(transpose_to_tensor, samples)

#         obs_full = torch.stack(obs_full)
#         next_obs_full = torch.stack(next_obs_full)
        
        agent = self.agents[agent_number]
        agent.critic_optimizer.zero_grad()

        #critic loss = batch mean of (y- Q(s,a) from target network)^2
        #y = reward of this timestep + discount * Q(st+1,at+1) from target network
        target_actions = self.target_act(next_obs)
        target_actions = torch.cat(target_actions, dim=1).detach()
        
        target_critic_input = torch.cat((next_obs_full,target_actions), dim=1)
        
        with torch.no_grad():
            q_next = agent.target_critic(target_critic_input)
        
        
#         print('\ncheck A = ', reward[agent_number].view(-1, 1))
#         print('check B = ', self.config.gamma)
#         print('check C = ', q_next)
#         print('check D = ', 1 - done[agent_number].view(-1, 1))
        
#         y = reward[agent_number].view(-1, 1) + self.config.gamma * q_next * (1 - done[agent_number].view(-1, 1))
        y = reward[..., agent_number].unsqueeze(1) + self.config.gamma * q_next * (1 - done[..., agent_number].unsqueeze(1))
        
#         action = torch.cat(action, dim=1)?
        critic_input = torch.cat((obs_full, action.view(self.config.batch_size, -1)), dim=1)
        q = agent.critic(critic_input)

        huber_loss = torch.nn.SmoothL1Loss()
        critic_loss = huber_loss(q, y.detach())
        critic_loss.backward()
        #torch.nn.utils.clip_grad_norm_(agent.critic.parameters(), 0.5)
        agent.critic_optimizer.step()

        #update actor network using policy gradient
        agent.actor_optimizer.zero_grad()
        # make input to agent
        # detach the other agents to save computation
        # saves some time for computing derivative
        q_input = [ self.agents[i].actor(ob) if i == agent_number \
                   else self.agents[i].actor(ob).detach()
                   for i, ob in enumerate(obs) ]
                
        q_input = torch.cat(q_input, dim=1)
        # combine all the actions and observations for input to critic
        # many of the obs are redundant, and obs[1] contains all useful information already
        q_input2 = torch.cat((obs_full, q_input), dim=1)
        
        # get the policy gradient
        actor_loss = -agent.critic(q_input2).mean()
        actor_loss.backward()
        #torch.nn.utils.clip_grad_norm_(agent.actor.parameters(),0.5)
        agent.actor_optimizer.step()

        al = actor_loss.cpu().detach().item()
        cl = critic_loss.cpu().detach().item()
#         logger.add_scalars('agent%i/losses' % agent_number,
#                            {'critic loss': cl,
#                             'actor_loss': al},
#                            self.iter)

    def update_targets(self):
        """soft update targets"""
        self.iter += 1
        for ddpg_agent in self.agents:
            soft_update(ddpg_agent.target_actor, ddpg_agent.actor, self.config.tau)
            soft_update(ddpg_agent.target_critic, ddpg_agent.critic, self.config.tau)
      
    def save_checkpoints(self, file_head):
        for i in range(len(self.agents)):
            file_name = file_head + 'agent{}_'.format(i)
            torch.save(self.agents[i].actor.state_dict(), file_name + '_actor.pth')
            torch.save(self.agents[i].critic.state_dict(), file_name + '_critic.pth')



In [6]:
def train(maddpg, n_eps=10000, max_steps=1000):
    
    scores = []
    scores_window = deque(maxlen=100)
    train_mode = True
    
    frame_num = 0
    current_max = 1.0
    
    # training loop
    for i_eps in range(n_eps):
        env_info = env.reset(train_mode=train_mode)[brain_name]
        state    = env_info.vector_observations
        maddpg.reset()
        agent_scores = np.zeros(num_agents)
        
        for step in range(max_steps):
            action     = maddpg.act(state, 1.0)
            env_info   = env.step(action)[brain_name] 
            reward     = env_info.rewards
            next_state = env_info.vector_observations
            done       = env_info.local_done
            maddpg.step(state, action, reward, next_state, done)
            
            state      = next_state
            agent_scores += reward
            frame_num += 1
            
            if np.any(done): break
                
        max_score = np.max(agent_scores)
        scores_window.append(max_score)
        scores.append(max_score)                
    
        # change all these print statements    
        print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_eps, np.mean(scores_window)), end="")
        
        if i_eps % 100 == 0:
            print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_eps, np.mean(scores_window)))

        if max_score > current_max:
            maddpg.save_checkpoints('checkpoints/checkpoint_max_')
            current_max = max_score
            
        if np.mean(scores_window) >= 0.5:
            print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(i_eps, np.mean(scores_window)))
            break
            
            
    return scores
                
    
    
def plot_scores(scores, title=""):
    fig = plt.figure()
    ax = fig.add_subplot(111)
    plt.title(title)
    plt.plot(np.arange(len(scores)), scores)
    plt.ylabel('Score')
    plt.xlabel('Episode #')
    plt.show()


In [7]:
# config = config()
config.seed = 0
config.buffer_size = int(1e5)
config.batch_size = 512
config.gamma = 0.99
config.tau = 2e-1

config.actor_hidden_sizes = [256, 128]
config.lr_actor = 1e-4
config.critic_hidden_sizes = [256, 128]
config.lr_critic = 3e-4
config.critic_weight_decay = 0.0

config.mu = 0.
config.theta = 0.15
config.sigma = 0.2

config.update_every = 5
config.device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

# config.prioritized_replay = True
config.beta = 0.4
config.beta_decay = 10000
# config.alpha = 0.6

# print(config.batch_size)

maddpg = MADDPG(state_size, action_size, config)
# print(config)
scores = train(maddpg, 3000, 2000)

plot_scores(scores)


Episode 0	Average Score: 0.00
Episode 100	Average Score: 0.01
Episode 200	Average Score: 0.00
Episode 300	Average Score: 0.00
Episode 400	Average Score: 0.00
Episode 500	Average Score: 0.01
Episode 600	Average Score: 0.01
Episode 700	Average Score: 0.02
Episode 800	Average Score: 0.01
Episode 900	Average Score: 0.01
Episode 1000	Average Score: 0.02
Episode 1100	Average Score: 0.05
Episode 1200	Average Score: 0.04
Episode 1300	Average Score: 0.05
Episode 1400	Average Score: 0.04
Episode 1500	Average Score: 0.06
Episode 1600	Average Score: 0.08
Episode 1700	Average Score: 0.08
Episode 1800	Average Score: 0.11
Episode 1900	Average Score: 0.15
Episode 2000	Average Score: 0.14
Episode 2100	Average Score: 0.15
Episode 2200	Average Score: 0.18
Episode 2300	Average Score: 0.18
Episode 2400	Average Score: 0.20
Episode 2500	Average Score: 0.25
Episode 2511	Average Score: 0.25

FileNotFoundError: [Errno 2] No such file or directory: 'checkpoints/checkpoint_max_agent0__actor.pth'