In [None]:
import os
import random
import numpy as np
import matplotlib.pyplot as plt
from time import sleep
from IPython.display import clear_output
from collections import namedtuple, deque

from unityagents import UnityEnvironment

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as T

from torch.utils.data.sampler import BatchSampler, SubsetRandomSampler

In [None]:
from buffer import ReplayBuffer
from model import ActorNetwork, CriticNetwork

In [None]:
import warnings
warnings.simplefilter('ignore', UserWarning)

In [None]:
plt.style.use('ggplot')

## Set Configs

In [None]:
is_cuda = torch.cuda.is_available()

if is_cuda: device = torch.device('cuda')
else: device = torch.device('cpu')

In [None]:
BATCH_SIZE = 32
GAMMA = 0.995
ENTROPY_WEIGHT = 0.001
EPS_CLIP = 1e-1
GRAD_CLIP = 5e-1
DEQUE_SIZE = 100

GOALIE_LR = 8e-5
STRIKER_LR = 1e-4

In [None]:
TRAINED_AGENT_KEY = 0
RANDOM_AGENT_KEY = 1

## Set Environment

In [None]:
env = UnityEnvironment(file_name='./Soccer.app', no_graphics=True)

env_info = env.reset(train_mode=True)

goalie_brain_name = env.brain_names[0]
goalie_state_size = env_info[goalie_brain_name].vector_observations.shape[1]
goalie_action_size = env.brains[goalie_brain_name].vector_action_space_size

striker_brain_name = env.brain_names[1]
striker_state_size = env_info[striker_brain_name].vector_observations.shape[1]
striker_action_size = env.brains[striker_brain_name].vector_action_space_size

## Define [PPO](https://arxiv.org/pdf/1707.06347.pdf) Agent

In [None]:
class PPOAgent():
    """The Agent that will interacts with and learns from the environment."""
    
    def __init__(self, env, mode='Goalie', seed=90):
        """Initialize an Agent object."""
        
        self.env_info = env.reset(train_mode=True)
                
        if mode == 'Goalie': brain_name = env.brain_names[0] # brain name for Goalie
        elif mode == 'Striker': brain_name = env.brain_names[1] # brain name for Striker
            
        self.state_size = self.env_info[brain_name].vector_observations.shape[1]
        self.action_size = env.brains[brain_name].vector_action_space_size
        
        self.batch_size = BATCH_SIZE
        self.gamma = GAMMA
        self.entropy_weight = ENTROPY_WEIGHT
        self.eps_clip = EPS_CLIP       
        self.grad_clip = GRAD_CLIP
        
        if mode == 'Goalie': self.lr = GOALIE_LR
        elif mode == 'Striker':  self.lr = STRIKER_LR
            
        self.actor = ActorNetwork(self.state_size, self.action_size, seed).to(device)
        self.critic = CriticNetwork(2 * len(env_info[goalie_brain_name].agents) * self.state_size, seed).to(device)
        self.optimizer = torch.optim.Adam(list(self.actor.parameters()) + list(self.critic.parameters()), lr=self.lr)
        
        self.buffer = ReplayBuffer(self.batch_size, seed)
        
    def act(self, states):
        
        states = torch.FloatTensor(states).unsqueeze(0).to(device)
        
        self.actor.eval()
        
        with torch.no_grad():
            actions, action_log_probs, _ = self.actor(states)
            
        actions = actions.cpu().detach().numpy().item()
        action_log_probs = action_log_probs.cpu().detach().numpy().item()
        
        self.actor.train()
        
        return actions, action_log_probs
    
    def memorize(self, actor_state, critic_state, action, log_prob, reward):
        
        self.buffer.add(actor_state, critic_state, action, log_prob, reward)
    
    def learn(self):
        
        actor_states, critic_states, actions, old_log_probs, rewards, num_experiences = self.buffer.get_experiences()
        
        discount = self.gamma**np.arange(num_experiences)
        rewards = rewards.squeeze(1) * discount
        next_rewards = rewards[::-1].cumsum(axis=0)[::-1]
        
        actor_states = torch.FloatTensor(actor_states).to(device)
        critic_states = torch.FloatTensor(critic_states).to(device)
        actions = torch.LongTensor(actions).squeeze(1).to(device)
        old_log_probs = torch.FloatTensor(old_log_probs).squeeze(1).to(device)
        rewards = torch.FloatTensor(next_rewards.copy()).to(device)

        self.critic.eval()
        with torch.no_grad():
            state_values = self.critic(critic_states)
        
        self.critic.train()
        
        advantages = (rewards - state_values.detach().squeeze())
        advantages = advantages.detach()
        advantages_normalized = (advantages - advantages.mean()) / (advantages.std() + 1e-10)
        advantages_normalized = torch.FloatTensor(advantages_normalized).to(device)
        
        batches = BatchSampler(SubsetRandomSampler(range(0, num_experiences)), self.batch_size, drop_last=False)
        losses = []
        
        for batch_indices in batches:
            
            batch_indices = torch.LongTensor(batch_indices).to(device)
            
            sampled_actor_states = actor_states[batch_indices]
            sampled_critic_states = critic_states[batch_indices]
            sampled_actions = actions[batch_indices]
            sampled_old_log_probs = old_log_probs[batch_indices]
            sampled_rewards = rewards[batch_indices]
            sampled_advantages_normalized = advantages_normalized[batch_indices]
        
            _, log_probs, dist_entropies = self.actor(sampled_actor_states, sampled_actions)
            state_values = self.critic(sampled_critic_states)
            state_values = state_values.squeeze()
            
            ratios = torch.exp(log_probs - sampled_old_log_probs.detach())
            
            surrogate1 = ratios * sampled_advantages_normalized
            surrogate2 = torch.clamp(ratios, 1-self.eps_clip, 1+self.eps_clip) * sampled_advantages_normalized
                
            policy_loss = -torch.min(surrogate1, surrogate2).mean() - self.entropy_weight * dist_entropies.mean()
            value_loss = 0.5 * (sampled_rewards - state_values).pow(2).mean()
            
            total_loss = policy_loss + value_loss

            self.optimizer.zero_grad()
            total_loss.backward()
            nn.utils.clip_grad_norm_(self.actor.parameters(), self.grad_clip)
            nn.utils.clip_grad_norm_(self.critic.parameters(), self.grad_clip)
            self.optimizer.step()
            
            losses.append(total_loss.item())
            
        self.buffer.reset()
            
        self.eps_clip *= 1.
        self.entropy_weight *= 0.995
        
        return np.average(losses)
    
    def save(self, actor_path, critic_path):
        
        if not os.path.exists('./agents/'): os.makedirs('./agents/')
        torch.save(self.actor.state_dict(), actor_path); torch.save(self.critic.state_dict(), critic_path);
        
    def load(self, actor_path, critic_path):
        
        self.actor.load_state_dict(torch.load(actor_path)); self.critic.load_state_dict(torch.load(critic_path));

In [None]:
goalie = PPOAgent(env, mode='Goalie', seed=90); striker = PPOAgent(env, mode='Striker', seed=90);

## Train The Agent

In [None]:
def train_agent(num_episodes=10000):
    
    all_scores = []
    scores_target = 95.0
    
    trained_agent_scores_window = deque(maxlen=DEQUE_SIZE)
    trained_agent_scores_window_wins = deque(maxlen=DEQUE_SIZE)

    random_agent_scores_window = deque(maxlen=DEQUE_SIZE)
    random_agent_scores_window_wins = deque(maxlen=DEQUE_SIZE)
    
    draws = deque(maxlen=DEQUE_SIZE)
    
    # training loop
    for i_episode in range(1, num_episodes+1):

        env_info = env.reset(train_mode=True)
        
        goalie_states = env_info[goalie_brain_name].vector_observations
        striker_states = env_info[striker_brain_name].vector_observations
        
        goalie_scores = np.zeros(len(env_info[goalie_brain_name].agents))
        striker_scores = np.zeros(len(env_info[striker_brain_name].agents))
        
        while True:
            
            trained_goalie_action, trained_goalie_log_prob = goalie.act(goalie_states[TRAINED_AGENT_KEY])
            trained_striker_action, trained_striker_log_prob = striker.act(striker_states[TRAINED_AGENT_KEY])
            
            random_goalie_action = np.asarray([np.random.choice(goalie_action_size)])
            random_striker_action = np.asarray([np.random.choice(striker_action_size)])
            
            goalie_actions = np.array((trained_goalie_action, random_goalie_action))
            striker_actions = np.array((trained_striker_action, random_striker_action))
            
            actions = dict(zip([goalie_brain_name, striker_brain_name], [goalie_actions, striker_actions]))
            
            env_info = env.step(actions)
            
            # agents get next states
            goalie_next_states = env_info[goalie_brain_name].vector_observations
            striker_next_states = env_info[striker_brain_name].vector_observations
            
            # agents get rewards
            goalie_rewards = env_info[goalie_brain_name].rewards
            striker_rewards = env_info[striker_brain_name].rewards
            
            # agents update scores
            goalie_scores += goalie_rewards
            striker_scores += striker_rewards
            
            # agents memorize experiences
            trained_goalie_reward = goalie_rewards[TRAINED_AGENT_KEY]
            goalie.memorize(goalie_states[TRAINED_AGENT_KEY], 
                            np.concatenate((goalie_states[TRAINED_AGENT_KEY], 
                                            striker_states[TRAINED_AGENT_KEY], 
                                            goalie_states[RANDOM_AGENT_KEY], 
                                            striker_states[RANDOM_AGENT_KEY]), axis=0), trained_goalie_action, trained_goalie_log_prob, trained_goalie_reward)
            
            trained_striker_reward = striker_rewards[TRAINED_AGENT_KEY]
            striker.memorize(striker_states[TRAINED_AGENT_KEY], 
                             np.concatenate((striker_states[TRAINED_AGENT_KEY], 
                                             goalie_states[TRAINED_AGENT_KEY], 
                                             striker_states[RANDOM_AGENT_KEY], 
                                             goalie_states[RANDOM_AGENT_KEY]), axis=0), trained_striker_action, trained_striker_log_prob, trained_striker_reward)
            
            # agents roll over states to next states
            goalie_states = goalie_next_states
            striker_states = striker_next_states
            
            # check if episode finished
            done = np.any(env_info[goalie_brain_name].local_done)
            if done: break  

        # agents learn from collected experiences
        goalie_loss = goalie.learn()
        striker_loss = striker.learn()

        # agents record scores
        trained_agent_score = goalie_scores[TRAINED_AGENT_KEY] + striker_scores[TRAINED_AGENT_KEY]
        trained_agent_scores_window.append(trained_agent_score)
        trained_agent_scores_window_wins.append(1 if trained_agent_score > 0 else 0)

        random_agent_score = goalie_scores[RANDOM_AGENT_KEY] + striker_scores[RANDOM_AGENT_KEY]
        random_agent_scores_window.append(random_agent_score)
        random_agent_scores_window_wins.append(1 if random_agent_score > 0 else 0)

        draws.append(trained_agent_score == random_agent_score)

        print(f'\rEpisode: {i_episode}, Goalie Loss: {goalie_loss:.4f}, Striker Loss: {striker_loss:.4f}')
        print(f'Red Wins: {np.count_nonzero(trained_agent_scores_window_wins)}, Score: {trained_agent_score:.4f}, Average Score: {np.sum(trained_agent_scores_window):.4f}')
        print(f'Blue Wins: {np.count_nonzero(random_agent_scores_window_wins)}, Score: {random_agent_score:.4f}, Average Score: {np.sum(random_agent_scores_window):.4f}')
        print(f'Draws: {np.count_nonzero(draws)}\n')

        all_scores.append(np.sum(trained_agent_scores_window))
        
        if i_episode == 1 or i_episode == 50 or i_episode == 100 or i_episode == 1000 or i_episode == 2000 or i_episode == 4000:
            goalie.save(f'./agents/Actor_{goalie_brain_name}_episode{i_episode}.pth', f'./agents/Critic_{goalie_brain_name}_episode{i_episode}.pth')
            striker.save(f'./agents/Actor_{striker_brain_name}_episode{i_episode}.pth', f'./agents/Critic_{striker_brain_name}_episode{i_episode}.pth')
            
        if np.count_nonzero(trained_agent_scores_window_wins) >= scores_target:
            print(f'\rEnvironment solved in {i_episode-100:d} episodes!')
            print(f'Red Wins: {np.count_nonzero(trained_agent_scores_window_wins)}, Score: {trained_agent_score:.4f}, Average Score: {np.sum(trained_agent_scores_window):.4f}')
            print(f'Blue Wins: {np.count_nonzero(random_agent_scores_window_wins)}, Score: {random_agent_score:.4f}, Average Score: {np.sum(random_agent_scores_window):.4f}')
            print(f'Draws: {np.count_nonzero(draws)}\n')
            break
    
    goalie.save(f'./agents/Actor_{goalie_brain_name}.pth', f'./agents/Critic_{goalie_brain_name}.pth')
    striker.save(f'./agents/Actor_{striker_brain_name}.pth', f'./agents/Critic_{striker_brain_name}.pth')
    print('Training completed.')
     
    return all_scores

In [None]:
scores = train_agent(num_episodes=10000)

## Evaluate The Agent

In [None]:
plt.figure(figsize=(10,5))
plt.plot(np.arange(len(scores)), scores, color='green')
plt.xlabel('Num of episodes')
plt.ylabel('Score')
if not os.path.exists('./images/'): os.makedirs('./images/')
plt.savefig('./images/plot_of_ppo_evaluation.png')
plt.show()

## 🎬 Watch The Smart Agent

In [None]:
def watch_agent(self, goalie, striker, num_episodes=10):
        
    trained_agent_scores_window = deque(maxlen=PRINT_EVERY)
    trained_agent_scores_window_wins = deque(maxlen=PRINT_EVERY)

    random_agent_scores_window = deque(maxlen=PRINT_EVERY)
    random_agent_scores_window_wins = deque(maxlen=PRINT_EVERY)

    draws = deque(maxlen=PRINT_EVERY)

    # training loop
    for i_episode in range(1, num_episodes+1):

        env_info = env.reset(train_mode=True)

        goalie_states = env_info[goalie_brain_name].vector_observations
        striker_states = env_info[striker_brain_name].vector_observations

        goalie_scores = np.zeros(len(env_info[goalie_brain_name].agents))
        striker_scores = np.zeros(len(env_info[striker_brain_name].agents))

        while True:

            trained_goalie_action, trained_goalie_log_prob = goalie.act(goalie_states[TRAINED_AGENT_KEY])
            trained_striker_action, trained_striker_log_prob = striker.act(striker_states[TRAINED_AGENT_KEY])

            random_goalie_action = np.asarray([np.random.choice(goalie_action_size)])
            random_striker_action = np.asarray([np.random.choice(striker_action_size)])

            goalie_actions = np.array((trained_goalie_action, random_goalie_action))
            striker_actions = np.array((trained_striker_action, random_striker_action))

            actions = dict(zip([goalie_brain_name, striker_brain_name], [goalie_actions, striker_actions]))

            env_info = env.step(actions)

            # agents get next states
            goalie_next_states = env_info[goalie_brain_name].vector_observations
            striker_next_states = env_info[striker_brain_name].vector_observations

            # agents get rewards
            goalie_rewards = env_info[goalie_brain_name].rewards
            striker_rewards = env_info[striker_brain_name].rewards

            # agents update scores
            goalie_scores += goalie_rewards
            striker_scores += striker_rewards

            # agents roll over states to next states
            goalie_states = goalie_next_states
            striker_states = striker_next_states

            # check if episode finished
            done = np.any(env_info[goalie_brain_name].local_done)
            if done: break

        # agents record scores
        trained_agent_score = goalie_scores[TRAINED_AGENT_KEY] + striker_scores[TRAINED_AGENT_KEY]
        trained_agent_scores_window.append(trained_agent_score)
        trained_agent_scores_window_wins.append(1 if trained_agent_score > 0 else 0)

        random_agent_score = goalie_scores[RANDOM_AGENT_KEY] + striker_scores[RANDOM_AGENT_KEY]
        random_agent_scores_window.append(random_agent_score)
        random_agent_scores_window_wins.append(1 if random_agent_score > 0 else 0)

        draws.append(trained_agent_score == random_agent_score)

        print(f'\rEpisode: {i_episode}, Goalie Loss: {goalie_loss:.4f}, Striker Loss: {striker_loss:.4f}')
        print(f'Red Wins: {np.count_nonzero(trained_agent_scores_window_wins)}, Score: {trained_agent_score:.4f}, Average Score: {np.sum(trained_agent_scores_window):.4f}')
        print(f'Blue Wins: {np.count_nonzero(random_agent_scores_window_wins)}, Score: {random_agent_score:.4f}, Average Score: {np.sum(random_agent_scores_window):.4f}')
        print(f'Draws: {np.count_nonzero(draws)}\n')

In [None]:
goalie.load(f'./agents/Actor_{goalie_brain_name}.pth', f'./agents/Critic_{goalie_brain_name}.pth');
striker.load(f'./agents/Actor_{striker_brain_name}.pth', f'./agents/Critic_{striker_brain_name}.pth');

In [None]:
watch_agent(goalie, striker, num_episodes=10)

---