In [None]:
import os
import random
import numpy as np
import matplotlib.pyplot as plt
from time import sleep
from IPython.display import clear_output
from collections import namedtuple, deque

from unityagents import UnityEnvironment

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as T

In [None]:
from buffer import ReplayBuffer
from model import PolicyNetwork

In [None]:
import warnings
warnings.simplefilter('ignore', UserWarning)

## Set Configs

In [None]:
is_cuda = torch.cuda.is_available()

if is_cuda: device = torch.device('cuda')
else: device = torch.device('cpu')

In [None]:
BATCH_SIZE = 1024
AGENT_BATCH_SIZE = 32
ENTROPY_WEIGHT = 0.01
GAMMA = 0.99
TAU = 0.99
PRINT_EVERY = 10

LR = 1e-4
EPSILON = 1e-5
WEIGHT_DECAY = 1e-4

EPS_CLIP = 0.2
GRAD_CLIP = 1
NOISE_REDUCE = 0.999

## Set Environment

In [None]:
env = UnityEnvironment(file_name='./Crawler.app')

brain_name = env.brain_names[0] # get the brain from unity environment
brain = env.brains[brain_name]

## Define [PPO](https://arxiv.org/pdf/1707.06347.pdf) Agent

In [None]:
class PPOAgent():
    """The Agent that will interacts with and learns from the environment."""
    
    def __init__(self, env, seed):
        """Initialize an Agent object."""
        
        self.brain = env.brains[env.brain_names[0]] # get the brain from unity environment
        
        self.env_info = env.reset(train_mode=True)[brain_name]
        
        self.num_agents = len(self.env_info.agents)
        
        self.state_size = self.env_info.vector_observations.shape[1]
        self.action_size = self.brain.vector_action_space_size
        
        self.batch_size = BATCH_SIZE
        self.entropy_weight = ENTROPY_WEIGHT
        self.gamma = GAMMA
        self.tau = TAU
        self.lr = LR
        self.epsilon = EPSILON
        self.weight_decay = WEIGHT_DECAY
        
        self.eps_clip = EPS_CLIP
        self.grad_clip = GRAD_CLIP
        self.noise_reduce = NOISE_REDUCE
        
        self.policy = PolicyNetwork(self.state_size, self.action_size, seed).to(device)
        self.policy_optimizer = torch.optim.Adam(self.policy.parameters(), lr=self.lr, eps=self.epsilon, weight_decay=self.weight_decay)
        
        self.trajectory = []
        self.std_scale = 1.
        
        # set buffer
        self.buffer = ReplayBuffer(self.batch_size, self.num_agents, seed)
    
    def act(self, states):
        
        states = torch.FloatTensor(states).to(device)
                    
        self.policy.eval()
        
        with torch.no_grad():
            actions, action_log_probs, dist_entropies, state_values = self.policy(states=states, scale=self.std_scale)
        
        actions = actions.cpu().data.numpy()
        
        self.policy.train()
        
        return actions, action_log_probs, state_values, dist_entropies

    def memorize_step(self, trajectory):
        
        self.trajectory.append(trajectory)

    def memorize_trajectory(self, states):
        
        states = torch.FloatTensor(states).to(device)
        
        _, _, pending_values, _ = self.act(states)
        self.trajectory.append([states, pending_values, None, None, None, None])
        
        processed_trajectory = [None] * (len(self.trajectory) - 1)
        advantages = torch.Tensor(np.zeros((self.num_agents, 1))).to(device)
        returns = pending_values.detach()
        
        for i in reversed(range(len(self.trajectory) - 1)):
            
            states, values, actions, log_probs, rewards, dones = self.trajectory[i]
            
            states = torch.Tensor(states).to(device)
            actions = torch.Tensor(actions).to(device)
            rewards = torch.Tensor(rewards).unsqueeze(1).to(device)
            dones = torch.Tensor(dones).unsqueeze(1).to(device)
            
            next_values = self.trajectory[i + 1][1]
            returns = rewards + self.gamma * dones * returns
            TD_error = rewards + self.gamma * dones * next_values.detach() - values.detach()
            advantages = advantages * self.tau * self.gamma * dones + TD_error
            
            processed_trajectory[i] = [states, actions, log_probs, returns, advantages]
            
        self.buffer.add(processed_trajectory)
        self.trajectory = []
        
    def learn(self, next_states):

        self.memorize_trajectory(next_states)

        if len(self.buffer) * self.num_agents >= BATCH_SIZE * AGENT_BATCH_SIZE:

            for states, actions, old_log_probs, returns, advantages in self.buffer.sample():
                
                _, log_probs, state_values, dist_entropies = self.policy(states, actions)
                
                ratios = torch.exp(log_probs - old_log_probs.detach())
                
                surrogate1 = ratios * advantages
                surrogate2 = torch.clamp(ratios, 1-self.eps_clip, 1+self.eps_clip) * advantages
                
                policy_loss = -torch.min(surrogate1, surrogate2).mean() - self.entropy_weight * dist_entropies.mean()
                value_loss = 0.5 * (returns - state_values).pow(2).mean()
                
                total_loss = policy_loss + value_loss

                self.policy_optimizer.zero_grad()
                total_loss.backward()
                nn.utils.clip_grad_norm_(self.policy.parameters(), self.grad_clip)
                self.policy_optimizer.step()

            self.buffer.reset()
            
        self.std_scale = self.std_scale * self.noise_reduce
        
    def save(self, policy_path):
        
        if not os.path.exists('./agents/'): os.makedirs('./agents/')
        torch.save(self.policy.state_dict(), policy_path)
        
    def load(self, policy_path):
        
        self.policy.load_state_dict(torch.load(policy_path));
        
    def watch(self, num_episodes=10, max_time=2000):
        
        for i_episode in range(1, num_episodes+1):

            env_info = env.reset(train_mode=False)[brain_name]
            states = env_info.vector_observations

            agent_scores = np.zeros(len(env_info.agents))

            for time_step in range(max_time):

                actions, log_probs, state_values, _ = agent.act(states)
            
                env_info = env.step(actions)[brain_name]
                next_states, rewards, dones = env_info.vector_observations, env_info.rewards, env_info.local_done
                dones = np.array([1 if time_step else 0 for time_step in dones])

                states = next_states
                agent_scores += rewards

            print(f'\rEpisode: {i_episode}, Average Score: {np.mean(agent_scores):.3f}, Max Score: {np.max(agent_scores):.3f}')

        env.close()

In [None]:
agent = PPOAgent(env, seed=90)

## 🎬 Watch The Smart Agent

In [None]:
# agent.load(f'./agents/PPO_{brain_name}_episode1.pth');
# agent.load(f'./agents/PPO_{brain_name}_episode10.pth');
# agent.load(f'./agents/PPO_{brain_name}_episode100.pth');
# agent.load(f'./agents/PPO_{brain_name}_episode200.pth');
# agent.load(f'./agents/PPO_{brain_name}_episode500.pth');
agent.load(f'./agents/PPO_{brain_name}.pth');

In [None]:
agent.watch(num_episodes=100, max_time=10000)

---