In [1]:
import os
import random
import numpy as np
import matplotlib.pyplot as plt
from time import sleep
from IPython.display import clear_output

from collections import deque
from unityagents import UnityEnvironment

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as T

In [3]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
plt.style.use('ggplot')

## Set Configs

In [5]:
is_cuda = torch.cuda.is_available()

if is_cuda: device = torch.device('cuda')
else: device = torch.device('cpu')

In [6]:
MEMORY_SIZE = int(1e5)
BATCH_SIZE = 64
GAMMA = 0.99
TAU = 1e-3 # for soft update of target parameters
LR = 5e-4
PRINT_EVERY = 100
UPDATE_EVERY = 4 # change to 10000?

SEED = 90

## Set Environment

In [7]:
env = UnityEnvironment(file_name='./Banana.app')

brain_name = env.brain_names[0] # get the brain from unity environment
brain = env.brains[brain_name]

INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		
Unity brain name: BananaBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 37
        Number of stacked Vector Observation: 1
        Vector Action space type: discrete
        Vector Action space size (per agent): 4
        Vector Action descriptions: , , , 


In [8]:
print('Environment Info')
env_info = env.reset(train_mode=False)[brain_name] # reset environment to a new, random state
state = env_info.vector_observations[0] 
action_size = brain.vector_action_space_size

print('Number of agents: {}'.format(len(env_info.agents)))
print('State space: {}'.format(len(state)))
print('Action space: {}'.format(action_size))

Environment Info
Number of agents: 1
State space: 37
Action space: 4


## 🎬 Watch The Agent

In [9]:
total_score = []
num_episodes = 20

for i in range(1, num_episodes+1):
    
    score = 0
    print(f'Watching episode {i}...', end='')
    
    while True:
                
        action = np.random.randint(action_size)
        env_info = env.step(action)[brain_name]
        next_state = env_info.vector_observations[0]
        reward = env_info.rewards[0]
        done = env_info.local_done[0]
        score += reward
        state = next_state
        
        if done:
            clear_output(wait=True)
            
            total_score.append(score)
            env.reset()
            break

print('Watching finished.')
print('Average score for {} episodes: {}'.format(num_episodes, np.mean(total_score)))

Watching finished.
Average score for 20 episodes: -0.3


## Build DQN Architecture

In [10]:
class DQN(nn.Module):
    """Define DQN architecture."""
    
    def __init__(self, state_size, action_size, seed, fc1_units=64, fc2_units=64):
        """Initialize parameters and build model."""
        
        super(DQN, self).__init__()
        
        self.seed = torch.manual_seed(seed)
        
        self.fc1_layer = nn.Linear(state_size, fc1_units)
        self.fc2_layer = nn.Linear(fc1_units, fc2_units)
        self.fc3_layer = nn.Linear(fc2_units, action_size)
        
    def forward(self, state):
        """Build a network that maps state into action values."""
        
        x = F.relu(self.fc1_layer(state))
        x = F.relu(self.fc2_layer(x))
        Qsa = self.fc3_layer(x)
        
        return Qsa

## Set Prioritized Replay Buffer

In [14]:
class SumTree(object):
    
    data_pointer = 0
    
    def __init__(self, capacity):
        
        self.capacity = capacity
        
        self.tree = np.zeros(2 * capacity - 1)
        
        self.data = np.zeros(capacity, dtype=object)
        
    def add(self, priority, data):
        
        tree_index = self.data_pointer + self.capacity - 1
        
        self.data[self.data_pointer] = data
        
        self.update(tree_index, priority)
        
        self.data_pointer += 1
        
        if self.data_pointer >= self.capacity:
            self.data_pointer = 0
            
    def update(self, tree_index, priority):
        
        change = priority - self.tree[tree_index]
        self.tree[tree_index] = priority
        
        while tree_index != 0:
            
            tree_index = (tree_index-1) // 2
            self.tree[tree_index] += change
            
    def get_leaf(self, value):
        
        parent_index = 0
        
        while True:
            
            left_child_index = 2 * parent_index + 1
            right_child_index = left_child_index + 1
            
            if left_child_index >= len(self.tree):
                leaf_index = parent_index
                break
            else:
                if value <= self.tree[left_child_index]:
                    parent_index = left_child_index
                else:
                    value -= self.tree[left_child_index]
                    parent_index = right_child_index
            
        data_index = leaf_index - self.capacity + 1
        return leaf_index, self.tree[leaf_index], self.data[data_index]
    
    @property
    def total_priority(self):
        return self.tree[0]

In [15]:
class PrioritizedReplayBuffer(object):
    
    PER_e = 0.01
    PER_a = 0.6
    PER_b = 0.4
    
    PER_b_increment_per_sampling = 1e-3
    absolute_error_upper = 1.
    
    def __init__(self, capacity):
        """Initialize a PrioritizedReplayBuffer object."""
        
        self.tree = SumTree(capacity)
        
    def add(self, state, action, reward, next_state, done):
        """Add a new experience to SumTree."""
        
        max_priority = np.max(self.tree.tree[-self.tree.capacity:])
        
        if max_priority == 0:
            max_priority = self.absolute_error_upper
            
        experience = state, action, reward, next_state, done
        self.tree.add(max_priority, experience)
        
    def sample(self, n):
        """Sample a batch of prioritized experiences from SumTree."""
        
        experiences = []
        
        batch_id, batch_IS_weights = np.empty((n,), dtype=np.int32), np.empty((n, 1), dtype=np.float32)
        priority_segment = self.tree.total_priority / n
        
        self.PER_batch = np.min([1., self.PER_b + self.PER_b_increment_per_sampling])
        
        p_min = np.min(self.tree.tree[-self.tree.capacity:]) / self.tree.total_priority
        max_weight = (p_min * n) ** (-self.PER_b)
        
        # collect state, action, reward, next_state and done information from the tree
        for i in range(n):
            
            a, b = priority_segment * i, priority_segment * (i + 1)
            value = np.random.uniform(a, b)
            
            index, priority, data = self.tree.get_leaf(value)
            
            sampling_probabilities = priority / self.tree.total_priority
            
            batch_IS_weights[i, 0] = np.power(n * sampling_probabilities, -self.PER_b) / max_weight
            
            batch_id[i] = index
            
            try:
                state, action, reward, next_state, done = data
            except:
                print('--- data')
                print(data)
                print(len(data))
                print(type(data[0]))
                print(data[0])
                
                print(type(data[1]))
                print(data[1])
                
                print(type(data[2]))
                print(data[2])
                
                print(type(data[3]))
                print(data[3])
                
                print(type(data[4]))
                print(data[4])
            experience = [state, action, reward, next_state, done]
            
            experiences.append(experience)
        
        states = torch.from_numpy(np.vstack([exp[0] for exp in experiences if exp is not None])).float()
        states = states.to(device)
        
        actions = torch.from_numpy(np.vstack([exp[1] for exp in experiences if exp is not None])).long()
        actions = actions.to(device)
        
        rewards = torch.from_numpy(np.vstack([exp[2] for exp in experiences if exp is not None])).float()
        rewards = rewards.to(device)
        
        next_states = torch.from_numpy(np.vstack([exp[3] for exp in experiences if exp is not None])).float()
        next_states = next_states.to(device)
        
        dones = torch.from_numpy(np.vstack([exp[4] for exp in experiences if exp is not None]).astype(np.uint8)).float()
        dones = dones.to(device)
        
        return batch_id, (states, actions, rewards, next_states, dones), batch_IS_weights
    
    def update_batch(self, batch_id, abs_errors):
        """Update current batch of internal SumTree."""
        
        abs_errors += self.PER_e
        clipped_errors = np.minimum(abs_errors.detach().numpy(), self.absolute_error_upper)
        ps = np.power(clipped_errors, self.PER_a)
        
        for bi, p in zip(batch_id, ps):
            self.tree.update(bi, p)

## Define DQN Agent

In [16]:
class DQNAgent(object):
    """The agent interacting with and learning from the environment."""
    
    def __init__(self, state_size, action_size, seed):
        """Initialize an agent object."""
        
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        
        # init Q-Network & target network
        self.dqn_net = DQN(state_size, action_size, seed).to(device)
        self.target_net = DQN(state_size, action_size, seed).to(device)
        
        # init optimizer
        self.optimizer = optim.Adam(self.dqn_net.parameters(), lr=LR) 
        
        # init Replay Buffer
        self.buffer = PrioritizedReplayBuffer(MEMORY_SIZE)
        self.time_step = 0
        
    def memorize(self, state, action, reward, next_state, done):
        """Save experience in replay buffer."""
        
        self.buffer.add(state, action, reward, next_state, done)
        
        self.time_step = (self.time_step + 1) % UPDATE_EVERY
        if self.time_step == 0:
            batch_id, experiences, batch_IS_weights = self.buffer.sample(BATCH_SIZE)
            self.learn(batch_id, experiences, batch_IS_weights, GAMMA)
                
    def learn(self, batch_id, experiences, batch_is_weights, gamma):
        """Update value parameters using given batch of experience tuples."""
        
        states, actions, rewards, next_states, dones = experiences
        
        self.optimizer.zero_grad()
        
        # get max predicted Q values (for next states) from target network
        Q_target_next = self.target_net(next_states).detach().max(1)[0].unsqueeze(1)
        
        # compute Q target
        Q_target = rewards + (gamma * Q_target_next * (1 - dones))
        
        # get expected Q values from dqn network
        Q_expected = self.dqn_net(states).gather(1, actions)
        
        # compute loss
        weight = torch.tensor(batch_is_weights)
        loss = torch.sum(weight * (Q_expected - Q_target) ** 2)

        # minimize the loss
        loss.backward()
        self.optimizer.step()
        
        # calculate absolute loss
        absolute_errors = torch.abs(Q_target - Q_expected)
        
        # update priority on replay buffer
        self.buffer.update_batch(batch_id, absolute_errors)
        
        # update target network
        self.soft_update(self.dqn_net, self.target_net, TAU)
        
    def soft_update(self, dqn_net, target_net, tau):
        """Softly update target network parameters."""
        
        for dqn_param, target_param in zip(dqn_net.parameters(), target_net.parameters()):
            target_param.data.copy_(tau*dqn_param.data + (1.0-tau) * target_param.data)
            
    def act(self, state, eps=0.):
        """Return actions for given state as per current policy."""
        
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        
        self.dqn_net.eval()
        with torch.no_grad():
            action_values = self.dqn_net(state)
            
        self.dqn_net.train()
        
        # epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))
        
    def save(self, agent_location):
        torch.save(self.dqn_net.state_dict(), agent_location)

    def load(self, agent_location):
        self.dqn_net.load_state_dict(torch.load(agent_location))

In [17]:
agent = DQNAgent(state_size=env_info.vector_observations[0].shape[0], action_size=action_size, seed=SEED)

## Train DQN Agent

In [18]:
def train_agent(num_episodes, max_time=1000, eps_start=1.0, eps_end=0.01, eps_decay=0.995):
    """Train DQN agent."""
    
    scores = []
    scores_target = 13.0
    scores_window = deque(maxlen=100)
    eps = eps_start
    
    for i_episode in range(1, num_episodes+1):
        
        env_info = env.reset(train_mode=True)[brain_name]
        state = env_info.vector_observations[0]
        score = 0
        for time_step in range(max_time):
            
            action = agent.act(state, eps)
            env_info = env.step(action)[brain_name]
            next_state, reward, done = env_info.vector_observations[0], env_info.rewards[0], env_info.local_done[0]
            
            agent.memorize(state, action, reward, next_state, done)
            state = next_state
            score += reward
            if done:
                break
        
        scores_window.append(score)
        scores.append(score)
        eps = max(eps_end, eps_decay*eps)
        
        print(f'\rEpisode: {i_episode}, Average Score: {np.mean(scores_window):.2f}', end='')
        
        if i_episode % PRINT_EVERY == 0:
            print(f'\rEpisode: {i_episode}, Average Score: {np.mean(scores_window):.2f}')
        if np.mean(scores_window) >= scores_target:
            print(f'\nEnvironment solved in {i_episode-100:d} episodes! Average Score: {np.mean(scores_window):.2f}')
            if not os.path.exists('./agents/'): os.makedirs('./agents/')
            agent.save(f'./agents/PER_DQN_{brain_name}.pth')
            
    print('Training completed.')
    return scores

In [19]:
scores = train_agent(num_episodes=1000, max_time=1000)

Episode: 100, Average Score: 0.07
Episode: 200, Average Score: -0.01
Episode: 287, Average Score: -0.12--- data


TypeError: object of type 'int' has no len()

## Evaluate The Agent

In [None]:
plt.figure(figsize=(10,5))
plt.plot(np.arange(len(scores)), scores, color='green')
plt.xlabel('Num of episodes')
plt.ylabel('Score')
if not os.path.exists('./images/'): os.makedirs('./images/')
plt.savefig('./images/plot_of_per_dqn_agent_evaluation.png')
plt.show()

## 🎬 Watch The Smart Agent

In [None]:
agent.load(f'./agents/PER_DQN_{brain_name}.pth')

In [None]:
num_episodes = 5
max_time=1000

scores = []

for i in range(1, num_episodes+1):
    
    print(f'Watching episode {i}...', end='')
    
    env_info = env.reset(train_mode=False)[brain_name]
    state = env_info.vector_observations[0]
    score = 0    
    for time_step in range(max_time):
                
        action = agent.act(state)
        env_info = env.step(action)[brain_name]
        next_state, reward, done = env_info.vector_observations[0], env_info.rewards[0], env_info.local_done[0]
            
        state = next_state
        score += reward
        if done:
            clear_output(wait=True)

            env.reset()
            break
            
    scores.append(score)

print('Watching finished.')
print('Total score for {} episodes: {}'.format(num_episodes, np.mean(scores)))

---