In [1]:
!pip -q install ./python

[31mtensorflow 1.7.1 has requirement numpy>=1.13.3, but you'll have numpy 1.12.1 which is incompatible.[0m
[31mipython 6.5.0 has requirement prompt-toolkit<2.0.0,>=1.0.15, but you'll have prompt-toolkit 2.0.9 which is incompatible.[0m


### 1. Import necessary packages

In [2]:
import torch
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from collections import deque
from unityagents import UnityEnvironment

### 2. Examine the state and action spaces

In [3]:
env = UnityEnvironment(file_name="/data/Banana_Linux_NoVis/Banana.x86_64")

INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		
Unity brain name: BananaBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 37
        Number of stacked Vector Observation: 1
        Vector Action space type: discrete
        Vector Action space size (per agent): 4
        Vector Action descriptions: , , , 


In [4]:
# Setting up variables
brain_name = "BananaBrain"
state_size = 37
action_size = 4
brain = env.brains[brain_name]

### 3. Multilayer perceptron for action-value estimation

In [5]:
'''
Multilayer perceptron with 3 hidden layers

Params (init):
1. input size
2. hidden units in 1st, 2nd, 3rd hidden layers
3. output size

Forward propagation:
- need to input state vector

'''

import torch.nn as nn
import torch.nn.functional as F 

#3 hidden layers
class QNetwork(nn.Module):
    
    def __init__(self, input_layer, hidden_1, hidden_2, hidden_3, output_layer):
        
        super(QNetwork,self).__init__()
        
        self.fc1 = nn.Linear(input_layer,hidden_1)
        self.fc2 = nn.Linear(hidden_1,hidden_2)
        self.fc3 = nn.Linear(hidden_2,hidden_3)
        self.fc4 = nn.Linear(hidden_3,output_layer)
    
    def forward(self, state):
        x = F.relu(self.fc1(state))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        
        return self.fc4(x)

### 4. Replay Buffer for Experience Replay

In [6]:
'''
Replay buffer that stores the followings for experience replay:
- current state
- action (needed as actions are chosen by epsilon-greedy policy)
- reward 
- next state
- dones (for terminal states)

Functions of replay buffer:
1. contains self.memory (local) to store experiences as a double-ended queue
2. add function to add new experience to buffer
3. sample function that returns 64 samples for parallel training
4. len function that returns length of replay buffer

'''

BUFFER_SIZE = 100000
SAMPLE_SIZE = 64

class ReplayBuffer:
    
    def __init__(self):
        
        self.memory = deque(maxlen=BUFFER_SIZE) 
    
    def add(self, state, action, reward, next_state, done):
        
        self.memory.append((state, action, reward, next_state, done))
    
    def sample(self):
        
        experiences = random.sample(self.memory, k=SAMPLE_SIZE)
        
        # Separates states, actions, rewards etc from sampled experience and vstack them for batch learning
        states = torch.from_numpy(np.vstack([e[0] for e in experiences if e is not None])).float().cuda()
        actions = torch.from_numpy(np.vstack([e[1] for e in experiences if e is not None])).long().cuda()
        rewards = torch.from_numpy(np.vstack([e[2] for e in experiences if e is not None])).float().cuda()
        next_states = torch.from_numpy(np.vstack([e[3] for e in experiences if e is not None])).float().cuda()
        dones = torch.from_numpy(np.vstack([e[4] for e in experiences if e is not None]).astype(np.uint8)).float().cuda() 
        
        return (states, actions, rewards, next_states, dones)
    
    def __len__(self):
        return len(self.memory)

### 5. Agent 

In [7]:
'''
Agent 

Params:
1. QNetwork local - estimates action values for current state
2. QNetwork target - estimates action values for the next state, finds the maximum for the generation of target values

Functions:
1. Act: forward propagation and generate actions based on epsilon-greedy policy
2. Step: - stores experience into the memory of ReplayBuffer and learns when the number of samples in ReplayBuffer is > 64
         - updates target network (occurs together with the learning step) with local network by TAU amount

'''

import torch.optim as optim
import random

GAMMA = 0.99
learning_rate = 0.0005
TAU = 0.001

class Agent():
    
    def __init__(self):
        
        self.qnetwork_local = QNetwork(input_layer=state_size, hidden_1=64, hidden_2=128, hidden_3=64, output_layer=action_size).cuda()
        self.qnetwork_target = QNetwork(input_layer=state_size,hidden_1=64,hidden_2=128,hidden_3=64,output_layer=action_size).cuda()
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=learning_rate) #no need to optimize target network as it will be copies later
        
        self.memory = ReplayBuffer() 
        self.timestep = 0
        
    def act(self, state, eps):
        
        eps = eps
        #unsqueeze as neural network expects dimension of [batch_size, channels, height, width]
        state = torch.from_numpy(state).float().unsqueeze(0).cuda() 
        
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(4))
    
    def step(self, state, action, reward, next_action, done):
        
        # 1. add current experience into memory
        self.memory.add(state, action, reward, next_action, done)
        
        # 2. learning every 5 steps, only if memory has > 64 tuples
        self.timestep += 1
        if self.timestep % 5 == 0:
            if len(self.memory) > 64:
                
                experiences = self.memory.sample()
                states, actions, rewards, next_states, dones = experiences
                
                # Get Q-targets with next_state and QNetwork target
                Q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1)
                Q_targets = rewards + (GAMMA * Q_targets_next * (1 - dones))

                # Get expected Q values from QNetwork local 
                Q_expected = self.qnetwork_local(states).gather(1, actions)

                loss = F.mse_loss(Q_expected, Q_targets)
                self.optimizer.zero_grad()
                loss.backward()
                self.optimizer.step()
                
                #Update target network by TAU amount
                for target_param, local_param in zip(self.qnetwork_target.parameters(), self.qnetwork_local.parameters()):
                    target_param.data.copy_(TAU*local_param.data + (1-TAU)*target_param.data)

### 6. Learning through interacting with Unity environment

In [8]:
'''
Checkpoint file:
- checkpoint.pth that saves current parameters

Note: need to specify the brain when using env.reset and env.step

'''

import math

agent = Agent()
maximum_score = -math.inf

EPS_START = 1
EPS_DECAY = 0.995
EPS_MIN = 0.01

total_for_average = 0
run = 0

eps = EPS_START

for i_episode in range(1, 1000):
    
    env_info = env.reset(train_mode=True)[brain_name]     
    state = env_info.vector_observations[0]
    
    score = 0    
    
    while True:
        
        action = agent.act(state, eps)
        
        env_info = env.step(action)[brain_name]
        next_state = env_info.vector_observations[0]
        reward = env_info.rewards[0]                   
        done = env_info.local_done[0]   
        
        agent.step(state, action, reward, next_state, done)
        
        state = next_state
        
        score += reward
        total_for_average += reward
        
        if done:
            
            # print running average and saves parameters
            run += 1
            average = total_for_average/run
            print("\rEpisode: %s Average Score= %s" % (i_episode,average), end="")
            torch.save(agent.qnetwork_local.state_dict(), 'checkpoint.pth')
            
            # print average score every 100 episode
            if i_episode % 100== 0:
                print("\rCheckpoint: Episode= %s Average score= %s" % (i_episode,average))
                total_for_average = 0
                run = 0
            
            break
            
    eps = max(EPS_MIN,eps*EPS_DECAY)

Checkpoint: Episode= 100 Average score= 0.256581
Checkpoint: Episode= 200 Average score= 3.4655
Checkpoint: Episode= 300 Average score= 7.3335
Checkpoint: Episode= 400 Average score= 9.2735
Checkpoint: Episode= 500 Average score= 12.445
Checkpoint: Episode= 600 Average score= 13.566
Checkpoint: Episode= 700 Average score= 15.141
Checkpoint: Episode= 800 Average score= 15.017
Checkpoint: Episode= 900 Average score= 15.414
Episode: 999 Average Score= 16.131313131313132