In [1]:
#only run this once.  If ran again, the workspace must be restarted.
!pip -q install ./python

[31mtensorflow 1.7.1 has requirement numpy>=1.13.3, but you'll have numpy 1.12.1 which is incompatible.[0m
[31mipython 6.5.0 has requirement prompt-toolkit<2.0.0,>=1.0.15, but you'll have prompt-toolkit 3.0.4 which is incompatible.[0m


## Set up initial environment 
The cell below instantiates the environment and sets some initial variables:
- brain_name
- action_size: the number of actions that can be performed in the environment
- state_size: the number of values retured from the envionment to represent the current state

In [1]:
from unityagents import UnityEnvironment
import numpy as np

env = UnityEnvironment(file_name="/data/Banana_Linux_NoVis/Banana.x86_64")

#collect infomration about the envronment
# reset the environment
brain_name = env.brain_names[0]
brain = env.brains[brain_name]
print('Brain Name:', brain_name)
env_info = env.reset(train_mode=True)[brain_name]

# number of agents in the environment
print('Number of agents:', len(env_info.agents))

# number of actions
action_size = brain.vector_action_space_size
print('Number of actions (action_size):', action_size)

# examine the state space 
state = env_info.vector_observations[0]
#print('States look like:', state)
state_size = len(state)
print('State size (state_size):', state_size)

ModuleNotFoundError: No module named 'unityagents'

## Define the Model
Define the NN model with:
 - action_size: the number of actions that can be performed in the environment
 - state_size: the number of values retured from the envionment to represent the current state
 
A helper function (soft_update_target) is also included to update a target model with weights from the local model

In [3]:
import torch

#define the NN Model
class BananaModel(torch.nn.Module):
    def __init__(self, state_size, action_count):
        super(BananaModel, self).__init__()
        self.state_size   = state_size
        self.action_count = action_count

        self.fc1 = torch.nn.Linear(state_size, 128)
        self.fc2 = torch.nn.Linear(128, 128)
        self.out = torch.nn.Linear(128, action_count)

    def forward(self, x):
        batch_size = x.size(0)
        x = torch.nn.functional.relu(self.fc1(x))
        x = torch.nn.functional.relu(self.fc2(x))
        x = self.out(x)
        return x
    
def soft_update_target(local_model, target_model, tau):
    """Soft update model parameters.
    θ_target = τ*θ_local + (1 - τ)*θ_target
    Params
    ======
        local_model (PyTorch model): weights will be copied from
        target_model (PyTorch model): weights will be copied to
        tau (float): interpolation parameter 
    """
    for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
        target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)

## Define classes and functions used when training the model
 - BananaStepInfo class:  Store information (state, next state, action, reward) about a single step performed in the environment
 - play_game function: Play a single game, adding a new BananaStepInfo to the history with each step.  Returns the final score to help measure the performance of the model.
 - get_training_batch: Get a batch of random BananaStepInfo instances for training.  This could be modified for prioritized experienced replay, but for now, its just a random sampling from the history buffer.  Returns a a tuple of lists that are convenient for training the model

In [4]:
from collections import deque
import random

class BananaStepInfo():
        def __init__(self, action, reward, state, next_state, done):
            self.action = action
            self.reward = reward
            self.state = state
            self.next_state = next_state
            self.done = done
            
        def to_string(self, show_states=False):
            if show_states:
                val = "BananaStepInfo[action: {}, reward: {}, done: {}, state: {}, next_state: {}]".format(self.action, self.reward, self.done, self.state, self.next_state)
            else:
                val = "BananaStepInfo[action: {}, reward: {}, done: {}]".format(self.action, self.reward, self.done)
            return val

def play_game(env, banana_model, epsilon, brain_name, history):
    score = 0
    env_info = env.reset(train_mode=True)[brain_name]
    state = env_info.vector_observations[0]            # get the current state
    
    while True:
        # choose an action using epsilon as the probability of the action being random or greedy from the model
        rand = random.uniform(0, 1)
        if rand < epsilon:
            action = np.random.randint(action_size)
        else:
            state_tensor = torch.from_numpy(state).float().cuda()
            state_tensor = state_tensor.view((1,)+state.shape) #reshape for batch size of 1
            output = banana_model(state_tensor)
            action_array = output.detach().cpu().numpy()[0]
            action = np.argmax(action_array)
            
        #take a step, adding informatio about the env to ths history replay buffer.
        env_info = env.step(action)[brain_name]        # send the action to the environment
        next_state = env_info.vector_observations[0]   # get the next state
        reward = env_info.rewards[0]                   # get the reward
        score += reward                                # track total score
        done = env_info.local_done[0]                  # see if episode has finished
        
        bananaStepInfo = BananaStepInfo(action, reward, state, next_state, done)
        history.append(bananaStepInfo)
        state = next_state

        if done:                                       # exit loop if episode finished
            break
    return score
            
def get_training_batch(batch_size, history):
    #generate a random array of indices to select from the history replay buffer
    rand_arr = np.arange(len(history))
    np.random.shuffle(rand_arr)
   
    index_counter = 0
    batch_index_counter = 0
    
    state_batch        = np.zeros((batch_size, state_size))
    next_state_batch   = np.zeros((batch_size, state_size))
    reward_batch       = np.zeros(batch_size)
    actions_batch      = np.zeros(batch_size)
    done_batch         = np.zeros(batch_size)
    
    #choose random BananaStepInfo and add to return arrays
    for batch_index in range(batch_size):
        frame_number = rand_arr[batch_index]
        bananaStepInfo = history[frame_number]
        
        state_batch[batch_index]      = bananaStepInfo.state
        next_state_batch[batch_index] = bananaStepInfo.next_state
        reward_batch[batch_index]     = bananaStepInfo.reward
        actions_batch[batch_index]    = bananaStepInfo.action
        done_batch[batch_index]       = bananaStepInfo.done

    return state_batch, next_state_batch, actions_batch, reward_batch, done_batch

#test out the functions
# step_history = deque(maxlen=2000)
       
# model = BananaModel(state_size, action_size).cuda()
# score = play_game(env, model, .5, brain_name, step_history)
# print("score: ".format(score))

# state_batch, next_state_batch, actions_batch, reward_batch, done_batch = get_training_batch(64, step_history)

# for i, banana_step in enumerate(step_history):
#     print("{}: {}".format(i, banana_step.to_string(show_states=False)))

## Create the models
Create a local and two target models.  In this case, I am creating two target models to implement A Double DQN training strategy.  Also instiate the step history so the training loop can be repeated without resetting the models and replay history.

In [5]:
#create models
banana_model_local    = BananaModel(state_size, action_size).cuda()
banana_model_target_1 = BananaModel(state_size, action_size).cuda()
banana_model_target_2 = BananaModel(state_size, action_size).cuda()

#instantiate step history with a maximum of playthrough 4000 steps
step_history = deque(maxlen=4000)

## Train the model
Execute the cell below to train the model.  The average game score is printed out after each epoch is completed.

In [10]:
epsilon_max = 0.9            #value of epsilon when training is started
epsilon_min = 0.05           #value of epsilon when training has reached epsilon_decay_epochs
epsilon_decay_epochs = 200   #number if epochs requred to reach epsilon_min
TAU = .005                   #controls degree to which target models are updated after each game.
gamma = .99                  #discounted rewards factor

epochs = 250                 #how many epochs to run for (200 seems to be about enough)
games_per_epoch = 10         #games played per epoch
batch_trainings_per_game = 4 #how many training batches are performed after each game (games last about 300 steps, so 4*64 sounds about right)
batch_size = 64              #training batch size

learning_rate = .0005        #learning rate.  .0005 seems to work OK.  
optimizer = torch.optim.Adam(banana_model_local.parameters(), lr=learning_rate)  

for epoch in range(epochs):
    #calculate epsilon for this epoch.
    epsilon = epsilon_max - (epsilon_max-epsilon_min)*(epoch/epsilon_decay_epochs)
    epsilon = max(epsilon_min, epsilon)
    total_loss = 0
    total_score = 0
    
    for game_count in range(games_per_epoch):
        #play a game, adding BananaStepInfo steps to step_history
        score = play_game(env, banana_model_local, epsilon, brain_name, step_history)
        total_score += score
        
        for batch_training_count in range(batch_trainings_per_game):
            optimizer.zero_grad() #zero out gradients.
            
            #get training batch
            state_batch, next_state_batch, actions_batch, reward_batch, done_batch = get_training_batch(batch_size, step_history)

            #now, implement Double DQN strategy using one target model to get maximum actions and another target 
            #model to get the expected rewards based upon those actions.  Using the two models in this manner helps 
            #to ensure rewards that deviate far from reality do not negatively influence training.
            
            #using target model 1, get the greedy actions (actions with greatest reward) for each step in the batch (target_indices_1).
            next_state_tensor = torch.from_numpy(next_state_batch).float().cuda()
            target_q_values_next_1 = banana_model_target_1(next_state_tensor)
            target_q_values_next_max_1, target_indices_1 = torch.max(target_q_values_next_1, dim=1, keepdim=True)
            
            #using the greedy actions (target_indices_1) from target model 1, get the expected rewards from target model 2 (target_q_values_next)
            target_q_values_next_2 = banana_model_target_2(next_state_tensor)
            target_q_values_next = target_q_values_next_2.gather(1, target_indices_1)
            
            #get the expected current rewards earned using historical actions performed using the banana_model_local
            history_actions_batch_reshaped = np.reshape(actions_batch, (batch_size, 1))
            history_actions_batch_tensor = torch.from_numpy(history_actions_batch_reshaped).long().cuda()
            state_tensor = torch.from_numpy(state_batch).float().cuda()
            local_q_values = banana_model_local(state_tensor)  #feed historical states into local model to get q_values for each action 
            local_q_values_performed = local_q_values.gather(1, history_actions_batch_tensor) #get q_value for each action that was performed from history

            #get the actual rewards earned from the playthrough history
            reward_batch_reshaped = np.reshape(reward_batch, (batch_size, 1))
            reward_batch_tensor = torch.from_numpy(reward_batch_reshaped).float().cuda()

            #calculate loss using the deep Q learning equation.  I like seeing the actual equation this way instead of breaking
            #it up across multiple lines.
            loss = torch.mean((reward_batch_tensor + (gamma * target_q_values_next) - local_q_values_performed)**2)
            total_loss += loss.item() #track total loss

            loss.backward()   #backpropigation is only performed on the local model
            optimizer.step()
    
        #now, randomly choose a taret model to update. Randomly choosing a model makes sure one model is not influenced 
        #too much to negate the traing stability that Doubel DQN wants to achieve.
        rand = random.uniform(0, 1)
        if rand < .5:
            soft_update_target(banana_model_local, banana_model_target_1, TAU)
        else:
            soft_update_target(banana_model_local, banana_model_target_2, TAU)
        
    print("epoch: {},  score: {}, loss: {:.3f}, epsilon: {:.3f}".format(epoch, total_score/games_per_epoch, total_loss/games_per_epoch, epsilon))

epoch: 0,  score: 10.8, loss: 0.056, epsilon: 0.100
epoch: 1,  score: 5.7, loss: 0.047, epsilon: 0.100
epoch: 2,  score: 6.9, loss: 0.037, epsilon: 0.099
epoch: 3,  score: 9.8, loss: 0.038, epsilon: 0.099
epoch: 4,  score: 10.5, loss: 0.053, epsilon: 0.099
epoch: 5,  score: 6.7, loss: 0.057, epsilon: 0.098
epoch: 6,  score: 8.3, loss: 0.033, epsilon: 0.098
epoch: 7,  score: 9.4, loss: 0.043, epsilon: 0.098
epoch: 8,  score: 8.8, loss: 0.039, epsilon: 0.097
epoch: 9,  score: 9.7, loss: 0.063, epsilon: 0.097
epoch: 10,  score: 10.9, loss: 0.053, epsilon: 0.097
epoch: 11,  score: 10.4, loss: 0.050, epsilon: 0.096
epoch: 12,  score: 9.6, loss: 0.057, epsilon: 0.096
epoch: 13,  score: 8.1, loss: 0.052, epsilon: 0.096
epoch: 14,  score: 10.9, loss: 0.050, epsilon: 0.095
epoch: 15,  score: 5.5, loss: 0.043, epsilon: 0.095
epoch: 16,  score: 5.7, loss: 0.033, epsilon: 0.095
epoch: 17,  score: 8.8, loss: 0.050, epsilon: 0.094
epoch: 18,  score: 8.4, loss: 0.047, epsilon: 0.094
epoch: 19,  score

epoch: 155,  score: 14.2, loss: 0.068, epsilon: 0.050
epoch: 156,  score: 14.2, loss: 0.074, epsilon: 0.050
epoch: 157,  score: 15.8, loss: 0.089, epsilon: 0.050
epoch: 158,  score: 14.2, loss: 0.097, epsilon: 0.050
epoch: 159,  score: 13.7, loss: 0.076, epsilon: 0.050
epoch: 160,  score: 14.1, loss: 0.061, epsilon: 0.050
epoch: 161,  score: 12.4, loss: 0.060, epsilon: 0.050
epoch: 162,  score: 14.8, loss: 0.083, epsilon: 0.050
epoch: 163,  score: 15.1, loss: 0.079, epsilon: 0.050
epoch: 164,  score: 11.4, loss: 0.061, epsilon: 0.050
epoch: 165,  score: 15.3, loss: 0.060, epsilon: 0.050
epoch: 166,  score: 14.6, loss: 0.101, epsilon: 0.050
epoch: 167,  score: 13.1, loss: 0.087, epsilon: 0.050
epoch: 168,  score: 17.9, loss: 0.095, epsilon: 0.050
epoch: 169,  score: 13.9, loss: 0.092, epsilon: 0.050
epoch: 170,  score: 14.5, loss: 0.080, epsilon: 0.050
epoch: 171,  score: 13.2, loss: 0.075, epsilon: 0.050
epoch: 172,  score: 14.7, loss: 0.098, epsilon: 0.050
epoch: 173,  score: 11.0, lo

In [11]:
#if you like the model, save it here
torch.save(banana_model_local.state_dict(), "banana_model.pt")


In [12]:
#load the weights into a new model to play a game and get the score
banana_model_load  = BananaModel(state_size, action_size).cuda()
banana_model_load.load_state_dict(torch.load("banana_model.pt"));

print(banana_model_load)

BananaModel(
  (fc1): Linear(in_features=37, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=128, bias=True)
  (out): Linear(in_features=128, out_features=4, bias=True)
)

In [None]:
#play a game to see what score is achieved

play_history = []
score = play_game(env, banana_model_load, .01, brain_name, play_history)

print("Score {} in {} steps".format(score, len(play_history)))
