In [1]:
!pip -q install ./python

[31mtensorflow 1.7.1 has requirement numpy>=1.13.3, but you'll have numpy 1.12.1 which is incompatible.[0m
[31mipython 6.5.0 has requirement prompt-toolkit<2.0.0,>=1.0.15, but you'll have prompt-toolkit 3.0.4 which is incompatible.[0m


In [2]:
from unityagents import UnityEnvironment
import numpy as np

env = UnityEnvironment(file_name="/data/Banana_Linux_NoVis/Banana.x86_64")

#collect infomration about the envronment
# reset the environment
brain_name = env.brain_names[0]
brain = env.brains[brain_name]
print('Brain Name:', brain_name)
env_info = env.reset(train_mode=True)[brain_name]

# number of agents in the environment
print('Number of agents:', len(env_info.agents))

# number of actions
action_size = brain.vector_action_space_size
print('Number of actions (action_size):', action_size)

# examine the state space 
state = env_info.vector_observations[0]
#print('States look like:', state)
state_size = len(state)
print('State size (state_size):', state_size)

INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		
Unity brain name: BananaBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 37
        Number of stacked Vector Observation: 1
        Vector Action space type: discrete
        Vector Action space size (per agent): 4
        Vector Action descriptions: , , , 


Brain Name: BananaBrain
Number of agents: 1
Number of actions (action_size): 4
State size (state_size): 37


In [3]:
import torch

#define the NN Model
class BananaModel(torch.nn.Module):
    def __init__(self, state_size, action_count, dropout=0.25):
        super(BananaModel, self).__init__()
        self.state_size   = state_size
        self.action_count = action_count

        self.fc1 = torch.nn.Linear(state_size, 128)
        self.fc2 = torch.nn.Linear(128, 128)
        self.out = torch.nn.Linear(128, action_count)

    def forward(self, x):
        batch_size = x.size(0)
        x = torch.nn.functional.relu(self.fc1(x))
        x = torch.nn.functional.relu(self.fc2(x))
        x = self.out(x)
        return x
    
def soft_update_target(local_model, target_model, tau):
    """Soft update model parameters.
    θ_target = τ*θ_local + (1 - τ)*θ_target
    Params
    ======
        local_model (PyTorch model): weights will be copied from
        target_model (PyTorch model): weights will be copied to
        tau (float): interpolation parameter 
    """
    for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
        target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)

In [4]:
from collections import deque
import random

class BananaStepInfo():
        def __init__(self, action, reward, state, next_state, done):
            self.action = action
            self.reward = reward
            self.state = state
            self.next_state = next_state
            self.done = done
            
        def to_string(self, show_states=False):
            if show_states:
                val = "BananaStepInfo[action: {}, reward: {}, done: {}, state: {}, next_state: {}]".format(self.action, self.reward, self.done, self.state, self.next_state)
            else:
                val = "BananaStepInfo[action: {}, reward: {}, done: {}]".format(self.action, self.reward, self.done)
            return val

        
def play_game(env, banana_model, epsilon, brain_name, history):
    score = 0
    env_info = env.reset(train_mode=True)[brain_name]
    state = env_info.vector_observations[0]            # get the current state
    while True:
        
        rand = random.uniform(0, 1)
        if rand < epsilon:
            action = np.random.randint(action_size)
        else:
            state_tensor = torch.from_numpy(state).float().cuda()
            state_tensor = state_tensor.view((1,)+state.shape) #reshape for batch size of 1
            output = model(state_tensor)
            action_array = output.detach().cpu().numpy()[0]
            action = np.argmax(action_array)
            
        # select an action TODO - use the model
        env_info = env.step(action)[brain_name]        # send the action to the environment
        next_state = env_info.vector_observations[0]   # get the next state
        reward = env_info.rewards[0]                   # get the reward
        score += reward
        done = env_info.local_done[0]                  # see if episode has finished
        
        bananaStepInfo = BananaStepInfo(action, reward, state, next_state, done)
        history.append(bananaStepInfo)
        state = next_state

        if done:                                       # exit loop if episode finished
            break
    return score
            
def get_training_batch(batch_size, history):
    rand_arr = np.arange(len(history))
    np.random.shuffle(rand_arr)
   
    index_counter = 0
    batch_index_counter = 0
    
    state_batch        = np.zeros((batch_size, state_size))
    next_state_batch   = np.zeros((batch_size, state_size))
    reward_batch       = np.zeros(batch_size)
    actions_batch      = np.zeros(batch_size)
    done_batch         = np.zeros(batch_size)
    
    for batch_index in range(batch_size):
        frame_number = rand_arr[batch_index]
        bananaStepInfo = history[frame_number]
        
        state_batch[batch_index]      = bananaStepInfo.state
        next_state_batch[batch_index] = bananaStepInfo.next_state
        reward_batch[batch_index]     = bananaStepInfo.reward
        actions_batch[batch_index]    = bananaStepInfo.action
        done_batch[batch_index]       = bananaStepInfo.done

    return state_batch, next_state_batch, actions_batch, reward_batch, done_batch
    


In [7]:
#test out the functions
step_history = deque(maxlen=1000)
       
model = BananaModel(state_size, action_size).cuda()
score = play_game(env, model, .5, brain_name, step_history)
print("score: ".format(score))

state_batch, next_state_batch, actions_batch, reward_batch, done_batch = get_training_batch(64, step_history)

for i, banana_step in enumerate(step_history):
    print("{}: {}".format(i, banana_step.to_string(show_states=False)))
    
for state, next_state, action, reward, done in zip(state_batch, next_state_batch, actions_batch, reward_batch, done_batch):
  print(action) 


score: 
0: BananaStepInfo[action: 0, reward: 0.0, done: False]
1: BananaStepInfo[action: 2, reward: 0.0, done: False]
2: BananaStepInfo[action: 2, reward: 0.0, done: False]
3: BananaStepInfo[action: 2, reward: 0.0, done: False]
4: BananaStepInfo[action: 2, reward: 0.0, done: False]
5: BananaStepInfo[action: 1, reward: 0.0, done: False]
6: BananaStepInfo[action: 0, reward: 0.0, done: False]
7: BananaStepInfo[action: 3, reward: 0.0, done: False]
8: BananaStepInfo[action: 0, reward: 0.0, done: False]
9: BananaStepInfo[action: 0, reward: 0.0, done: False]
10: BananaStepInfo[action: 2, reward: 0.0, done: False]
11: BananaStepInfo[action: 2, reward: 0.0, done: False]
12: BananaStepInfo[action: 1, reward: 0.0, done: False]
13: BananaStepInfo[action: 1, reward: 0.0, done: False]
14: BananaStepInfo[action: 0, reward: 0.0, done: False]
15: BananaStepInfo[action: 0, reward: 0.0, done: False]
16: BananaStepInfo[action: 2, reward: 0.0, done: False]
17: BananaStepInfo[action: 0, reward: 0.0, done: F

In [10]:
#create model,optimizer
banana_model_local  = BananaModel(state_size, action_size).cuda()
banana_model_target = BananaModel(state_size, action_size).cuda()
step_history = deque(maxlen=5000)


In [14]:
epsilon_min = 0.1
epsilon_max = 0.8
epsilon_decay_epochs = 30
epochs = 50
games_per_epoch = 10
batch_trainings_per_game = 4
batch_size = 64
learning_rate = .005
optimizer = torch.optim.Adam(banana_model_local.parameters(), lr=learning_rate)
TAU = .001
gamma = .99

for epoch in range(epochs):
    epsilon = epsilon_max - (epsilon_max-epsilon_min)*(epoch/epsilon_decay_epochs)
    epsilon = max(epsilon_min, epsilon)
    total_loss = 0
    total_score = 0
    
    for game_count in range(games_per_epoch):
        optimizer.zero_grad()
        score = play_game(env, banana_model_local, epsilon, brain_name, step_history)
        total_score += score
        
        for batch_training_count in range(batch_trainings_per_game):
            state_batch, next_state_batch, actions_batch, reward_batch, done_batch = get_training_batch(batch_size, step_history)

            #get the expected reward of the next state from the target banana model
            next_state_tensor = torch.from_numpy(next_state_batch).float().cuda()
            target_q_values_next = banana_model_target(next_state_tensor)
            target_q_values_next_max, indices = torch.max(target_q_values_next, dim=1, keepdim=True)
            #print("target_q_values_next_max[:10]: {}".format(target_q_values_next_max[:10]))

            #get the expected rewards earned using actions performed using the local banana model
            actions_batch_reshaped = np.reshape(actions_batch, (batch_size, 1))
            actions_batch_tensor = torch.from_numpy(actions_batch_reshaped).long().cuda()
            state_tensor = torch.from_numpy(state_batch).float().cuda()
            local_q_values = banana_model_local(state_tensor)
            local_q_values_performed = local_q_values.gather(1, actions_batch_tensor)
            #print("local_q_values_performed[:10]: {}".format(local_q_values_performed[:10]))

            #get the actual rewards earned 
            reward_batch_reshaped = np.reshape(reward_batch, (batch_size, 1))
            reward_batch_tensor = torch.from_numpy(reward_batch_reshaped).float().cuda()
            #print("reward_batch_tensor[:10]: {}".format(reward_batch_tensor[:10]))

            #calculate loss
            loss = torch.mean((reward_batch_tensor + (gamma * target_q_values_next_max) - local_q_values_performed)**2)
            total_loss += loss.item()

            loss.backward()
            optimizer.step()
    
        soft_update_target(banana_model_local, banana_model_target, TAU)
        
    print("epoch: {},  score: {}, loss: {:.3f}, epsilon: {:.3f}".format(epoch, total_score/games_per_epoch, total_loss/games_per_epoch, epsilon))
    
    

epoch: 0,  score: 0.0, loss: 0.026, epsilon: 0.800
epoch: 1,  score: 0.0, loss: 0.023, epsilon: 0.777
epoch: 2,  score: 0.4, loss: 0.016, epsilon: 0.753
epoch: 3,  score: 0.2, loss: 0.036, epsilon: 0.730
epoch: 4,  score: 0.3, loss: 0.013, epsilon: 0.707
epoch: 5,  score: 0.7, loss: 0.011, epsilon: 0.683
epoch: 6,  score: 0.0, loss: 0.035, epsilon: 0.660
epoch: 7,  score: 0.4, loss: 0.032, epsilon: 0.637
epoch: 8,  score: 0.7, loss: 0.030, epsilon: 0.613
epoch: 9,  score: 0.6, loss: 0.033, epsilon: 0.590
epoch: 10,  score: 0.7, loss: 0.014, epsilon: 0.567
epoch: 11,  score: 0.2, loss: 0.023, epsilon: 0.543
epoch: 12,  score: 1.1, loss: 0.026, epsilon: 0.520
epoch: 13,  score: 0.8, loss: 0.024, epsilon: 0.497
epoch: 14,  score: 0.1, loss: 0.020, epsilon: 0.473
epoch: 15,  score: 0.2, loss: 0.026, epsilon: 0.450
epoch: 16,  score: 1.1, loss: 0.040, epsilon: 0.427
epoch: 17,  score: 1.2, loss: 0.037, epsilon: 0.403
epoch: 18,  score: 0.6, loss: 0.057, epsilon: 0.380
epoch: 19,  score: 2.2