In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np

In [3]:
import shutil

In [4]:
from copy import deepcopy

In [5]:
from ShallowMindBot import *
from TowerDefense import *
from TowerDefenseApi import *

In [6]:
def softmax_grad(softmax):
    s = softmax.reshape(-1,1)
    return np.diagflat(s) - np.dot(s, s.T)

In [7]:
def gradW2(p, possible_actions, selected_action, h):
    dsoftmax = softmax_grad(p)[selected_action,:]
    dlog = dsoftmax / p[selected_action]
    # Fill in dlog with zeros at the actions that were not possible
    dlog_filled = np.zeros(NUM_ACTION_TYPES)
    dlog_filled[possible_actions] = dlog
    dW2 = dlog_filled[None,:].T.dot(h[None,:])
    return dW2, dlog_filled

In [8]:
def gradW1(dlog_filled, W2, h, x):
    dh = W2.T.dot(dlog_filled)
    dh[h<=0] = 0
    dW1 = np.outer(dh,x)
    return dW1

## Start training

In [9]:
num_batches = 1
batch_size = 100 # used to perform a RMS prop param update every batch_size steps
learning_rate = 1e-2 # learning rate used in RMS prop
gamma = 0.99 # discount factor for reward
decay_rate = 0.99 # decay factor for RMSProp leaky sum of grad^2
reward_win = 10**5
reward_lose = -reward_win

In [10]:
for batch_idx in range(num_batches):
    # Clear out the logs folder
    log_subfolders = [d for d in os.listdir(os.path.join('.','logs')) if os.path.isdir(os.path.join('.','logs',d))]
    for subfolder in log_subfolders:
        shutil.rmtree(os.path.join('.','logs',subfolder))
        
    # Run the games in the batch
    for game_idx in range(batch_size):
        ! java -jar tower-defence-runner-3.0.3.jar

Starting round 0
Error List: []
Player A, Health=100, Energy=     5, Score=     8, IronCurtainAvailable= No
Player B, Health=100, Energy=     5, Score=     8, IronCurtainAvailable= No
[0e0][1 0][2 0][3 0][4 0][5 0][6 0][7 0][7 0][6 0][5 0][4 0][3 0][2 0][1 0][0 0]
[0 1][1 1][2 1][3 1][4 1][5 1][6 1][7 1][7 1][6 1][5 1][4 1][3 1][2 1][1 1][0 1]
[0 2][1 2][2 2][3 2][4 2][5 2][6 2][7 2][7 2][6 2][5 2][4 2][3 2][2 2][1 2][0 2]
[0 3][1 3][2 3][3 3][4 3][5 3][6 3][7 3][7 3][6 3][5 3][4 3][3 3][2 3][1 3][0e3]
[0 4][1 4][2 4][3 4][4 4][5 4][6 4][7 4][7 4][6 4][5 4][4 4][3 4][2 4][1 4][0 4]
[0 5][1 5][2 5][3 5][4 5][5 5][6 5][7 5][7 5][6 5][5 5][4 5][3 5][2 5][1 5][0 5]
[0 6][1 6][2 6][3 6][4 6][5 6][6 6][7 6][7 6][6 6][5 6][4 6][3 6][2 6][1 6][0 6]
[0 7][1 7][2 7][3 7][4 7][5 7][6 7][7 7][7 7][6 7][5 7][4 7][3 7][2 7][1 7][0 7]

Starting round 1
Error List: []
Player A, Health=100, Energy=    13, Score=    16, IronCurtainAvailable= No
Player B, Health=100, Energy=    13, Score=    16, IronCurt

In [12]:
batch_idx = 0

model_params = pickle.load(open('model_params.p','rb'))

advantages = []
win_count = 0
for game_idx in range(batch_size):
    num_rounds = max([int(f.split('.')[0].split('_')[1]) for f in os.listdir(os.path.join('.','logs','game_{}'.format(game_idx)))])
    rewards = np.zeros(num_rounds)
    for round_idx in range(num_rounds):
        with open(os.path.join('.','logs','game_{}'.format(game_idx),'round_{}.pkl'.format(round_idx)), 'rb') as f:
            state_features, possible_action_types, action_type_probabilities, hidden_layer, action_type_choice = pickle.load(f)
        rewards[round_idx] = state_features['myMinusOppScore']
        # If last round, estimate who won or lost and update rewards
        if round_idx == (num_rounds-1):
            if state_features['myMinusOppHealth'] > 0:
                rewards[round_idx] += reward_win
                win_count += 1
            else:
                rewards[round_idx] += reward_lose
    
    # Compute advantages
    advantages.append(np.cumsum(rewards[::-1])[::-1])
    
print('Won {} of {} games'.format(win_count, batch_size))
    
# Standardize advantages
advantages_all = np.hstack(advantages)
advantages_mean = np.mean(advantages_all)
advantages_std = np.std(advantages_all)

# Compute gradients
gW1 = np.zeros_like(model_params['W1'])
gW2 = np.zeros_like(model_params['W2'])
for game_idx in range(batch_size):
    num_rounds = max([int(f.split('.')[0].split('_')[1]) for f in os.listdir(os.path.join('.','logs','game_{}'.format(game_idx)))])
    # Increment gradients
    advantages_game = advantages[game_idx]
    for round_idx in range(num_rounds):
        with open(os.path.join('.','logs','game_{}'.format(game_idx),'round_{}.pkl'.format(round_idx)), 'rb') as f:
            state_features, possible_action_types, action_type_probabilities, hidden_layer, action_type_choice = pickle.load(f)
        action_type_choice_idx = np.where(np.array(possible_action_types)==action_type_choice)[0][0]
        gW2_round, dlog_filled = gradW2(action_type_probabilities, possible_action_types, action_type_choice_idx, hidden_layer)
        gW2 += gW2_round*((advantages_game[round_idx]-advantages_mean)/advantages_std)
        x = np.array([float(val) for val in state_features.values()])
        gW1 += gradW1(dlog_filled, model_params['W2'], hidden_layer, x)*((advantages_game[round_idx]-advantages_mean)/advantages_std)
    
# Normalize by number of games
gW1 /= batch_size
gW2 /= batch_size

# Update using gradient
new_W1 = model_params['W1'] + learning_rate*gW1
new_W2 = model_params['W2'] + learning_rate*gW2

# Save
new_model = {}
new_model['W1'] = new_W1
new_model['W2'] = new_W2
pickle.dump(new_model,open('new_model_params.p','wb'))

Won 55 of 100 games


In [16]:
new_model['W2'] - model_params['W2']

array([[ 1.60329322e-03, -1.93172274e-03, -1.21130140e-03,
        -1.92167931e-03,  2.47282646e-02, -1.89999416e-02,
        -5.83502882e-03,  2.61010976e-02, -4.69117669e-04,
         7.45558203e-03,  3.18227553e-03, -1.32405610e-03,
        -9.00863914e-03,  0.00000000e+00, -4.03406563e-02],
       [-3.74108556e-03, -8.82719614e-05, -1.31132209e-03,
         2.94842676e-03,  1.24419779e-02,  1.42203980e-02,
         6.95282341e-03,  2.28292624e-02, -1.44448941e-03,
         7.34337438e-03,  2.41687650e-03, -7.11205575e-06,
         1.27292107e-02,  0.00000000e+00,  3.71089835e-02],
       [-2.61518649e-03,  1.73495208e-03,  8.85588231e-04,
         2.65299919e-03, -1.97212477e-02,  1.46181404e-02,
         6.31412495e-03, -1.21987296e-02,  3.88631939e-04,
        -3.24208731e-03, -4.13564448e-03,  6.38203475e-04,
         7.03669553e-03,  0.00000000e+00,  3.53404192e-02],
       [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000