In [10]:
import numpy as np
import Backgammon as B
import agent as A
import flipped_agent as FA

In [11]:
import tensorflow as tf
import keras
import keras.layers as L

In [3]:
states = tf.placeholder("float32", (None, 29), name = "states")
actions = tf.placeholder("int32", name = "action_ids")
cumulative_rewards = tf.placeholder("float32", name = "cumulative_rewards")

In [4]:
model = keras.models.Sequential()
model.add(L.Dense(32, activation = "relu"))
model.add(L.Dense(64, activation = "relu"))
model.add(L.Dense(1))

logits = model(states)
policy = tf.nn.softmax(logits)
log_policy = tf.nn.log_softmax(logits)

get_action_prob = lambda s: policy.eval({states: s})

indices = tf.stack([tf.range(tf.shape(log_policy)[0]),actions],axis=-1)
log_policy_for_actions = tf.gather_nd(log_policy,indices)
J = tf.reduce_mean(log_policy_for_actions * cumulative_rewards)
entropy = -tf.reduce_sum(tf.multiply(policy, log_policy), 1, name="entropy")
all_weights = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)

loss = -J -0.1 * entropy

update = tf.train.AdamOptimizer().minimize(loss,var_list=all_weights)
    
    

In [111]:
board = B.init_board()
dice = B.roll_dice()
legal_moves, legal_boards = B.legal_moves(board, dice, 1)
print(dice)

[6 1]


In [42]:
np.array([board for board in legal_boards]).shape

(30, 29)

In [7]:
np.array([move for move in legal_moves]).shape

(28, 2, 2)

In [120]:
legal_moves[3].T

array([[13,  6],
       [ 7,  5]])

In [110]:
s = tf.InteractiveSession()
s.run(tf.global_variables_initializer())
board = B.init_board()
legal_moves, legal_boards = B.legal_moves(board, B.roll_dice(), 1)

legal_boards = np.array([board for board in legal_boards])

probs = get_action_prob(legal_boards)
n_actions = probs.shape[0]

probs = probs.reshape(n_actions)
probs = probs / np.sum(probs)
action = np.random.choice(np.arange(0, n_actions),
                         p = probs)

move = legal_moves[action]

print(legal_boards, legal_moves)

[[ 0. -2.  1.  0.  0.  0.  5.  0.  2.  0.  0.  0. -5.  5.  0.  0.  0. -3.
   0. -5.  0.  0.  0.  0.  2.  0.  0.  0.  0.]
 [ 0. -2.  0.  1.  0.  1.  4.  0.  2.  0.  0.  0. -5.  5.  0.  0.  0. -3.
   0. -5.  0.  0.  0.  0.  2.  0.  0.  0.  0.]
 [ 0. -2.  0.  1.  0.  0.  5.  1.  1.  0.  0.  0. -5.  5.  0.  0.  0. -3.
   0. -5.  0.  0.  0.  0.  2.  0.  0.  0.  0.]
 [ 0. -2.  0.  1.  0.  0.  5.  0.  2.  0.  0.  0. -5.  5.  0.  0.  0. -3.
   0. -5.  0.  0.  0.  1.  1.  0.  0.  0.  0.]
 [ 0. -2.  0.  0.  0.  1.  4.  0.  4.  0.  0.  0. -5.  4.  0.  0.  0. -3.
   0. -5.  0.  0.  0.  0.  2.  0.  0.  0.  0.]
 [ 0. -2.  0.  0.  0.  0.  5.  1.  3.  0.  0.  0. -5.  4.  0.  0.  0. -3.
   0. -5.  0.  0.  0.  0.  2.  0.  0.  0.  0.]
 [ 0. -2.  0.  0.  0.  0.  5.  0.  4.  0.  0.  0. -5.  4.  0.  0.  0. -3.
   0. -5.  0.  0.  0.  1.  1.  0.  0.  0.  0.]
 [ 0. -2.  0.  1.  0.  1.  4.  0.  2.  0.  0.  0. -5.  5.  0.  0.  0. -3.
   0. -5.  0.  0.  0.  0.  2.  0.  0.  0.  0.]
 [ 0. -2.  0.  0.  0.  1.  4.  0

In [121]:
def generate_session():
    """play env with REINFORCE agent and train at the session end"""
    
    #arrays to record session
    boards, rewards = [],[]
    
    board = B.init_board()
    player = 1
    
    while True:
        
        dice = B.roll_dice()
        
        for i in range(1 + int(dice[0] == dice[1])):
        
            legal_moves, legal_boards = B.legal_moves(board, dice, 1)
            legal_boards = np.array([board for board in legal_boards])
            
            if legal_boards.shape[0] == 0:
                break
            
            probs = get_action_prob(legal_boards)
            n_actions = probs.shape[0]
            probs = probs.reshape(n_actions)
            probs = probs / np.sum(probs)

            action = np.random.choice(np.arange(0, n_actions), 
                                 p = probs)

            move = legal_moves[action]

            next_board = B.update_board(board = board, move = move.T, player = player)

            #record session history to train later
            boards.append(next_board)

            board = next_board
            board = FA.flip_board(board)

            if B.game_over(board):
                rewards.append(1)
            else:
                rewards.append(0)
                
            if B.check_for_error(board):
                return("Error")
            
    train_step(boards, rewards)
            
    return sum(rewards)

In [122]:
def get_cumulative_rewards(rewards, #rewards at each step
                           gamma = 0.99 #discount for reward
                           ):
    """
    take a list of immediate rewards r(s,a) for the whole session 
    compute cumulative rewards R(s,a) (a.k.a. G(s,a) in Sutton '16)
    R_t = r_t + gamma*r_{t+1} + gamma^2*r_{t+2} + ...
    
    The simple way to compute cumulative rewards is to iterate from last to first time tick
    and compute R_t = r_t + gamma*R_{t+1} recurrently
    
    You must return an array/list of cumulative rewards with as many elements as in the initial rewards.
    """
    
    rewards = np.array(rewards)
    R = np.zeros_like(rewards, dtype= "float32")
    r = 0.
    for i, reward in enumerate(reversed(rewards)):
        r += reward
        R[-(i + 1)] = r
        r *= gamma
        
    return R
    
    

In [123]:
def train_step(_states,_actions,_rewards):
    """given full session, trains agent with policy gradient"""
    _cumulative_rewards = get_cumulative_rewards(_rewards)
    update.run({states:_states,actions:_actions,cumulative_rewards:_cumulative_rewards})

In [149]:
boards, moves, rewards = [], [], []
    
board = B.init_board()
player = 1

k = 1
error = False
while True:
    dice = B.roll_dice()

    for i in range(1 + int(dice[0] == dice[1])):

        legal_moves, legal_boards = B.legal_moves(board, dice, 1)
        legal_boards = np.array([board for board in legal_boards])

        if len(legal_moves) == 0:
            break

        probs = get_action_prob(legal_boards)
        n_actions = probs.shape[0]
        probs = probs.reshape(n_actions)
        probs = probs / np.sum(probs)

        action = np.random.choice(np.arange(0, n_actions), 
                             p = probs)

        move = legal_moves[action]

        next_board = B.update_board(board = board, move = move.T, player = 1)

        #record session history to train later
        boards.append(next_board)
        moves.append(move)
        board = next_board

        if B.game_over(board):
            rewards.append(1)
            break
        else:
            rewards.append(0)
            
    board = FA.flip_board(board)
    k += 1
    if B.check_for_error(board):
        Error = True
        print("Error at game step ", k)
        break
        
if not Error:        
    train_step(boards, rewards)

Too many or too few pieces on board!
Error at game step  6


In [147]:
np.array(boards).shape

(7, 29)

In [148]:
[(sum(board[board>0]), sum(board[board<0])) for board in boards]

[(15.0, -15.0),
 (15.0, -15.0),
 (15.0, -15.0),
 (15.0, -15.0),
 (15.0, -15.0),
 (15.0, -15.0),
 (14.0, -15.0)]

In [127]:
boards

[array([ 0., -2.,  0.,  0.,  0.,  1.,  5.,  0.,  4.,  0.,  0.,  0., -5.,
         5.,  0.,  0.,  0., -3., -1., -5.,  0.,  0.,  0.,  0.,  1.,  0.,
         0.,  0.,  0.])]

In [135]:
FA.flip_board(board)

array([ 0., -2.,  1.,  0.,  1.,  0.,  4.,  0.,  3.,  0.,  0.,  0., -5.,
        5.,  0.,  0.,  0., -3.,  0., -5.,  0.,  0.,  0.,  0.,  2.,  0.,
        0.,  0.,  0.])

In [136]:
board

array([-0., -2., -0., -0., -0., -0.,  5., -0.,  3., -0., -0., -0., -5.,
        5., -0., -0., -0., -3., -0., -4., -0., -1., -0., -1.,  2., -0.,
       -0., -0., -0.])

In [145]:
len(legal_moves)

4

In [152]:
get_cumulative_rewards(rewards)

array([0.94148016, 0.95099   , 0.960596  , 0.970299  , 0.9801    ,
       0.99      , 1.        ], dtype=float32)