In [105]:
import numpy as np
import Backgammon as B
import agent as A
import flipped_agent as FA
import tensorflow as tf
import keras
import keras.layers as L
from IPython.display import clear_output
import matplotlib.pyplot as plt

In [106]:
def get_cumulative_rewards(rewards, gamma = 0.99):
   
    rewards = np.array(rewards)
    R = np.zeros_like(rewards, dtype= "float32")
    r = 0.
    for i, reward in enumerate(reversed(rewards)):
        r += reward
        R[-(i + 1)] = r
        r *= gamma
        
    return R

In [107]:
def train_step(_states, _rewards):
    _cumulative_rewards = get_cumulative_rewards(_rewards)
    update.run({states: _states, 
                cumulative_rewards: _cumulative_rewards})

In [113]:
'''
Þetta fall spilar heilan leik, uppfærir tauganetið og skilar K, sem er fjöldi umferða í leiknum.
'''

def generate_session(Debug = False, Verbose = False):

    # Spila leikinn
    boards, moves, rewards = [], [], []

    board = B.init_board()
    player = 1
    
    Error = False
    GameOver = False
    k = 0
    '''
    Pælingin hér er að spila einn leik til enda og geyma öll boards og actions. 
    Reikna svo eligibility trace og update'a modelið með öllu episodeinu.
    '''
    while True:
        dice = B.roll_dice()
        for i in range(1 + int(dice[0] == dice[1])):

            legal_moves, legal_boards = B.legal_moves(board, dice, 1)
            legal_boards = np.array([board for board in legal_boards])

            if len(legal_moves) == 0:
                break

            #probs = np.array([get_action_prob(state.reshape(1, 29)) for state in legal_boards])
            probs = get_action_prob(legal_boards)
            n_actions = probs.shape[0]
            probs = probs.reshape(n_actions)
            probs = probs / np.sum(probs)

            action = np.random.choice(np.arange(0, n_actions), 
                                 p = probs)

            move = legal_moves[action]


            if Debug:
                print("Action: \n", action)
                print("Board now: \n", board)
                print("Chosen move:\n", move)

            if len(move) != 0:
                for m in move:
                    board = B.update_board(board = board, move = m, player = 1)

            #record session history to train later
            boards.append(board)
            moves.append(move)
            GameOver = B.game_over(board)
            if GameOver:
                rewards.append(1)
                break
            else:
                rewards.append(0)

        board = FA.flip_board(board)

        if GameOver:
            if Verbose:
                print("Game is over.")
            break

        if B.check_for_error(board):
            Error = True
            print("Error at game step ", k)
            break
        k += 1
    if not Error:        
        train_step(boards,  rewards)
            
    return k

In [122]:
'''
Þetta fall spilar heilan leik gegn random agent
'''

def PlayRandomAgent(Debug = False, Verbose = False):

    board = B.init_board()
    player = 1
    
    Error = False
    GameOver = False
    k = 1
    '''
    Pælingin hér er að spila einn leik til enda og geyma öll boards og actions. 
    Reikna svo eligibility trace og update'a modelið með öllu episodeinu.
    '''
    while True:
        dice = B.roll_dice()
        for i in range(1 + int(dice[0] == dice[1])):
            
            legal_moves, legal_boards = B.legal_moves(board, dice, 1)
            legal_boards = np.array([board for board in legal_boards])

            if len(legal_moves) == 0:
                break

            
            probs = get_action_prob(legal_boards)
            n_actions = probs.shape[0]
            probs = probs.reshape(n_actions)
            probs = probs / np.sum(probs)

            action = np.random.choice(np.arange(0, n_actions), 
                                 p = probs)

            move = legal_moves[action]


            if Debug:
                print("Action: \n", action)
                print("Board now: \n", board)
                print("Chosen move:\n", move)

            if len(move) != 0:
                for m in move:
                    board = B.update_board(board = board, move = m, player = 1)
            GameOver = B.game_over(board)
            if GameOver:
                break
        if GameOver:
                break
                
                
        board = FA.flip_board(board)
        player *= -1
        dice = B.roll_dice()
            
            
        for i in range(1 + int(dice[0] == dice[1])):
            
            legal_moves, legal_boards = B.legal_moves(board, dice, 1)
            legal_boards = np.array([board for board in legal_boards])

            if len(legal_moves) == 0:
                break

            
            move = legal_moves[np.random.randint(len(legal_moves))]


            if Debug:
                print("Action: \n", action)
                print("Board now: \n", board)
                print("Chosen move:\n", move)

            if len(move) != 0:
                for m in move:
                    board = B.update_board(board = board, move = m, player = 1)
            
            GameOver = B.game_over(board)
            if GameOver:
                break
            player *= -1

        if GameOver:
            if Verbose:
                print("Game is over.")
            break

        if B.check_for_error(board):
            Error = True
            print("Error at game step ", k)
            break
        k += 1
            
    return (player + 1) / 2

In [110]:
# Skilgreina inputs í model
states = tf.placeholder("float32", (None, 29), name = "states")
actions = tf.placeholder("int32", name = "action_ids")
cumulative_rewards = tf.placeholder("float32", name = "cumulative_rewards")

# Skilgreina model (arkitektúrinn skiptir litlu máli þangað til að þjálfunin gengur)
model = keras.models.Sequential()
model.add(L.Dense(32, activation = "relu"))
model.add(L.Dense(64, activation = "relu"))
model.add(L.Dense(1))



logits = model(states)
policy = tf.nn.softmax(logits)
log_policy = tf.nn.log_softmax(logits)

get_action_prob = lambda s: policy.eval({states: s})


J = tf.reduce_mean(log_policy * cumulative_rewards)
entropy = -tf.reduce_sum(tf.multiply(policy, log_policy), 1, name="entropy")
all_weights = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)

loss = -J - 0.1 * entropy

update = tf.train.AdamOptimizer().minimize(loss,var_list=all_weights)

In [111]:
s = tf.InteractiveSession()
s.run(tf.global_variables_initializer())



In [None]:
all_k = []
win_pct = []

for i in range(1000):
    k = [generate_session() for _ in range(20)]
    #all_k.append(np.mean(k))
    
    if i % 10 == 0:
        clear_output(True)
        print("Playing random agent")
        wins = [PlayRandomAgent() for _ in range(20)]
        win_pct.append(np.mean(wins))
        print("Win percentage: ", np.mean(wins))
        plt.plot(win_pct)
        plt.show()
    
    
    #print("Mean turns to win game: ", np.mean(k))
    #plt.plot(all_k)
    #plt.show()

Playing random agent


### Allt hér fyrir neðan var notað sem fikkt

Útgáfa þar sem við flippum borðið þ.a. agentinn er bara alltaf nr 1 og flippum aldrei moveinu hans

In [73]:
# Spila leikinn
boards, moves, rewards = [], [], []
    
board = B.init_board()
player = 1

k = 1 # Halda utan um hvenær ég fæ villu
error = False
Debug = False


'''
Pælingin hér er að spila einn leik til enda og geyma öll boards og actions. 
Reikna svo eligibility trace og update'a modelið með öllu episodeinu.
'''
while True:
    dice = B.roll_dice()
    for i in range(1 + int(dice[0] == dice[1])):

        legal_moves, legal_boards = B.legal_moves(board, dice, 1)
        legal_boards = np.array([board for board in legal_boards])

        if len(legal_moves) == 0:
            break

        #probs = np.array([get_action_prob(state.reshape(1, 29)) for state in legal_boards])
        probs = get_action_prob(legal_boards)
        n_actions = probs.shape[0]
        probs = probs.reshape(n_actions)
        probs = probs / np.sum(probs)

        action = np.random.choice(np.arange(0, n_actions), 
                             p = probs)
        
        move = legal_moves[action]
        
        
        if Debug:
            print("Action: \n", action)
            print("Board now: \n", board)
            print("Chosen move:\n", move)
            
        if len(move) != 0:
            for m in move:
                board = B.update_board(board = board, move = m, player = 1)

        #record session history to train later
        boards.append(board)
        moves.append(move)

        if B.game_over(board):
            rewards.append(1)
            break
        else:
            rewards.append(0)
            
        board = FA.flip_board(board)
    
    if B.game_over(board):
            print("Game is over.")
            break
    
    if B.check_for_error(board):
        Error = True
        print("Error at game step ", k)
        break
    k += 1
        
if not Error:        
    train_step(boards, rewards)

Game is over.


Útgáfa þar sem við flippum boardið fyrir Player -1 og flippum múvið hans. 
Virkar ekki eins og er

In [64]:
# Spila leikinn
boards, moves, rewards = [], [], []
    
board = B.init_board()
player = 1

k = 1 # Halda utan um hvenær ég fæ villu
error = False
Debug = True


'''
Pælingin hér er að spila einn leik til enda og geyma öll boards og actions. 
Reikna svo eligibility trace og update'a modelið með öllu episodeinu.
'''
Player = -1
while True:
    Player *= -1
    dice = B.roll_dice()
    for i in range(1 + int(dice[0] == dice[1])):
        
        if Player == -1:
            real_board = np.copy(board)
            board = FA.flip_board(board)

        legal_moves, legal_boards = B.legal_moves(board, dice, Player)
        legal_boards = np.array([board for board in legal_boards])

        if len(legal_moves) == 0:
            break

        #probs = np.array([get_action_prob(state.reshape(1, 29)) for state in legal_boards])
        probs = get_action_prob(legal_boards)
        n_actions = probs.shape[0]
        probs = probs.reshape(n_actions)
        probs = probs / np.sum(probs)

        action = np.random.choice(np.arange(0, n_actions), 
                             p = probs)
        
        move = legal_moves[action]
        
        if Player == -1:
            move = FA.flip_move(move)
            board = real_board
        
        if Debug:
            print("Action: \n", action)
            print("Board now: \n", board)
            print("Chosen move:\n", move)
            
        if len(move) != 0:
            for m in move:
                board = B.update_board(board = board, move = m, player = Player)

        #record session history to train later
        boards.append(board)
        moves.append(move)

        if B.game_over(board):
            rewards.append(1)
            break
        else:
            rewards.append(0)
            
    
    k += 1
    if B.check_for_error(board):
        Error = True
        print("Error at game step ", k)
        break
        
if not Error:        
    train_step(boards, rewards)

Action: 
 1
Board now: 
 [ 0. -2.  0.  0.  0.  0.  5.  0.  3.  0.  0.  0. -5.  5.  0.  0.  0. -3.
  0. -5.  0.  0.  0.  0.  2.  0.  0.  0.  0.]
Chosen move:
 [[ 6  2]
 [13  8]]
Action: 
 14
Board now: 
 [ 0. -2.  1.  0.  0.  0.  4.  0.  4.  0.  0.  0. -5.  4.  0.  0.  0. -3.
  0. -5.  0.  0.  0.  0.  2.  0.  0.  0.  0.]
Chosen move:
 [[24 23]
 [24 20]]
Too many or too few pieces on board!
Error at game step  3
