In [None]:
import torch
import copy
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import clear_output
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

import Backgammon as B
import agent as A
import flipped_agent as FA

import pubeval_tpr

In [None]:
class backgammon:
    def __init__(self):
        self.board = B.init_board()
            
    def reset(self):
        self.board = B.init_board()
        self.done = False
        
    def choose_board(self, board):
        self.board = board
        self.done = False
        return np.copy(self.board)
    
    def legal_moves(self, dice, player):
        moves, boards = B.legal_moves(board = self.board, dice = dice, player = player)
        if len(boards) == 0:
            return [], self.board
        n_boards = np.shape(boards)[0]
        tesauro = np.zeros((n_boards, 198))
        for b in range(n_boards):
            tesauro[b,:] = features(boards[b], player)
        return moves, tesauro
    
    def swap_player(self):
        self.board = FA.flip_board(board_copy=np.copy(self.board))
    
    # oppents random move
    def make_move(self, dice):
        moves, _ = self.legal_moves(dice, -1)
        if len(moves) == 0:
            return self.step([], -1)
        move = moves[np.random.randint(len(moves))]
        return self.step(move, -1)
    
    def step(self, move, player):
        if len(move) != 0:
            for m in move:
                self.board = B.update_board(board = self.board, move = m, player = player)
        reward = 0
        self.done = False
        if self.iswin():
            reward = player
            self.done = True
        tesauro_board = features(np.copy(self.board), player)
        return tesauro_board, reward, self.done
        
    def iswin(self):
        return B.game_over(self.board)
        
    def render(self):
        B.pretty_print(self.board)

In [None]:
def reset_graph(seed=43):
    #tf.reset_default_graph()
    #tf.set_random_seed(seed)
    torch.manual_seed(seed)
    np.random.seed(seed)
reset_graph()

In [None]:
# D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
D_in, H1, H2, D_out = 198, 256, 128, 1

actor = torch.nn.Sequential(
    torch.nn.Linear(D_in, H1),
    torch.nn.LeakyReLU(),
    torch.nn.Linear(H1, H2),
    torch.nn.LeakyReLU(),
    torch.nn.Linear(H2, D_out),
    torch.nn.Softmax(dim=0),
)
critic = torch.nn.Sequential(
    torch.nn.Linear(D_in, H1),
    torch.nn.LeakyReLU(),
    torch.nn.Linear(H1, H2),
    torch.nn.LeakyReLU(),
    torch.nn.Linear(H2, D_out),
    torch.nn.Tanh(),
)

In [None]:
def get_action_and_value(actor, boards):
    boards = torch.from_numpy(boards).float()
    possible_actions_probs = actor(boards)
    with torch.no_grad():
        action = int(torch.multinomial(possible_actions_probs.view(1,-1), 1))
    action_value = possible_actions_probs[action]
    return action, action_value

def get_action_and_value_greedy(actor, boards):
    boards = torch.from_numpy(boards).float()
    possible_actions_probs = actor(boards)
    action = np.argmax(possible_actions_probs)
    action_value = possible_actions_probs[action]
    return action, action_value

def get_action_value(actor, boards, action):
    boards = torch.from_numpy(boards).float()
    possible_actions_probs = actor(boards)
    action_value = possible_actions_probs[action]
    return action_value

def get_action(actor, boards):
    with torch.no_grad():
        boards = torch.from_numpy(boards).float()
        possible_actions_probs = actor(boards)
        action = torch.multinomial(possible_actions_probs.view(1,-1), 1)
    return int(action)

def get_action_greedy(actor, boards):
    with torch.no_grad():
        boards = torch.from_numpy(boards).float()
        possible_actions_probs = actor(boards)
        action = np.argmax(possible_actions_probs)
    return int(action)

def get_state_value(nn_model, after_state):
    after_state = torch.from_numpy(after_state).float()
    value = nn_model(after_state)
    return value

def epsilon_greedy(critic, possible_boards, epsilon=1):
    possible_boards = torch.from_numpy(possible_boards).float()
    values = critic(possible_boards)
    if np.random.random()<epsilon:
        _ , index = values.max(0)
    else:
        index = np.random.randint(0, len(possible_boards))
    return int(index)

In [None]:
"""
Use: f = features(board)
Input: board is is a 29-vector
Output: f is a 198-vector of features that follows Tesauro's procedure.
        See p. 423 in Sutton & Barto
"""
def features(board, player):
    f = np.zeros(198)
    
    # define features for points on board
    p = 0
    for i in range(1,25):
        point = board[i]
        #print('point:', point)
        #print('p: ', p)
        if (point != 0):
            #print('Not 0')
            if(point > 0):
                if (point == 1):
                    f[p] = 1
                elif (point == 2):
                    f[p+1] = 1
                elif (point == 3):
                    f[p+2] = 1
                else:
                    f[p+3] = (point-3)/2
            else:
                if (point == -1):
                    f[p+4] == 1
                elif (point == -2):
                    f[p+5] = 1
                elif (point == -3):
                    f[p+6] = 1
                else:
                    f[p+7] = (-point-3)/2
        p += 8
    
    f[192] = board[25]/2
    f[193] = board[26]/2
    f[194] = board[27]/15
    f[195] = board[28]/15
    f[196] = int(player == 1)
    f[197] = int(player == -1)
    return f

In [None]:
import time
rew_plt = []

def playAgainstRandom(num_games = 100, clear = True):
    global rew_plt
    rew = []
    for episode in range(num_games):
        env.reset()
        done = False

        while not done:
            dice = B.roll_dice()
            for i in range(1 + int(dice[0] == dice[1])):
                possible_moves, possible_boards = env.legal_moves(dice, 1)
                if len(possible_moves) == 0:
                    break
                action = get_action(actor, possible_boards)
                after_state, reward, done = env.step(possible_moves[action], player = 1)
                if done:
                    break

            if not done:
                dice = B.roll_dice()
                for i in range(1 + int(dice[0] == dice[1])):
                    next_state, reward, done = env.make_move(dice)
                    if done:
                        break


        rew.append(reward)

    if clear:
        clear_output(True)
    rew_plt.append(np.mean(np.equal(rew,1)))
    rew = []
    plt.plot(rew_plt)
    plt.axhline(0.5, color="gray")
    plt.show()
    print("Win rate:", rew_plt[-1])

In [None]:
import time
pubRew_plt = []

def playPubeval(num_games = 100, clear = True):
    global pubRew_plt
    pubRew = []
    for episode in range(num_games):
        env.reset()
        done = False

        while not done:
            dice = B.roll_dice()
            for i in range(1 + int(dice[0] == dice[1])):
                possible_moves, possible_boards = env.legal_moves(dice, 1)
                if len(possible_moves) == 0:
                    break
                action = get_action_greedy(actor, possible_boards)
                after_state, reward, done = env.step(possible_moves[action], player = 1)
                if done:
                    break

            if not done:
                dice = B.roll_dice()
                for i in range(1 + int(dice[0] == dice[1])):
                    action = pubeval_tpr.agent_pubeval(np.copy(env.board), dice, oplayer = -1)
                    next_state, reward, done = env.step(action, player = -1)
                    if done:
                        reward = -1
                        break


        pubRew.append(reward)

    if clear:
        clear_output(True)
    pubRew_plt.append(np.mean(np.equal(pubRew,1)))
    pubRew = []
    plt.plot(pubRew_plt)
    plt.axhline(0.5, color="gray")
    plt.show()
    print("Win rate:", pubRew_plt[-1])

In [None]:
gamma = 1
actor_alpha = 0.001
critic_alpha = 0.001
actor_lambda = 0.7
critic_lambda = 0.7
forever = 2000000
score = []

plt_iter = 200

env = backgammon()
tic = time.time()

for episode in range(1,forever+1):

    env.reset()
    done = False
    step = 1
    
    I = 1
    actor_Z = [ [0 for layer in actor.parameters()] for player in range(2) ]
    critic_Z = [ [0 for layer in critic.parameters()] for player in range(2) ]
    value = [[0,0],[0,0]] #value[player][0(old),1(new)]
        
    player = 0
    while not done:
        dice = B.roll_dice()
        for i in range(1 + int(dice[0] == dice[1])):
            possible_moves, possible_boards = env.legal_moves(dice, 1)
            if len(possible_moves) == 0:
                break
            action, pi = get_action_and_value(actor, possible_boards) # Using actor
            pi.clamp(min=1e-8) # so that log does not become nan
            log_pi = torch.log(pi) 
            actor.zero_grad()
            log_pi.backward()
            with torch.no_grad():
                for i, param in enumerate(actor.parameters()):
                    actor_Z[player][i] = actor_lambda * I * actor_Z[player][i] + param.grad
            after_state, reward, done = env.step(possible_moves[action], player = 1)
            if done:
                break
                
        if not done:
            value[player][0] = float(value[player][1]) # old_value
            value[player][1] = get_state_value(critic, after_state)
            critic.zero_grad()
            value[player][1].backward()
            with torch.no_grad():
                for i, param in enumerate(critic.parameters()):
                    critic_Z[player][i] = critic_lambda * critic_Z[player][i] + param.grad
            if (step>2):
                with torch.no_grad():
                    reward = 0 # Reward er 0
                    delta = reward + gamma*value[player][1] - value[player][0]
                    
                    for i, param in enumerate(actor.parameters()):
                        param += actor_alpha * delta * actor_Z[player][i] 
                    for i, param in enumerate(critic.parameters()):
                        param += critic_alpha * delta * critic_Z[player][i]
            
            if player==0:
                I *= gamma
                step += 1
            player = 1 - player
            env.swap_player()
            
    score.append(player)
            
    value[player][0] = float(value[player][1])
    value[player][1] = 0
    
    value[1 - player][0] = float(value[1 - player][1])
    value[1 - player][1] = 0
    
    with torch.no_grad():
        delta1 = reward + gamma*value[player][1] - value[player][0]
        delta2 = -reward + gamma*value[1 - player][1] - value[1 - player][0]
        
        for i, param in enumerate(actor.parameters()):
            param += actor_alpha * delta1 * actor_Z[player][i]
            param += actor_alpha * delta2 * actor_Z[1 - player][i] 
        for i, param in enumerate(critic.parameters()):
            param += critic_alpha * delta1 * critic_Z[player][i]
            param += critic_alpha * delta2 * critic_Z[1 - player][i]
            
    if episode%5000 == 0:
        torch.save(critic.state_dict(), "critic_nn_nott.pth")
        torch.save(actor.state_dict(), "actor_nn_nott.pth")

    if episode%plt_iter == 0:
        toc = time.time()
        playPubeval(num_games = 30, clear = True)
        playAgainstRandom(num_games = 30, clear = False)
        print('++++++++++++++++++++++++++++++')
        print('Win-rate in self-play: ', np.mean(np.equal(score,1)))
        print("Time per {}: {}".format(plt_iter, toc-tic))
        print("EPISODE: ", episode, "Steps:", step)
        time.sleep(50)
        tic = time.time()
