In [1]:
import torch
import copy
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import clear_output
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

import Backgammon as B
import agent as A
import flipped_agent as FA

In [2]:
class backgammon:
    def __init__(self):
        self.board = B.init_board()
            
    def reset(self):
        self.board = B.init_board()
        self.done = False
        
    def choose_board(self, board):
        self.board = board
        self.done = False
        return np.copy(self.board)
    
    def legal_moves(self, dice, player):
        moves, boards = B.legal_moves(board = self.board, dice = dice, player = player)
        if len(boards) == 0:
            return [], self.board
        return moves, np.vstack(boards)
    
    def swap_player(self):
        self.board = FA.flip_board(board_copy=np.copy(self.board))
    
    # oppents random move
    def make_move(self, dice):
        moves, _ = self.legal_moves(dice, -1)
        if len(moves) == 0:
            return self.step([], -1)
        move = moves[np.random.randint(len(moves))]
        return self.step(move, -1)
    
    def step(self, move, player):
        if len(move) != 0:
            for m in move:
                self.board = B.update_board(board = self.board, move = m, player = player)
        reward = 0
        self.done = False
        if self.iswin():
            reward = player
            self.done = True
        return np.copy(self.board), reward, self.done
        
    def iswin(self):
        return B.game_over(self.board)
        
    def render(self):
        B.pretty_print(self.board)

In [3]:
def reset_graph(seed=42):
    #tf.reset_default_graph()
    #tf.set_random_seed(seed)
    torch.manual_seed(seed)
    np.random.seed(seed)
reset_graph()

In [4]:
# D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
D_in, H1, H2, H3, D_out = 29, 32, 64, 128, 1

actor = torch.nn.Sequential(
    torch.nn.Linear(D_in, H1),
    torch.nn.LeakyReLU(),
    torch.nn.Linear(H1, H2),
    torch.nn.LeakyReLU(),
    torch.nn.Linear(H2, H3),
    torch.nn.LeakyReLU(),
    torch.nn.Linear(H3, D_out),
    torch.nn.Softmax(dim=0),
)
critic = torch.nn.Sequential(
    torch.nn.Linear(D_in, H1),
    torch.nn.LeakyReLU(),
    torch.nn.Linear(H1, H2),
    torch.nn.LeakyReLU(),
    torch.nn.Linear(H2, H3),
    torch.nn.LeakyReLU(),
    torch.nn.Linear(H3, D_out),
    torch.nn.Tanh(),
)
memory = torch.nn.Sequential(
    torch.nn.Linear(D_in, H1),
    torch.nn.LeakyReLU(),
    torch.nn.Linear(H1, H2),
    torch.nn.LeakyReLU(),
    torch.nn.Linear(H2, H3),
    torch.nn.LeakyReLU(),
    torch.nn.Linear(H3, D_out),
    torch.nn.Tanh(),
)
# save initial parameters of transient memory
initial_memory = [copy.deepcopy(param) for param in memory.parameters()]

In [5]:
def get_action_and_value(actor, boards):
    boards = torch.from_numpy(boards).float()
    possible_actions_probs = actor(boards)
    with torch.no_grad():
        action = int(torch.multinomial(possible_actions_probs.view(1,-1), 1))
    action_value = possible_actions_probs[action]
    return action, action_value

def get_action_value(actor, boards, action):
    boards = torch.from_numpy(boards).float()
    possible_actions_probs = actor(boards)
    action_value = possible_actions_probs[action]
    return action_value

def get_action(actor, boards):
    with torch.no_grad():
        boards = torch.from_numpy(boards).float()
        possible_actions_probs = actor(boards)
        action = torch.multinomial(possible_actions_probs.view(1,-1), 1)
    return int(action)

def get_state_value(nn_model, after_state):
    after_state = torch.from_numpy(after_state).float()
    value = nn_model(after_state)
    return value

def epsilon_greedy(critic, possible_boards, epsilon=.9):
    possible_boards = torch.from_numpy(possible_boards).float()
    values = critic(possible_boards)
    if np.random.random()<epsilon:
        _ , index = values.max(0)
    else:
        index = np.random.randint(0, len(possible_boards))
    return int(index)

def composite_greedy(critic, memory, possible_boards, epsilon=1):
    possible_boards = torch.from_numpy(possible_boards).float()
    critic_values = critic(possible_boards)
    memory_values = memory(possible_boards)
    values = critic_values + memory_values
    if np.random.random()<epsilon:
        _ , index = values.max(0)
    else:
        index = np.random.randint(0, len(possible_boards))
    return int(index)

In [8]:
"""
Use: valuefn_temp = search(pre_state, memory, n_dreams, max_steps)
Input: pre_state is current state,
       memory is the transient memorythe value function,
       old_value is the value of last after state
       n_dreams is number of dreams,
       max_steps is maximum number of steps,
Output: memory has been updated for episodes in dream
"""
def search(pre_state, pre_value, n_dreams, max_steps = 1000):
    # Clear eligibility trace
    with torch.no_grad():
        for i, param in enumerate(memory.parameters()):
            param.data.copy_(initial_memory[i])
    
    # Dream n_dreams
    for dreams in range(n_dreams):
        # Clear eligibility trace
        memory_Z = [0 for layer in critic.parameters()]
        done = False
        I = 1
        step = 1
        state = env.choose_board(np.copy(pre_state))
        # Set after_state = pre_state to deal with possibilty of no legal move on first step
        #after_state = = env.choose_board(np.copy(pre_state))
        old_value = pre_value
        skip_round = False
        Error = False
        
        print(' +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ ')
        print('dream: ', dreams)
        print('pre_state: ', pre_state)
        env.render()
        print(' +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ ')
        
        # play one round and update
        while not (done or step > max_steps):
            if (sum(env.board[env.board>0])!=15 or sum(env.board[env.board<0])!=-15):
                print('++++++++++ --------- \n ++++++++++')
                print('TOOO MANY!!!!!!!!!!!!!!!!!!!!!!!!!!')
                print('++++++++++ --------- \n ++++++++++')
            dice = B.roll_dice()
            print(dice)
            for i in range(1 + int(dice[0] == dice[1])):
                print('step: ', step)
                possible_moves, possible_boards = env.legal_moves(dice, 1)
                
                if len(possible_moves) == 0:
                    after_state = possible_boards
                    #skip_round = True
                    #after_state = next_state
                    break
                
                env.render()
                # Use composite value function to choose action
                action = composite_greedy(critic, memory, possible_boards)
                after_state, reward, done = env.step(possible_moves[action], player = 1)
                if done:
                    break
            if not done:
                critic_value = get_state_value(critic, after_state)
                memory_value = get_state_value(memory, after_state)
                value = critic_value + memory_value
                #calc critic gradient
                memory.zero_grad()
                critic.zero_grad()
                value.backward()
                with torch.no_grad():
                    for i, param in enumerate(memory.parameters()):
                        memory_Z[i] = memory_lambda * memory_Z[i] + param.grad
            else:
                value = 0
            skip_round = False
            
            with torch.no_grad():
                # other players move
                if not done:
                    dice = B.roll_dice()
                    for i in range(1 + int(dice[0] == dice[1])):
                        next_state, reward, done = env.make_move(dice)
                        if done:
                            break
                    #next_value = get_state_value(critic, next_state)
                else:
                    next_value = 0
                
                delta = reward + gamma*value - old_value
                old_value = value
            
                # apply gradients
                for i, param in enumerate(memory.parameters()):
                    param += memory_alpha * delta * memory_Z[i]

            I *= gamma
            step +=1

    state = env.choose_board(np.copy(pre_state))
    old_value = pre_value


In [9]:
gamma = 0.99
actor_alpha = 0.01
critic_alpha = 0.01
memory_alpha = 0.01
actor_lambda = 0.7
critic_lambda = 0.7
memory_lambda = 0.7
forever = 49

plt_iter = 100
rew = []
rew_plt = []

from time import time
tic = time()

for episode in range(forever):
    print('\n \n \n +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ \n \n \n')
    print('episode: ', episode)
    print('\n \n \n +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ \n \n \n')
    env = backgammon()
    done = False
    I = 1
    step = 1
    actor_Z = [0 for layer in actor.parameters()]
    critic_Z = [0 for layer in critic.parameters()]
    memory_Z = [0 for layer in memory.parameters()]
    n_dreams = 5
    max_rounds = 5
    Error = False

    while not done:
        dice = B.roll_dice()
        for i in range(1 + int(dice[0] == dice[1])):
            possible_moves, possible_boards = env.legal_moves(dice, 1)

            if len(possible_moves) == 0:
                break

            if (step > 1):
                search(next_state, old_value, n_dreams, max_rounds)
                print('\n ------------- \n POST DREAM')
                env.render()
                action = composite_greedy(critic, memory, possible_boards)
                print('\n ------------- \n POST ACTION')
                env.render()
                """
                print('currnt state')
                env.render()
                critic_value = get_state_value(critic, after_state)
                memory_value = get_state_value(memory, after_state)
                print('critic_value: ', critic_value)
                print('memory_value: ', memory_value)
                critic_action = epsilon_greedy(critic, possible_boards)
                print('action: ', possible_moves[action])
                print('critic_action: ', possible_moves[critic_action])
                """
            else:
                action = epsilon_greedy(critic, possible_boards) # No search on first step
            
            """
            action = epsilon_greedy(critic, possible_boards) # Only use after_state values
            pi = get_action_value(actor, possible_boards, action)
            action, pi = get_action_and_value(actor, possible_boards) # Using actor
            pi.clamp(min=1e-8) # so that log does not become nan
            log_pi = torch.log(pi) 
            actor.zero_grad()
            log_pi.backward()
            with torch.no_grad():
                for i, param in enumerate(actor.parameters()):
                    actor_Z[i] = actor_lambda * I * actor_Z[i] + param.grad
            """
                    
            after_state, reward, done = env.step(possible_moves[action], player = 1)
            if done:
                break
        if not done:
            value = get_state_value(critic, after_state)
            critic.zero_grad()
            value.backward()
            with torch.no_grad():
                for i, param in enumerate(critic.parameters()):
                    critic_Z[i] = critic_lambda * critic_Z[i] + param.grad
        else:
            value = 0
                  
        with torch.no_grad():
            # other players move
            if not done:
                dice = B.roll_dice()
                for i in range(1 + int(dice[0] == dice[1])):
                    next_state, reward, done = env.make_move(dice)
                    if done:
                        break
                next_value = get_state_value(critic, next_state)
                print('\n ------------- \n POST OPPONENT')
                env.render()
            else:
                next_value = 0
            #delta = reward + gamma*next_value - value
            if (step>1):
                delta = reward + gamma*value - old_value
            old_value = value

            ###### plot
            if episode%plt_iter == 0:
                if done:
                    clear_output(True)
                    print('Reward: ',reward)
                    rew_plt.append(np.mean(np.equal(rew,1)))
                    rew = []
                    plt.plot(rew_plt)
                    plt.axhline(0.5, color="gray")
                    plt.show()
                    rnd = False
                    print("Episode: {}".format(episode))
                    toc=time()
                    print('time per',plt_iter,':',toc-tic)
                    tic=toc
                    #env.render()
            ######
            
            if step > 1:
                for j, param in enumerate(critic.parameters()):
                    param += critic_alpha * delta * critic_Z[j]
                #for j, param in enumerate(actor.parameters()):
                #    param += actor_alpha * delta * actor_Z[j]         
        I *= gamma
        step += 1
        
    rew.append(reward)
    #actor_alpha *= 0.99
    #critic_alpha *= 0.99


 
 
 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 
 
 

episode:  0

 
 
 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 
 
 


 ------------- 
 POST OPPONENT
board: 
 [-2.  0.  0.  0.  3.  2.  0.  3.  0.  0.  0. -4.]
[-1.  1.  0.  0.  0. -4.  0. -3. -1.  0.  0.  5.]
[1. 0. 0. 0.]
 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 
dream:  0
pre_state:  [ 0. -2.  0.  0.  0.  3.  2.  0.  3.  0.  0.  0. -4.  5.  0.  0. -1. -3.
  0. -4.  0.  0.  0.  1. -1.  1.  0.  0.  0.]
board: 
 [-2.  0.  0.  0.  3.  2.  0.  3.  0.  0.  0. -4.]
[-1.  1.  0.  0.  0. -4.  0. -3. -1.  0.  0.  5.]
[1. 0. 0. 0.]
 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 
[2 6]
step:  1
board: 
 [-2.  0.  0.  0.  3.  2.  0.  3.  0.  0.  0. -4.]
[-1.  1.  0.  0.  0. -4.  0. -3. -1.  0.  0.  5.]
[1. 0. 0. 0.]
[5 3]
step:  2
board: 
 [-2.  0.  0.  0.  3.  2.  1.  3.  0.  0.  0. -2.]
[-1.  2.  0. -1.  0. -4.  0. -2. -1.  0. -2.  4.]

ValueError: zero-size array to reduction operation maximum which has no identity

In [None]:
plt.plot(rew_plt)
plt.axhline(0.5, color="gray")
plt.show()

In [None]:
    board = np.zeros(29)
    board[1] = -2
    board[12] = -5
    board[17] = -3
    board[19] = -5
    board[6] = 5
    board[8] = 3
    board[13] = 5
    board[24] = 2

In [None]:
board

In [None]:
sum(board[board>0])

In [None]:
sum(board[board<0]) != -15

In [None]:
print(param)

In [None]:
possible_moves, possible_boards = backgammon().legal_moves(1)
get_action_value(actor, possible_boards)

In [None]:
backgammon().render()