In [46]:
import numpy as np
import math
import os

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import unittest
from random import shuffle
import random
import pickle

In [47]:
class DraftGame:

    def __init__(self, vals):
        self.columns = 4
        self.player_vals = vals
        self.rounds = 0

    def get_board(self):
        b = np.zeros((self.columns,), dtype=np.int)
        return b

    def get_board_size(self):
        return self.columns

    def get_action_size(self):
        return self.columns

    
    def next_state(self, board, player, action):
        b = np.copy(board)
        #print("round is: ", self.rounds,"board is: ", b, "player is: ", player, "action is: ", action)
        
        b[action] = player
      
        #print("new board is: ", b, "next player is: ", player)
        
        # Return new game state with perspective fliped
        return (b, -player)

    def legal_moves(self, board):
        legal = False
        for i in range(len(board)):
            if board[i] == 0:
                #print("checking for legal moves, board is: ", b, "index is: ", i, "value at index is: ", board[i] )
                legal = True
        return legal

    def valid_moves(self, board):
        # All moves are set to invalid
        valid_moves = [0] * self.get_action_size()

        for index in range(self.columns):
            if board[index] == 0:
                valid_moves[index] = 1

        return valid_moves

    def is_win(self, board, player):
        playerscore = 0
        otherscore = 0
        if (self.legal_moves(board) == True):
            return False
        else:
            for index in range(self.columns):
                if board[index] == player:
                    playerscore = playerscore + self.player_vals[index]
                else:
                    otherscore = otherscore + self.player_vals[index]

            if playerscore > otherscore:
                return True
            else:
                return False
    def reward_for_player(self, board, player):
        if self.is_win(board, player):
            return 1
        if self.is_win(board, -player):
            return -1
        if self.legal_moves(board):
            return None

        return 0

    def canonical_board(self, board, player):
        return player * board

In [48]:
class GameModel(nn.Module):

    def __init__(self, board_size, action_size, device):

        super(GameModel, self).__init__()

        self.device = device
        self.size = board_size
        self.action_size = action_size

        self.fc1 = nn.Linear(in_features=self.size, out_features=16)
        self.fc2 = nn.Linear(in_features=16, out_features=16)

        # Two heads: policy and value
        self.action_head = nn.Linear(in_features=16, out_features=self.action_size)
        self.value_head = nn.Linear(in_features=16, out_features=1)

        self.to(device)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))

        action_logits = self.action_head(x)
        value_logit = self.value_head(x)

        return F.softmax(action_logits, dim=1), torch.tanh(value_logit)

    def predict(self, board):
        board = torch.FloatTensor(board.astype(np.float32)).to(self.device)
        board = board.view(1, self.size)
        self.eval()
        with torch.no_grad():
            pi, v = self.forward(board)

        return pi.data.cpu().numpy()[0], v.data.cpu().numpy()[0]

In [49]:
def ucb_score(parent, child):
    prior_score = child.prior * math.sqrt(parent.visit_count) / (child.visit_count + 1)
    if child.visit_count > 0:
        value_score = -child.value()
    else:
        value_score = 0

    return value_score + prior_score


class Node:
    def __init__(self, prior, to_play):
        self.visit_count = 0
        self.to_play = to_play
        self.prior = prior
        self.value_sum = 0
        self.children = {}
        self.state = None
    
    #True if node has been expanded
    def expanded(self):
        return len(self.children) > 0
    
    #gets value of node
    def value(self):
        if self.visit_count == 0:
            return 0
        return self.value_sum / self.visit_count
    
    def select_action(self, temperature):
        """
        Select action according to the visit count distribution and the temperature.
        """
        visit_counts = np.array([child.visit_count for child in self.children.values()])
        actions = [action for action in self.children.keys()]
        if temperature == 0:
            action = actions[np.argmax(visit_counts)]
        elif temperature == float("inf"):
            action = np.random.choice(actions)
        else:
            visit_count_distribution = visit_counts ** (1 / temperature)
            visit_count_distribution = visit_count_distribution / sum(visit_count_distribution)
            action = np.random.choice(actions, p=visit_count_distribution)

        return action

    def select_child(self):
        #choose child with maximum UCB
        best_score = -np.inf
        best_action = -1
        best_child = None

        for action, child in self.children.items():
            score = ucb_score(self, child)
            if score > best_score:
                best_score = score
                best_action = action
                best_child = child

        return best_action, best_child
    
    def expand(self, state, to_play, action_probs):
        """
        We expand a node and keep track of the prior policy probability given by neural network
        """
        self.to_play = to_play
        self.state = state
        for a, prob in enumerate(action_probs):
            if prob != 0:
                self.children[a] = Node(prior=prob, to_play=self.to_play * -1)

    def __repr__(self):
        # to debug
        prior = "{0:.2f}".format(self.prior)
        return "{} Prior: {} Count: {} Value: {}".format(self.state.__str__(), prior, self.visit_count, self.value())


class MCTS:

    def __init__(self, game, model, args):
        self.game = game
        self.model = model
        self.args = args

    def run(self, model, state, to_play):

        root = Node(0, to_play)
        # EXPAND root
        #get priors from nueral network
        action_probs, value = model.predict(state)
        #get valid moves given state of the game
        valid_moves = self.game.valid_moves(state)
        action_probs = action_probs * valid_moves  # mask invalid moves
        action_probs /= np.sum(action_probs)
        root.expand(state, to_play, action_probs)

        for _ in range(self.args['num_simulations']):
            node = root
            search_path = [node]

            # SELECT
            while node.expanded():
                action, node = node.select_child()
                search_path.append(node)

            parent = search_path[-2]
            state = parent.state
            next_state, _ = self.game.next_state(state, player=1, action=action) 
            # Get the board from the perspective of the other player
            next_state = self.game.canonical_board(next_state, player=-1)

            # The value of the new state
            value = self.game.reward_for_player(next_state, player=1)
            if value is None:
                # If the game has not ended, expand
                action_probs, value = model.predict(next_state)
                valid_moves = self.game.valid_moves(next_state)
                action_probs = action_probs * valid_moves  # mask invalid moves
                action_probs /= np.sum(action_probs)
                node.expand(next_state, parent.to_play * -1, action_probs)

            self.backpropagate(search_path, value, parent.to_play * -1)

        return root

    def backpropagate(self, search_path, value, to_play):
        for node in reversed(search_path):
            node.value_sum += value if node.to_play == to_play else -value
            node.visit_count += 1

In [50]:
class Trainer:

    def __init__(self, game, model, args):
        self.game = game
        self.model = model
        self.args = args
        self.mcts = MCTS(self.game, self.model, self.args)

    def exceute_episode(self):

        train_examples = []
        current_player = 1
        state = self.game.get_board()

        while True:
            canonical_board = self.game.canonical_board(state, current_player)

            self.mcts = MCTS(self.game, self.model, self.args)
            root = self.mcts.run(self.model, canonical_board, to_play=1)

            action_probs = [0 for _ in range(self.game.get_action_size())]
            for k, v in root.children.items():
                action_probs[k] = v.visit_count

            action_probs = action_probs / np.sum(action_probs)
            train_examples.append((canonical_board, current_player, action_probs))

            action = root.select_action(temperature=0)
            state, current_player = self.game.next_state(state, current_player, action)
            reward = self.game.reward_for_player(state, current_player)

            if reward is not None:
                ret = []
                for hist_state, hist_current_player, hist_action_probs in train_examples:
                    ret.append((hist_state, hist_action_probs, reward * ((-1) ** (hist_current_player != current_player))))

                return ret

    def learn(self):
        pi_losses = []
        v_losses = []
        for i in range(1, self.args['numIters'] + 1):

            print("{}/{}".format(i, self.args['numIters']))

            train_examples = []

            for eps in range(self.args['numEps']):
                iteration_train_examples = self.exceute_episode()
                train_examples.extend(iteration_train_examples)

            shuffle(train_examples)
            pi_loss, v_loss = self.train(train_examples) #may need to append instead of assign index
            pi_losses.append(pi_loss)
            v_losses.append(v_loss)
            filename = self.args['checkpoint_path']
            self.save_checkpoint(folder=".", filename=filename)
        return pi_losses, v_losses

    def train(self, examples):
        optimizer = optim.Adam(self.model.parameters(), lr=5e-4)
        pi_losses = []
        v_losses = []

        for epoch in range(self.args['epochs']):
            self.model.train()

            batch_idx = 0

            while batch_idx < int(len(examples) / self.args['batch_size']):
                sample_ids = np.random.randint(len(examples), size=self.args['batch_size'])
                boards, pis, vs = list(zip(*[examples[i] for i in sample_ids]))
                boards = torch.FloatTensor(np.array(boards).astype(np.float64))
                target_pis = torch.FloatTensor(np.array(pis))
                target_vs = torch.FloatTensor(np.array(vs).astype(np.float64))

                # predict
                boards = boards.contiguous().cuda()
                target_pis = target_pis.contiguous().cuda()
                target_vs = target_vs.contiguous().cuda()

                # compute output
                out_pi, out_v = self.model(boards)
                l_pi = self.loss_pi(target_pis, out_pi)
                l_v = self.loss_v(target_vs, out_v)
                total_loss = l_pi + l_v

                pi_losses.append(float(l_pi))
                v_losses.append(float(l_v))

                optimizer.zero_grad()
                total_loss.backward()
                optimizer.step()

                batch_idx += 1

            print("Policy Loss", np.mean(pi_losses))
            print("Value Loss", np.mean(v_losses))
            print("Examples:")
            print(out_pi[0].detach())
            print(target_pis[0])
            return np.mean(pi_losses), np.mean(v_losses)

    def loss_pi(self, targets, outputs):
        loss = -(targets * torch.log(outputs)).sum(dim=1)
        return loss.mean()

    def loss_v(self, targets, outputs):
        loss = torch.sum((targets-outputs.view(-1))**2)/targets.size()[0]
        return loss

    def save_checkpoint(self, folder, filename):
        if not os.path.exists(folder):
            os.mkdir(folder)

        filepath = os.path.join(folder, filename)
        torch.save({
            'state_dict': self.model.state_dict(),
        }, filepath)

In [51]:
#Train Model

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


args = {
    'batch_size': 64,
    'numIters': 500,                             # training iterations
    'num_simulations': 100,                     # Number of MCTS simulations to run when deciding on a move to play
    'numEps': 100,                              # Number of games to run for each iteration
    'numItersForTrainExamplesHistory': 20,
    'epochs': 2,                                # epochs of training / iteration
    'checkpoint_path': 'latest.pth'             # Where to save most current set of weights
}

vals = [62, 91, 63, 55]
game = DraftGame(vals)

board_size = game.get_board_size()

action_size = game.get_action_size()

model = GameModel(board_size, action_size, device)

trainer = Trainer(game, model, args)
pi_losses, v_losses = trainer.learn()

In [52]:
#Plot Model Losses
import pandas as pd
losses = pd.DataFrame(pi_losses, v_losses).reset_index()
losses.columns = ['pi_loss', 'v_loss']
losses.reset_index()
figfile = str(str(vals)+'.png')
losses.plot.line(figsize=(16,8)).figure.savefig(figfile)



## Draft Game Test

In [53]:
#Draft Game Test
b = np.zeros((4,), dtype=np.int)
vals = [12, 10, 8, 4]
game = DraftGame(vals)
player = 1
legal=True
while legal == True:
    canonical_b = game.canonical_board(b, player)
    print("canonical: ", canonical_b, "normal: ", b)
    priors = model.predict(canonical_b)
    priors = priors[0]
    print(priors)
    maxind = np.argmax(priors)
    valid = game.valid_moves(b)
    arg_sorted_priors = np.argsort(priors, axis=0)
    if valid[arg_sorted_priors[-1]] == 1:
        b, player = game.next_state(b, player, arg_sorted_priors[-1])
    elif valid[arg_sorted_priors[-2]] ==1:
        b, player = game.next_state(b, player, arg_sorted_priors[-2])
    elif valid[arg_sorted_priors[-3]] ==1:
        b, player = game.next_state(b, player, arg_sorted_priors[-3])
    else:
        b, player = game.next_state(b, player, arg_sorted_priors[-4])
    legal = game.legal_moves(b)
    #print('legal: ', legal)
    #print("next player: ",player)
    

## Learned Strategy vs Random and Greedy

In [54]:
def learned_choice(model, game, b, player=1):
    canonical_b = game.canonical_board(b, player)
    priors = model.predict(canonical_b)
    priors = priors[0]
    maxind = np.argmax(priors)
    valid = game.valid_moves(b)
    arg_sorted_priors = np.argsort(priors, axis=0)
    if valid[arg_sorted_priors[-1]] == 1:
        b, player = game.next_state(b, player, arg_sorted_priors[-1])
    elif valid[arg_sorted_priors[-2]] ==1:
        b, player = game.next_state(b, player, arg_sorted_priors[-2])
    elif valid[arg_sorted_priors[-3]] ==1:
        b, player = game.next_state(b, player, arg_sorted_priors[-3])
    else:
        b, player = game.next_state(b, player, arg_sorted_priors[-4])
    return b

def rand_choice(b, game, player = -1):
    valid = game.valid_moves(b)
    options = []
    for i in range(len(valid)):
        if valid[i] == 1:
            options.append(i)
    choice_index = random.choice(options)
    new_board = np.copy(b)
    new_board[choice_index] = -1
    return new_board
    

def greedy_choice(b, game, player = -1):
    valid = game.valid_moves(b)
    options = []
    for i in range(len(valid)):
        if valid[i] == 1:
            options.append(i) #gives index of options [0, 2, 3]
   
    best_ind = options[0]
    for i in options:
        if game.player_vals[i] > game.player_vals[best_ind]:
            best_ind = i
    new_board = np.copy(b)
    new_board[best_ind] = -1
    return new_board

In [55]:
def against_rand(num_games, starting_player,model, game, vals):
    learn_tally = 0
    rand_tally = 0
    tie_tally = 0
    for i in range(num_games):
        game = DraftGame(vals)
        board = np.zeros((4,), dtype=np.int)
        current_player = starting_player
        keep_playing = True
        while keep_playing == True:
            if current_player == 1:
                new = learned_choice(model, game, board, player=1)
                next_player = -1
            else:
                new = rand_choice(board, game)
                next_player = 1
            current_player = next_player
            board = np.copy(new)
            if (0 not in board):
                keep_playing = False
                learn_winner = game.is_win(board, 1)
                rand_winner = game.is_win(board, -1)
                if learn_winner == True:
                    #print("Learned Strategy Wins")
                    learn_tally = learn_tally+1
                elif rand_winner == True:

                    rand_tally = rand_tally + 1
                else: 
                    tie_tally = tie_tally + 1
    print("learned strategy win rate was: ", (learn_tally/num_games)*100, "%")
    print("tie rate was: ", (tie_tally/num_games)*100, "%")
    print("random strategy win rate was: ", (rand_tally/num_games)*100, "%")
    return (learn_tally/num_games)
    
def against_greedy(num_games, starting_player, model, game, vals):
    learn_tally = 0
    greedy_tally = 0
    tie_tally = 0
    for i in range(num_games):
        #print("---------------------------------NEW GAME---------------------------------")
        #vals = [12, 10, 8, 4]
        game = DraftGame(vals)
        board = np.zeros((4,), dtype=np.int)
        #print(board)
        current_player = starting_player
        keep_playing = True
        while keep_playing == True:
            #print("BEFORE: board: ", board, "current player is: ", current_player)
            if current_player == 1:
                new = learned_choice(model, game, board)
                next_player = -1
            else:
                new = greedy_choice(board, game)
                next_player = 1
            current_player = next_player
            board = np.copy(new)
            #print(board)
            #print("After: new board: ", board, "current player is now: ", current_player)
            if (0 not in board):
                keep_playing = False
                learn_winner = game.is_win(board, 1)
                greedy_winner = game.is_win(board, -1)
                if learn_winner == True:
                    #print("Learned Strategy Wins")
                    learn_tally = learn_tally+1
                elif greedy_winner == True:
                    #print("Other Strategy Wins")
                    greedy_tally = greedy_tally + 1
                else: 
                    #print('Game Tied')
                    tie_tally = tie_tally + 1
    print("learned strategy win rate was: ", (learn_tally/num_games)*100, "%")
    print("tie rate was: ", (tie_tally/num_games)*100, "%")
    print("greedy strategy win rate was: ", (greedy_tally/num_games)*100, "%")
    return (learn_tally/num_games)

In [56]:
def results(model, game, vals):
    print('Against greedy -- learned going first: ')
    against_greedy(5, 1, model, game, vals)
    print('Against greedy -- greedy going first: ')
    against_greedy(5, -1, model, game, vals)
    print('Against rand -- learned going first: out of 100')
    against_rand(100, 1, model, game, vals)
    print('Against rand -- rand going first: out of 100')
    against_rand(100, -1, model, game, vals)

In [57]:
def train(vals):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


    args = {
        'batch_size': 64,
        'numIters': 500,                                # Total number of training iterations
        'num_simulations': 100,                         # Total number of MCTS simulations to run when deciding on a move to play
        'numEps': 100,                                  # Number of full games (episodes) to run during each iteration
        'numItersForTrainExamplesHistory': 20,
        'epochs': 2,                                    # Number of epochs of training per iteration
        'checkpoint_path': 'latest.pth'                 # location to save latest set of weights
    }

    game = DraftGame(vals)

    board_size = game.get_board_size()

    action_size = game.get_action_size()

    model = GameModel(board_size, action_size, device)

    trainer = Trainer(game, model, args)
    pi_losses, v_losses = trainer.learn()
    
    losses = pd.DataFrame(pi_losses, v_losses).reset_index()
    losses.columns = ['pi_loss', 'v_loss']
    losses.reset_index()
    figfile = str(str(vals)+'.png')
    losses.plot.line(figsize=(16,8)).figure.savefig(figfile)
    
    return model, game


def evaluate(model, vals):
    game = DraftGame(vals)
    print('FINAL RESULTS')
    results(model, game, vals)

In [58]:
#Save Model
model_name = str('model'+ str(vals))
filename = model_name
pickle.dump(model, open(model_name, 'wb'))

In [59]:
#To open value_sets.txt
with open("value_sets.txt", "rb") as new_filename:
    vals_list_test = pickle.load(new_filename)
vals_list

In [60]:
#To train and save models for each value set in value_sets.txt
for vals in vals_list:
    print('CURRENT VALS IS: ', vals)
    model, game = train(vals)
    #save models
    strvals = str(vals)
    model_name = str('model'+ str(vals))
    filename = model_name
    pickle.dump(model, open(model_name, 'wb'))
    

In [61]:
#To load and evaluate imported model
with open('model[62, 91, 63, 55]', 'rb') as file:  
    model = pickle.load(file)

evaluate(model, [62, 91, 63, 55])