# Learn to Score a Tic-Tac-Toe Board by Example

## Introduction 


We want to use machine learning to support intelligent agents playing Tic-Tac-Toe (see [rules](https://en.wikipedia.org/wiki/Tic-tac-toe)). Here is the approach:

1. Simulate playouts to estimate create training data labeled with if they lead to a win.
2. Learn a model to predict the label for each board.
3. The model can be applied as:
   - the heuristic evaluation function for Heuristic Minimax Search.
   - a playout policy for better simulated games used in Pure Monte Carlo Search/Monte Carlo Tree Search.

## The board

I represent the board as a vector of length 9. The values are `' ', 'x', 'o'`.  

In [1]:
import numpy as np
import math

In [2]:
def empty_board():
    return [' '] * 9

board = empty_board()
display(board)

[' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ']

Some helper functions.

In [3]:
def show_board(board):
    """display the board"""
    b = np.array(board).reshape((3,3))
    print(b)

board = empty_board()
show_board(board)    

print()
print("Add some x's")
board[0] = 'x'; board[3] = 'x'; board[6] = 'x';  
show_board(board)

[[' ' ' ' ' ']
 [' ' ' ' ' ']
 [' ' ' ' ' ']]

Add some x's
[['x' ' ' ' ']
 ['x' ' ' ' ']
 ['x' ' ' ' ']]


In [4]:
def check_win(board):
    """check the board and return one of x, o, d (draw), or n (for next move)"""
    
    board = np.array(board).reshape((3,3))
    
    diagonals = np.array([[board[i][i] for i in range(len(board))], 
                          [board[i][len(board)-i-1] for i in range(len(board))]])
    
    for a_board in [board, np.transpose(board), diagonals]:
        for row in a_board:
            if len(set(row)) == 1 and row[0] != ' ':
                return row[0]
    
    # check for draw
    if(np.sum(board == ' ') < 1):
        return 'd'
    
    return 'n'

show_board(board)
print('Win? ' + check_win(board))

print()
show_board(empty_board())
print('Win? ' + check_win(empty_board()))

[['x' ' ' ' ']
 ['x' ' ' ' ']
 ['x' ' ' ' ']]
Win? x

[[' ' ' ' ' ']
 [' ' ' ' ' ']
 [' ' ' ' ' ']]
Win? n


In [5]:
def get_actions(board):
    """return possible actions as a vector ot indices"""
    return np.where(np.array(board) == ' ')[0].tolist()

    # randomize the action order
    #actions = np.where(np.array(board) == ' ')[0]
    #np.random.shuffle(actions)
    #return actions.tolist()


show_board(board)
get_actions(board)

[['x' ' ' ' ']
 ['x' ' ' ' ']
 ['x' ' ' ' ']]


[1, 2, 4, 5, 7, 8]

In [6]:
def result(state, player, action):
    """Add move to the board."""
    
    state = state.copy()
    state[action] = player
  
    return state

show_board(empty_board())

print()
print("State for placing an x at position 4:")
show_board(result(empty_board(), 'x', 4))

[[' ' ' ' ' ']
 [' ' ' ' ' ']
 [' ' ' ' ' ']]

State for placing an x at position 4:
[[' ' ' ' ' ']
 [' ' 'x' ' ']
 [' ' ' ' ' ']]


In [7]:
def other(player): 
    if player == 'x': return 'o'
    else: return 'x'

In [8]:
def utility(state, player = 'x'):
    """check is a state is terminal and return the utility if it is. None means not a terminal mode."""
    goal = check_win(state)        
    if goal == player: return +1 
    if goal == 'd': return 0  
    if goal == other(player): return -1  # loss is failure
    return None # continue

print(utility(['x'] * 9))
print(utility(['o'] * 9))
print(utility(empty_board()))

1
-1
None


# Create Training Data using Playouts


We will try to learn a function $y = h(X)$ where $X$ is a board and $y$ is the utility. The data we need can be creating by running playouts (complete games) and recording the boards and if the playout lead to a win or not.  

To make the data useful for learning, I recode `x` and `o` into numbers.

In [9]:
tr = {' ': 0, 'x': 1, 'o': -1} # I translate the board into numbers

def encode_state(state):
    """Represent the board using numbers."""
    return [tr[s] for s in state]

def playout_record(record = 'x'):
    """Run a playout and record the boards after the player record's move."""
    state = empty_board()
    player = 'x'
    current_player = 'x'
    
    boards = []
    
    while(True):
        # reached terminal state?
        u = utility(state, player)
        if u is not None: return(boards, [u] * len(boards))
  
        # we use a random playout policy
        a = np.random.choice(get_actions(state))
        state = result(state, current_player, a)   
  
        if current_player == record:
            boards.append(encode_state(state))

        # switch between players
        current_player = other(current_player)

playout_record()

([[0, 0, 0, 0, 0, 0, 0, 0, 1],
  [0, 0, 0, 0, 1, 0, 0, -1, 1],
  [0, 0, 0, 0, 1, 1, -1, -1, 1],
  [1, 0, 0, -1, 1, 1, -1, -1, 1]],
 [1, 1, 1, 1])

Run `N` playouts and create a pandas dataframe for `X` and a list for y.

In [10]:
import pandas as pd

def create_data(N = 100, record = 'x'):
    board = []
    utility = []
    
    for i in range(N):
        b, u = playout_record(record)
        board.extend(b)
        utility.extend(u)
        
    return {'X': pd.DataFrame(board), 'y': np.array(utility)}
        
data = create_data(1000)
X = data['X']
y = data['y']

print("X")
display(X)

print("y")
display(y[0:10])

X


Unnamed: 0,0,1,2,3,4,5,6,7,8
0,0,0,1,0,0,0,0,0,0
1,0,0,1,0,-1,0,0,0,1
2,0,-1,1,0,-1,0,1,0,1
3,0,-1,1,-1,-1,0,1,1,1
4,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...
4139,-1,1,-1,0,1,0,1,0,0
4140,-1,1,-1,1,1,-1,1,0,0
4141,0,0,0,1,0,0,0,0,0
4142,0,-1,0,1,0,0,1,0,0


y


array([1, 1, 1, 1, 0, 0, 0, 0, 0, 0])

## Train a Model

In [11]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import confusion_matrix, accuracy_score

Split the data in training and testing data.

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

We learn a ANN to approximate $y = f(X)$ by $\hat{y} = h(X)$. See
https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html

In [13]:
from sklearn.neural_network import MLPClassifier

clf = MLPClassifier(
                    hidden_layer_sizes = (100),
                    #verbose = True,
                    max_iter = 1000)              # max. number of iterations
                    
%time clf.fit(X_train, y_train)

CPU times: user 11.2 s, sys: 10.7 ms, total: 11.2 s
Wall time: 11.2 s


MLPClassifier(hidden_layer_sizes=100, max_iter=1000)

Test the model against the test data.

In [14]:
pred = clf.predict(X_test)

print("y_test:\t", list(y_test)[0:10])
print("pred:\t",   pred[0:10])

print("Confusion matrix:\n", confusion_matrix(pred, y_test))
print("Accuracy:", accuracy_score(pred, y_test))

y_test:	 [1, 1, -1, 1, 1, 1, -1, 1, -1, 1]
pred:	 [ 1  1  1 -1  1 -1  1  1  1  0]
Confusion matrix:
 [[ 82  16  56]
 [  9  52  16]
 [120  49 429]]
Accuracy: 0.6791314837153196


__Note:__ The accuracy is not great since we have many boards with only a few moves on it. Since these boards can easily lead to wins, losses or ties, they produce many errors.

Here is the number of empty cells for each board in the test set.

In [15]:
(X_test == 0).sum(axis=1)

1929    8
160     2
3433    4
377     8
2671    8
       ..
3903    4
3703    8
624     6
1006    4
3141    4
Length: 829, dtype: int64

Test only on boards that have only two cells left to play.

In [16]:
take = list((X_test == 0).sum(axis=1)<=2)

X_test2 = X_test[take]
y_test2 = y_test[take]

In [17]:
pred2 = clf.predict(X_test2)
print("Accuracy:", accuracy_score(pred2, y_test2))

Accuracy: 0.8268398268398268


## Some Tests

Given that o is moving next, who is predicted to win (-1 = `o`, 0 = draw, 1 = `x`). 

In [18]:
# x will win

board = ['x', 'x', ' ',
         'o', 'x', ' ',
         'o', ' ', ' ']

print("Board:")
show_board(board)

%time display(clf.predict(pd.DataFrame([encode_state(board)])))
print(clf.predict_proba(pd.DataFrame([encode_state(board)])))

Board:
[['x' 'x' ' ']
 ['o' 'x' ' ']
 ['o' ' ' ' ']]


array([1])

CPU times: user 25.5 ms, sys: 35 ms, total: 60.5 ms
Wall time: 9.03 ms
[[0.00543554 0.00355217 0.99101229]]


In [19]:
# o will win

board = ['o', 'x', ' ',
         'x', 'o', 'x',
         ' ', ' ', ' ']
    
print("Board:")
show_board(board)

%time display(clf.predict(pd.DataFrame([encode_state(board)])))
print(clf.predict_proba(pd.DataFrame([encode_state(board)])))

Board:
[['o' 'x' ' ']
 ['x' 'o' 'x']
 [' ' ' ' ' ']]


array([-1])

CPU times: user 18.1 ms, sys: 31.1 ms, total: 49.1 ms
Wall time: 6.8 ms
[[0.60756948 0.03865519 0.35377533]]


In [20]:
# o can draw if it chooses 7.

board = ['x', 'o', 'x',
         ' ', 'o', 'x',
         ' ', ' ', ' ']

print("Board:")
show_board(board)

%time display(clf.predict(pd.DataFrame([encode_state(board)])))
print(clf.predict_proba(pd.DataFrame([encode_state(board)])))

Board:
[['x' 'o' 'x']
 [' ' 'o' 'x']
 [' ' ' ' ' ']]


array([-1])

CPU times: user 23.9 ms, sys: 32 ms, total: 55.9 ms
Wall time: 7.73 ms
[[0.37998027 0.31633482 0.30368491]]


In [21]:
# A bad situation for x

board = ['o', 'x', 'x',
         ' ', ' ', 'x',
         ' ', ' ', 'o']

print("Board:")
show_board(board)

%time display(clf.predict(pd.DataFrame([encode_state(board)])))
print(clf.predict_proba(pd.DataFrame([encode_state(board)])))

Board:
[['o' 'x' 'x']
 [' ' ' ' 'x']
 [' ' ' ' 'o']]


array([-1])

CPU times: user 5.92 ms, sys: 167 µs, total: 6.09 ms
Wall time: 4.91 ms
[[9.93887736e-01 9.33134309e-04 5.17912976e-03]]


## Using the Predictions

The predict function can be used for
   - the heuristic evaluation function for Heuristic Minimax Search.
   - a playout policy for better simulated games used in Pure Monte Carlo Search/Monte Carlo Tree Search.

Use the model as a heuristic evaluation function for all possible moves given a board. `x` moves next.

In [25]:
def eval_fun_ML(state, player = 'x'):
    p = clf.predict_proba(pd.DataFrame([encode_state(state)]))
    val = np.sum(p * [-1, 0 , 1])
    #print(state, "; win prob=", p, "; expeced utility =", val)
    return val
    

def eval_moves(state, player = 'x'):
    actions = get_actions(state)    

    for a in actions: 
        b = result(board.copy(), player, a)
        val = eval_fun_ML(b, player)
        print("%s chooses %d; predicted utility = %+1.2f" % (player, a, val))

print("Empty board: Place in the center.")
board = empty_board()
show_board(board)
eval_moves(board)

print("\nPlay 7 to avoid loss.")
board = ['x', 'o', 'x',
         ' ', 'o', ' ',
         ' ', ' ', ' ']
show_board(board)
eval_moves(board)

print("\nPlay 4 to win.")
board = ['o', 'x', ' ',
         ' ', ' ', 'x',
         ' ', ' ', 'o']
show_board(board)
eval_moves(board)


Empty board: Place in the center.
[[' ' ' ' ' ']
 [' ' ' ' ' ']
 [' ' ' ' ' ']]
x chooses 0; predicted utility = +0.44
x chooses 1; predicted utility = -0.01
x chooses 2; predicted utility = +0.40
x chooses 3; predicted utility = +0.03
x chooses 4; predicted utility = +0.54
x chooses 5; predicted utility = +0.18
x chooses 6; predicted utility = +0.35
x chooses 7; predicted utility = +0.25
x chooses 8; predicted utility = +0.43

Play 7 to avoid loss.
[['x' 'o' 'x']
 [' ' 'o' ' ']
 [' ' ' ' ' ']]
x chooses 3; predicted utility = -0.75
x chooses 5; predicted utility = -0.08
x chooses 6; predicted utility = -0.24
x chooses 7; predicted utility = +0.75
x chooses 8; predicted utility = +0.66

Play 4 to win.
[['o' 'x' ' ']
 [' ' ' ' 'x']
 [' ' ' ' 'o']]
x chooses 2; predicted utility = -0.99
x chooses 3; predicted utility = -0.67
x chooses 4; predicted utility = +0.36
x chooses 6; predicted utility = -0.08
x chooses 7; predicted utility = -0.49
