In [1]:
import tensorflow as tf
import numpy as np
import random

In [2]:
#Basic tic tac toe class
class ttt:
    def __init__(self):
        self.board = np.array([None] * 9)

    def update_board(self, player, move):
        self.board[move] = player

    @staticmethod
    def check_for_win(board):
        WIN_STATES = [
            (0,1,2),
            (3,4,5),
            (6,7,8),
            (0,3,6),
            (1,4,7),
            (2,5,8),
            (0,4,8),
            (2,4,6)
        ]

        for a, b, c in WIN_STATES:
            if board[a] == board[b] == board[c] and board[a] == 1:
                return 10
            elif board[a] == board[b] == board[c] and board[a] == 2:
                return -10
        if len(ttt.legal_moves(board)) == 0:
            return 5
        else:
            return 0
    
    @staticmethod
    def legal_moves(board):
        return np.asarray(np.where(board == None)).flatten()    

    @staticmethod
    def display_board(board):
        def convert_board(board):
            readable_board = [' '] * 9
            for i in range(len(board)):
                if board[i] == 1:
                    readable_board[i] = 'X'
                elif board[i] == 2:
                    readable_board[i] = 'O'
            return readable_board
        board = convert_board(board)
        print(' {:1} | {:1} | {:1}'.format(board[0],board[1],board[2]))
        print('-----------')
        print(' {:1} | {:1} | {:1}'.format(board[3],board[4],board[5]))
        print('-----------')
        print(' {:1} | {:1} | {:1}'.format(board[6],board[7],board[8]))

In [3]:
class Network:
    def __init__(self):
        self.inputs = None
        self.output_q = None
        self.target_q = None
        self.train_step = None
        self.session = None
        self.probabilities = None
        self.mse = None
        self.build_network()
        self.get_session()

    def __del__(self):
        self.session.close()

    def build_network(self):
        with tf.variable_scope('q_network', reuse=tf.AUTO_REUSE):
                self.inputs = tf.placeholder(tf.float32, shape=(None, 3, 3, 3))
                self.target_q = tf.placeholder(tf.float32, [None, 9])

                net = self.inputs
                net = tf.layers.conv2d(inputs=net, filters=128, kernel_size=3,
                                       kernel_regularizer=tf.contrib.layers.l1_l2_regularizer(),
                                       data_format="channels_last", padding='SAME', activation=tf.nn.relu)
                net = tf.layers.conv2d(inputs=net, filters=128, kernel_size=3,
                                       kernel_regularizer=tf.contrib.layers.l1_l2_regularizer(),
                                       data_format="channels_last", padding='SAME', activation=tf.nn.relu)
                net = tf.layers.conv2d(inputs=net, filters=64, kernel_size=3,
                                       kernel_regularizer=tf.contrib.layers.l1_l2_regularizer(),
                                       data_format="channels_last", padding='SAME', activation=tf.nn.relu)
                net = tf.layers.flatten(net)
                net = tf.layers.dense(net, 243, activation=tf.nn.relu,
                                            kernel_initializer=tf.contrib.layers.variance_scaling_initializer())
                self.output_q = tf.layers.dense(net, 9, activation=None,
                                            kernel_initializer=tf.contrib.layers.variance_scaling_initializer())
                self.probabilities = tf.nn.softmax(self.output_q)
                self.mse = tf.losses.mean_squared_error(predictions=self.output_q, labels=self.target_q)
                self.train_step = tf.train.GradientDescentOptimizer(learning_rate=0.005).minimize(self.mse)

    def get_session(self):
        self.session = tf.Session()
        self.session.run(tf.global_variables_initializer())

In [4]:
class Player:
    def __init__(self):
        self.nn = Network()
        self.games = [ttt() for i in range(1000)]
        self.board_position_log = []
        self.action_log = []
        self.next_max_log = []
        self.values_log = []
        self.rewards = []
        self.epsilon = 1

    @staticmethod
    def nn_game_state(board):
        nn_board = [0] * 27
        for i in range(len(board)):
            if (board[i] == 1):
                nn_board[i] = 1
            elif (board[i] == 2):
                nn_board[i + 9] = 1
            else:
                nn_board[i + 18] = 1
        nn_board = np.array(nn_board).reshape(3, 3, 3)
        return nn_board

    def new_game(self):
        self.board_position_log = []
        self.action_log = []
        self.next_max_log = []
        self.values_log = []

    def calculate_targets(self):
        targets = []
        for i in range(len(self.values_log)):
            target = np.copy(self.values_log[i])
            target[self.action_log[i]] = 0.95 * self.next_max_log[i]
            targets.append(target)
        return targets

        #Random play
    def random_play(self):
        def p1_turn(game):
            def new_prob(prob, moves):
                new_prob = [-1] * 9
                for i in range(len(prob)):
                    if i in moves:
                        new_prob[i] = prob[i]
                return new_prob
            legal_moves = ttt.legal_moves(game.board)

            self.board_position_log.append(game.board.copy())
            nn_board = self.nn_game_state(game.board).reshape(1, 3, 3, 3)
            q_values, probabilities = self.nn.session.run([self.nn.output_q, self.nn.probabilities], 
                                                        feed_dict={self.nn.inputs: nn_board})   
            q_values = np.copy(q_values).flatten()
            if (random.uniform(0, 1) > self.epsilon):
                move = np.argmax(new_prob(np.array(probabilities).flatten(), legal_moves))
            else:
                move = np.random.choice(legal_moves)

            if len(self.action_log) > 0:
                self.next_max_log.append(q_values[move])
            
            self.action_log.append(move)
            self.values_log.append(q_values)
            game.update_board(1, move)
            p2_turn(game)

            # can add value to next_max_log here if you check to see if the game has been won, maybe
            # maybe that was the issue with the other way: even for the winning states, I still had the max Q next thing
            # but I don't think it mattered for the Q table ? I don't know

            # I think the reward could be here as well. I'll just have to make sure the different arrays match regardless
            # of the implementation. My way is more textbook

            # also in this implementation, you have to train after every game, which isn't really experience replay

            # if not(np.array_equal(old_board, game.board)):
            #     updateQ(old_board, move, game.board, ttt.check_for_win(game.board))
        
        def p2_turn(game):
            legal_moves = ttt.legal_moves(game.board)
            if (len(legal_moves) != 0):
                move = np.random.choice(legal_moves)
                game.update_board(2, move)

        # updating using NN
        # def updateQ(s, a, new_s, r):
        #     b = np.array([DeepQ.nn_game_state(s), DeepQ.nn_game_state(new_s)]).reshape(2,27)
        #     q = sess.run(output_q, feed_dict={board: b})
        #     # r + γ maxa′Q(s′)[a′]
        #     q[0][a] = r + (0.9 * max(q[1]))
        #     sess.run(train_step, feed_dict={board: np.array(DeepQ.nn_game_state(s)).reshape(1,27), target_q: np.array(q[0]).reshape(1,9)})
        
        for game in self.games:
            self.new_game()

            first = random.randint(1,2)
            p_order = [first, 3-first]

            if (p_order[0] == 2):
                p2_turn(game)
            while (ttt.check_for_win(game.board) == 0):
                p1_turn(game)
                    
            # it makes sense to do it like this, though
            # reward is usually 0 unless game is over. r + gamma max a'Q(s')[a']  --->  gamma max a'Q(s')[a']
            # and when there is a reward the next state max should be 0 because there is no next state ? Maybe
            # although in that situation there wouldn't be a penalty by gamma
            self.next_max_log.append(ttt.check_for_win(game.board))
            nn_input = [self.nn_game_state(x) for x in self.board_position_log]
            targets = self.calculate_targets()

            self.nn.session.run(self.nn.train_step, feed_dict={self.nn.inputs: nn_input, self.nn.target_q: targets})
            self.rewards = np.append(self.rewards, ttt.check_for_win(game.board))

            if (self.epsilon > 0.05):
                self.epsilon *= 0.999991

        print()
        print("Player 1 Wins: ", np.asarray(np.where(self.rewards == 10)).flatten().size/len(self.rewards))
        print("Player 2 Wins: ", np.asarray(np.where(self.rewards == -10)).flatten().size/len(self.rewards))
        print("Ties: ", np.asarray(np.where(self.rewards == 5)).flatten().size/len(self.rewards))

In [None]:
p = Player()
p.random_play()

In [None]:
b = Player.nn_game_state([None, None, None, None, None, None, None, None, None]).reshape(1, 3, 3, 3)
b = Player.nn_game_state([1, None, 1, None, None, 2, None, None, 2]).reshape(1, 3, 3, 3)
# b = np.array(b).reshape(1, 27)
print(b)

In [None]:
p.nn.session.run(p.nn.output_q, feed_dict={p.nn.inputs: b})

In [None]:
p.nn.session.run(p.nn.probabilities, feed_dict={p.nn.inputs: b})

In [None]:
def q_play():
    rewards = np.array([])
    games = [ttt() for i in range(10000)]

    def p_turn(player, game):
        def new_prob(prob, moves):
                new_prob = [-1] * 9
                for i in range(len(prob)):
                    if i in moves:
                        new_prob[i] = prob[i]
                return new_prob
        
        legal_moves = ttt.legal_moves(game.board)
        nn_board = Player.nn_game_state(game.board).reshape(1, 3, 3, 3)
        q_values, probabilities = p.nn.session.run([p.nn.output_q, p.nn.probabilities], 
                                                    feed_dict={p.nn.inputs: nn_board})   
        q_values = np.copy(q_values).flatten()
        if (player == 1):
            move = np.argmax(new_prob(np.array(probabilities).flatten(), legal_moves))
        else:
            move = np.random.choice(legal_moves)
        game.update_board(player, move)

    for game in games:
        first = random.randint(1,2)
        p_order = [first, 3-first]

        while (ttt.check_for_win(game.board) == 0):
            p_turn(p_order[0], game)
            if (ttt.check_for_win(game.board) != 0 or len(ttt.legal_moves(game.board)) == 0):
                break
            p_turn(p_order[1], game)
    
        rewards = np.append(rewards, ttt.check_for_win(game.board))

    print()
    print("Player 1 Wins: ", np.asarray(np.where(rewards == 10)).flatten().size/len(rewards)) # 0.9122, 0.9005, 0.904
    print("Player 2 Wins: ", np.asarray(np.where(rewards == -10)).flatten().size/len(rewards)) # 0.0127, 0.0035, 0.0014
    print("Ties: ", np.asarray(np.where(rewards == 5)).flatten().size/len(rewards)) # 0.071, 0.096, 0.094

In [None]:
q_play()

In [None]:
def human_play():
    game = ttt()

    def p_turn(game):
        def new_prob(prob, moves):
                new_prob = [-1] * 9
                for i in range(len(prob)):
                    if i in moves:
                        new_prob[i] = prob[i]
                return new_prob

        legal_moves = ttt.legal_moves(game.board)
        nn_board = Player.nn_game_state(game.board).reshape(1, 3, 3, 3)
        q_values, probabilities = p.nn.session.run([p.nn.output_q, p.nn.probabilities], 
                                                    feed_dict={p.nn.inputs: nn_board})   
        q_values = np.copy(q_values).flatten()
        move = np.argmax(new_prob(np.array(probabilities).flatten(), legal_moves))

        game.update_board(1, move)

    def h_turn(game):
        ttt.display_board(game.board)
        move = input('Where would you like to play (0-8): ')
        game.update_board(2, int(move))

    first = random.randint(1,2)
    p_order = [first, 3-first]
    while (ttt.check_for_win(game.board) == 0):
        if (p_order[0] == 1):
            p_turn(game)
        else:
            h_turn(game)
        if (ttt.check_for_win(game.board) != 0):
            ttt.display_board(game.board)
            break
        if (p_order[0] == 1):
            h_turn(game)
        else:
            p_turn(game)

    print()
    if (ttt.check_for_win(game.board) == 10):
        print('AI wins')
    elif (ttt.check_for_win(game.board) == -10):
        print('You somehow beat the AI')
    else:
        print('You both played a perfect game. Tie')

In [None]:
human_play()