# Basic Network Notebook

This notebook contains the definition for a basic network that will 
play tic tac toe. This notebook also contains code to train this 
network utilizing Q learning


In [1]:
# Import Tic Tac Toe code
from game import TicTacToe

# Make a game
toe = TicTacToe()

# Define Network

Below is the code to create a nerual network to play the 
tic tac toe game. 

In [2]:
# Import Tensorflow
import tensorflow as tf

# Reset graph
tf.reset_default_graph()


In [3]:
# Define settings for the network

# Batch Size
batch_size = None

# Define settings
board_size = toe.size * toe.size

# Trying for simple DNN right now
hidden = 9

# Define output space size, one output for each space
# Network will output the space it wants to play on
action_space = board_size

# Learning Rate
learning_rate = 1e-2

In [4]:
# Define network itself

# Board Input
board = tf.placeholder(dtype=tf.float32, 
                       shape=[batch_size, board_size],
                       name="Board")

# Weights
W = tf.Variable(tf.random_normal(shape=[hidden, action_space]),
                name="Weights")
# Biases
b = tf.Variable(tf.zeros(shape=[action_space]),
                name="Biases")

# Output
pred = tf.nn.bias_add(tf.matmul(board, W), b)

# Action
action = tf.argmax(pred,
                   name="Action",
                   axis=1)


In [5]:
# Define training for the network

# Targets for future learning
targets = tf.placeholder(dtype=tf.float32,
                         shape=[batch_size, action_space],
                         name="Targets")

# Define loss as distance from targets squared
loss = tf.reduce_mean(tf.square(pred - targets))

# Define training function to minimize loss
train_step = tf.train.AdamOptimizer(0.1).minimize(loss)


In [6]:
# Initialize network 
sess = tf.Session()
init = tf.global_variables_initializer()

sess.run(init)

# Training the Network

This will define how the AI will learn from previous games. 

Basicailly twp steps - 
- Observation : play some games and record win or loss and assign rewards
- Training    : based on observation and calculated rewards, update values in 
the neural network to better fit the problem

Repeat oberservation for a given number of games. After making overservations, 
batch training to progress over previous steps. Repeat and train against 
previous version of self. Keep iterating until can consistently beat its 
previous version and  then make current version previous version.

In [7]:
# Play against random player for testing

from game import random_player

enemy = random_player


In [8]:
# Define variables for training

# Number of steps to train as a group
train_batch_size = 1000

# Games to play before training
observation_size = 10000

# Iterations of training per training set
training_steps = 1000


In [9]:
# Define method to save rewards and results from a game

def label_step(reward, state, action):
    return {"state":state,"action":action,"reward":reward}

def get_reward(step):
    return step["reward"]

def get_state(step):
    return step["state"]

def get_action(step):
    return step["action"]

def get_target(step):
    return get_action(step) * get_reward(step)

def label_states(reward, states, actions, decay = 0.85):
    """Processes sates from a game. States should be feed in order
    of first trun, second turn, third tur, and so on... 
    
    Only feed actions for one player.
    
    Returns a list of labeled steps"""
    turns = len(states)
    return [label_step(reward * decay ** (turns - i), states[i], actions[i]) for i in range(turns)]
    

In [10]:
# Define a Q player based on previous functions

import numpy as np

class Qplayer:
    
    def __init__(self, sess, action_op, train_op, board_input, target_input, action_space):
        self.state_batch = []
        self.action_batch = []
        self.observations = []
        
        self.board_size = toe.size * toe.size
        
        self.sess = sess
        self.train_op = train_op
        self.action_op = action_op
        self.board_input = board_input
        self.target_input = target_input
        self.action_space = action_space
    
    def get_player(self, record=False):
        def get_move(board, player):
            board_vector = board.get_board_vector(
                                            lambda x:  1 if x == player 
                                            else  0 if x == toe.empty
                                            else -1)
                                      

            act = self.sess.run(self.action_op, feed_dict={self.board_input:
                                                           np.reshape(board_vector, [1,self.board_size])})
            act = act[0]
            
            if record:
                act_vector = np.eye(action_space)[act]
                self.action_batch += [act_vector]
                self.state_batch += [board_vector]
            
            return TicTacToe.make_move(player, act // board.size, act % board.size)
        return get_move
    
    def process_game(self, reward):
        self.observations.extend(label_states(reward, self.state_batch, self.action_batch))
        self.state_batch.clear()
        self.action_batch.clear()
        
    def train(self):
        observations_vector = np.array(self.observations)
        for step in range(training_steps):
            mini_batch = observations_vector[np.random.choice(len(observations_vector),
                                                              size=train_batch_size,
                                                              replace=False)]
            board_batch = []
            target_batch = []
            for step in mini_batch:
                board_batch.append(get_state(step))
                target_batch.append(get_target(step))
            self.sess.run(self.train_op, 
                          feed_dict={
                              self.board_input:np.array(board_batch),
                              self.target_input:np.array(board_batch),
                          })
        self.observations.clear()
    
    def play_game(self, enemy, train=True):
        winner = toe.play_game(self.get_player(train), enemy,
                               
                player_1_piece = 'X', player_2_piece = 'O',log=False)
        if train:
            reward = 0
            if winner == 'X':
                reward = 1
            elif winner == 'O':
                reward = -1
            self.process_game(reward)
        return ("win" if winner == 'X' 
                else "tie" if winner == toe.tie 
                else "lose")
        
    def win_ratio(self, enemy, games = 1000):
        """Get number of games won playing against an ememy"""
        results = {"win":0, "tie":0, "lose":0}
        for _ in range(games):
            results[self.play_game(enemy, train=False)] += 1
        return results
    

In [11]:
# Make Q player
q_player = Qplayer(sess, action, train_step, board, targets, action_space)
q_player

<__main__.Qplayer at 0x29ac9280ba8>

In [12]:
# Play test 1000 games against random player
q_player.win_ratio(enemy)

{'lose': 972, 'tie': 4, 'win': 24}

In [13]:
# Viewing win ration should not save observations
q_player.observations
# Result should be []

[]

In [None]:
# Have Q player play against ranodm player for an observation size
for i in range(observation_size):
    q_player.play_game(enemy)
    if (i + 1) % 1000 == 0:
        print("played", i + 1, "games")

played 1000 games
played 2000 games


In [None]:
# After playing for an observatin size, train the network
q_player.train()
# Play test 1000 games against random player
q_player.win_ratio(enemy)


In [None]:
# Repeat process 1000 times
for _ in range(1000):
    for i in range(observation_size):
        q_player.play_game(enemy)
        if (i + 1) % 1000 == 0:
            print("played", i + 1, "games")

    # After playing for an observatin size, train the network
    q_player.train()
    # Play test 1000 games against random player
    print(q_player.win_ratio(enemy))


In [None]:
# Try a game against the ai

from game import human_player

q_player.play_game(human_player, train=False)
