In [833]:
class Game:
    def allowed_moves(self):
        raise NotImplementedError()
        
    def make_move(self, next_state):
        raise NotImplementedError()
        
    def playable(self):
        raise NotImplementedError()

    def predict_winner(self, state):
        raise NotImplementedError()

In [834]:
# Copied from https://github.com/neilslater/game_playing_scripts

'''
   Copyright 2017 Neil Slater

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
'''

import numpy as np
import csv
import random
from itertools import groupby
from datetime import datetime

class TicTacToeGame(Game):
    # 1 is for 'X', -1 is for 'O', 0 is for empty
    N_CELLS = 9
    
    @staticmethod
    def output_mark(num):
        return 'X' if num == 1 else 'O' if num == -1 else ' '

    def __init__(self):
        self.state = (0,) * TicTacToeGame.N_CELLS
        self.player = 1
        self.winner = 0

    def allowed_moves(self):
        return [i for i in range(TicTacToeGame.N_CELLS) if self.state[i] == 0]

    def move_to_state(self, move):
        return self.state[:move] + (self.player,) + self.state[move+1:]
    
    def make_move(self, move):
        if self.winner:
            raise(Exception("Game already completed, cannot make another move!"))
        if not self.__valid_move(move):
            raise(Exception("Cannot make move from position '{}' to '{}' for player {}".format(
                    self.state, move, self.player)))

        next_state = self.move_to_state(move)
        self.state = next_state
        self.winner = self.predict_winner()
        if self.winner:
            self.player = 0
        else:
            self.player = -self.player

    def playable(self):
        return ( (not self.winner) and any(self.allowed_moves()) )

    def predict_winner(self):
        lines = [(0,1,2), (3,4,5), (6,7,8), (0,3,6), (1,4,7), (2,5,8), (0,4,8), (2,4,6)]
        winner = 0
        for line in lines:
            line_state = self.state[line[0]] + self.state[line[1]] + self.state[line[2]]
            if line_state == 3:
                winner = 1
            elif line_state == -3:
                winner = -1
        return winner

    def __valid_move(self, move):
        return move in self.allowed_moves()

    def print_board(self):
        s = self.state
        def cell(index):
            return TicTacToeGame.output_mark(s[index]) if s[index] != 0 else index
        print('     {} | {} | {} '.format(cell(0), cell(1), cell(2)))
        print('    -----------')
        print('     {} | {} | {} '.format(cell(3), cell(4), cell(5)))
        print('    -----------')
        print('     {} | {} | {} '.format(cell(6), cell(7), cell(8)))


In [835]:
class Agent:
    def __init__(self, name, epsilon=0.1, player=None):
        self.name = name
        self.epsilon = epsilon
        self.player = player
        self.total_games_learned = 0
    
    def games_learned(self):
        return self.total_games_learned
    
    def get_full_name(self):
        return "{}({})".format(self.name, TicTacToeGame.output_mark(self.player))
        
    def learn_game(self, num_episodes=1000):
        for episode in range(num_episodes):
            train_data = self.learn_from_episode()
            self.train(train_data)
            
        self.total_games_learned += num_episodes
            
    def learn_from_episode(self):
        game = TicTacToeGame()
        player = self.player
        if not self.player:
            if random.random() < 0.5:
                self.player = 1
            else:
                self.player = -1
            
        opponent = Agent("opponent", player=-self.player)       
        train_data = []
        
        while game.playable():
            our_turn = game.player == self.player
            # save train data only for current player
            if our_turn:
                self.learn_from_move(game, train_data)
            else:
                move = opponent.play_select_move(game)
                game.make_move(move)
                
        train_data.append({'state' : game.state, 'reward' : self.reward(game) })

        self.player = player # restore player
        return train_data

    def learn_from_move(self, game, train_data):
        raise NotImplementedError()
        
    def learn_select_move(self, game):
        best_move = self.play_select_move(game)

        if random.random() < self.epsilon:
            selected_move = random.choice(game.allowed_moves())
        else:
            selected_move = best_move

        return best_move, selected_move

    def play_select_move(self, game):
        assert game.player == self.player
        allowed_state_values = self.predict_state_values(game)
        return self.__argmax_V(allowed_state_values)
        
    def __argmax_V(self, state_values):
        max_V = max(state_values.values())
        chosen_state = random.choice([move for move, v in state_values.items() if v == max_V])
        return chosen_state

    def reward(self, game):
        return game.winner * self.player
    
    def train(self, game, train_data):
        raise NotImplementedError()
        
    def predict_state_value(self, state):
        return 0.0
        
    def predict_state_values(self, game):
        return dict((move, self.predict_state_value(game.move_to_state(move))) for move in game.allowed_moves())
        
    def state_after_my_move(self, state):
        if self.player == 1:
            return state.count(1) == state.count(-1) + 1
        elif self.player == -1:
            return state.count(1) == state.count(-1)
        else:
            assert False, "incorrect player"

In [876]:
def demo_game(agent1, agent2, verbose=False):
    t = 0
    random_players = False
    if not agent1.player and not agent2.player:
        random_players = True
        if random.random() < 0.5:
            agent1.player = 1
            agent2.player = -1
        else:
            agent1.player = -1
            agent2.player = 1
    assert agent1.player != agent2.player
    
    game = TicTacToeGame()
    agent_to_move = agent1 if agent1.player == 1 else agent2
    while game.playable():
        if verbose:
            print(" \nTurn {}\n".format(t))
            game.print_board()
        move = agent_to_move.play_select_move(game)
        if agent_to_move == agent1:
            agent_to_move = agent2
        else:
            agent_to_move = agent1
        game.make_move(move)
        t += 1
    if verbose:
        print(" \nTurn {}\n".format(t))
        game.print_board()
    if random_players:
        agent1.player = None
        agent2.player = None
    if game.winner:
        if verbose:
            print("\n{} is the winner!".format(game.winner))
        return game.winner
    else:
        if verbose:
            print("\nIt's a draw!")
        return 0

def request_human_move(game):
    allowed_moves = game.allowed_moves()
    while True:
        try:
            return int(input("Choose move for '{}', from {} : ".format(TicTacToeGame.output_mark(game.player), allowed_moves)))
        except Exception:
            pass

def interactive_game(agent):
    t = 0
    game = TicTacToeGame()
    while game.playable():
        print(" \nTurn {}\n".format(t))
        game.print_board()
        if game.player == agent.player:
            move = agent.play_select_move(game)
        else:
            move = request_human_move(game)
        game.make_move(move)
        t += 1

    print(" \nTurn {}\n".format(t))
    game.print_board()

    if game.winner:
        winner = TicTacToeGame.output_mark(game.winner)
        print("\n{} is the winner!".format(winner))
        return winner
    print("\nIt's a draw!")
    return '-'

In [837]:
def demo_game_stats(agent1, agent2, num_games=1000, verbose=False):
    results = [demo_game(agent1, agent2, verbose) for i in range(num_games)]
    game_stats = {TicTacToeGame.output_mark(k): "{:.1f}%".format(0 if num_games == 0 else results.count(k)/num_games*100) for k in [1, -1, 0]}
    return game_stats

In [838]:
def train_agent(agent, learn_games=1000):
    print ('Training {} by {} games'.format(agent.get_full_name(), learn_games))
    agent.learn_game(learn_games)
    print ("{}: {} games learned".format(agent.get_full_name(), agent.games_learned()))

In [839]:
def test_agents(agent1, agent2, demo_games=1000, verbose=False):
    game_stats = demo_game_stats(agent1, agent2, num_games=demo_games, verbose=verbose)
    print ("{} vs {}: {}".format(agent1.get_full_name(), agent2.get_full_name(), game_stats))

In [840]:
def test_agent(agent, demo_games=1000, verbose=False):
    opponent = Agent("agent", player=None if agent.player is None else -agent.player)
    test_agents(agent, opponent, demo_games=demo_games, verbose=verbose)

In [841]:
def train_and_test(agent, rounds=10, demo_games=1000, epsilon_coeff=0.1, verbose=False):
    test_agent(agent, demo_games, verbose=verbose)
    epsilon = agent.epsilon
    for i in range(rounds):
        agent.epsilon *= epsilon_coeff
        train_agent(agent)
        test_agent(agent, demo_games, verbose=verbose)
        
    agent.epsilon = epsilon

In [842]:
class AgentDict(Agent):
    def __init__(self, name, epsilon=0.1, alpha=1.0, player=None):
        super(AgentDict, self).__init__(name, epsilon, player)
        self.alpha = alpha
        self.V = {}

    def learn_from_move(self, game, train_data):
        best_next_move, selected_next_move = self.learn_select_move(game)
        best_next_state_value = self.predict_state_value(game.move_to_state(best_next_move))
        
        game.make_move(selected_next_move)
        
        train_data.append({'state' : game.state, 'reward' : best_next_state_value })
    
        return selected_next_move

    def train(self, train_data):
        for i in range(len(train_data)-1):
            state = train_data[i]['state']
            td_target = train_data[i+1]['reward']
            current_state_value = self.V.get(state, 0.0)
            value = current_state_value + self.alpha * (td_target - current_state_value)
            #print (state, value)
            self.V[state] = value

    def predict_state_value(self, state):
        assert self.state_after_my_move(state)
        return self.V.get(state, 0.0)
    
    def round_V(self):
        # After training, this makes action selection random from equally-good choices
        for k in self.V.keys():
            self.V[k] = round(self.V[k],1)

    def save_v_table(self, filename):
        with open(filename, 'w', newline='') as csvfile:
            writer = csv.writer(csvfile)
            writer.writerow(['State', 'Value'])
            all_states = list(self.V.keys())
            all_states.sort()
            for state in all_states:
                writer.writerow([state, self.V[state]])

In [843]:
# Testing agents with dictionary
# Train agent_dict_1 to play for 'X'
agent_dict_X = AgentDict("agent_dict_X", epsilon=0.8, alpha=0.8, player=1)
train_and_test(agent_dict_X)

agent_dict_X(X) vs agent(O): {'X': '55.3%', 'O': '27.7%', ' ': '17.0%'}
Training agent_dict_X(X) by 1000 games
agent_dict_X(X): 1000 games learned
agent_dict_X(X) vs agent(O): {'X': '92.9%', 'O': '4.4%', ' ': '2.7%'}
Training agent_dict_X(X) by 1000 games
agent_dict_X(X): 2000 games learned
agent_dict_X(X) vs agent(O): {'X': '94.5%', 'O': '1.8%', ' ': '3.7%'}
Training agent_dict_X(X) by 1000 games
agent_dict_X(X): 3000 games learned
agent_dict_X(X) vs agent(O): {'X': '95.2%', 'O': '0.0%', ' ': '4.8%'}
Training agent_dict_X(X) by 1000 games
agent_dict_X(X): 4000 games learned
agent_dict_X(X) vs agent(O): {'X': '95.7%', 'O': '0.1%', ' ': '4.2%'}
Training agent_dict_X(X) by 1000 games
agent_dict_X(X): 5000 games learned
agent_dict_X(X) vs agent(O): {'X': '97.0%', 'O': '0.0%', ' ': '3.0%'}
Training agent_dict_X(X) by 1000 games
agent_dict_X(X): 6000 games learned
agent_dict_X(X) vs agent(O): {'X': '96.1%', 'O': '0.0%', ' ': '3.9%'}
Training agent_dict_X(X) by 1000 games
agent_dict_X(X): 70

In [844]:
agent_dict_X.round_V()
agent_dict_X.save_v_table("state_values.csv")

In [845]:
state = (0,0,0,0,1,0,0,0,0)
print (agent_dict_X.predict_state_value(state))
g = TicTacToeGame()
print (agent_dict_X.predict_state_values(g))

0.0
{0: 0.0, 1: 1.0, 2: 0.1, 3: 0.0, 4: 0.0, 5: 0.0, 6: 0.0, 7: 0.1, 8: 0.0}


In [846]:
# Check agent_dict_X interactively
interactive_game(agent_dict_X)

In [847]:
# Train agent_dict_O to play for 'O'
agent_dict_O = AgentDict("agent_dict_O", epsilon=0.8, alpha=0.9, player=-1)

train_and_test(agent_dict_O)

agent_dict_O(O) vs agent(X): {'X': '52.7%', 'O': '32.4%', ' ': '14.9%'}
Training agent_dict_O(O) by 1000 games
agent_dict_O(O): 1000 games learned
agent_dict_O(O) vs agent(X): {'X': '27.4%', 'O': '55.8%', ' ': '16.8%'}
Training agent_dict_O(O) by 1000 games
agent_dict_O(O): 2000 games learned
agent_dict_O(O) vs agent(X): {'X': '10.2%', 'O': '73.5%', ' ': '16.3%'}
Training agent_dict_O(O) by 1000 games
agent_dict_O(O): 3000 games learned
agent_dict_O(O) vs agent(X): {'X': '7.9%', 'O': '74.5%', ' ': '17.6%'}
Training agent_dict_O(O) by 1000 games
agent_dict_O(O): 4000 games learned
agent_dict_O(O) vs agent(X): {'X': '4.3%', 'O': '76.0%', ' ': '19.7%'}
Training agent_dict_O(O) by 1000 games
agent_dict_O(O): 5000 games learned
agent_dict_O(O) vs agent(X): {'X': '3.7%', 'O': '75.6%', ' ': '20.7%'}
Training agent_dict_O(O) by 1000 games
agent_dict_O(O): 6000 games learned
agent_dict_O(O) vs agent(X): {'X': '2.1%', 'O': '76.3%', ' ': '21.6%'}
Training agent_dict_O(O) by 1000 games
agent_dict_

In [877]:
# Check agent_dict_O interactively
interactive_game(agent_dict_O)

 
Turn 0

     0 | 1 | 2 
    -----------
     3 | 4 | 5 
    -----------
     6 | 7 | 8 
Choose move for 'X', from [0, 1, 2, 3, 4, 5, 6, 7, 8] : 4
 
Turn 1

     0 | 1 | 2 
    -----------
     3 | X | 5 
    -----------
     6 | 7 | 8 
 
Turn 2

     O | 1 | 2 
    -----------
     3 | X | 5 
    -----------
     6 | 7 | 8 
Choose move for 'X', from [1, 2, 3, 5, 6, 7, 8] : 2
 
Turn 3

     O | 1 | X 
    -----------
     3 | X | 5 
    -----------
     6 | 7 | 8 
 
Turn 4

     O | 1 | X 
    -----------
     3 | X | 5 
    -----------
     O | 7 | 8 
Choose move for 'X', from [1, 3, 5, 7, 8] : 5
 
Turn 5

     O | 1 | X 
    -----------
     3 | X | X 
    -----------
     O | 7 | 8 
 
Turn 6

     O | 1 | X 
    -----------
     O | X | X 
    -----------
     O | 7 | 8 

O is the winner!


'O'

In [881]:
# Test trained 'agent_dict_X' and 'agent_dict_O' agents together
test_agents(agent_dict_X, agent_dict_O)

agent_dict_X(X) vs agent_dict_O(O): {'X': '0.0%', 'O': '0.0%', ' ': '100.0%'}


In [852]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms

In [988]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(27, 144)
        self.fc2 = nn.Linear(144, 288)        
        self.fc3 = nn.Linear(288, 144)        
        self.fc4 = nn.Linear(144, 36)        
        self.fc5 = nn.Linear(36, 9)

    def forward(self, x):
        x = self.fc1(x)
        x = torch.relu(x)
        x = self.fc2(x)
        x = torch.relu(x)
        x = self.fc3(x)
        x = torch.relu(x)
        x = self.fc4(x)
        x = torch.relu(x)
        x = self.fc5(x)
        x = torch.tanh(x)
        return x

In [1074]:
class AgentNN(Agent):
    INPUT_SIZE = TicTacToeGame.N_CELLS * 3 # for X, O and empty
    
    def __init__(self, name, epsilon=0.1, gamma=1.0, lr=0.001, player=1):
        super(AgentNN, self).__init__(name, epsilon, player)
        self.model = Net()
        self.gamma = gamma
        import copy
        self.model_copy = copy.deepcopy(self.model)

        self.optim = torch.optim.SGD(self.model.parameters(), lr=lr)
        #self.optim = torch.optim.Adadelta(self.model.parameters())
        self.loss = F.torch.nn.MSELoss
        
        self.batch_data = []

        self.update_freq = 10
        self.update_counter = self.update_freq
        
        self.memory = ReplayMemory(capacity=300)

    def learn_from_move(self, game, train_data):
        _, selected_next_move = self.learn_select_move(game)
        state = [s * self.player for s in game.state]
        game.make_move(selected_next_move)    
        train_data.append({'state' : state, 'move' : selected_next_move, 'reward' : 0})
        
    def train(self, train_data):
        #print (train_data)
        states = []
        moves = []
        new_states = []
        rewards = []
        moves_made = len(train_data) - 1
        for i in range(len(train_data) - 1):
            states.append(train_data[i]['state'])
            moves.append(train_data[i]['move'])
            new_states.append(train_data[i+1]['state'])
            rewards.append(train_data[i+1]['reward'])
        done = [0] * moves_made
        done[-1] = 1
        
        self.batch_data.append((states, moves, rewards, new_states, done))
        if len(self.batch_data) > 16:
            self.optimize()
            self.batch_data = []

        if self.update_counter == 0:
            self.model_copy.load_state_dict(self.model.state_dict())
            self.update_counter = self.update_freq
        else:
            self.update_counter -= 1

    def optimize(self):
        def prepare_batch_data(s):
            return self.__tensor_from_states(s).view(-1, AgentNN.INPUT_SIZE)

        random.shuffle(self.batch_data)
        for batch in self.batch_data:
            states, moves, rewards, new_states, done = batch
            #print ('states=', states)
            #print ('moves=', moves)
            #print ('rewards=', rewards)

            moves = torch.tensor(moves)
            rewards = torch.tensor(rewards).type(torch.float32)
            Q_pred = self.model(prepare_batch_data(states))
            Q = Q_pred.clone().detach()

            new_state_values = self.model_copy(prepare_batch_data(new_states)).detach()

            max_Q_new = torch.max(new_state_values, axis=1).values
            #print ('max_Q_new=', max_Q_new)
            # Q(s, a) = r + gamma * max Q(s', a') 
            rewards[done == 0] = self.gamma * max_Q_new[done == 0]
            Q[range(len(states)), moves] = rewards
            Q = Q * (torch.tensor(states) == 0)  # 0 for non-legal moves

            loss = self.loss()(Q_pred, Q)
            self.optim.zero_grad()
            loss.backward()
            self.optim.step()

    def __tensor_from_states(self, states):    
        states_tensor = []
        for state in states:
            state_tensor = []
            assert len(state) == TicTacToeGame.N_CELLS
            for sym in state:
                state_tensor.append(1 if sym == 1 else 0)
            for sym in state:
                state_tensor.append(1 if sym == -1 else 0)
            for sym in state:
                state_tensor.append(1 if sym == 0 else 0)
            states_tensor.append(state_tensor)
        return torch.tensor(states_tensor).type(torch.float32)

    def predict_state_value(self, state, player):
        raise NotImplementedError # this method is not used
    
    def predict_state_values(self, game):
        assert self.player == game.player
        state = [s * self.player for s in game.state]
        x = self.__tensor_from_states([state])
        state_values = self.model(x).detach().view(-1).numpy()
        predicted_state_values = {}
        for i in range(TicTacToeGame.N_CELLS):
            if game.state[i] == 0:
                predicted_state_values[i] = state_values[i]
        return predicted_state_values

In [1068]:
# Testing agents with NN
# Train agent_nn_X to play for 'X'
agent_nn_X = AgentNN("agent_nn_X", epsilon=0.7, gamma=0.95, lr=0.1, player=1)
train_and_test(agent_nn_X, rounds=100, epsilon_coeff=0.9)

agent_nn_X(X) vs agent(O): {'X': '65.4%', 'O': '16.8%', ' ': '17.8%'}
Training agent_nn_X(X) by 1000 games
agent_nn_X(X): 1000 games learned
agent_nn_X(X) vs agent(O): {'X': '83.0%', 'O': '11.3%', ' ': '5.7%'}
Training agent_nn_X(X) by 1000 games
agent_nn_X(X): 2000 games learned
agent_nn_X(X) vs agent(O): {'X': '79.8%', 'O': '14.2%', ' ': '6.0%'}
Training agent_nn_X(X) by 1000 games
agent_nn_X(X): 3000 games learned
agent_nn_X(X) vs agent(O): {'X': '87.3%', 'O': '11.1%', ' ': '1.6%'}
Training agent_nn_X(X) by 1000 games
agent_nn_X(X): 4000 games learned
agent_nn_X(X) vs agent(O): {'X': '87.1%', 'O': '10.1%', ' ': '2.8%'}
Training agent_nn_X(X) by 1000 games
agent_nn_X(X): 5000 games learned
agent_nn_X(X) vs agent(O): {'X': '86.2%', 'O': '13.3%', ' ': '0.5%'}
Training agent_nn_X(X) by 1000 games
agent_nn_X(X): 6000 games learned
agent_nn_X(X) vs agent(O): {'X': '83.4%', 'O': '14.5%', ' ': '2.1%'}
Training agent_nn_X(X) by 1000 games
agent_nn_X(X): 7000 games learned
agent_nn_X(X) vs ag

In [1075]:
#state = (0,0,0,0,1,0,0,0,0)
#print (agent_nn_X.predict_state_value(state,1))
print (agent_nn_X.epsilon)
g = TicTacToeGame()
print (agent_nn_X.predict_state_values(g))

0.7
{0: -0.062125415, 1: -0.13336438, 2: -1.5240163e-05, 3: -0.1010155, 4: -0.026680224, 5: -0.18843895, 6: -0.08477706, 7: -0.12190332, 8: -0.023210267}


In [1076]:
# Check agent_nn_X interactively
interactive_game(agent_nn_X)

 
Turn 0

     0 | 1 | 2 
    -----------
     3 | 4 | 5 
    -----------
     6 | 7 | 8 
 
Turn 1

     0 | 1 | X 
    -----------
     3 | 4 | 5 
    -----------
     6 | 7 | 8 
Choose move for 'O', from [0, 1, 3, 4, 5, 6, 7, 8] : 4
 
Turn 2

     0 | 1 | X 
    -----------
     3 | O | 5 
    -----------
     6 | 7 | 8 
 
Turn 3

     X | 1 | X 
    -----------
     3 | O | 5 
    -----------
     6 | 7 | 8 
Choose move for 'O', from [1, 3, 5, 6, 7, 8] : 1
 
Turn 4

     X | O | X 
    -----------
     3 | O | 5 
    -----------
     6 | 7 | 8 
 
Turn 5

     X | O | X 
    -----------
     3 | O | 5 
    -----------
     6 | X | 8 
Choose move for 'O', from [3, 5, 6, 8] : 3
 
Turn 6

     X | O | X 
    -----------
     O | O | 5 
    -----------
     6 | X | 8 
 
Turn 7

     X | O | X 
    -----------
     O | O | X 
    -----------
     6 | X | 8 
Choose move for 'O', from [6, 8] : 6
 
Turn 8

     X | O | X 
    -----------
     O | O | X 
    -----------
     O | X | 8 
 
Turn

'X'

In [1069]:
# Train agent_nn_O to play for 'O'
agent_nn_O = AgentNN("agent_nn_O", epsilon=0.7, gamma=0.95, lr=0.1, player=-1)
train_and_test(agent_nn_O, rounds=100, epsilon_coeff=1.0)

agent_nn_O(O) vs agent(X): {'X': '49.0%', 'O': '41.6%', ' ': '9.4%'}
Training agent_nn_O(O) by 1000 games
agent_nn_O(O): 1000 games learned
agent_nn_O(O) vs agent(X): {'X': '39.4%', 'O': '52.0%', ' ': '8.6%'}
Training agent_nn_O(O) by 1000 games
agent_nn_O(O): 2000 games learned
agent_nn_O(O) vs agent(X): {'X': '34.1%', 'O': '59.2%', ' ': '6.7%'}
Training agent_nn_O(O) by 1000 games
agent_nn_O(O): 3000 games learned
agent_nn_O(O) vs agent(X): {'X': '43.6%', 'O': '50.5%', ' ': '5.9%'}
Training agent_nn_O(O) by 1000 games
agent_nn_O(O): 4000 games learned
agent_nn_O(O) vs agent(X): {'X': '27.5%', 'O': '68.5%', ' ': '4.0%'}
Training agent_nn_O(O) by 1000 games
agent_nn_O(O): 5000 games learned
agent_nn_O(O) vs agent(X): {'X': '32.4%', 'O': '63.1%', ' ': '4.5%'}
Training agent_nn_O(O) by 1000 games
agent_nn_O(O): 6000 games learned
agent_nn_O(O) vs agent(X): {'X': '38.7%', 'O': '56.7%', ' ': '4.6%'}
Training agent_nn_O(O) by 1000 games
agent_nn_O(O): 7000 games learned
agent_nn_O(O) vs age

In [1073]:
# Check agent_nn_O interactively
interactive_game(agent_nn_O)

 
Turn 0

     0 | 1 | 2 
    -----------
     3 | 4 | 5 
    -----------
     6 | 7 | 8 
Choose move for 'X', from [0, 1, 2, 3, 4, 5, 6, 7, 8] : 4
 
Turn 1

     0 | 1 | 2 
    -----------
     3 | X | 5 
    -----------
     6 | 7 | 8 
 
Turn 2

     0 | 1 | 2 
    -----------
     3 | X | 5 
    -----------
     6 | 7 | O 
Choose move for 'X', from [0, 1, 2, 3, 5, 6, 7] : 5
 
Turn 3

     0 | 1 | 2 
    -----------
     3 | X | X 
    -----------
     6 | 7 | O 
 
Turn 4

     0 | 1 | 2 
    -----------
     O | X | X 
    -----------
     6 | 7 | O 
Choose move for 'X', from [0, 1, 2, 6, 7] : 7
 
Turn 5

     0 | 1 | 2 
    -----------
     O | X | X 
    -----------
     6 | X | O 
 
Turn 6

     0 | 1 | 2 
    -----------
     O | X | X 
    -----------
     O | X | O 
Choose move for 'X', from [0, 1, 2] : 0
 
Turn 7

     X | 1 | 2 
    -----------
     O | X | X 
    -----------
     O | X | O 
 
Turn 8

     X | O | 2 
    -----------
     O | X | X 
    -----------
     O | X

'-'

In [1070]:
# Test trained agent_nn_X and agent_nn_O agents
test_agents(agent_nn_X, agent_nn_O)

agent_nn_X(X) vs agent_nn_O(O): {'X': '0.0%', 'O': '0.0%', ' ': '100.0%'}


In [1071]:
# Test trained agent_dict_X and agent_nn_O agents
test_agents(agent_dict_X, agent_nn_O)

agent_dict_X(X) vs agent_nn_O(O): {'X': '47.9%', 'O': '52.1%', ' ': '0.0%'}


In [1072]:
# Test trained agent_nn_X and agent_dict_O agents
test_agents(agent_nn_X, agent_dict_O)

agent_nn_X(X) vs agent_dict_O(O): {'X': '0.0%', 'O': '0.0%', ' ': '100.0%'}
