In [821]:
class Game:
    def allowed_moves(self):
        raise NotImplementedError()
        
    def make_move(self, next_state):
        raise NotImplementedError()
        
    def playable(self):
        raise NotImplementedError()

    def predict_winner(self, state):
        raise NotImplementedError()

In [822]:
# Copied from https://github.com/neilslater/game_playing_scripts

'''
   Copyright 2017 Neil Slater

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
'''

import numpy as np
import csv
import random
from itertools import groupby

class TicTacToeGame(Game):
    # 1 is for 'X', -1 is for 'O', 0 is for empty
    @staticmethod
    def output_mark(num):
        return 'X' if num == 1 else 'O' if num == -1 else ' '

    def __init__(self):
        self.state = (0,) * 9
        self.player = 1
        self.winner = 0

    def allowed_moves(self):
        moves = []
        for i in range(len(self.state)):
            if self.state[i] == 0:
                moves.append(i)
        return moves

    def move_to_state(self, move):
        return self.state[:move] + (self.player,) + self.state[move+1:]
    
    def make_move(self, move):
        if self.winner:
            raise(Exception("Game already completed, cannot make another move!"))
        if not self.__valid_move(move):
            raise(Exception("Cannot make move from position '{}' to '{}' for player {}".format(
                    self.state, move, self.player)))

        next_state = self.move_to_state(move)
        self.state = next_state
        self.winner = self.predict_winner()
        if self.winner:
            self.player = 0
        else:
            self.player = -self.player

    def playable(self):
        return ( (not self.winner) and any(self.allowed_moves()) )

    def predict_winner(self):
        lines = [(0,1,2), (3,4,5), (6,7,8), (0,3,6), (1,4,7), (2,5,8), (0,4,8), (2,4,6)]
        winner = 0
        for line in lines:
            line_state = self.state[line[0]] + self.state[line[1]] + self.state[line[2]]
            if line_state == 3:
                winner = 1
            elif line_state == -3:
                winner = -1
        return winner

    def __valid_move(self, move):
        return move in self.allowed_moves()

    def print_board(self):
        s = self.state
        print('     {} | {} | {} '.format(TicTacToeGame.output_mark(s[0]), TicTacToeGame.output_mark(s[1]), TicTacToeGame.output_mark(s[2])))
        print('    -----------')
        print('     {} | {} | {} '.format(TicTacToeGame.output_mark(s[3]), TicTacToeGame.output_mark(s[4]), TicTacToeGame.output_mark(s[5])))
        print('    -----------')
        print('     {} | {} | {} '.format(TicTacToeGame.output_mark(s[6]), TicTacToeGame.output_mark(s[7]), TicTacToeGame.output_mark(s[8])))


In [823]:
class Agent():
    def __init__(self, name, epsilon=0.1, alpha=1.0, player_mark=None):
        self.name = name
        self.alpha = alpha
        self.epsilon = epsilon
        self.player_mark = player_mark
        self.total_games_learned = 0
    
    def games_learned(self):
        return self.total_games_learned
    
    def get_full_name(self):
        return "{}({})".format(self.name, TicTacToeGame.output_mark(self.player_mark))
        
    def learn_game(self, num_episodes=1000):
        for episode in range(num_episodes):
            train_data = self.learn_from_episode()
            self.train(train_data)
            
        self.total_games_learned += num_episodes
            
    def learn_from_episode(self):
        game = TicTacToeGame()
        random_player_mark = False
        if not self.player_mark:
            random_player_mark = True
            if random.random() < 0.5:
                self.player_mark = 1
            else:
                self.player_mark = -1
            
        opponent = AgentDict("opponent", player_mark=-self.player_mark)       
        train_data = []
        
        while game.playable():
            our_turn = game.player == self.player_mark
            # save train data only for current player
            if our_turn:
                self.learn_from_move(game, train_data)
            else:
                move = opponent.play_select_move(game)
                game.make_move(move)

        train_data.append({'state' : None, 'move' : None, 'new_state' : None, 'reward' : self.__reward(game)})
        if random_player_mark:
            self.player_mark = 0
        return train_data

    def learn_from_move(self, game, train_data):
        best_next_move, selected_next_move = self.learn_select_move(game)
        
        next_state_value = self.predict_state_value(game.move_to_state(best_next_move), game.player)
        state = game.state
        
        game.make_move(selected_next_move)
        
        train_data.append({'state' : state, 'move' : selected_next_move, 'new_state' : game.state,
                           'reward' : next_state_value})
        
        return selected_next_move

    def learn_select_move(self, game):
        assert game.player == self.player_mark
        
        allowed_state_values = self.predict_state_values(game)
        best_move = self.__argmax_V(allowed_state_values)

        selected_move = best_move
        if random.random() < self.epsilon:
            selected_move = random.choice(game.allowed_moves())

        return best_move, selected_move

    def play_select_move(self, game):
        assert game.player == self.player_mark
        allowed_state_values = self.predict_state_values(game)
        return self.__argmax_V(allowed_state_values)
        
    def __argmax_V(self, state_values):
        max_V = max(state_values.values())
        chosen_state = random.choice([move for move, v in state_values.items() if v == max_V])
        return chosen_state

    def __reward(self, game):
        return game.winner * self.player_mark

    def __request_human_move(self, game):
        allowed_moves = [i for i in range(9) if game.state[i] == 0]
        human_move = None
        while not human_move:
            human_move = int(input('Choose move for {}, from {} : '.format(game.player, allowed_moves)))
        return human_move
    
    def train(self, game, train_data):
        raise NotImplementedError()
        
    def predict_state_value(self, state, player_mark):
        raise NotImplementedError()
        
    def predict_state_values(self, game):
        raise NotImplementedError()

In [824]:
def demo_game(agent1, agent2, verbose=False):
    t = 0
    random_player_marks = False
    if not agent1.player_mark and not agent2.player_mark:
        random_player_marks = True
        if random.random() < 0.5:
            agent1.player_mark = 1
            agent2.player_mark = -1
        else:
            agent1.player_mark = -1
            agent2.player_mark = 1
    assert agent1.player_mark != agent2.player_mark
    
    game = TicTacToeGame()
    agent_to_move = agent1 if agent1.player_mark == 1 else agent2
    while game.playable():
        if verbose:
            print(" \nTurn {}\n".format(t))
            game.print_board()
        move = agent_to_move.play_select_move(game)
        if agent_to_move == agent1:
            agent_to_move = agent2
        else:
            agent_to_move = agent1
        game.make_move(move)
        t += 1
    if verbose:
        print(" \nTurn {}\n".format(t))
        game.print_board()
    if random_player_marks:
        agent1.player_mark = 0
        agent2.player_mark = 0
    if game.winner:
        if verbose:
            print("\n{} is the winner!".format(game.winner))
        return game.winner
    else:
        if verbose:
            print("\nIt's a draw!")
        return 0

def interactive_game(game, agent):
    t = 0
    while game.playable():
        print(" \nTurn {}\n".format(t))
        game.print_board()
        if game.player == agent.agent_player:
            move = agent.play_select_move(game)
        else:
            move = agent.__request_human_move()
        game.make_move(move)
        t += 1

    print(" \nTurn {}\n".format(t))
    game.print_board()

    if game.winner:
        winner = TicTacToeGame.output_mark(game.winner)
        print("\n{} is the winner!".format(winner))
        return winner
    print("\nIt's a draw!")
    return '-'

In [825]:
def demo_game_stats(agent1, agent2, num_games=1000):
    results = [demo_game(agent1, agent2) for i in range(num_games)]
    game_stats = {TicTacToeGame.output_mark(k): "{:.1f}%".format(0 if num_games == 0 else results.count(k)/num_games*100) for k in [1, -1, 0]}
    return game_stats

In [826]:
def train_agent(agent, learn_games=10000):
    print ('Training {} by {} games'.format(agent.get_full_name(), learn_games))
    agent.learn_game(learn_games)
    print ("{}: {} games learned".format(agent.get_full_name(), agent.games_learned()))

In [827]:
def test_agents(agent1, agent2, demo_games=1000):
    game_stats = demo_game_stats(agent1, agent2, demo_games)
    print ("{} vs {}: {}".format(agent1.get_full_name(), agent2.get_full_name(), game_stats))

In [828]:
class AgentDict(Agent):
    def __init__(self, name, epsilon=0.1, alpha=1.0, player_mark=None):
        super(AgentDict, self).__init__(name, epsilon, alpha, player_mark)
        self.V = {}

    def train(self, train_data):
        for i in range(len(train_data)-1):
            new_state = train_data[i]['new_state']
            td_target = train_data[i+1]['reward']
            #print (new_state, td_target)
            current_state_value = self.predict_state_value(new_state, self.player_mark)
            value = current_state_value + self.alpha * (td_target - current_state_value)
            self.V[new_state] = value

    def predict_state_value(self, state, player_mark):
        return self.V.get(state, 0.0)
    
    def predict_state_values(self, game):
        return dict((move, self.predict_state_value(game.move_to_state(move), game.player)) for move in game.allowed_moves())

In [829]:
# Testing agents with dictionary
# Train agent_dict_1 to play for 'X'
agent_dict_1 = AgentDict("agent_dict_1", epsilon=0.8, alpha=0.9, player_mark=1)
agent_dict_2 = AgentDict("agent_dict_2", epsilon=0.8, alpha=0.9, player_mark=-1)

rounds = 10
test_agents(agent_dict_1, agent_dict_2)
for round in range(rounds):
    train_agent(agent_dict_1)
    test_agents(agent_dict_1, agent_dict_2)

agent_dict_1(X) vs agent_dict_2(O): {'X': '54.7%', 'O': '30.1%', ' ': '15.2%'}
Training agent_dict_1(X) by 10000 games
agent_dict_1(X): 10000 games learned
agent_dict_1(X) vs agent_dict_2(O): {'X': '98.4%', 'O': '0.5%', ' ': '1.1%'}
Training agent_dict_1(X) by 10000 games
agent_dict_1(X): 20000 games learned
agent_dict_1(X) vs agent_dict_2(O): {'X': '99.0%', 'O': '0.9%', ' ': '0.1%'}
Training agent_dict_1(X) by 10000 games
agent_dict_1(X): 30000 games learned
agent_dict_1(X) vs agent_dict_2(O): {'X': '99.2%', 'O': '0.8%', ' ': '0.0%'}
Training agent_dict_1(X) by 10000 games
agent_dict_1(X): 40000 games learned
agent_dict_1(X) vs agent_dict_2(O): {'X': '97.5%', 'O': '1.9%', ' ': '0.6%'}
Training agent_dict_1(X) by 10000 games
agent_dict_1(X): 50000 games learned
agent_dict_1(X) vs agent_dict_2(O): {'X': '98.4%', 'O': '0.0%', ' ': '1.6%'}
Training agent_dict_1(X) by 10000 games
agent_dict_1(X): 60000 games learned
agent_dict_1(X) vs agent_dict_2(O): {'X': '98.5%', 'O': '0.8%', ' ': '0.7%

In [830]:
# Train agent_dict_3 to play for 'O'
agent_dict_3 = AgentDict("agent_dict_3", epsilon=0.8, alpha=0.9, player_mark=-1)
agent_dict_4 = AgentDict("agent_dict_4", epsilon=0.8, alpha=0.9, player_mark=1)

rounds = 10
test_agents(agent_dict_3, agent_dict_4)
for round in range(rounds):
    train_agent(agent_dict_3)
    test_agents(agent_dict_3, agent_dict_4)

agent_dict_3(O) vs agent_dict_4(X): {'X': '54.7%', 'O': '27.7%', ' ': '17.6%'}
Training agent_dict_3(O) by 10000 games
agent_dict_3(O): 10000 games learned
agent_dict_3(O) vs agent_dict_4(X): {'X': '9.2%', 'O': '88.2%', ' ': '2.6%'}
Training agent_dict_3(O) by 10000 games
agent_dict_3(O): 20000 games learned
agent_dict_3(O) vs agent_dict_4(X): {'X': '9.9%', 'O': '85.5%', ' ': '4.6%'}
Training agent_dict_3(O) by 10000 games
agent_dict_3(O): 30000 games learned
agent_dict_3(O) vs agent_dict_4(X): {'X': '8.2%', 'O': '87.0%', ' ': '4.8%'}
Training agent_dict_3(O) by 10000 games
agent_dict_3(O): 40000 games learned
agent_dict_3(O) vs agent_dict_4(X): {'X': '13.4%', 'O': '81.2%', ' ': '5.4%'}
Training agent_dict_3(O) by 10000 games
agent_dict_3(O): 50000 games learned
agent_dict_3(O) vs agent_dict_4(X): {'X': '8.5%', 'O': '88.7%', ' ': '2.8%'}
Training agent_dict_3(O) by 10000 games
agent_dict_3(O): 60000 games learned
agent_dict_3(O) vs agent_dict_4(X): {'X': '9.4%', 'O': '86.1%', ' ': '4.5

In [831]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms

In [832]:
class Net(nn.Module):
    def __init__(self, lr=0.0001):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(27, 54)
        self.fc2 = nn.Linear(54, 36)        
        self.fc3 = nn.Linear(36, 9)

        self.optim = torch.optim.SGD(self.parameters(), lr=lr)
        self.loss = F.torch.nn.MSELoss()        

    def forward(self, x):
        x = self.fc1(x)
        x = torch.relu(x)
        x = self.fc2(x)
        x = torch.relu(x)
        x = self.fc3(x)
        x = torch.tanh(x)
        return x

In [835]:
class AgentNN(Agent):
    def __init__(self, name, epsilon=0.1, alpha=1.0, player_mark=1):
        super(AgentNN, self).__init__(name, epsilon, alpha, player_mark)
        self.model = Net(lr=0.05)
        import copy
        self.model_copy = copy.deepcopy(self.model)
    
    def train(self, train_data):
        #print (train_data)
        states = []
        new_states = []
        rewards = []
        moves = []
        for i in range(len(train_data) - 1):
            new_states.append(train_data[i]['new_state'])
            moves.append([train_data[i]['move']])
            states.append(train_data[i]['state'])
            rewards.append(train_data[i+1]['reward'])
        moves = torch.tensor(moves)
        rewards = torch.tensor(rewards).type(torch.float32)
        y_pred = self.model(self.__tensor_from_states(states).view(-1, 27))
        y = y_pred.clone().detach()
        new_state_values = self.model_copy(self.__tensor_from_states(new_states).view(-1, 27)).detach()

        value = torch.max(new_state_values, axis=1).values
        rewards[:-1] = self.alpha * value[:-1] # propagate rewards to all moves
        y[range(len(train_data) - 1), moves] = rewards

        loss = self.model.loss(y_pred, y)
        self.model.optim.zero_grad()
        loss.backward()
        self.model.optim.step()
        self.model_copy.load_state_dict(self.model.state_dict())
        
    def __tensor_from_states(self, states):    
        states_tensor = []
        for state in states:
            assert len(state) == 9
            state_tensor = []
            if state is None:
                for i in range(27):
                    state_tensor += 0.0
            else:
                for sym in state:
                    state_tensor.append(1 if sym == 1 else 0)
                for sym in state:
                    state_tensor.append(1 if sym == -1 else 0)
                for sym in state:
                    state_tensor.append(1 if sym == 0 else 0)
            states_tensor.append(state_tensor)
        return torch.tensor(states_tensor).type(torch.float32)

    def learn_from_move(self, game, train_data):
        state = game.state
        best_next_move, selected_next_move = self.learn_select_move(game)
        game.make_move(selected_next_move)
        train_data.append({'state' : state, 'move' : selected_next_move, 'new_state' : game.state, 'reward' : 0})

    def predict_state_value(self, state, player):
        raise NotImplementedError # this method is not used
    
    def predict_state_values(self, game):
        x = game.player * self.__tensor_from_states([game.state])
        state_values = self.model(x).detach().view(-1).numpy()
        predicted_state_values = {}
        for i in range(9):
            if game.state[i] == 0:
                predicted_state_values[i] = state_values[i]
        return predicted_state_values

In [836]:
# Testing agents with NN
# Train agent_nn_1 to play for 'X'
agent_nn_1 = AgentNN("agent_nn_1", epsilon=0.7, alpha=0.9, player_mark=1)
agent_dict = AgentDict("agent_dict", epsilon=0.7, alpha=0.9, player_mark=-1)
rounds = 10
test_agents(agent_nn_1, agent_dict)
for round in range(rounds):
    train_agent(agent_nn_1)
    test_agents(agent_nn_1, agent_dict)

agent_nn_1(X) vs agent_dict(O): {'X': '35.8%', 'O': '40.3%', ' ': '23.9%'}
Training agent_nn_1(X) by 10000 games
agent_nn_1(X): 10000 games learned
agent_nn_1(X) vs agent_dict(O): {'X': '72.1%', 'O': '18.6%', ' ': '9.3%'}
Training agent_nn_1(X) by 10000 games
agent_nn_1(X): 20000 games learned
agent_nn_1(X) vs agent_dict(O): {'X': '87.6%', 'O': '11.0%', ' ': '1.4%'}
Training agent_nn_1(X) by 10000 games
agent_nn_1(X): 30000 games learned
agent_nn_1(X) vs agent_dict(O): {'X': '94.9%', 'O': '3.4%', ' ': '1.7%'}
Training agent_nn_1(X) by 10000 games
agent_nn_1(X): 40000 games learned
agent_nn_1(X) vs agent_dict(O): {'X': '84.6%', 'O': '5.8%', ' ': '9.6%'}
Training agent_nn_1(X) by 10000 games
agent_nn_1(X): 50000 games learned
agent_nn_1(X) vs agent_dict(O): {'X': '96.0%', 'O': '2.0%', ' ': '2.0%'}
Training agent_nn_1(X) by 10000 games
agent_nn_1(X): 60000 games learned
agent_nn_1(X) vs agent_dict(O): {'X': '99.1%', 'O': '0.0%', ' ': '0.9%'}
Training agent_nn_1(X) by 10000 games
agent_nn_

In [837]:
# Train agent_nn_2 to play for 'O'
agent_nn_2 = AgentNN("agent_nn_2", epsilon=0.7, alpha=0.9, player_mark=-1)
agent_dict = AgentDict("agent_dict", epsilon=0.7, alpha=0.9, player_mark=1)
rounds = 10
test_agents(agent_nn_2, agent_dict)
for round in range(rounds):
    train_agent(agent_nn_2)
    test_agents(agent_nn_2, agent_dict)

agent_nn_2(O) vs agent_dict(X): {'X': '53.3%', 'O': '29.0%', ' ': '17.7%'}
Training agent_nn_2(O) by 10000 games
agent_nn_2(O): 10000 games learned
agent_nn_2(O) vs agent_dict(X): {'X': '47.5%', 'O': '25.1%', ' ': '27.4%'}
Training agent_nn_2(O) by 10000 games
agent_nn_2(O): 20000 games learned
agent_nn_2(O) vs agent_dict(X): {'X': '38.5%', 'O': '38.4%', ' ': '23.1%'}
Training agent_nn_2(O) by 10000 games
agent_nn_2(O): 30000 games learned
agent_nn_2(O) vs agent_dict(X): {'X': '32.4%', 'O': '36.1%', ' ': '31.5%'}
Training agent_nn_2(O) by 10000 games
agent_nn_2(O): 40000 games learned
agent_nn_2(O) vs agent_dict(X): {'X': '28.1%', 'O': '45.4%', ' ': '26.5%'}
Training agent_nn_2(O) by 10000 games
agent_nn_2(O): 50000 games learned
agent_nn_2(O) vs agent_dict(X): {'X': '26.9%', 'O': '44.4%', ' ': '28.7%'}
Training agent_nn_2(O) by 10000 games
agent_nn_2(O): 60000 games learned
agent_nn_2(O) vs agent_dict(X): {'X': '27.4%', 'O': '45.7%', ' ': '26.9%'}
Training agent_nn_2(O) by 10000 games