In [1]:
# Copied from https://github.com/neilslater/game_playing_scripts

'''
   Copyright 2017 Neil Slater

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
'''

import numpy as np
import csv
import random
from itertools import groupby

class TicTacToeGame():
    def __init__(self):
        self.state = '         '
        self.player = 'X'
        self.winner = None

    def allowed_moves(self):
        states = []
        for i in range(len(self.state)):
            if self.state[i] == ' ':
                states.append(self.state[:i] + self.player + self.state[i+1:])
        return states

    def make_move(self, next_state):
        if self.winner:
            raise(Exception("Game already completed, cannot make another move!"))
        if not self.__valid_move(next_state):
            raise(Exception("Cannot make move {} to {} for player {}".format(
                    self.state, next_state, self.player)))

        self.state = next_state
        self.winner = self.predict_winner(self.state)
        if self.winner:
            self.player = None
        elif self.player == 'X':
            self.player = 'O'
        else:
            self.player = 'X'

    def playable(self):
        return ( (not self.winner) and any(self.allowed_moves()) )

    def predict_winner(self, state):
        lines = [(0,1,2), (3,4,5), (6,7,8), (0,3,6), (1,4,7), (2,5,8), (0,4,8), (2,4,6)]
        winner = None
        for line in lines:
            line_state = state[line[0]] + state[line[1]] + state[line[2]]
            if line_state == 'XXX':
                winner = 'X'
            elif line_state == 'OOO':
                winner = 'O'
        return winner

    def __valid_move(self, next_state):
        allowed_moves = self.allowed_moves()
        if any(state == next_state for state in allowed_moves):
            return True
        return False

    def print_board(self):
        s = self.state
        print('     {} | {} | {} '.format(s[0],s[1],s[2]))
        print('    -----------')
        print('     {} | {} | {} '.format(s[3],s[4],s[5]))
        print('    -----------')
        print('     {} | {} | {} '.format(s[6],s[7],s[8]))


In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms

In [3]:
class Net(nn.Module):
    def __init__(self, lr=0.00001):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(18, 24)
        self.fc2 = nn.Linear(24, 1)

        self.fc1.weight.data.fill_(0.0)
        self.fc2.weight.data.fill_(0.0)
        
        self.optim = torch.optim.SGD(self.parameters(), lr=lr)
        self.loss = F.torch.nn.MSELoss()

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        return x

In [4]:
class Agent():
    def __init__(self, game_class, epsilon=0.1, alpha=1.0, player_mark='X'):
        self.alpha = alpha
        self.model = Net()
        self.NewGame = game_class
        self.epsilon = epsilon
        self.player_mark = player_mark

    def state_value(self, game_state, V):
        return V.get(game_state, 0.0)
        
    def learn_game(self, num_episodes=1000):
        teach_data = []
        batch_size = 100
        for episode in range(num_episodes):
            teach_data.extend(self.learn_from_episode())
            if (episode % batch_size) == 0 and episode > 0:
                for data in teach_data:
                    self.teach_nn(data)
                teach_data.clear()

    def learn_from_episode(self):
        game = self.NewGame()
        prev_state = game.state
        V = dict()
        _, move = self.learn_select_move(game, V)
        teach_data = []
        while move:
            prev_state = move
            move = self.learn_from_move(game, move, V)
            teach_data.append({'state' : prev_state, 'action' : move, 'value' : self.state_value(move, V)})
        return teach_data

    def teach_nn(self, data):
        x = self.__tensor_from_states(data['state'], data['action'])
        y = data['value']
        y_pred = self.model(x)
        loss = self.model.loss(y_pred, torch.tensor([y]))
        self.model.optim.zero_grad()
        loss.backward()
        self.model.optim.step()
        
    def learn_from_move(self, game, move, V):
        game.make_move(move)
        r = self.__reward(game)
        next_state_value = 0.0
        selected_next_move = None
        if game.playable():
            best_next_move, selected_next_move = self.learn_select_move(game, V)
            next_state_value = self.state_value(best_next_move, V)
        current_state_value = self.state_value(move, V)
        td_target = r + next_state_value
        V[move] = current_state_value + self.alpha * (td_target - current_state_value)
        
        return selected_next_move

    def learn_select_move(self, game, V):
        allowed_state_values = self.__state_values(game.allowed_moves(), V)
        if game.player == self.player_mark:
            best_move = self.__argmax_V(allowed_state_values)
        else:
            best_move = self.__argmin_V(allowed_state_values)

        selected_move = best_move
        if random.random() < self.epsilon:
            selected_move = self.__random_V(allowed_state_values)

        return (best_move, selected_move)

    def play_select_move(self, game):
        allowed_state_values = self.__predict_state_values(game.state, game.allowed_moves())
        if game.player == self.player_mark:
            return self.__argmax_V(allowed_state_values)
        else:
            return self.__argmin_V(allowed_state_values)

    def demo_game(self, verbose=False):
        game = self.NewGame()
        t = 0
        while game.playable():
            if verbose:
                print(" \nTurn {}\n".format(t))
                game.print_board()
            move = self.play_select_move(game)
            game.make_move(move)
            t += 1
        if verbose:
            print(" \nTurn {}\n".format(t))
            game.print_board()
        if game.winner:
            if verbose:
                print("\n{} is the winner!".format(game.winner))
            return game.winner
        else:
            if verbose:
                print("\nIt's a draw!")
            return '-'

    def interactive_game(self, agent_player='X'):
        game = self.NewGame()
        t = 0
        while game.playable():
            print(" \nTurn {}\n".format(t))
            game.print_board()
            if game.player == agent_player:
                move = self.play_select_move(game)
                game.make_move(move)
            else:
                move = self.__request_human_move(game)
                game.make_move(move)
            t += 1

        print(" \nTurn {}\n".format(t))
        game.print_board()

        if game.winner:
            print("\n{} is the winner!".format(game.winner))
            return game.winner
        print("\nIt's a draw!")
        return '-'

    def __tensor_from_states(self, cur_state, state):    
        state_tensor = []
        def append_state(s):
            if s is None:
                for i in range(9):
                    state_tensor.append(0.0)
            else:
                for sym in s:
                    if sym == ' ':
                        state_tensor.append(0.0)
                    elif sym == 'X':
                        state_tensor.append(1.0)
                    elif sym == 'O':
                        state_tensor.append(-1.0)
                    else:
                        assert False, "Incorrect sym"
        append_state(cur_state)
        append_state(state)

        return torch.tensor(state_tensor)

    def __predict_state_values(self, cur_state, game_states):
        return dict((state, self.model(self.__tensor_from_states(cur_state, state))) for state in game_states)

    def __state_values(self, game_states, V):
        return dict((state, self.state_value(state, V)) for state in game_states)

    def __argmax_V(self, state_values):
        max_V = max(state_values.values())
        chosen_state = random.choice([state for state, v in state_values.items() if v == max_V])
        return chosen_state

    def __argmin_V(self, state_values):
        min_V = min(state_values.values())
        chosen_state = random.choice([state for state, v in state_values.items() if v == min_V])
        return chosen_state

    def __random_V(self, state_values):
        return random.choice(list(state_values.keys()))

    def __reward(self, game):
        if game.winner == self.player_mark:
            return 1.0
        elif game.winner:
            return -1.0
        else:
            return 0.0

    def __request_human_move(self, game):
        allowed_moves = [i+1 for i in range(9) if game.state[i] == ' ']
        human_move = None
        while not human_move:
            idx = int(input('Choose move for {}, from {} : '.format(game.player, allowed_moves)))
            if any([i==idx for i in allowed_moves]):
                human_move = game.state[:idx-1] + game.player + game.state[idx:]
        return human_move

In [5]:
def demo_game_stats(agent):
    results = [agent.demo_game() for i in range(10000)]
    game_stats = {k: results.count(k)/100 for k in ['X', 'O', '-']}
    print("    percentage results: {}".format(game_stats))

In [6]:
agent = Agent(TicTacToeGame, epsilon = 0.1, alpha = 1.0)
print("Before learning:")
demo_game_stats(agent)

agent.learn_game(1000)
print("After 1000 learning games:")
demo_game_stats(agent)

agent.learn_game(4000)
print("After 5000 learning games:")
demo_game_stats(agent)

agent.learn_game(5000)
print("After 10000 learning games:")
demo_game_stats(agent)

agent.learn_game(10000)
print("After 20000 learning games:")
demo_game_stats(agent)

agent.learn_game(10000)
print("After 30000 learning games:")
demo_game_stats(agent)


Before learning:
    percentage results: {'X': 58.78, 'O': 28.47, '-': 12.75}
After 1000 learning games:
    percentage results: {'X': 59.61, 'O': 27.55, '-': 12.84}
After 5000 learning games:
    percentage results: {'X': 75.06, 'O': 0.0, '-': 24.94}
After 10000 learning games:
    percentage results: {'X': 49.43, 'O': 50.57, '-': 0.0}
After 20000 learning games:
    percentage results: {'X': 0.0, 'O': 0.0, '-': 100.0}
After 30000 learning games:
    percentage results: {'X': 0.0, 'O': 0.0, '-': 100.0}


In [7]:
agent.demo_game(True)

 
Turn 0

       |   |   
    -----------
       |   |   
    -----------
       |   |   
 
Turn 1

       |   |   
    -----------
     X |   |   
    -----------
       |   |   
 
Turn 2

       |   |   
    -----------
     X |   |   
    -----------
     O |   |   
 
Turn 3

       |   |   
    -----------
     X |   |   
    -----------
     O |   | X 
 
Turn 4

     O |   |   
    -----------
     X |   |   
    -----------
     O |   | X 
 
Turn 5

     O |   |   
    -----------
     X | X |   
    -----------
     O |   | X 
 
Turn 6

     O |   |   
    -----------
     X | X | O 
    -----------
     O |   | X 
 
Turn 7

     O |   | X 
    -----------
     X | X | O 
    -----------
     O |   | X 
 
Turn 8

     O |   | X 
    -----------
     X | X | O 
    -----------
     O | O | X 
 
Turn 9

     O | X | X 
    -----------
     X | X | O 
    -----------
     O | O | X 

It's a draw!


'-'