In [469]:
# A bit of setup
import numpy as np
import copy
import random as rnd
from sets import Set
import matplotlib.pyplot as plt

%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

# for auto-reloading extenrnal modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [470]:
a = np.zeros((3,3))
np.fliplr(a)

array([[ 0.,  0.,  0.],
       [ 0.,  0.,  0.],
       [ 0.,  0.,  0.]])

In [471]:
class Board:
    '''
    state is n x n ndarray, 0: unknown, 1: X (first), -1: O (later)
    '''
    def __init__(self, size=3):
        self.reset(size)
            
    def reset(self, size=3):
        self.size = size
        self.state = np.zeros( (self.size, self.size) ) # board state
        self.unassigned = Set()  # available position tuples
        it = np.nditer(self.state, flags=['multi_index'])
        while not it.finished:
            ix = it.multi_index  # e.g., (1, 2)
            self.unassigned.add( ix )
            it.iternext()

            
    def print_board(self):
        print '------------------'
        for x in xrange( self.size ):
            for y in xrange( self.size ):
                if self.state[x, y] == 1:
                    print 'X\t',
                elif self.state[x, y] == -1:
                    print 'O\t',
                else:
                    print '()\t',
            print
        print '------------------'
        
    # x: up-down, y: left-right    
    def add_X(self, x, y): 
        self.state[x, y] = 1
        self.unassigned.remove( (x, y) )
    
    def add_O(self, x, y):
        self.state[x, y] = -1
        self.unassigned.remove( (x, y) )

    # clear a position to be unassigned
    def clear_pos(self, x, y):
        self.state[x, y] = 0
        self.unassigned.add( (x, y) )

    
    # 0: ongoing, 1: agent (first) wins, -1: agent (later) wins,  2: draw  (game over, !=0)
    def if_win(self):
        if self.state[:, 0].sum() == 3 \
        or self.state[:, 1].sum() == 3 \
        or self.state[:, 2].sum() == 3 \
        or self.state[0, :].sum() == 3 \
        or self.state[1, :].sum() == 3 \
        or self.state[2, :].sum() == 3 \
        or self.state.diagonal().sum() == 3\
        or np.fliplr(self.state).diagonal().sum() == 3:
            return 1
        elif self.state[:, 0].sum() == -3 \
        or self.state[:, 1].sum() == -3 \
        or self.state[:, 2].sum() == -3 \
        or self.state[0, :].sum() == -3 \
        or self.state[1, :].sum() == -3 \
        or self.state[2, :].sum() == -3 \
        or self.state.diagonal().sum() == -3 \
        or np.fliplr(self.state).diagonal().sum() == -3:
            return -1
        elif len(self.unassigned) == 0:
            return 2  # draw
        else: # ongoing
            return 0

In [472]:
class Agent:
    def __init__(self, role='first', policy='random'):
        self.role = role
        self.policy = policy
        self.state_values = {}

    def move(self, board):
        if self.policy == 'state_value':
            x, y = self.best_move_by_state_value( board )
        else: #if self.policy == 'random':
            x, y = rnd.choice( tuple(board.unassigned) ) # return position
        if self.role=='first':
            board.add_X( x, y )
        else: # role == 'later'
            board.add_O( x, y )
    
    def set_state_values(self, board, value):
        key = board.state.tostring()
        self.state_values[ key ] = value
    
    def get_state_value(self, board):
        key = board.state.tostring()
        if  key not in self.state_values:
            return 0.0
        else:
            return self.state_values[ key ]
    
    # simulate next move
    def best_move_by_state_value(self, board):
        cand = [] # tuple ((x,y), state_value)
        for x, y in board.unassigned:
            # simulate move
            if self.role == 'first':
                board.add_X(x, y)
            else: # role == 'later'
                board.add_O(x, y)
            tup = ((x,y), self.get_state_value(board))
            #print tup    
            cand.append( tup )
            board.clear_pos(x, y) # revert !!
        #print cand
        tup_max = max( cand, key=lambda tup: tup[1])  # the tuple with max state value
        pos = [ tup[0] for tup in cand if tup[1] == tup_max[1] ] # list of positions
        #print pos
        return rnd.choice( pos )

In [473]:
class Game:
    def __init__(self):
        pass
    
    def set_board(self, board):
        self.board = board;
        
    def set_agent_first(self, agent):
        self.agent_first = agent
    
    def set_agent_later(self, agent):
        self.agent_later = agent
    
    # return: 0: ongoing, 1: first wins, -1, later wins, 2: draw
    def check_finish(self):
        if_win = self.board.if_win()
        return if_win
    
    def print_result(self, if_win):
        if if_win == 0:
            print 'game is ongoing...'
        elif if_win == 1:
            print 'Player First Wins!'
        elif if_win == -1:
            print 'Player Later Wins!'
        else: #if_win == 2:
            print 'Draw!'

    # play a game (an episode)
    # return: game result        
    def auto_play(self, verbose=True):
        self.board.reset()
        if_win = 0 
        while 1:
            # first agent
            self.agent_first.move( self.board )
            if_win = self.check_finish()
            if verbose:
                self.board.print_board()
                self.print_result( if_win )
            if if_win != 0: # game over
                break
            # later agent    
            self.agent_later.move( self.board )
            if_win = self.check_finish()
            if verbose:
                self.board.print_board()
                self.print_result( if_win )
            if if_win != 0:
                break
        return if_win        

In [569]:
class QTrainer(Game):
    def __init__(self):
        self.alpha = 1.0
        self.gamma = 0.9
    
    # a TD learning method
    # V(s) = V(s) + a * (R + g * V(s') - V(s))
    def train_episode(self, verbose=False):
        self.board.reset()
        if_win = 0 
        value_old = None 
        while 1:
            ## first agent
            self.agent_first.move( self.board )
            if_win = self.check_finish()
            if verbose:
                self.board.print_board()
                self.print_result( if_win )
            if if_win == 1: # win
                value_new = 1.0                
            elif if_win == 2: # draw
                value_new = -1.0
            else:
                value_new = self.agent_first.get_state_value( self.board )
            self.agent_first.set_state_values( self.board, value_new )
            # update last state value
            if value_old != None:  
                value_old += self.alpha * ( 0 + self.gamma * value_new - value_old )
                self.agent_first.set_state_values( board_old, value_old )
            # save current board state as last state    
            board_old = copy.deepcopy( self.board ) 
            value_old = value_new
            if if_win != 0: # game over
                break
                
            ## later agent    
            self.agent_later.move( self.board )
            if_win = self.check_finish()
            if verbose:
                self.board.print_board()
                self.print_result( if_win )
            if if_win != 0: # later agent wins
                # update last state value (for first agent)
                value_old = -1.0
                self.agent_first.set_state_values( board_old, value_old )
                break
        return if_win        
    
    # Can be called repeatedly to update agent's "state_values" function 
    def train(self, n_episode):
        win_first = 0
        win_later = 0
        draw = 0
        for i in range(n_episode):
            rslt = self.train_episode(verbose=False)
            if rslt == 1:
                win_first += 1
            elif rslt == -1:
                win_later += 1
            elif rslt == 2:
                draw += 1
        print 'f-win', win_first, 'l-win', win_later, 'draw', draw

# Initiate Agents

In [570]:
agent_first = Agent('first', 'state_value')
agent_later = Agent('later', 'random')

In [571]:
print agent_first.state_values
print agent_later.state_values

{}
{}


# Train

In [584]:
trainer = QTrainer()
trainer.set_board( Board(3) )
trainer.set_agent_first(agent_first)
trainer.set_agent_later(agent_later)

In [623]:
trainer.train(1000000)
np.save('agent_first_state_values_1M.npy', agent_first.state_values)

f-win 987121 l-win 53 draw 12826


In [624]:
print agent_first.state_values.values()


[-1.0, -0.81, -1.0, 0.9, 1.0, 0.9, -1.0, 1.0, -1.0, -1.0, -1.0, 1.0, 1.0, 1.0, -1.0, 1.0, 1.0, -0.7290000000000001, -1.0, -0.7290000000000002, 1.0, 1.0, -1.0, -1.0, 0.81, 1.0, -0.9, 0.9, -1.0, -0.9, 1.0, -0.8099999999999999, 0.0, -0.81, 0.9, 1.0, -1.0, -1.0, -0.81, 0.0, -1.0, -0.81, 0.9, -1.0, -1.0, 0.9, -0.81, -0.9, -0.9, -1.0, -0.7290000000000002, -0.81, 0.9, 0.9, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, 0.0, 1.0, -0.81, 1.0, 0.9, -1.0, -1.0, -1.0, -1.0, 0.9, 1.0, -1.0, -1.0, 0.9, -0.81, -1.0, -1.0, 1.0, -0.81, 0.0, -0.81, -0.81, 0.9, -1.0, -1.0, -1.0, -0.9, 0.0, 1.0, -0.81, 0.9, -1.0, 1.0, -0.9, -0.7290000000000001, 1.0, -0.7290000000000002, -1.0, -1.0, -0.8099999999999999, -0.9, -1.0, -0.9, 0.9, 1.0, 1.0, -0.8099999999999999, 0.9, 0.7290000000000001, 1.0, 1.0, 1.0, -1.0, -1.0, 0.0, 1.0, 1.0, -1.0, -1.0, -1.0, -1.0, 0.81, -1.0, 1.0, 0.8999999999999999, 0.0, -1.0, 0.9, -0.7290000000000002, -1.0, 0.9, 0.81, 1.0, -0.9, 0.8999999999999999, -0.9, -1.0, -1.0, -1.0, 0.81, 0.0, -0.9, 0.81, -1.0, -0

# Test

In [625]:
game = Game()
game.set_board( Board(3) )

In [626]:
game.set_agent_first( agent_first )
game.set_agent_later( agent_later )

In [627]:
'''
game.set_agent_first( Agent('first', 'random') )
game.set_agent_later( Agent('later', 'random') )
'''

"\ngame.set_agent_first( Agent('first', 'random') )\ngame.set_agent_later( Agent('later', 'random') )\n"

In [631]:
win_first = 0
win_later = 0
draw = 0
for i in range(10000):
    rslt = game.auto_play(verbose=False)
    if rslt == 1:
        win_first += 1
    elif rslt == -1:
        win_later += 1
    elif rslt == 2:
        draw += 1
print win_first, win_later, draw

9939 0 61
