In [1]:
from tictactoe import *

class QPlayer:
    
    EXPLORE = 0.10
    ALPHA = 0.01
    EPSILON = 0.90
    
    def __init__(self, explore=EXPLORE, alpha=ALPHA, epsilon=EPSILON):
        self.q_table = np.zeros([3**9,9])
        self.explore = explore
        self.wins = 0
        self.losses = 0
        self.a = alpha
        self.e = epsilon
        
    def move(self, game, state):
        self.state = state
        self.action = np.argmax(self.q_table[state])
        if np.random.random() < self.explore:
            self.action = game.sample()
        return self.action

    def update(self, game, state, reward):
        self.q_table[self.state][self.action] = ((1 - self.a) * self.q_table[self.state][self.action]
                                                  + self.a * (reward + self.e * np.amax(self.q_table[state])))
        if reward > 0:
            self.wins += 1
        if reward < 0:
            self.losses += 1
            
    def reset_metrics(self):
        self.wins = 0
        self.losses = 0

    def __str__(self):
        if self.wins + self.losses > 0:
            return (str(np.sum(self.q_table)) 
                    + ' win rate = ' + str(self.wins / (self.wins + self.losses))
                    + ' loss rate = ' + str(self.losses / (self.wins + self.losses))
                    + ' explore = ' + str(self.explore))
        else:
            return 'inexperienced Q player'

In [2]:
p = ProceduralPlayer()
q = QPlayer()
g = Game(p,q)
loss_rate = []
for m in range(100):
    for n in range(10000):
        g.play()
    loss_rate.append(q.losses / 10000)
    print(g.i,q.losses / 10000,q.explore)
    q.reset_metrics()


10000 0.5482 0.1
20000 0.4791 0.1
30000 0.4657 0.1
40000 0.4627 0.1
50000 0.4531 0.1
60000 0.4444 0.1
70000 0.4568 0.1
80000 0.4491 0.1
90000 0.4528 0.1
100000 0.4512 0.1
110000 0.4448 0.1
120000 0.4469 0.1
130000 0.4469 0.1
140000 0.45 0.1
150000 0.4456 0.1
160000 0.4386 0.1
170000 0.4424 0.1
180000 0.447 0.1
190000 0.4459 0.1
200000 0.4422 0.1
210000 0.4507 0.1
220000 0.4444 0.1
230000 0.4509 0.1
240000 0.4378 0.1
250000 0.4417 0.1
260000 0.4522 0.1
270000 0.4361 0.1
280000 0.4484 0.1
290000 0.4572 0.1
300000 0.4511 0.1
310000 0.4484 0.1
320000 0.4382 0.1
330000 0.4391 0.1
340000 0.4407 0.1
350000 0.4485 0.1
360000 0.4492 0.1
370000 0.4563 0.1
380000 0.451 0.1
390000 0.4495 0.1
400000 0.4492 0.1
410000 0.4411 0.1
420000 0.452 0.1
430000 0.437 0.1
440000 0.4416 0.1
450000 0.4556 0.1
460000 0.4481 0.1
470000 0.4399 0.1
480000 0.4346 0.1
490000 0.4415 0.1
500000 0.4429 0.1
510000 0.45 0.1
520000 0.4409 0.1
530000 0.4422 0.1
540000 0.4438 0.1
550000 0.4462 0.1
560000 0.439 0.1
570000 0.4

In [None]:
h = HumanPlayer()
b = Game(q,h)
while True:
    b.play(verbose=True)