In [6]:
from tictactoe import *

class QPlayer:
    
    EXPLORE = 0.10
    ALPHA = 0.01
    EPSILON = 0.90
    
    def __init__(self, explore=EXPLORE, alpha=ALPHA, epsilon=EPSILON):
        self.q_table = np.zeros([3**9,9])
        self.explore = explore
        self.wins = 0
        self.losses = 0
        self.a = alpha
        self.e = epsilon
        
    def move(self, game, state):
        self.state = state
        self.action = np.argmax(self.q_table[state])
        if np.random.random() < self.explore:
            self.action = game.sample()
        return self.action

    def update(self, game, state, reward):
        self.q_table[self.state][self.action] = ((1 - self.a) * self.q_table[self.state][self.action]
                                                  + self.a * (reward + self.e * np.amax(self.q_table[state])))
        if reward > 0:
            self.explore *= 0.9999
            self.wins += 1
        if reward < 0:
            self.losses += 1
            
    def reset_metrics(self):
        self.wins = 0
        self.losses = 0

    def __str__(self):
        if self.wins + self.losses > 0:
            return (str(np.sum(self.q_table)) 
                    + ' win rate = ' + str(self.wins / (self.wins + self.losses))
                    + ' loss rate = ' + str(self.losses / (self.wins + self.losses))
                    + ' explore = ' + str(self.explore))
        else:
            return 'inexperienced Q player'

In [None]:
p = ProceduralPlayer()
q = QPlayer()
g = Game(p,q)
for m in range(100):
    for n in range(1000):
        g.play()
    print(g.i,q)
    q.reset_metrics()

1000 -48.546771259333745 win rate = 0.07061503416856492 loss rate = 0.929384965831435 explore = 0.09938188722357204
2000 -72.14829301941187 win rate = 0.17409470752089137 loss rate = 0.8259052924791086 explore = 0.09814728424702479
3000 -89.50158702798637 win rate = 0.2197962154294032 loss rate = 0.7802037845705968 explore = 0.09667632043310306
4000 -101.42141635524372 win rate = 0.2398753894080997 loss rate = 0.7601246105919003 explore = 0.09519883704645137
5000 -114.24348569377287 win rate = 0.21954887218045113 loss rate = 0.7804511278195488 explore = 0.09381896262627748
6000 -123.12664834224664 win rate = 0.24769230769230768 loss rate = 0.7523076923076923 explore = 0.09232049741798797
7000 -132.65714555064343 win rate = 0.23311897106109325 loss rate = 0.7668810289389068 explore = 0.09099144268562193
8000 -141.23401409167724 win rate = 0.2370486656200942 loss rate = 0.7629513343799058 explore = 0.08962772569035374
9000 -152.18572994062976 win rate = 0.25 loss rate = 0.75 explore = 0.

In [None]:
h = HumanPlayer()
b = Game(q,h)
while True:
    b.play(verbose=True)