In [1]:
import numpy as np
import random
import pandas as pd
import pickle
import os

In [2]:
# Agent
class Agent(object):

    def __init__(self, lr, gamma, reward_player, ):
        
        # Rewards
        self.reward_player = reward_player
        # Learning Rate - 0 to 1
        self.lr = lr
        # Discount factor
        self.gamma = gamma
        # Players ( player_1 = 1 and player_2 = -1 )
        self.player = 1 
        
        self.number_match = 0
        
        self.results ={
            'win':0,
            'draw':0,
            'lost':0}
        
        self.Q_table = {
            'states' : [],
            'actions': ['(0, 0)','(0, 1)','(0, 2)','(1, 0)','(1, 1)','(1, 2)','(2, 0)','(2, 1)','(2, 2)'],
            'Q': []
        }
        
        self.path = {
            'states':  [], # boards
            'actions': [], # posição no tabuleiro
        }
        
    def reset_game(self):
        self.player = 1
        self.path = {
            'states': [],
            'actions':[],
        }
    
    def reset_historic_game(self):
        self.results ={
            'win':0,
            'draw':0,
            'lost':0}
        
    def save_result(self, resultado):
        
        if resultado == 1:
            #print('won')
            self.results['win'] += 1
            
        elif resultado == -1:
            #print('lost')
            self.results['lost'] += 1

        else:
            #print('draw')
            self.results['draw'] += 1
            
    def Q_table_df(self):
        
        df = pd.DataFrame(
            index= self.Q_table['states'],
            columns= self.Q_table['actions'],
            data = self.Q_table['Q']
            )
        return df
    
    def update_Q(self, reward):
        
        # Q(s,a) = Q(s,a) + alpha* ( R(s) + * Gamma * max_Q(s+1,:) - Q(s,a) ) )
        # R(s) = Reward...
        
        
        # Menor caminho para derrota, pontua mais. ( reward > 0 )
        # Maior caminho para derrota, perde menos. ( reward < 0)
        reward = reward / len(self.path['actions'])
        
        lr =    self.lr
        gamma = self.gamma
        
        # Lista de Estados e Ações - Executados
        states_actions = list( self.path.values() )

        # Lista de Estados Reverso (pois iremos do FUTURO pro PASSADO)
        states =  list( reversed( states_actions[0] ) )

        # Lista de Ações Reverso   (pois iremos do FUTURO pro PASSADO)
        actions = list( reversed( states_actions[1] ) )

        # Marcador para eu saber onde estou
        index = 0
        for s2, a2 in zip( states, actions ):
            
            if reward >= 0: 

                try:
                    # index  = 0 é a ultima ação que levou a vitória, derrota ou empate
                    if index == 0:

                        # Estado Atual
                        s2 = self.Q_table['states'].index(str(s2))
                        # Ação Atual
                        a2 = self.Q_table['actions'].index(str(a2))

                        self.Q_table['Q'][s2][a2] = lr * ( reward ) #self.Q_table['Q'][s2][a2] = reward 


                        # Fazer o mesmo, mas agora para o States adiantado
                        ##### Next Value #####
                        
                        # ESTADO avançado
                        index += 1

                        s1 = states[index]
                        s1 = self.Q_table['states'].index(str(s1))

                        a1 = actions[index]
                        a1 = self.Q_table['actions'].index(str(a1))

                        # a2 -> Não precisamos, porque estamos interessado na ação com valor MÁXIMO
                        # do respectivo ESTADO avançado Max_Q(s+1,: )... ou seja, deixa em aberto
                        self.Q_table['Q'][s1][a1] += lr*( 0 + gamma*np.max( self.Q_table['Q'][s2] ) - self.Q_table['Q'][s1][a1] )

                    else:

                        ##### pegar o index numérico dos States e Actions
                        s2 = self.Q_table['states'].index(str(s2))
                        a2 = self.Q_table['actions'].index(str(a2))

                        # Fazer o mesmo, mas agora para o States adiantado
                        ##### Next Value #####

                        # ESTADO avançado
                        index += 1

                        s1 = states[index]
                        s1 = self.Q_table['states'].index(str(s1))

                        a1 = actions[index]
                        a1 = self.Q_table['actions'].index(str(a1))

                        # a2 -> Não precisamos, porque estamos interessado na ação com valor MÁXIMO
                        # do respectivo ESTADO avançado Max_Q(s+1,: )... ou seja, deixa em aberto
                        self.Q_table['Q'][s1][a1] += lr*( 0 + gamma*np.max( self.Q_table['Q'][s2] ) - self.Q_table['Q'][s1][a1] ) 

                # Não há mais Estados Adiantados para buscar.   
                except IndexError:
                    continue
            
            
            if reward < 0:
            # Se for negativo tem que pegar o MIN, pois foi uma jogada ruim
                
                try:
                    # index  = 0 é a ultima ação que levou a vitóriam, ou derrota
                    if index == 0:

                        s2 = self.Q_table['states'].index(str(s2))
                        a2 = self.Q_table['actions'].index(str(a2))

                        self.Q_table['Q'][s2][a2] = lr* ( reward ) #self.Q_table['Q'][s2][a2] = reward 


                        # Fazer o mesmo, mas agora para o States adiantado
                        ##### Next Value #####

                        # ESTADO avançado
                        index += 1

                        s1 = states[index]
                        s1 = self.Q_table['states'].index(str(s1))

                        a1 = actions[index]
                        a1 = self.Q_table['actions'].index(str(a1))

                        # a2 -> Não precisamos, porque estamos interessado na ação com valor MÁXIMO
                        # do respectivo ESTADO avançado Max_Q(s+1,: )... ou seja, deixa em aberto
                        self.Q_table['Q'][s1][a1] += lr*( 0 + gamma*np.min( self.Q_table['Q'][s2] ) - self.Q_table['Q'][s1][a1] )

                    else:

                        ##### pegar o index numérico dos States e Actions
                        s2 = self.Q_table['states'].index(str(s2))
                        a2 = self.Q_table['actions'].index(str(a2))

                        # Fazer o mesmo, mas agora para o States adiantado
                        ##### Next Value #####

                        # ESTADO avançado
                        index += 1

                        s1 = states[index]
                        s1 = self.Q_table['states'].index(str(s1))

                        a1 = actions[index]
                        a1 = self.Q_table['actions'].index(str(a1))

                        # a2 -> Não precisamos, porque estamos interessado na ação com valor MÁXIMO
                        # do respectivo ESTADO avançado Max_Q(s+1,: )... ou seja, deixa em aberto
                        self.Q_table['Q'][s1][a1] += lr*( 0 + gamma*np.min( self.Q_table['Q'][s2] ) - self.Q_table['Q'][s1][a1] ) 
                        
                # Não há mais Estados Adiantados para buscar.   
                except IndexError:
                    continue   

In [3]:
# Enviroment
class Enviroment(object):

    def __init__(self, epsilon):
        
        # randomness factor
        self.epsilon = epsilon
        
        # Board (é nosso ESTADO ATUAL)
        self.board = np.zeros((3,3))
        
        # posição jogada
        self.pos = 0

    def reset_game(self):
        self.board = np.zeros((3,3))

    # Plotar o Board
    def draw_board(self):

        draw = ''

        for i in range(3):
            for j in range(3):
                simbolo = ''
                # simbolo X (p1 = 1) ou O (p2 = -1)
                if self.board[i][j] == 1:
                    symbol = 'X'
                elif self.board[i][j] == -1:
                    symbol = 'O'
                else:
                    symbol = ' '

                draw += '|'+symbol+''

                if j == 2:

                    draw +='|\n-------\n'
        print(draw)

    # Posições disponíveis
    def available_moves(self):
        return np.argwhere(self.board == 0)
    # Jogar uma posição disponível
    def available_move_choice(self):
        return random.choice(self.available_moves())

    # Checar Resultado    
    def check_result(self):

        # Row
        if sum(self.board[0]) == 3 or sum(self.board[1]) == 3 or sum(self.board[2]) == 3:
            #print('venceu')
            return 1
        if sum(self.board[0]) == -3 or sum(self.board[1]) == -3 or sum(self.board[2]) == -3:
            #print('perdeu')
            return -1
        # Col
        if sum(self.board[:,0]) == 3 or sum(self.board[:,1]) == 3 or sum(self.board[:,2]) == 3:
            #print('venceu')
            return 1
        if sum(self.board[:,0]) == - 3 or sum(self.board[:,1]) == - 3 or sum(self.board[:,2]) == - 3:
            #print('perdeu')
            return -1
        # Diagonal
        if sum(self.board.diagonal()) == 3 or sum(np.fliplr(self.board).diagonal()) == 3:
            #print('venceu')
            return 1
        if sum(self.board.diagonal()) == -3 or sum(np.fliplr(self.board).diagonal()) == -3:
            #print('perdeu')
            return -1
        # Empate
        if not 0 in self.board:
            #print('empate')
            return 0
        
        return 2

        #########################################################
        ## continua = 2, empate = 0, vitoria = 1, derrota = -1 ##
        #########################################################

    # Dar recompensa        
    def reward(self, result, reward_player):

        if result == 1:  # Vitória
            return reward_player['win']

        if result == -1: # Derrota
            return reward_player['lost']
        
        if result == 0:  # Empate
            return reward_player['draw']
    
    # jogada - Random 
    def select_pos_by_random(self, player, name):
        
        row_col = self.available_move_choice()
        
        row = row_col[0] # Linha
        col = row_col[1] # Coluna

        self.board[row][col] = player
        
        self.pos = row,col
        
        #print(name + f' jogou na posição { str(self.pos) }')
           
    # jogada - humano   
    def select_pos_by_input(self, player, name):
        
        #os.system('clear')
        # desenhar jogada do player 
        #self.draw_board()
        while True:
            row = int( input('Row: ') )
            col = int( input('Col: ') )
            
            if [row,col] in self.available_moves().tolist(): # Refransforme Em lista... Array ele aceita 
                
                self.board[row][col] = player
                self.pos = row,col
                break
            else:
                input('try other position...')
    
    # Através do estado atual (Seu board)... pegue a ação com maior Q
    def select_pos_by_Q(self,player, name, Q_table):
        
        # jogada Aleatória ( Exploring )
        if np.random.uniform(0, 1) < self.epsilon:
            
            #print('********jogada aleatória - Caiu no EPSILON ***********')
            
            self.select_pos_by_random( player, name = 'player '+str( player ) )
            
        # Vai na tabela e joga ( Exploiting )
        else:

            # Se existir esse estado gravado...
            if str(self.board) in Q_table['states']:
                
                index_state = Q_table['states'].index( str(self.board) )
                #index_action= self.Q_table['Q'][index_state].index( str(np.max(self.Q_table['Q'][index_state])) )
                #index_qmax = np.argmax(self.Q_table['Q'][index_state])


                # pega todos valores de Q com respectivo index state na ordem DESCRESCENTE
                # assim, se a posição máx já estiver ocupada, ele vai pro segundo maior e assim por diante.

                #print(sorted( self.Q_table['Q'][index_state], reverse = True ) )
                #input()
                
                valores_qmax = sorted( Q_table['Q'][index_state], reverse = True )
                #print(valores_qmax)
                
                # pega o maior na ordem decrescente... 
                for qmax in valores_qmax:
                    
                    # logo se for Zero não temos estado treinado
                    # ( Ou pode ser que todos são negativos e a posição zero é pq não pode ser jogada... complicou )
                    #if all(valores_qmax) == qmax:  # qmax = 0
                    #if qmax == 0:
                        
                        #print(f'********Jogada Aleatório - qmax = {qmax} ... não tem treino***********')
                        
                        #self.select_pos_by_random( player, name = 'player '+str( player ) )
                        #break
                    
                    index_qmax = Q_table['Q'][index_state].index( qmax )

                    action = Q_table['actions'][index_qmax]

                    row = int(action[1:2])
                    col = int(action[4:5])
                    
                    
                    if [row,col] in self.available_moves().tolist(): # Refransforme Em lista... Array ele aceita  
                        
                        #print(f'******** Jogada Inteligente - melhor Q:{qmax}***********')
                        
                        self.board[row][col] = player
                        self.pos = row,col # Atualiza pos atual
                        
                        break
                        
            # se não existir o ESTADO, joga aleatório mesmo
            else:
                
                #print('********Jogada Aleatória - Não existe este Estado***********') 
                self.select_pos_by_random( player, name = 'player '+str(player) )

In [4]:
# funct to start the game
def start():
    while True:

        ##################### Criação da Tabela Q (antes) - PLAYER 1 ###################
        # Se não existe este Estado dentro da Tabela Q, adicione
        if str(env.board) not in agent_1.Q_table['states']:

            # 1-) Adicionar Estado Atual
            agent_1.Q_table['states'].append( str(env.board ) )

            # 2-) Add valor de Q
            #agent_1.Q_table['Q'].append( [0,0,0,0,0,0,0,0,0] )
            agent_1.Q_table['Q'].append( [99998,11111,99995,11112,99999,11113,99996,11114,99997] )
        ###############################################################
        
        ##################### Criação da Tabela Q (antes) - PLAYER 2 ###################
        # Se não existe este Estado dentro da Tabela Q, adicione
        if str(env.board) not in agent_2.Q_table['states']:

            # 1-) Adicionar Estado Atual
            agent_2.Q_table['states'].append( str(env.board ) )

            # 2-) Add valor de Q
            agent_2.Q_table['Q'].append( [99998,11111,99995,11112,99999,11113,99996,11114,99997] )
        ###############################################################

        # Registrar o State Inicial no PATH - Player 1
        agent_1.path['states'].append( str(env.board) )
        
        # Registrar o State Inicial no PATH - Player 2
        agent_2.path['states'].append( str(env.board) )

        ############################ Agente Executa Ação no Ambiente #################### 
        if agent_1.player == 1: # PLAYER 1
            env.select_pos_by_Q( agent_1.player,name = 'player '+str(agent_1.player),Q_table = agent_1.Q_table)
            #env.select_pos_by_random( agent_1.player, name = 'player '+str(agent_1.player) )  
            #env.select_pos_by_input( agent_1.player, name = 'player '+str(agent_1.player) )

            # ( Desenha  Board )
            #env.draw_board()
            
        else:               # PLAYER 2 
            env.select_pos_by_Q( agent_2.player,name = 'player '+str(agent_2.player),Q_table = agent_2.Q_table)
            #env.select_pos_by_random( agent_2.player, name = 'player '+str(agent_2.player) )
            #env.select_pos_by_input( agent_2.player, name = 'player '+str(agent_2.player) )
            
            # ( Desenha  Board )
            #env.draw_board()
        #################################################################################

        # Registrar o Action realizada no PATH
        agent_1.path['actions'].append( str(env.pos) )
        
        # Registrar o Action realizada no PATH
        agent_2.path['actions'].append( str(env.pos) )

        ########################## Ambiente Responde ######################################
        # checa resultado
        if env.check_result() != 2: # continua = 2, empate = 0, vitoria = 1, derrota = -1

            # resultado do jogo
            agent_1.save_result( env.check_result() )
            agent_2.save_result( -1 * env.check_result() )

            # Valor da Recompensa
            reward_1 = env.reward( result = env.check_result(), reward_player = agent_1.reward_player  )
            reward_2 = env.reward( result = -1 * env.check_result(), reward_player = agent_2.reward_player  )
            
            
            # Update Q Table
            agent_1.update_Q( reward_1 )
            agent_2.update_Q( reward_2 )

            # Reset Game
            env.reset_game()
            agent_1.reset_game()
            agent_2.reset_game()

            
            # add partida jogada
            agent_1.number_match += 1
            agent_2.number_match += 1

            break

        # Mudar jogador    
        agent_1.player *= -1 # switch players
        agent_2.player *= -1 # switch players

In [5]:
# SAVE
def save_Q_table():
    
    # Player 1
    with open("./trained_QxQ/Q_table_1.pkl", "wb") as tf:
        pickle.dump(agent_1.Q_table,tf)

    with open("./trained_QxQ/partidas_1.pkl", "wb") as tf:
        pickle.dump(agent_1.number_match,tf)
    
    # Player 2
    with open("./trained_QxQ/Q_table_2.pkl", "wb") as tf:
        pickle.dump(agent_2.Q_table,tf)

    with open("./trained_QxQ/partidas_2.pkl", "wb") as tf:
        pickle.dump(agent_2.number_match,tf)

# LOAD
def load_Q_table():
    # Player 2
    with open('./trained_QxQ/Q_table_1.pkl', 'rb') as handle:
        Q_table_1 = pickle.load(handle)
    with open('./trained_QxQ/partidas_1.pkl', 'rb') as handle:
        number_match_1 = pickle.load(handle)
        
    # Player 2
    with open('./trained_QxQ/Q_table_2.pkl', 'rb') as handle:
        Q_table_2 = pickle.load(handle)
    with open('./trained_QxQ/partidas_2.pkl', 'rb') as handle:
        number_match_2 = pickle.load(handle)

    agent_1.number_match = number_match_1
    agent_1.Q_table = Q_table_1
    
    agent_2.number_match = number_match_1
    agent_2.Q_table = Q_table_2

    print(f"número de partidas {agent_1.number_match}")


In [6]:
## Player 1
agent_1 = Agent( 
    lr = 0.7,
    gamma = 0.7,
    reward_player = {
        'win': 1,
        'lost': -1,
        'draw': 0.0,
    }
)


# Player 2
agent_2 = Agent( 
    lr = 0.7,
    gamma = 0.7,
    reward_player = {
        'win': 1,
        'lost': -1,
        'draw': 0.0, 
    }
)

# Load Q Table
#load_Q_table()

In [7]:
# Object Enviroment
env = Enviroment(
    epsilon =  0.00,
)

In [17]:
# Execution Random Env epsilon

def exec_random():
    # Random Variando
    inter = 0.1
     
    for e in reversed( np.arange(0.0, 1.0 + inter, inter ) ):
        env.epsilon = e
        print(f'---------------- Epsilon = {e} -----------------------------')
        
        for epoch in range(1):
            matches = 10
            for i in range(matches):
                start()

            escrever = "Player 1 - Win: %3i  Draw: %3i  Loss: %3i "%(agent_1.results['win'],agent_1.results['draw'],agent_1.results['lost'])
            print( escrever + " -> epoch : " + str(agent_1.number_match) )
            agent_1.reset_historic_game()

            escrever = "Player 2 - Win: %3i  Draw: %3i  Loss: %3i "%(agent_2.results['win'],agent_2.results['draw'],agent_2.results['lost'])
            print( escrever + " -> epoch : " + str(agent_2.number_match) )
            agent_2.reset_historic_game()

            print('------------------------------------------------------------')
            #print(agent_1.Q_table_df().shape)
            
    
exec_random()

---------------- Epsilon = 1.0 -----------------------------
Player 1 - Win:   5  Draw:   1  Loss:   4  -> epoch : 11121
Player 2 - Win:   4  Draw:   1  Loss:   5  -> epoch : 11121
------------------------------------------------------------
---------------- Epsilon = 0.9 -----------------------------
Player 1 - Win:   8  Draw:   0  Loss:   2  -> epoch : 11131
Player 2 - Win:   2  Draw:   0  Loss:   8  -> epoch : 11131
------------------------------------------------------------
---------------- Epsilon = 0.8 -----------------------------
Player 1 - Win:   6  Draw:   1  Loss:   3  -> epoch : 11141
Player 2 - Win:   3  Draw:   1  Loss:   6  -> epoch : 11141
------------------------------------------------------------
---------------- Epsilon = 0.7000000000000001 -----------------------------
Player 1 - Win:   7  Draw:   2  Loss:   1  -> epoch : 11151
Player 2 - Win:   1  Draw:   2  Loss:   7  -> epoch : 11151
------------------------------------------------------------
---------------- 

In [16]:
# teste plotar
"""# Execution Random Env epsilon

graf_1 = {
    'epsilon':[],
    'win':[],
    'draw':[],
    'lost':[],
}


def exec_random():
    # Random Variando
    inter = 0.1
     
    for e in reversed( np.arange(0.0, 1.0 + inter, inter ) ):
        env.epsilon = e
        print(f'---------------- Epsilon = {e} -----------------------------')
        
        for epoch in range(1):
            matches = 10
            for i in range(matches):
                start()
                

            graf_1['win'].append(agent_1.results['win'])
            graf_1['draw'].append(agent_1.results['draw'])
            graf_1['lost'].append(agent_1.results['lost'])

            escrever = "Player 1 - Win: %3i  Draw: %3i  Loss: %3i "%(agent_1.results['win'],agent_1.results['draw'],agent_1.results['lost'])
            print( escrever + " -> epoch : " + str(agent_1.number_match) )
            agent_1.reset_historic_game()

            escrever = "Player 2 - Win: %3i  Draw: %3i  Loss: %3i "%(agent_2.results['win'],agent_2.results['draw'],agent_2.results['lost'])
            print( escrever + " -> epoch : " + str(agent_2.number_match) )
            agent_2.reset_historic_game()

            print('------------------------------------------------------------')
            #print(agent_1.Q_table_df().shape)
            
        
        graf_1['epsilon'].append(e)

    
exec_random()

import matplotlib.pyplot as plt

plt.plot(graf_1['epsilon'],graf_1['win'],'.-',label = 'win')
plt.plot(graf_1['epsilon'],graf_1['draw'],'.-',label='draw')
plt.plot(graf_1['epsilon'],graf_1['lost'],'.-',label='lost')
plt.title(" 100 matches played by epsilon")
plt.xlim( graf_1['epsilon'][0], graf_1['epsilon'][len(graf_1['epsilon']) -1] )
plt.xlabel('Epsilon (random factor)')
plt.ylabel('count of win,lost,draw')
plt.grid(ls = 'dashdot')
plt.legend()
plt.plot()""";

In [9]:
# Normal Execution

def exec_normal():
    # K = epoch
    for k in range(10):

        #  Train 100 x por época
        partidas = 1000
        for i in range(partidas):
            start()
        escrever = " Win: %3i  Draw: %3i  Lost: %3i "%(agent_1.results['win'],agent_1.results['draw'],agent_1.results['lost'])
        print( escrever + " -> epoch : " + str(agent_1.number_match) )
        agent_1.reset_historic_game()

        escrever = " Win: %3i  Draw: %3i  Lost: %3i "%(agent_2.results['win'],agent_2.results['draw'],agent_2.results['lost'])
        print( escrever + " -> epoch : " + str(agent_2.number_match) )
        agent_2.reset_historic_game()

        print('\n')

#exec_normal()

In [10]:
# AREA DE TESTE

def start_test():
    while True:

        ##################### Criação da Tabela Q (antes) - PLAYER 1 ###################
        # Se não existe este Estado dentro da Tabela Q, adicione
        if str(env.board) not in agent_1.Q_table['states']:

            # 1-) Adicionar Estado Atual
            agent_1.Q_table['states'].append( str(env.board ) )

            # 2-) Add valor de Q
            agent_1.Q_table['Q'].append( [99998,11111,99995,11112,99999,11113,99996,11114,99997] )
            ###############################################################
        
        ##################### Criação da Tabela Q (antes) - PLAYER 2 ###################
        # Se não existe este Estado dentro da Tabela Q, adicione
        if str(env.board) not in agent_2.Q_table['states']:

            # 1-) Adicionar Estado Atual
            agent_2.Q_table['states'].append( str(env.board ) )

            # 2-) Add valor de Q
            agent_2.Q_table['Q'].append( [99998,11111,99995,11112,99999,11113,99996,11114,99997] )
        ###############################################################


        # Registrar o State Inicial no PATH - Player 1
        agent_1.path['states'].append( str(env.board) )
        
        # Registrar o State Inicial no PATH - Player 2
        agent_2.path['states'].append( str(env.board) )
        
        


        ############################ Agente Executa Ação no Ambiente #################### 
        if agent_1.player == 1: # PLAYER 1
            env.select_pos_by_Q( agent_1.player,name = 'player '+str(agent_1.player),Q_table = agent_1.Q_table)
            #env.select_pos_by_random( agent_1.player, name = 'player '+str(agent_1.player) )  
            #env.select_pos_by_input( agent_1.player, name = 'player '+str(agent_1.player) )

            # ( Desenha  Board )
            env.draw_board()
            
        else:               # PLAYER 2 
            env.select_pos_by_Q( agent_2.player,name = 'player '+str(agent_2.player),Q_table = agent_2.Q_table)
            #env.select_pos_by_random( agent_2.player, name = 'player '+str(agent_2.player) )
            #env.select_pos_by_input( agent_2.player, name = 'player '+str(agent_2.player) )
            
            # ( Desenha  Board )
            env.draw_board()
        #################################################################################


        # Registrar o Action realizada no PATH
        agent_1.path['actions'].append( str(env.pos) )
        
        # Registrar o Action realizada no PATH
        agent_2.path['actions'].append( str(env.pos) )


        ########################## Ambiente Responde ######################################
        # checa resultado
        if env.check_result() != 2: # continua = 2, empate = 0, vitoria = 1, derrota = -1

            # resultado do jogo
            agent_1.save_result( env.check_result() )
            agent_2.save_result( -1 * env.check_result() )

            # Valor da Recompensa
            reward_1 = env.reward( result = env.check_result(), reward_player = agent_1.reward_player  )
            reward_2 = env.reward( result = -1 * env.check_result(), reward_player = agent_2.reward_player  )
            
            
            # Update Q Table
            agent_1.update_Q( reward_1 )
            agent_2.update_Q( reward_2 )

            # Reset Game
            env.reset_game()
            agent_1.reset_game()
            agent_2.reset_game()

            
            # add partida jogada
            agent_1.number_match += 1
            agent_2.number_match += 1

            break
            


        # Mudar jogador    
        agent_1.player *= -1 # switch players
        agent_2.player *= -1 # switch players
        
#agent_1.reward_player['draw'] = -0.1
        
start_test()

| | | |
-------
| |X| |
-------
| | | |
-------

| | | |
-------
| |X| |
-------
| |O| |
-------

|X| | |
-------
| |X| |
-------
| |O| |
-------

|X| | |
-------
| |X| |
-------
| |O|O|
-------

|X| | |
-------
| |X| |
-------
|X|O|O|
-------

|X| | |
-------
|O|X| |
-------
|X|O|O|
-------

|X| | |
-------
|O|X|X|
-------
|X|O|O|
-------

|X| |O|
-------
|O|X|X|
-------
|X|O|O|
-------

|X|X|O|
-------
|O|X|X|
-------
|X|O|O|
-------



In [11]:
# Q_TABLE ---> States X Actions
agent_1.Q_table_df().head()

Unnamed: 0,"(0, 0)","(0, 1)","(0, 2)","(1, 0)","(1, 1)","(1, 2)","(2, 0)","(2, 1)","(2, 2)"
[[0. 0. 0.]\n [0. 0. 0.]\n [0. 0. 0.]],59072.009204,48999.494653,60253.684251,48999.48759,69999.3,48998.53,32716.413524,48998.475992,66689.792832
[[1. 0. 0.]\n [0. 0. 0.]\n [0. 0. 0.]],99998.0,69997.742714,69687.565837,66059.960782,69877.098536,69998.342814,69887.149938,69998.522166,56699.255111
[[ 1. 0. 0.]\n [ 0. 0. -1.]\n [ 0. 0. 0.]],99998.0,68822.336519,56932.24782,69997.742712,25932.995492,11113.0,56929.663387,64902.820811,26444.95
[[ 1. 1. 0.]\n [ 0. 0. -1.]\n [ 0. 0. 0.]],99998.0,11111.0,78998.01,69521.61854,21242.85617,11113.0,65661.588056,52333.71,19543.777199
[[ 1. 1. -1.]\n [ 0. 0. -1.]\n [ 0. 0. 0.]],99998.0,11111.0,99995.0,11112.0,99999.0,11113.0,78998.31,11114.0,78998.12


In [12]:
# Q_TABLE ---> States X Actions
agent_2.Q_table_df().head()

Unnamed: 0,"(0, 0)","(0, 1)","(0, 2)","(1, 0)","(1, 1)","(1, 2)","(2, 0)","(2, 1)","(2, 2)"
[[0. 0. 0.]\n [0. 0. 0.]\n [0. 0. 0.]],69995.610921,34284.258007,69996.5,47323.279462,69999.3,48997.549986,68687.596413,34299.313929,69994.582312
[[1. 0. 0.]\n [0. 0. 0.]\n [0. 0. 0.]],99998.0,7021.938539,24728.924976,54789.66701,19466.182523,49938.557776,11202.740473,21124.642444,69998.59995
[[ 1. 0. 0.]\n [ 0. 0. -1.]\n [ 0. 0. 0.]],99998.0,4676.824097,69449.936919,49470.498505,49552.914126,11113.0,17789.136701,8314.605304,50332.891034
[[ 1. 1. 0.]\n [ 0. 0. -1.]\n [ 0. 0. 0.]],99998.0,11111.0,35442.89,4964.090751,49809.473294,11113.0,4652.810492,8778.59,51180.930656
[[ 1. 1. -1.]\n [ 0. 0. -1.]\n [ 0. 0. 0.]],99998.0,11111.0,99995.0,11112.0,99999.0,11113.0,34300.3091,11114.0,31632.53999


In [13]:
# SAVE Q Table
#save_Q_table()

In [14]:
#agent_1.number_match