In [1]:
import numpy as np
import random
import pandas as pd
import pickle
import os

In [2]:
# Agent
class Agent(object):

    def __init__(self, lr, gamma, reward_player, ):
        
        self.reward_player = reward_player
        
        self.lr = lr
        
        self.gamma = gamma
        
        self.player = 1 # player 1 = 1 and player 2 = -1
        
        self.number_match = 0
        
        self.results ={
            'win':0,
            'draw':0,
            'lost':0}
        
        self.Q_table = {
            'states' : [],
            'actions': ['(0, 0)','(0, 1)','(0, 2)','(1, 0)','(1, 1)','(1, 2)','(2, 0)','(2, 1)','(2, 2)'],
            'Q': []
        }
        
        self.path = {
            'states':  [], # boards
            'actions': [], # posição no tabuleiro
        }
        
    def reset_game(self):
        self.player = 1
        self.path = {
            'states': [],
            'actions':[],
        }
    
    def reset_historic_game(self):
        self.results ={
            'win':0,
            'draw':0,
            'lost':0}
        
    def save_result(self, resultado):
        
        if resultado == 1:
            #print('venceu')
            self.results['win'] += 1
            
        elif resultado == -1:
            #print('perdeu')
            self.results['lost'] += 1

        else:
            #print('empate')
            self.results['draw'] += 1
            
    def Q_table_df(self):
        
        df = pd.DataFrame(
            index= self.Q_table['states'],
            columns= self.Q_table['actions'],
            data = self.Q_table['Q']
            )
            #data = 0 )
        return df
    
    def update_Q(self, reward):
        
        # Q(s,a) = Q(s,a) + alpha* ( R(s) + * Gamma * max_Q(s+1,:) - Q(s,a) ) )
        # R(s) = Reward...
        
        lr =    self.lr    # 0.9 # Alpha - Taxa de Aprendizagem
        gamma = self.gamma # 0.9 # Gamma - Fator de Desconto
        
        
        # Lista de Estados e Ações - Executados
        states_actions = list( self.path.values() )

        # Lista de Estados Reverso (pois iremos do FUTURO pro PASSADO)
        states =  list( reversed( states_actions[0] ) )

        # Lista de Ações Reverso   (pois iremos do FUTURO pro PASSADO)
        actions = list( reversed( states_actions[1] ) )

        # Marcador para eu saber onde estou
        index = 0
        for s2, a2 in zip( states, actions ):
            
            
            if reward >= 0: 

                try:
                    # index  = 0 é a ultima ação que levou a vitóriam, ou derrota
                    if index == 0:

                        s2 = self.Q_table['states'].index(str(s2))
                        a2 = self.Q_table['actions'].index(str(a2))

                        self.Q_table['Q'][s2][a2] = lr* ( reward ) #self.Q_table['Q'][s2][a2] = reward 


                        # Fazer o mesmo, mas agora para o States adiantado

                        ##### Next Value #####

                        # ESTADO avançado
                        index += 1

                        s1 = states[index]
                        s1 = self.Q_table['states'].index(str(s1))

                        a1 = actions[index]
                        a1 = self.Q_table['actions'].index(str(a1))

                        # a2 -> deixa em aberto, por que estamos interessado na ação com valor MÁXIMO do respectivo ESTADO avançado Max_Q(s+1,:)
                        self.Q_table['Q'][s1][a1] += lr*( 0 + gamma*np.max( self.Q_table['Q'][s2] ) - self.Q_table['Q'][s1][a1] )

                    else:

                        ##### pegar o index numérico dos States e Actions
                        s2 = self.Q_table['states'].index(str(s2))
                        a2 = self.Q_table['actions'].index(str(a2))


                        # Fazer o mesmo, mas agora para o States adiantado

                        ##### Next Value #####

                        # ESTADO avançado
                        index += 1

                        s1 = states[index]
                        s1 = self.Q_table['states'].index(str(s1))

                        a1 = actions[index]
                        a1 = self.Q_table['actions'].index(str(a1))

                        # a2 -> deixa em aberto, por que estamos interessado na ação com valor MÁXIMO do respectivo ESTADO avançado Max_Q(s+1,:)
                        self.Q_table['Q'][s1][a1] += lr*( 0 + gamma*np.max( self.Q_table['Q'][s2] ) - self.Q_table['Q'][s1][a1] ) 

                # Não há mais Estados Adiantados para buscar.   
                except IndexError:
                    continue
            
            
            if reward < 0:
            # Se for negativo tem que DESCONTAR, pra isso, usa-se o MIN_Q
                
                try:
                    # index  = 0 é a ultima ação que levou a vitóriam, ou derrota
                    if index == 0:

                        s2 = self.Q_table['states'].index(str(s2))
                        a2 = self.Q_table['actions'].index(str(a2))

                        self.Q_table['Q'][s2][a2] = lr* ( reward ) #self.Q_table['Q'][s2][a2] = reward 


                        # Fazer o mesmo, mas agora para o States adiantado

                        ##### Next Value #####

                        # ESTADO avançado
                        index += 1

                        s1 = states[index]
                        s1 = self.Q_table['states'].index(str(s1))

                        a1 = actions[index]
                        a1 = self.Q_table['actions'].index(str(a1))

                        # a2 -> deixa em aberto, por que estamos interessado na ação com valor MIN do respectivo ESTADO avançado Max_Q(s+1,:)
                        self.Q_table['Q'][s1][a1] += lr*( 0 + gamma*np.min( self.Q_table['Q'][s2] ) - self.Q_table['Q'][s1][a1] )

                    else:

                        ##### pegar o index numérico dos States e Actions
                        s2 = self.Q_table['states'].index(str(s2))
                        a2 = self.Q_table['actions'].index(str(a2))


                        # Fazer o mesmo, mas agora para o States adiantado

                        ##### Next Value #####

                        # ESTADO avançado
                        index += 1

                        s1 = states[index]
                        s1 = self.Q_table['states'].index(str(s1))

                        a1 = actions[index]
                        a1 = self.Q_table['actions'].index(str(a1))

                        # a2 -> deixa em aberto, por que estamos interessado na ação com valor MIN do respectivo ESTADO avançado Max_Q(s+1,:)
                        self.Q_table['Q'][s1][a1] += lr*( 0 + gamma*np.min( self.Q_table['Q'][s2] ) - self.Q_table['Q'][s1][a1] ) 
                        
                # Não há mais Estados Adiantados para buscar.   
                except IndexError:
                    continue   

In [3]:
# Enviroment
class Enviroment(object):

    def __init__(self, epsilon):
        
        self.epsilon = epsilon
        

        # Board (é nosso ESTADO ATUAL)
        self.board = np.zeros((3,3))
        
        # pos jogada
        self.pos = 0

    def reset_game(self):
        self.board = np.zeros((3,3))

    # Plotar o Board
    def draw_board(self):

        draw = ''

        for i in range(3):
            for j in range(3):
                simbolo = ''
                # simbolo X (p1 = 1) ou O (p2 = -1)
                if self.board[i][j] == 1:
                    symbol = 'X'
                elif self.board[i][j] == -1:
                    symbol = 'O'
                else:
                    symbol = ' '


                draw += '|'+symbol+''



                if j == 2:

                    draw +='|\n-------\n'

        print(draw)

    # Posições disponíveis
    def available_moves(self):
        return np.argwhere(self.board == 0)
    # Jogar uma posição disponível
    def available_move_choice(self):
        return random.choice(self.available_moves())

    # Checar Resultado    
    def check_result(self):

        # Row
        if sum(self.board[0]) == 3 or sum(self.board[1]) == 3 or sum(self.board[2]) == 3:
            #print('venceu')
            return 1
        if sum(self.board[0]) == -3 or sum(self.board[1]) == -3 or sum(self.board[2]) == -3:
            #print('perdeu')
            return -1
        # Col
        if sum(self.board[:,0]) == 3 or sum(self.board[:,1]) == 3 or sum(self.board[:,2]) == 3:
            #print('venceu')
            return 1
        if sum(self.board[:,0]) == - 3 or sum(self.board[:,1]) == - 3 or sum(self.board[:,2]) == - 3:
            #print('perdeu')
            return -1
        # Diagonal
        if sum(self.board.diagonal()) == 3 or sum(np.fliplr(self.board).diagonal()) == 3:
            #print('venceu')
            return 1
        if sum(self.board.diagonal()) == -3 or sum(np.fliplr(self.board).diagonal()) == -3:
            #print('perdeu')
            return -1
        # Empate
        if not 0 in self.board:
            #print('empate')
            return 0

        #########################################################
        ## continua = 2, empate = 0, vitoria = 1, derrota = -1 ##
        #########################################################

        return 2

    # Dar recompensa        
    def reward(self, result, reward_player):

        if result == 1:  # Vitória
            return reward_player['win']

        if result == -1: # Derrota
            return reward_player['lost']
        
        if result == 0:  # Empate
            return reward_player['draw']
    
    
    # jogada - Random 
    def select_pos_by_random(self, player, name):
        
        row_col = self.available_move_choice()
        
        row = row_col[0] # Linha
        col = row_col[1] # Coluna

        self.board[row][col] = player
        
        self.pos = row,col
        
        #print(name + f' jogou na posição { str(self.pos) }')
           
    # jogada - humano   
    def select_pos_by_input(self, player, name):
        
        #os.system('clear')
        # desenhar jogada do player 
        #self.draw_board()
        while True:
            row = int( input('Row: ') )
            col = int( input('Col: ') )
            
            if [row,col] in self.available_moves().tolist(): # Refransforme Em lista... Array ele aceita 
                
                self.board[row][col] = player
                self.pos = row,col
                break
            else:
                input('try other position...')
    

    def select_pos_by_Q(self,player, name, Q_table):

        # Veja o estado atual seu (Seu board)... pegue a ação com maior Q
        
        
        # jogada Aleatória ( Exploring )
        if np.random.uniform(0, 1) < self.epsilon:
            
            #print('********jogada aleatória - Caiu no EPSILON ***********')
            
            self.select_pos_by_random( player, name = 'player '+str( player ) )


            #print('usando aleatório')

        # Vai na tabela e joga ( Exploiting )
        else:

            # Se existir esse estado gravado...

            if str(self.board) in Q_table['states']:


                #print('usando o Q')


                index_state = Q_table['states'].index( str(self.board) )
                #index_action= self.Q_table['Q'][index_state].index( str(np.max(self.Q_table['Q'][index_state])) )
                #index_qmax = np.argmax(self.Q_table['Q'][index_state])


                # pega todos valores de Q com respectivo index state na ordem DESCRESCENTE
                # assim, se a posição máx já estiver ocupada, ele vai pro segundo maior e assim por diante.

                #print(sorted( self.Q_table['Q'][index_state], reverse = True ) )
                #input()
                

                # pega o maior na ordem decrescente... 
                for qmax in sorted( Q_table['Q'][index_state], reverse = True ):
                    
                    # logo se for Zero não temos estado treinado
                    if qmax == 0:
                        
                        #print(f'********Jogada Aleatório - qmax = {qmax} ... não tem treino***********')
                        
                        self.select_pos_by_random( player, name = 'player '+str( player ) )
                        break


                    index_qmax = Q_table['Q'][index_state].index( qmax )

                    action = Q_table['actions'][index_qmax]

                    row = int(action[1:2])
                    col = int(action[4:5])

                    if [row,col] in self.available_moves().tolist(): # Refransforme Em lista... Array ele aceita  

                        self.board[row][col] = player

                        self.pos = row,col
                        
                        #print(f'******** Jogada Inteligente - melhor Q:{qmax}***********')


                        break



            # se não existir, joga aleatório mesmo
            else:
                
                #print('********Jogada Aleatória - Não existe este Estado***********')
                
                #print(str(self.board))
                
                self.select_pos_by_random( player, name = 'player '+str(player) )

In [4]:
# funct to start the game
def start():
    while True:

        ##################### Criação da Tabela Q (antes) - PLAYER 1 ###################
        # Se não existe este Estado dentro da Tabela Q, adicione
        if str(env.board) not in agent_1.Q_table['states']:

            # 1-) Adicionar Estado Atual
            agent_1.Q_table['states'].append( str(env.board ) )

            # 2-) Add valor de Q
            agent_1.Q_table['Q'].append( [0,0,0,0,0,0,0,0,0] )
        ###############################################################
        
        ##################### Criação da Tabela Q (antes) - PLAYER 2 ###################
        # Se não existe este Estado dentro da Tabela Q, adicione
        if str(env.board) not in agent_2.Q_table['states']:

            # 1-) Adicionar Estado Atual
            agent_2.Q_table['states'].append( str(env.board ) )

            # 2-) Add valor de Q
            agent_2.Q_table['Q'].append( [0,0,0,0,0,0,0,0,0] )
        ###############################################################


        # Registrar o State Inicial no PATH - Player 1
        agent_1.path['states'].append( str(env.board) )
        
        # Registrar o State Inicial no PATH - Player 2
        agent_2.path['states'].append( str(env.board) )
        
        


        ############################ Agente Executa Ação no Ambiente #################### 
        if agent_1.player == 1: # PLAYER 1
            env.select_pos_by_Q( agent_1.player,name = 'player '+str(agent_1.player),Q_table = agent_1.Q_table)
            #env.select_pos_by_random( agent_1.player, name = 'player '+str(agent_1.player) )  
            #env.select_pos_by_input( agent_1.player, name = 'player '+str(agent_1.player) )

            # ( Desenha  Board )
            #env.draw_board()
            
        else:               # PLAYER 2 
            env.select_pos_by_Q( agent_2.player,name = 'player '+str(agent_2.player),Q_table = agent_2.Q_table)
            #env.select_pos_by_random( agent_2.player, name = 'player '+str(agent_2.player) )
            #env.select_pos_by_input( agent_2.player, name = 'player '+str(agent_2.player) )
            
            # ( Desenha  Board )
            #env.draw_board()
        #################################################################################


        # Registrar o Action realizada no PATH
        agent_1.path['actions'].append( str(env.pos) )
        
        # Registrar o Action realizada no PATH
        agent_2.path['actions'].append( str(env.pos) )


        ########################## Ambiente Responde ######################################
        # checa resultado
        if env.check_result() != 2: # continua = 2, empate = 0, vitoria = 1, derrota = -1

            # resultado do jogo
            agent_1.save_result( env.check_result() )
            agent_2.save_result( -1 * env.check_result() )

            # Valor da Recompensa
            reward_1 = env.reward( result = env.check_result(), reward_player = agent_1.reward_player  )
            reward_2 = env.reward( result = -1 * env.check_result(), reward_player = agent_2.reward_player  )
            

            # Update Q Table
            agent_1.update_Q( reward_1 )
            agent_2.update_Q( reward_2 )

            # Reset Game
            env.reset_game()
            agent_1.reset_game()
            agent_2.reset_game()

            
            # add partida jogada
            agent_1.number_match += 1
            agent_2.number_match += 1

            break
            


        # Mudar jogador    
        agent_1.player *= -1 # switch players
        agent_2.player *= -1 # switch players

In [5]:
# SAVE
def save_Q_table():
    
    # Player 1
    with open("./trained_QxQ/Q_table_1.pkl", "wb") as tf:
        pickle.dump(agent_1.Q_table,tf)

    with open("./trained_QxQ/partidas_1.pkl", "wb") as tf:
        pickle.dump(agent_1.number_match,tf)
    
    # Player 2
    with open("./trained_QxQ/Q_table_2.pkl", "wb") as tf:
        pickle.dump(agent_2.Q_table,tf)

    with open("./trained_QxQ/partidas_2.pkl", "wb") as tf:
        pickle.dump(agent_2.number_match,tf)

# LOAD
def load_Q_table():
    # Player 2
    with open('./trained_QxQ/Q_table_1.pkl', 'rb') as handle:
        Q_table_1 = pickle.load(handle)
    with open('./trained_QxQ/partidas_1.pkl', 'rb') as handle:
        number_match_1 = pickle.load(handle)
        
    # Player 2
    with open('./trained_QxQ/Q_table_2.pkl', 'rb') as handle:
        Q_table_2 = pickle.load(handle)
    with open('./trained_QxQ/partidas_2.pkl', 'rb') as handle:
        number_match_2 = pickle.load(handle)

    agent_1.number_match = number_match_1
    agent_1.Q_table = Q_table_1
    
    agent_2.number_match = number_match_1
    agent_2.Q_table = Q_table_2

    print(f"número de partidas {agent_1.number_match}")


In [6]:
## Player 1
agent_1 = Agent( 
    lr = 0.9,
    gamma = 0.1,
    reward_player = {
        'win': 1,
        'lost': -1,
        'draw': -0.1,  
                        # Valores Positivos você força ele a buscar empates... ( Ele buscará o empate quando você treinar muito... ele deixa que vencer)
                        # Valore Zero... você acomoda o sistema. (Vc ferra o Player 2)
    }
)


"""
############################# EPSILON = 0 ###################################

 Valores para o Draw (AMBOS IGUAIS)

Negativos: (melhor) Você força ele sempre a buscar outra alternativa quando empata, isso é bom para usar modelos
já treinados contra Players, pois ele busca sempre outros caminhos! e o jogo não fica monótono

Positivos: Você força ele a buscar empates quando treina muito (não treina bem player 1)

Zero: Você Acomoda o sistema (Ferra o player 2 durante o Treino, pois ele mais empata do que vence...se ele não ganha recompensa por
empata, logo ele sai perdendo...)
O player 1 começa a vencer... logo o player 2 começa a perde... até que em um determinado momento o player 1
não perde mais para o player 2, e existem alguns empates ainda


Valores para Draw (AMBOS DIFERENTES)

Play 1 ZERO e Play 2 Positivo: Irão Empatar até o fIM, pois o Player 2 vai tentar de tudo para empatar o jogo
e o player 1 não sairá da zona de conforto... pois o Draw dele é ZERO, logo ele não entende que é algo
ruim!!! então fica elas por elas !!!

Play 1 NEGATIVO e Play 2 POSITIVO: O player 1 começa bem, começa a vender todas partidas, mas depois o
player 2 começa aprender empatar... logo ele vai pontuando positivamente e o player 1 negativamente, pois
empate para player 1 é algo ruim... então o player 1 é forçado a procurar outras alternativas e com isso
ele abre brexa na defesa e o player 2 começa a vencer!!! chega um momento que o player 1 começa a perder
75% das partidas e empatar 25%... nesse ponto o player 1 não vence mais... Logo o Player 1 não aprende
a jogar a longo prazo !!! e o Player 2 aprende muito bem!

Play 1 NEGATIVO e player 2 ZERO: 



....
#####################################################################################
COnclusão....  O jogo tem que ser equilibrado!!!!

DRAW = 0 para ambos!!! 

Quem faz o papel da variação é o Epsilon!!!!!

Se você treina com DRAW = NEGATIVO... a long prazo ele começa a errar!!! pois o jogo da velha
não tem como não empatar quando você joga com quem entende... Então o melhor é mostrar ao pc que não houve
melhor jogador....

Deposi que TREINAR deste modo... tira o Aleatório e joga DRAW = -1... para ele mudar as jogadas 
quando empatar! (contra player)

##################################################################################

"""

# Player 2
agent_2 = Agent( 
    lr = 0.9,
    gamma = 0.1,
    reward_player = {
        'win': 1,
        'lost': -1,
        'draw': 0.1, 
    }
)


# Load Q Table
#load_Q_table()


In [7]:
# Object Enviroment
env = Enviroment(
    epsilon =  0.0,
)

In [14]:
# Execution

# K = epoch
for k in range(20):
    
    #  Train 100 x por época
    partidas = 100
    for i in range(partidas):
        start()
    escrever = " Win: %3i  Draw: %3i  Lost: %3i "%(agent_1.results['win'],agent_1.results['draw'],agent_1.results['lost'])
    print( escrever + " -> epoch : " + str(agent_1.number_match) )
    agent_1.reset_historic_game()
    
    escrever = " Win: %3i  Draw: %3i  Lost: %3i "%(agent_2.results['win'],agent_2.results['draw'],agent_2.results['lost'])
    print( escrever + " -> epoch : " + str(agent_2.number_match) )
    agent_2.reset_historic_game()
    
    print('\n')

 Win:  61  Draw:  13  Lost:  26  -> epoch : 5100
 Win:  26  Draw:  13  Lost:  61  -> epoch : 5100


 Win:  57  Draw:  20  Lost:  23  -> epoch : 5200
 Win:  23  Draw:  20  Lost:  57  -> epoch : 5200


 Win:  73  Draw:   5  Lost:  22  -> epoch : 5300
 Win:  22  Draw:   5  Lost:  73  -> epoch : 5300


 Win:  38  Draw:  27  Lost:  35  -> epoch : 5400
 Win:  35  Draw:  27  Lost:  38  -> epoch : 5400


 Win:  18  Draw:  17  Lost:  65  -> epoch : 5500
 Win:  65  Draw:  17  Lost:  18  -> epoch : 5500


 Win:  86  Draw:   5  Lost:   9  -> epoch : 5600
 Win:   9  Draw:   5  Lost:  86  -> epoch : 5600


 Win:  86  Draw:   8  Lost:   6  -> epoch : 5700
 Win:   6  Draw:   8  Lost:  86  -> epoch : 5700


 Win:  72  Draw:  15  Lost:  13  -> epoch : 5800
 Win:  13  Draw:  15  Lost:  72  -> epoch : 5800


 Win:  41  Draw:  18  Lost:  41  -> epoch : 5900
 Win:  41  Draw:  18  Lost:  41  -> epoch : 5900


 Win:  32  Draw:  20  Lost:  48  -> epoch : 6000
 Win:  48  Draw:  20  Lost:  32  -> epoch : 6000




In [17]:
# Q_TABLE ---> States X Actions
agent_1.Q_table_df().head()

Unnamed: 0,"(0, 0)","(0, 1)","(0, 2)","(1, 0)","(1, 1)","(1, 2)","(2, 0)","(2, 1)","(2, 2)"
[[0. 0. 0.]\n [0. 0. 0.]\n [0. 0. 0.]],-0.000009,-0.000009,-8.999982e-06,-0.000009,0.000090,-0.000009,-0.000009,-8.999979e-06,-0.000009
[[0. 0. 0.]\n [0. 0. 0.]\n [0. 1. 0.]],0.000012,0.000900,7.888283e-04,0.000641,0.000806,0.000900,-0.000090,0.000000e+00,0.000720
[[ 0. 0. 0.]\n [ 0. 0. -1.]\n [ 0. 1. 0.]],0.000000,-0.000005,-5.731060e-07,-0.000006,0.000000,0.000000,0.000000,0.000000e+00,0.009000
[[ 0. 0. 0.]\n [ 1. 0. -1.]\n [ 0. 1. 0.]],0.000006,0.000656,8.531532e-04,0.000000,0.000656,0.000000,0.000888,0.000000e+00,0.000005
[[ 0. 0. 0.]\n [ 1. 0. -1.]\n [ 0. 1. -1.]],0.000000,0.000000,5.904900e-05,0.000000,0.000000,0.000000,0.000000,0.000000e+00,0.000000
[[ 0. 0. 1.]\n [ 1. 0. -1.]\n [ 0. 1. -1.]],0.000656,0.000000,0.000000e+00,0.000000,0.000000,0.000000,0.000000,0.000000e+00,0.000000
[[-1. 0. 1.]\n [ 1. 0. -1.]\n [ 0. 1. -1.]],0.000000,0.000000,0.000000e+00,0.000000,0.000000,0.000000,-0.080271,0.000000e+00,0.000000
[[-1. 0. 1.]\n [ 1. 0. -1.]\n [ 1. 1. -1.]],0.000000,0.081000,0.000000e+00,0.000000,-0.900000,0.000000,0.000000,0.000000e+00,0.000000
[[-1. -1. 1.]\n [ 1. 0. -1.]\n [ 1. 1. -1.]],0.000000,0.000000,0.000000e+00,0.000000,0.900000,0.000000,0.000000,0.000000e+00,0.000000
[[-1. 0. 0.]\n [ 0. 0. 0.]\n [ 0. 1. 0.]],0.000000,0.000235,-7.210488e-04,0.000000,-0.000633,-0.000656,-0.000657,0.000000e+00,-0.000852


In [16]:
# Q_TABLE ---> States X Actions
agent_2.Q_table_df().head()

Unnamed: 0,"(0, 0)","(0, 1)","(0, 2)","(1, 0)","(1, 1)","(1, 2)","(2, 0)","(2, 1)","(2, 2)"
[[0. 0. 0.]\n [0. 0. 0.]\n [0. 0. 0.]],9e-06,9e-06,8.999982e-06,9e-06,-9e-05,9e-06,9e-06,9e-06,9e-06
[[0. 0. 0.]\n [0. 0. 0.]\n [0. 1. 0.]],-1.2e-05,-0.0009,-0.0007888283,-0.000641,-0.000806,-0.0009,9e-05,0.0,-0.00072
[[ 0. 0. 0.]\n [ 0. 0. -1.]\n [ 0. 1. 0.]],0.0,5e-06,5.73106e-07,6e-06,0.0,0.0,0.0,0.0,-0.009
[[ 0. 0. 0.]\n [ 1. 0. -1.]\n [ 0. 1. 0.]],-6e-06,-0.000656,-0.0008531532,0.0,-0.000656,0.0,-0.000888,0.0,-5e-06
[[ 0. 0. 0.]\n [ 1. 0. -1.]\n [ 0. 1. -1.]],0.0,0.0,-5.9049e-05,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
# SAVE Q Table
#save_Q_table()