In [1]:
import tic_tac_toe as game
import random
import numpy as np
from collections import deque

from keras.initializations import normal, identity
from keras.models import model_from_json
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation, Flatten
from keras.optimizers import SGD , Adam
import tensorflow as tf

Using TensorFlow backend.


In [2]:
N_ACTIONS = 36
GAMMA = 0.99 # decay rate of past observations
OBSERVATION = 3200. # timesteps to observe before training
FINAL_EPSILON = 0.0001 # final value of epsilon
INITIAL_EPSILON = 0.1 # starting value of epsilon
REPLAY_MEMORY = 5000 # number of previous transitions to remember
BATCH = 32 # size of minibatch
LEARNING_RATE = 1e-4

In [3]:
def build_model():
    model = Sequential()
    model.add(Dense(36, input_dim=36, activation='relu'))
    model.add(Dense(36, activation='relu'))
    model.add(Dense(36, activation='relu'))
    model.add(Dense(36, activation='relu'))
    model.add(Dense(36, activation='relu'))
    
    adam = Adam(lr=LEARNING_RATE)
    model.compile(loss='mse',optimizer=adam)
    return model

In [64]:
def train_network(model, args):
    game_state = game.ofttt()
    RM = deque()
    (x_t, r_0, terminal) = (game_state.initial, 0, False)
    s_t = np.array(x_t[1])
    #player2 = game.random_player(game_state, game_state.initial)
    if args == 'run':
        OBSERVE = 999999999    #Keep observe, never train
        epsilon = FINAL_EPSILON    #Use a small epsilon to choose mainly policy actions
        #Load model
        print ("Now we load weight")  
        model.load_weights("model.h5")
        adam = Adam(lr=LEARNING_RATE)
        model.compile(loss='mse',optimizer=adam)
        print ("Weight load successfully")
    else:
        #Assign an observation variable and max epsilon to train
        OBSERVE = OBSERVATION
        epsilon = INITIAL_EPSILON
        
    t = 0
    
    while(True):
        #Initialize variables
        loss = 0
        Q_sa = 0
        action_index = 0
        r_t = 0
        a_t = np.zeros([N_ACTIONS])    #Output vector of actions a_[t] = 1 for action to take
        
        if random.random() <= epsilon:    #At the first move, choose randomly
            print("----------Random Action----------")
            action_index = random.randrange(N_ACTIONS)
            a_t[action_index] = 1
        else:
            q = model.predict(s_t.reshape(1,36))       #input the state at time t
            max_Q = np.argmax(q)         #Take the max q value predicted from network
            action_index = 1 + max_Q         #Assign action to the argmax Q
            a_t[max_Q] = 1               #Output vector a_t = 1 for max_Q
        
        #Decrease epsilon by a smalll factor
        if epsilon > FINAL_EPSILON and t > OBSERVE:               
            epsilon -= (INITIAL_EPSILON - FINAL_EPSILON) / 10000
            
        x_t1, r_t, terminal = game_state.next_state(action_index, x_t) #run the selected action and observed next state and reward
        s_t1 = np.array(x_t1[1])
        RM.append((s_t, action_index, r_t, s_t1, terminal))    # store the transition in the Replay Memory
        if len(RM) > REPLAY_MEMORY:
            RM.popleft()

        #only train if done observing
        if t > OBSERVE:
            #sample a minibatch to train on
            minibatch = random.sample(RM, BATCH)
            inputs = np.zeros((BATCH, N_ACTIONS))  
            print (inputs.shape)
            targets = np.zeros((inputs.shape[0], N_ACTIONS)) 

            #Now we do the experience replay
            for i in range(0, len(minibatch)):
                x0_t = np.array(minibatch[i][0])
                action_t = minibatch[i][1]   #This is action index
                reward_t = minibatch[i][2]
                state_t1 = minibatch[i][3]
                terminal = minibatch[i][4]
                # if terminated, only equals reward
                state_t = x0_t.reshape(1,N_ACTIONS)

                inputs[i:i + 1] = state_t    #I saved down s_t

                targets[i] = model.predict(state_t)  # Hitting each buttom probability
                Q_sa = model.predict(state_t1.reshape(1,N_ACTIONS))

                if terminal:
                    targets[i, action_t] = reward_t
                else:
                    targets[i, action_t] = reward_t + GAMMA * np.max(Q_sa)

            # targets2 = normalize(targets)
            loss += model.train_on_batch(inputs, targets)

        s_t = s_t1
        t = t + 1

        # save progress every 10000 iterations
        if t % 1000 == 0:
            print("Now we save model")
            model.save_weights("model.h5", overwrite=True)

        # print info
        state = ""
        if t <= OBSERVE:
            state = "observe"
        else:
            state = "train"

        print("TIMESTEP", t, "/ STATE", state, \
            "/ EPSILON", epsilon, "/ ACTION", action_index, "/ REWARD", r_t, \
            "/ Q_MAX " , np.max(Q_sa), "/ Loss ", loss)

    print("Episode finished!")
    print("************************")

In [65]:
def playGame(args):
    model = build_model()
    train_network(model,args)

In [66]:
def main():
    #parser = argparse.ArgumentParser(description='Description of your program')
    #parser.add_argument('-m','--mode', help='Train / Run', required=True)
    #args = vars(parser.parse_args())
    playGame(args="train")

In [67]:
if __name__ == "__main__":
    main()

('TIMESTEP', 1, '/ STATE', 'observe', '/ EPSILON', 0.1, '/ ACTION', 1, '/ REWARD', 0, '/ Q_MAX ', 0, '/ Loss ', 0)
('TIMESTEP', 2, '/ STATE', 'observe', '/ EPSILON', 0.1, '/ ACTION', 5, '/ REWARD', 0, '/ Q_MAX ', 0, '/ Loss ', 0)
('TIMESTEP', 3, '/ STATE', 'observe', '/ EPSILON', 0.1, '/ ACTION', 5, '/ REWARD', 0, '/ Q_MAX ', 0, '/ Loss ', 0)
----------Random Action----------
('TIMESTEP', 4, '/ STATE', 'observe', '/ EPSILON', 0.1, '/ ACTION', 34, '/ REWARD', 0, '/ Q_MAX ', 0, '/ Loss ', 0)
----------Random Action----------
('TIMESTEP', 5, '/ STATE', 'observe', '/ EPSILON', 0.1, '/ ACTION', 26, '/ REWARD', 0, '/ Q_MAX ', 0, '/ Loss ', 0)
('TIMESTEP', 6, '/ STATE', 'observe', '/ EPSILON', 0.1, '/ ACTION', 20, '/ REWARD', 0, '/ Q_MAX ', 0, '/ Loss ', 0)
('TIMESTEP', 7, '/ STATE', 'observe', '/ EPSILON', 0.1, '/ ACTION', 25, '/ REWARD', 0, '/ Q_MAX ', 0, '/ Loss ', 0)
('TIMESTEP', 8, '/ STATE', 'observe', '/ EPSILON', 0.1, '/ ACTION', 25, '/ REWARD', 0, '/ Q_MAX ', 0, '/ Loss ', 0)
('TIMES

('TIMESTEP', 409, '/ STATE', 'observe', '/ EPSILON', 0.1, '/ ACTION', 32, '/ REWARD', 1, '/ Q_MAX ', 0, '/ Loss ', 0)
('TIMESTEP', 410, '/ STATE', 'observe', '/ EPSILON', 0.1, '/ ACTION', 32, '/ REWARD', 1, '/ Q_MAX ', 0, '/ Loss ', 0)
('TIMESTEP', 411, '/ STATE', 'observe', '/ EPSILON', 0.1, '/ ACTION', 32, '/ REWARD', 1, '/ Q_MAX ', 0, '/ Loss ', 0)
('TIMESTEP', 412, '/ STATE', 'observe', '/ EPSILON', 0.1, '/ ACTION', 32, '/ REWARD', 1, '/ Q_MAX ', 0, '/ Loss ', 0)
('TIMESTEP', 413, '/ STATE', 'observe', '/ EPSILON', 0.1, '/ ACTION', 32, '/ REWARD', 1, '/ Q_MAX ', 0, '/ Loss ', 0)
('TIMESTEP', 414, '/ STATE', 'observe', '/ EPSILON', 0.1, '/ ACTION', 32, '/ REWARD', 1, '/ Q_MAX ', 0, '/ Loss ', 0)
('TIMESTEP', 415, '/ STATE', 'observe', '/ EPSILON', 0.1, '/ ACTION', 32, '/ REWARD', 1, '/ Q_MAX ', 0, '/ Loss ', 0)
('TIMESTEP', 416, '/ STATE', 'observe', '/ EPSILON', 0.1, '/ ACTION', 32, '/ REWARD', 1, '/ Q_MAX ', 0, '/ Loss ', 0)
('TIMESTEP', 417, '/ STATE', 'observe', '/ EPSILON', 0.1

('TIMESTEP', 596, '/ STATE', 'observe', '/ EPSILON', 0.1, '/ ACTION', 32, '/ REWARD', 1, '/ Q_MAX ', 0, '/ Loss ', 0)
('TIMESTEP', 597, '/ STATE', 'observe', '/ EPSILON', 0.1, '/ ACTION', 32, '/ REWARD', 1, '/ Q_MAX ', 0, '/ Loss ', 0)
('TIMESTEP', 598, '/ STATE', 'observe', '/ EPSILON', 0.1, '/ ACTION', 32, '/ REWARD', 1, '/ Q_MAX ', 0, '/ Loss ', 0)
('TIMESTEP', 599, '/ STATE', 'observe', '/ EPSILON', 0.1, '/ ACTION', 32, '/ REWARD', 1, '/ Q_MAX ', 0, '/ Loss ', 0)
----------Random Action----------
('TIMESTEP', 600, '/ STATE', 'observe', '/ EPSILON', 0.1, '/ ACTION', 16, '/ REWARD', 1, '/ Q_MAX ', 0, '/ Loss ', 0)
('TIMESTEP', 601, '/ STATE', 'observe', '/ EPSILON', 0.1, '/ ACTION', 32, '/ REWARD', 1, '/ Q_MAX ', 0, '/ Loss ', 0)
('TIMESTEP', 602, '/ STATE', 'observe', '/ EPSILON', 0.1, '/ ACTION', 32, '/ REWARD', 1, '/ Q_MAX ', 0, '/ Loss ', 0)
('TIMESTEP', 603, '/ STATE', 'observe', '/ EPSILON', 0.1, '/ ACTION', 32, '/ REWARD', 1, '/ Q_MAX ', 0, '/ Loss ', 0)
('TIMESTEP', 604, '/ S

('TIMESTEP', 777, '/ STATE', 'observe', '/ EPSILON', 0.1, '/ ACTION', 29, '/ REWARD', 1, '/ Q_MAX ', 0, '/ Loss ', 0)
----------Random Action----------
('TIMESTEP', 778, '/ STATE', 'observe', '/ EPSILON', 0.1, '/ ACTION', 35, '/ REWARD', 1, '/ Q_MAX ', 0, '/ Loss ', 0)
('TIMESTEP', 779, '/ STATE', 'observe', '/ EPSILON', 0.1, '/ ACTION', 29, '/ REWARD', 1, '/ Q_MAX ', 0, '/ Loss ', 0)
('TIMESTEP', 780, '/ STATE', 'observe', '/ EPSILON', 0.1, '/ ACTION', 29, '/ REWARD', 1, '/ Q_MAX ', 0, '/ Loss ', 0)
('TIMESTEP', 781, '/ STATE', 'observe', '/ EPSILON', 0.1, '/ ACTION', 29, '/ REWARD', 1, '/ Q_MAX ', 0, '/ Loss ', 0)
('TIMESTEP', 782, '/ STATE', 'observe', '/ EPSILON', 0.1, '/ ACTION', 29, '/ REWARD', 1, '/ Q_MAX ', 0, '/ Loss ', 0)
('TIMESTEP', 783, '/ STATE', 'observe', '/ EPSILON', 0.1, '/ ACTION', 29, '/ REWARD', 1, '/ Q_MAX ', 0, '/ Loss ', 0)
('TIMESTEP', 784, '/ STATE', 'observe', '/ EPSILON', 0.1, '/ ACTION', 29, '/ REWARD', 1, '/ Q_MAX ', 0, '/ Loss ', 0)
('TIMESTEP', 785, '/ S

('TIMESTEP', 1093, '/ STATE', 'observe', '/ EPSILON', 0.1, '/ ACTION', 32, '/ REWARD', 1, '/ Q_MAX ', 0, '/ Loss ', 0)
('TIMESTEP', 1094, '/ STATE', 'observe', '/ EPSILON', 0.1, '/ ACTION', 32, '/ REWARD', 1, '/ Q_MAX ', 0, '/ Loss ', 0)
('TIMESTEP', 1095, '/ STATE', 'observe', '/ EPSILON', 0.1, '/ ACTION', 32, '/ REWARD', 1, '/ Q_MAX ', 0, '/ Loss ', 0)
('TIMESTEP', 1096, '/ STATE', 'observe', '/ EPSILON', 0.1, '/ ACTION', 32, '/ REWARD', 1, '/ Q_MAX ', 0, '/ Loss ', 0)
('TIMESTEP', 1097, '/ STATE', 'observe', '/ EPSILON', 0.1, '/ ACTION', 32, '/ REWARD', 1, '/ Q_MAX ', 0, '/ Loss ', 0)
('TIMESTEP', 1098, '/ STATE', 'observe', '/ EPSILON', 0.1, '/ ACTION', 32, '/ REWARD', 1, '/ Q_MAX ', 0, '/ Loss ', 0)
('TIMESTEP', 1099, '/ STATE', 'observe', '/ EPSILON', 0.1, '/ ACTION', 32, '/ REWARD', 1, '/ Q_MAX ', 0, '/ Loss ', 0)
('TIMESTEP', 1100, '/ STATE', 'observe', '/ EPSILON', 0.1, '/ ACTION', 32, '/ REWARD', 1, '/ Q_MAX ', 0, '/ Loss ', 0)
('TIMESTEP', 1101, '/ STATE', 'observe', '/ EPSI

('TIMESTEP', 1209, '/ STATE', 'observe', '/ EPSILON', 0.1, '/ ACTION', 32, '/ REWARD', 1, '/ Q_MAX ', 0, '/ Loss ', 0)
('TIMESTEP', 1210, '/ STATE', 'observe', '/ EPSILON', 0.1, '/ ACTION', 32, '/ REWARD', 1, '/ Q_MAX ', 0, '/ Loss ', 0)
('TIMESTEP', 1211, '/ STATE', 'observe', '/ EPSILON', 0.1, '/ ACTION', 32, '/ REWARD', 1, '/ Q_MAX ', 0, '/ Loss ', 0)
('TIMESTEP', 1212, '/ STATE', 'observe', '/ EPSILON', 0.1, '/ ACTION', 32, '/ REWARD', 1, '/ Q_MAX ', 0, '/ Loss ', 0)
('TIMESTEP', 1213, '/ STATE', 'observe', '/ EPSILON', 0.1, '/ ACTION', 32, '/ REWARD', 1, '/ Q_MAX ', 0, '/ Loss ', 0)
('TIMESTEP', 1214, '/ STATE', 'observe', '/ EPSILON', 0.1, '/ ACTION', 32, '/ REWARD', 1, '/ Q_MAX ', 0, '/ Loss ', 0)
----------Random Action----------
('TIMESTEP', 1215, '/ STATE', 'observe', '/ EPSILON', 0.1, '/ ACTION', 20, '/ REWARD', 1, '/ Q_MAX ', 0, '/ Loss ', 0)
('TIMESTEP', 1216, '/ STATE', 'observe', '/ EPSILON', 0.1, '/ ACTION', 32, '/ REWARD', 1, '/ Q_MAX ', 0, '/ Loss ', 0)
----------Rand

('TIMESTEP', 1351, '/ STATE', 'observe', '/ EPSILON', 0.1, '/ ACTION', 29, '/ REWARD', 1, '/ Q_MAX ', 0, '/ Loss ', 0)
('TIMESTEP', 1352, '/ STATE', 'observe', '/ EPSILON', 0.1, '/ ACTION', 29, '/ REWARD', 1, '/ Q_MAX ', 0, '/ Loss ', 0)
('TIMESTEP', 1353, '/ STATE', 'observe', '/ EPSILON', 0.1, '/ ACTION', 29, '/ REWARD', 1, '/ Q_MAX ', 0, '/ Loss ', 0)
('TIMESTEP', 1354, '/ STATE', 'observe', '/ EPSILON', 0.1, '/ ACTION', 29, '/ REWARD', 1, '/ Q_MAX ', 0, '/ Loss ', 0)
('TIMESTEP', 1355, '/ STATE', 'observe', '/ EPSILON', 0.1, '/ ACTION', 29, '/ REWARD', 1, '/ Q_MAX ', 0, '/ Loss ', 0)
('TIMESTEP', 1356, '/ STATE', 'observe', '/ EPSILON', 0.1, '/ ACTION', 29, '/ REWARD', 1, '/ Q_MAX ', 0, '/ Loss ', 0)
('TIMESTEP', 1357, '/ STATE', 'observe', '/ EPSILON', 0.1, '/ ACTION', 29, '/ REWARD', 1, '/ Q_MAX ', 0, '/ Loss ', 0)
('TIMESTEP', 1358, '/ STATE', 'observe', '/ EPSILON', 0.1, '/ ACTION', 29, '/ REWARD', 1, '/ Q_MAX ', 0, '/ Loss ', 0)
('TIMESTEP', 1359, '/ STATE', 'observe', '/ EPSI

('TIMESTEP', 1714, '/ STATE', 'observe', '/ EPSILON', 0.1, '/ ACTION', 29, '/ REWARD', 1, '/ Q_MAX ', 0, '/ Loss ', 0)
('TIMESTEP', 1715, '/ STATE', 'observe', '/ EPSILON', 0.1, '/ ACTION', 29, '/ REWARD', 1, '/ Q_MAX ', 0, '/ Loss ', 0)
('TIMESTEP', 1716, '/ STATE', 'observe', '/ EPSILON', 0.1, '/ ACTION', 29, '/ REWARD', 1, '/ Q_MAX ', 0, '/ Loss ', 0)
('TIMESTEP', 1717, '/ STATE', 'observe', '/ EPSILON', 0.1, '/ ACTION', 29, '/ REWARD', 1, '/ Q_MAX ', 0, '/ Loss ', 0)
('TIMESTEP', 1718, '/ STATE', 'observe', '/ EPSILON', 0.1, '/ ACTION', 29, '/ REWARD', 1, '/ Q_MAX ', 0, '/ Loss ', 0)
('TIMESTEP', 1719, '/ STATE', 'observe', '/ EPSILON', 0.1, '/ ACTION', 29, '/ REWARD', 1, '/ Q_MAX ', 0, '/ Loss ', 0)
('TIMESTEP', 1720, '/ STATE', 'observe', '/ EPSILON', 0.1, '/ ACTION', 29, '/ REWARD', 1, '/ Q_MAX ', 0, '/ Loss ', 0)
('TIMESTEP', 1721, '/ STATE', 'observe', '/ EPSILON', 0.1, '/ ACTION', 29, '/ REWARD', 1, '/ Q_MAX ', 0, '/ Loss ', 0)
('TIMESTEP', 1722, '/ STATE', 'observe', '/ EPSI

('TIMESTEP', 2028, '/ STATE', 'observe', '/ EPSILON', 0.1, '/ ACTION', 29, '/ REWARD', 1, '/ Q_MAX ', 0, '/ Loss ', 0)
('TIMESTEP', 2029, '/ STATE', 'observe', '/ EPSILON', 0.1, '/ ACTION', 29, '/ REWARD', 1, '/ Q_MAX ', 0, '/ Loss ', 0)
('TIMESTEP', 2030, '/ STATE', 'observe', '/ EPSILON', 0.1, '/ ACTION', 29, '/ REWARD', 1, '/ Q_MAX ', 0, '/ Loss ', 0)
('TIMESTEP', 2031, '/ STATE', 'observe', '/ EPSILON', 0.1, '/ ACTION', 29, '/ REWARD', 1, '/ Q_MAX ', 0, '/ Loss ', 0)
('TIMESTEP', 2032, '/ STATE', 'observe', '/ EPSILON', 0.1, '/ ACTION', 29, '/ REWARD', 1, '/ Q_MAX ', 0, '/ Loss ', 0)
('TIMESTEP', 2033, '/ STATE', 'observe', '/ EPSILON', 0.1, '/ ACTION', 29, '/ REWARD', 1, '/ Q_MAX ', 0, '/ Loss ', 0)
('TIMESTEP', 2034, '/ STATE', 'observe', '/ EPSILON', 0.1, '/ ACTION', 29, '/ REWARD', 1, '/ Q_MAX ', 0, '/ Loss ', 0)
('TIMESTEP', 2035, '/ STATE', 'observe', '/ EPSILON', 0.1, '/ ACTION', 29, '/ REWARD', 1, '/ Q_MAX ', 0, '/ Loss ', 0)
('TIMESTEP', 2036, '/ STATE', 'observe', '/ EPSI

('TIMESTEP', 2216, '/ STATE', 'observe', '/ EPSILON', 0.1, '/ ACTION', 29, '/ REWARD', 1, '/ Q_MAX ', 0, '/ Loss ', 0)
('TIMESTEP', 2217, '/ STATE', 'observe', '/ EPSILON', 0.1, '/ ACTION', 29, '/ REWARD', 1, '/ Q_MAX ', 0, '/ Loss ', 0)
('TIMESTEP', 2218, '/ STATE', 'observe', '/ EPSILON', 0.1, '/ ACTION', 29, '/ REWARD', 1, '/ Q_MAX ', 0, '/ Loss ', 0)
('TIMESTEP', 2219, '/ STATE', 'observe', '/ EPSILON', 0.1, '/ ACTION', 29, '/ REWARD', 1, '/ Q_MAX ', 0, '/ Loss ', 0)
----------Random Action----------
('TIMESTEP', 2220, '/ STATE', 'observe', '/ EPSILON', 0.1, '/ ACTION', 25, '/ REWARD', 1, '/ Q_MAX ', 0, '/ Loss ', 0)
('TIMESTEP', 2221, '/ STATE', 'observe', '/ EPSILON', 0.1, '/ ACTION', 29, '/ REWARD', 1, '/ Q_MAX ', 0, '/ Loss ', 0)
('TIMESTEP', 2222, '/ STATE', 'observe', '/ EPSILON', 0.1, '/ ACTION', 29, '/ REWARD', 1, '/ Q_MAX ', 0, '/ Loss ', 0)
('TIMESTEP', 2223, '/ STATE', 'observe', '/ EPSILON', 0.1, '/ ACTION', 29, '/ REWARD', 1, '/ Q_MAX ', 0, '/ Loss ', 0)
('TIMESTEP', 2

('TIMESTEP', 2540, '/ STATE', 'observe', '/ EPSILON', 0.1, '/ ACTION', 29, '/ REWARD', 1, '/ Q_MAX ', 0, '/ Loss ', 0)
('TIMESTEP', 2541, '/ STATE', 'observe', '/ EPSILON', 0.1, '/ ACTION', 29, '/ REWARD', 1, '/ Q_MAX ', 0, '/ Loss ', 0)
('TIMESTEP', 2542, '/ STATE', 'observe', '/ EPSILON', 0.1, '/ ACTION', 29, '/ REWARD', 1, '/ Q_MAX ', 0, '/ Loss ', 0)
('TIMESTEP', 2543, '/ STATE', 'observe', '/ EPSILON', 0.1, '/ ACTION', 29, '/ REWARD', 1, '/ Q_MAX ', 0, '/ Loss ', 0)
('TIMESTEP', 2544, '/ STATE', 'observe', '/ EPSILON', 0.1, '/ ACTION', 29, '/ REWARD', 1, '/ Q_MAX ', 0, '/ Loss ', 0)
('TIMESTEP', 2545, '/ STATE', 'observe', '/ EPSILON', 0.1, '/ ACTION', 29, '/ REWARD', 1, '/ Q_MAX ', 0, '/ Loss ', 0)
('TIMESTEP', 2546, '/ STATE', 'observe', '/ EPSILON', 0.1, '/ ACTION', 29, '/ REWARD', 1, '/ Q_MAX ', 0, '/ Loss ', 0)
('TIMESTEP', 2547, '/ STATE', 'observe', '/ EPSILON', 0.1, '/ ACTION', 29, '/ REWARD', 1, '/ Q_MAX ', 0, '/ Loss ', 0)
('TIMESTEP', 2548, '/ STATE', 'observe', '/ EPSI

('TIMESTEP', 2788, '/ STATE', 'observe', '/ EPSILON', 0.1, '/ ACTION', 29, '/ REWARD', 1, '/ Q_MAX ', 0, '/ Loss ', 0)
('TIMESTEP', 2789, '/ STATE', 'observe', '/ EPSILON', 0.1, '/ ACTION', 29, '/ REWARD', 1, '/ Q_MAX ', 0, '/ Loss ', 0)
('TIMESTEP', 2790, '/ STATE', 'observe', '/ EPSILON', 0.1, '/ ACTION', 29, '/ REWARD', 1, '/ Q_MAX ', 0, '/ Loss ', 0)
('TIMESTEP', 2791, '/ STATE', 'observe', '/ EPSILON', 0.1, '/ ACTION', 29, '/ REWARD', 1, '/ Q_MAX ', 0, '/ Loss ', 0)
('TIMESTEP', 2792, '/ STATE', 'observe', '/ EPSILON', 0.1, '/ ACTION', 29, '/ REWARD', 1, '/ Q_MAX ', 0, '/ Loss ', 0)
----------Random Action----------
('TIMESTEP', 2793, '/ STATE', 'observe', '/ EPSILON', 0.1, '/ ACTION', 13, '/ REWARD', 1, '/ Q_MAX ', 0, '/ Loss ', 0)
('TIMESTEP', 2794, '/ STATE', 'observe', '/ EPSILON', 0.1, '/ ACTION', 29, '/ REWARD', 1, '/ Q_MAX ', 0, '/ Loss ', 0)
('TIMESTEP', 2795, '/ STATE', 'observe', '/ EPSILON', 0.1, '/ ACTION', 29, '/ REWARD', 1, '/ Q_MAX ', 0, '/ Loss ', 0)
('TIMESTEP', 2

('TIMESTEP', 2982, '/ STATE', 'observe', '/ EPSILON', 0.1, '/ ACTION', 29, '/ REWARD', 1, '/ Q_MAX ', 0, '/ Loss ', 0)
('TIMESTEP', 2983, '/ STATE', 'observe', '/ EPSILON', 0.1, '/ ACTION', 29, '/ REWARD', 1, '/ Q_MAX ', 0, '/ Loss ', 0)
('TIMESTEP', 2984, '/ STATE', 'observe', '/ EPSILON', 0.1, '/ ACTION', 29, '/ REWARD', 1, '/ Q_MAX ', 0, '/ Loss ', 0)
----------Random Action----------
('TIMESTEP', 2985, '/ STATE', 'observe', '/ EPSILON', 0.1, '/ ACTION', 35, '/ REWARD', 1, '/ Q_MAX ', 0, '/ Loss ', 0)
('TIMESTEP', 2986, '/ STATE', 'observe', '/ EPSILON', 0.1, '/ ACTION', 29, '/ REWARD', 1, '/ Q_MAX ', 0, '/ Loss ', 0)
('TIMESTEP', 2987, '/ STATE', 'observe', '/ EPSILON', 0.1, '/ ACTION', 29, '/ REWARD', 1, '/ Q_MAX ', 0, '/ Loss ', 0)
('TIMESTEP', 2988, '/ STATE', 'observe', '/ EPSILON', 0.1, '/ ACTION', 29, '/ REWARD', 1, '/ Q_MAX ', 0, '/ Loss ', 0)
('TIMESTEP', 2989, '/ STATE', 'observe', '/ EPSILON', 0.1, '/ ACTION', 29, '/ REWARD', 1, '/ Q_MAX ', 0, '/ Loss ', 0)
('TIMESTEP', 2

('TIMESTEP', 3202, '/ STATE', 'train', '/ EPSILON', 0.09999001, '/ ACTION', 29, '/ REWARD', 1, '/ Q_MAX ', 0.32906988, '/ Loss ', 0.027355512604117393)
(32, 36)


IndexError: index 36 is out of bounds for axis 1 with size 36

In [6]:
print(game_state.next_state(22,(1,[[1, -1, -1, 0, 0, 0], [0, 1, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0], [0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0]])))

((-1, [[1, -1, -1, 0, 0, 0], [0, 1, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0], [0, 0, 0, 1, 0, 0], [0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0]]), 1, True)


In [7]:
print(game_state.initial)

(1, [[0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0]])


In [41]:
random.randrange?