In [1]:
import gym
from gym.envs.registration import register
import joblib

from keras.models import Sequential, Model
from keras.layers import Activation, Dense, Flatten, Input
from keras.optimizers import Adam
from rl.agents.dqn import DQNAgent
from rl.memory import SequentialMemory
from rl.policy import LinearAnnealedPolicy, EpsGreedyQPolicy

Using TensorFlow backend.


In [2]:
register(id='TicTacToe-v0', entry_point='envs:TicTacToeEnv')
env = gym.make('TicTacToe-v0')

In [3]:
def build_model(state_size, num_actions):
  input = Input(shape=(1, state_size))
  x = Flatten()(input)
  x = Dense(256, activation='relu')(x)
  x = Dense(64, activation='relu')(x)
  x = Dense(32, activation='relu')(x)
  output = Dense(num_actions, activation='linear')(x)
  model = Model(inputs=input, outputs=output)
  model.summary()
  return model

memory = SequentialMemory(limit=50000, window_length=1)
policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=1.0, value_min=-1.0, value_test=.05, nb_steps=1000)

In [3]:
model = build_model((env.observation_space).n, (env.action_space).n)
dqn = DQNAgent(model=model, policy=policy, nb_actions=(env.action_space).n, memory=memory)
dqn.compile(Adam(lr=1e-3), metrics=['mae'])

Instructions for updating:
Colocations handled automatically by placer.
Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 1, 9)              0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 9)                 0         
_________________________________________________________________
dense_1 (Dense)              (None, 256)               2560      
_________________________________________________________________
dense_2 (Dense)              (None, 64)                16448     
_________________________________________________________________
dense_3 (Dense)              (None, 32)                2080      
_________________________________________________________________
dense_4 (Dense)              (None, 9)                 297       
Total params: 21,385
Trainable params: 21,385
Non-tra

In [None]:
dqn.fit(
    env, 
    nb_steps=10000,
    visualize=False
)

In [None]:
res = dqn.test(env, nb_episodes=100, visualize=True)
res?

In [12]:
joblib.dump(model, 'model.bin')

['model.bin']

In [10]:
_model = joblib.load('model.bin')

env.reset_info()
_dqn = DQNAgent(model=_model, policy=policy, nb_actions=(env.action_space).n, memory=memory)
_dqn.compile(Adam(lr=1e-3), metrics=['mae'])
_dqn.test(env, nb_episodes=100, visualize=True)

print(env.info)

Testing for 100 episodes ...


choice: 7
Learner (X's) move
--------------------
   0  1  2
0  _  _  _
1  _  _  _
2  _  X  _

Computer (O's) move
--------------------
   0  1  2
0  _  _  _
1  O  _  _
2  _  X  _

   0  1  2
0  _  _  _
1  O  _  _
2  _  X  _



choice: 1
Learner (X's) move
--------------------
   0  1  2
0  _  X  _
1  O  _  _
2  _  X  _

Computer (O's) move
--------------------
   0  1  2
0  _  X  _
1  O  _  _
2  _  X  O

   0  1  2
0  _  X  _
1  O  _  _
2  _  X  O



choice: 4
Learner (X's) move
--------------------
   0  1  2
0  _  X  _
1  O  X  _
2  _  X  O

Learner ('X') wins 💪

reward: 10 | max_steps: 5 | urgency_inv: 3 | steps: 3
   0  1  2
0  _  X  _
1  O  X  _
2  _  X  O

Episode 1: reward: 30.000, steps: 3


choice: 7
Learner (X's) move
--------------------
   0  1  2
0  _  _  _
1  _  _  _
2  _  X  _

Computer (O's) move
--------------------
   0  1  2
0  _  _  _
1  _  _  O
2  _  X  _

   0  1  2
0  _  _  _
1  _  _  O
2  _  X  _



choice: 1
Learner (X's) move
--

2  _  X  _



choice: 8
Learner (X's) move
--------------------
   0  1  2
0  _  _  O
1  _  _  _
2  _  X  X

Computer (O's) move
--------------------
   0  1  2
0  O  _  O
1  _  _  _
2  _  X  X

   0  1  2
0  O  _  O
1  _  _  _
2  _  X  X



choice: 6
Learner (X's) move
--------------------
   0  1  2
0  O  _  O
1  _  _  _
2  X  X  X

Learner ('X') wins 💪

reward: 10 | max_steps: 5 | urgency_inv: 3 | steps: 3
   0  1  2
0  O  _  O
1  _  _  _
2  X  X  X

Episode 14: reward: 30.000, steps: 3


choice: 7
Learner (X's) move
--------------------
   0  1  2
0  _  _  _
1  _  _  _
2  _  X  _

Computer (O's) move
--------------------
   0  1  2
0  _  _  _
1  O  _  _
2  _  X  _

   0  1  2
0  _  _  _
1  O  _  _
2  _  X  _



choice: 1
Learner (X's) move
--------------------
   0  1  2
0  _  X  _
1  O  _  _
2  _  X  _

Computer (O's) move
--------------------
   0  1  2
0  _  X  O
1  O  _  _
2  _  X  _

   0  1  2
0  _  X  O
1  O  _  _
2  _  X  _



choice: 4
Learner (X's) move
------------------

2  _  X  _

Computer (O's) move
--------------------
   0  1  2
0  _  _  _
1  _  _  O
2  _  X  _

   0  1  2
0  _  _  _
1  _  _  O
2  _  X  _



choice: 1
Learner (X's) move
--------------------
   0  1  2
0  _  X  _
1  _  _  O
2  _  X  _

Computer (O's) move
--------------------
   0  1  2
0  _  X  _
1  _  _  O
2  O  X  _

   0  1  2
0  _  X  _
1  _  _  O
2  O  X  _



choice: 4
Learner (X's) move
--------------------
   0  1  2
0  _  X  _
1  _  X  O
2  O  X  _

Learner ('X') wins 💪

reward: 10 | max_steps: 5 | urgency_inv: 3 | steps: 3
   0  1  2
0  _  X  _
1  _  X  O
2  O  X  _

Episode 30: reward: 30.000, steps: 3


choice: 7
Learner (X's) move
--------------------
   0  1  2
0  _  _  _
1  _  _  _
2  _  X  _

Computer (O's) move
--------------------
   0  1  2
0  _  _  O
1  _  _  _
2  _  X  _

   0  1  2
0  _  _  O
1  _  _  _
2  _  X  _



choice: 8
Learner (X's) move
--------------------
   0  1  2
0  _  _  O
1  _  _  _
2  _  X  X

Computer (O's) move
--------------------
   0  1 

2  _  X  _

Computer (O's) move
--------------------
   0  1  2
0  O  _  _
1  _  _  _
2  _  X  _

   0  1  2
0  O  _  _
1  _  _  _
2  _  X  _



choice: 6
Learner (X's) move
--------------------
   0  1  2
0  O  _  _
1  _  _  _
2  X  X  _

Computer (O's) move
--------------------
   0  1  2
0  O  _  _
1  _  O  _
2  X  X  _

   0  1  2
0  O  _  _
1  _  O  _
2  X  X  _



choice: 8
Learner (X's) move
--------------------
   0  1  2
0  O  _  _
1  _  O  _
2  X  X  X

Learner ('X') wins 💪

reward: 10 | max_steps: 5 | urgency_inv: 3 | steps: 3
   0  1  2
0  O  _  _
1  _  O  _
2  X  X  X

Episode 47: reward: 30.000, steps: 3


choice: 7
Learner (X's) move
--------------------
   0  1  2
0  _  _  _
1  _  _  _
2  _  X  _

Computer (O's) move
--------------------
   0  1  2
0  _  _  O
1  _  _  _
2  _  X  _

   0  1  2
0  _  _  O
1  _  _  _
2  _  X  _



choice: 8
Learner (X's) move
--------------------
   0  1  2
0  _  _  O
1  _  _  _
2  _  X  X

Computer (O's) move
--------------------
   0  1 

2  _  X  _

   0  1  2
0  _  _  _
1  _  O  _
2  _  X  _



choice: 8
Learner (X's) move
--------------------
   0  1  2
0  _  _  _
1  _  O  _
2  _  X  X

Computer (O's) move
--------------------
   0  1  2
0  O  _  _
1  _  O  _
2  _  X  X

   0  1  2
0  O  _  _
1  _  O  _
2  _  X  X



choice: 6
Learner (X's) move
--------------------
   0  1  2
0  O  _  _
1  _  O  _
2  X  X  X

Learner ('X') wins 💪

reward: 10 | max_steps: 5 | urgency_inv: 3 | steps: 3
   0  1  2
0  O  _  _
1  _  O  _
2  X  X  X

Episode 62: reward: 30.000, steps: 3


choice: 7
Learner (X's) move
--------------------
   0  1  2
0  _  _  _
1  _  _  _
2  _  X  _

Computer (O's) move
--------------------
   0  1  2
0  _  _  _
1  _  _  _
2  _  X  O

   0  1  2
0  _  _  _
1  _  _  _
2  _  X  O



choice: 1
Learner (X's) move
--------------------
   0  1  2
0  _  X  _
1  _  _  _
2  _  X  O

Computer (O's) move
--------------------
   0  1  2
0  _  X  _
1  _  _  _
2  O  X  O

   0  1  2
0  _  X  _
1  _  _  _
2  O  X  O



ch

2  _  X  _

Computer (O's) move
--------------------
   0  1  2
0  _  _  _
1  O  _  _
2  _  X  _

   0  1  2
0  _  _  _
1  O  _  _
2  _  X  _



choice: 1
Learner (X's) move
--------------------
   0  1  2
0  _  X  _
1  O  _  _
2  _  X  _

Computer (O's) move
--------------------
   0  1  2
0  O  X  _
1  O  _  _
2  _  X  _

   0  1  2
0  O  X  _
1  O  _  _
2  _  X  _



choice: 4
Learner (X's) move
--------------------
   0  1  2
0  O  X  _
1  O  X  _
2  _  X  _

Learner ('X') wins 💪

reward: 10 | max_steps: 5 | urgency_inv: 3 | steps: 3
   0  1  2
0  O  X  _
1  O  X  _
2  _  X  _

Episode 79: reward: 30.000, steps: 3


choice: 7
Learner (X's) move
--------------------
   0  1  2
0  _  _  _
1  _  _  _
2  _  X  _

Computer (O's) move
--------------------
   0  1  2
0  _  O  _
1  _  _  _
2  _  X  _

   0  1  2
0  _  O  _
1  _  _  _
2  _  X  _



choice: 8
Learner (X's) move
--------------------
   0  1  2
0  _  O  _
1  _  _  _
2  _  X  X

Computer (O's) move
--------------------
   0  1 

2  X  X  O



choice: 1
Learner (X's) move
--------------------
   0  1  2
0  O  X  O
1  _  X  _
2  X  X  O

Learner ('X') wins 💪

reward: 10 | max_steps: 5 | urgency_inv: 2 | steps: 4
   0  1  2
0  O  X  O
1  _  X  _
2  X  X  O

Episode 92: reward: 20.000, steps: 4


choice: 7
Learner (X's) move
--------------------
   0  1  2
0  _  _  _
1  _  _  _
2  _  X  _

Computer (O's) move
--------------------
   0  1  2
0  _  _  _
1  _  _  _
2  O  X  _

   0  1  2
0  _  _  _
1  _  _  _
2  O  X  _



choice: 1
Learner (X's) move
--------------------
   0  1  2
0  _  X  _
1  _  _  _
2  O  X  _

Computer (O's) move
--------------------
   0  1  2
0  _  X  O
1  _  _  _
2  O  X  _

   0  1  2
0  _  X  O
1  _  _  _
2  O  X  _



choice: 4
Learner (X's) move
--------------------
   0  1  2
0  _  X  O
1  _  X  _
2  O  X  _

Learner ('X') wins 💪

reward: 10 | max_steps: 5 | urgency_inv: 3 | steps: 3
   0  1  2
0  _  X  O
1  _  X  _
2  O  X  _

Episode 93: reward: 30.000, steps: 3


choice: 7
Learner (X

In [7]:
_dqn.fit(
    env, 
    nb_steps=10000,
    visualize=False
)

Training for 10000 steps ...
Interval 1 (0 steps performed)


choice: 2
Learner (X's) move
--------------------
   0  1  2
0  _  _  X
1  _  _  _
2  _  _  _

Computer (O's) move
--------------------
   0  1  2
0  _  _  X
1  _  _  _
2  O  _  _

    1/10000 [..............................] - ETA: 2:03 - reward: 0.0000e+00

choice: 1
Learner (X's) move
--------------------
   0  1  2
0  _  X  X
1  _  _  _
2  O  _  _

Computer (O's) move
--------------------
   0  1  2
0  _  X  X
1  _  _  O
2  O  _  _



choice: 8
Learner (X's) move
--------------------
   0  1  2
0  _  X  X
1  _  _  O
2  O  _  X

Computer (O's) move
--------------------
   0  1  2
0  _  X  X
1  O  _  O
2  O  _  X



choice: 5
Learner (X's) move
--------------------
Learner made invalid move


choice: 8
Learner (X's) move
--------------------
   0  1  2
0  _  _  _
1  _  _  _
2  _  _  X

Computer (O's) move
--------------------
   0  1  2
0  _  _  O
1  _  _  _
2  _  _  X



choice: 4
Learner (X's) move
--------------------
 

   50/10000 [..............................] - ETA: 55s - reward: -25.6000

choice: 4
Learner (X's) move
--------------------
   0  1  2
0  O  X  _
1  _  X  O
2  O  X  X

Learner ('X') wins 💪

reward: 10 | max_steps: 5 | urgency_inv: 2 | steps: 4


choice: 3
Learner (X's) move
--------------------
   0  1  2
0  _  _  _
1  X  _  _
2  _  _  _

Computer (O's) move
--------------------
   0  1  2
0  _  _  _
1  X  O  _
2  _  _  _



choice: 5
Learner (X's) move
--------------------
   0  1  2
0  _  _  _
1  X  O  X
2  _  _  _

Computer (O's) move
--------------------
   0  1  2
0  O  _  _
1  X  O  X
2  _  _  _



choice: 1
Learner (X's) move
--------------------
   0  1  2
0  O  X  _
1  X  O  X
2  _  _  _

Computer (O's) move
--------------------
   0  1  2
0  O  X  O
1  X  O  X
2  _  _  _



choice: 4
Learner (X's) move
--------------------
Learner made invalid move


choice: 7
Learner (X's) move
--------------------
   0  1  2
0  _  _  _
1  _  _  _
2  _  X  _

Computer (O's) move
---------

   97/10000 [..............................] - ETA: 1:02 - reward: -20.5670

choice: 0
Learner (X's) move
--------------------
Learner made invalid move


choice: 7
Learner (X's) move
--------------------
   0  1  2
0  _  _  _
1  _  _  _
2  _  X  _

Computer (O's) move
--------------------
   0  1  2
0  _  _  O
1  _  _  _
2  _  X  _



choice: 2
Learner (X's) move
--------------------
Learner made invalid move


choice: 2
Learner (X's) move
--------------------
   0  1  2
0  _  _  X
1  _  _  _
2  _  _  _

Computer (O's) move
--------------------
   0  1  2
0  _  _  X
1  _  _  _
2  _  O  _



choice: 7
Learner (X's) move
--------------------
Learner made invalid move


choice: 6
Learner (X's) move
--------------------
   0  1  2
0  _  _  _
1  _  _  _
2  X  _  _

Computer (O's) move
--------------------
   0  1  2
0  _  _  _
1  _  _  _
2  X  _  O



choice: 8
Learner (X's) move
--------------------
Learner made invalid move


choice: 7
Learner (X's) move
--------------------
   0  1  2
0

   0  1  2
0  O  O  X
1  _  _  X
2  X  O  _



choice: 4
Learner (X's) move
--------------------
   0  1  2
0  O  O  X
1  _  X  X
2  X  O  _

Learner ('X') wins 💪

reward: 10 | max_steps: 5 | urgency_inv: 2 | steps: 4


choice: 1
Learner (X's) move
--------------------
   0  1  2
0  _  X  _
1  _  _  _
2  _  _  _

Computer (O's) move
--------------------
   0  1  2
0  _  X  _
1  _  O  _
2  _  _  _

  151/10000 [..............................] - ETA: 1:04 - reward: -22.0861

choice: 4
Learner (X's) move
--------------------
Learner made invalid move


choice: 1
Learner (X's) move
--------------------
   0  1  2
0  _  X  _
1  _  _  _
2  _  _  _

Computer (O's) move
--------------------
   0  1  2
0  _  X  _
1  _  O  _
2  _  _  _



choice: 0
Learner (X's) move
--------------------
   0  1  2
0  X  X  _
1  _  O  _
2  _  _  _

Computer (O's) move
--------------------
   0  1  2
0  X  X  O
1  _  O  _
2  _  _  _



choice: 6
Learner (X's) move
--------------------
   0  1  2
0  X  X  O
1  _  



choice: 4
Learner (X's) move
--------------------
   0  1  2
0  _  X  O
1  _  X  _
2  _  X  O

Learner ('X') wins 💪

reward: 10 | max_steps: 5 | urgency_inv: 3 | steps: 3


choice: 7
Learner (X's) move
--------------------
   0  1  2
0  _  _  _
1  _  _  _
2  _  X  _

Computer (O's) move
--------------------
   0  1  2
0  _  _  O
1  _  _  _
2  _  X  _



choice: 8
Learner (X's) move
--------------------
   0  1  2
0  _  _  O
1  _  _  _
2  _  X  X

Computer (O's) move
--------------------
   0  1  2
0  _  _  O
1  O  _  _
2  _  X  X

  198/10000 [..............................] - ETA: 1:10 - reward: -19.5707

choice: 5
Learner (X's) move
--------------------
   0  1  2
0  _  _  O
1  O  _  X
2  _  X  X

Computer (O's) move
--------------------
   0  1  2
0  O  _  O
1  O  _  X
2  _  X  X



choice: 8
Learner (X's) move
--------------------
Learner made invalid move


choice: 5
Learner (X's) move
--------------------
   0  1  2
0  _  _  _
1  _  _  X
2  _  _  _

Computer (O's) move
--------

2  _  _  _



choice: 2
Learner (X's) move
--------------------
   0  1  2
0  X  X  X
1  _  O  O
2  _  _  _

Learner ('X') wins 💪

reward: 10 | max_steps: 5 | urgency_inv: 3 | steps: 3


choice: 7
Learner (X's) move
--------------------
   0  1  2
0  _  _  _
1  _  _  _
2  _  X  _

Computer (O's) move
--------------------
   0  1  2
0  O  _  _
1  _  _  _
2  _  X  _



choice: 5
Learner (X's) move
--------------------
   0  1  2
0  O  _  _
1  _  _  X
2  _  X  _

Computer (O's) move
--------------------
   0  1  2
0  O  _  O
1  _  _  X
2  _  X  _



choice: 1
Learner (X's) move
--------------------
   0  1  2
0  O  X  O
1  _  _  X
2  _  X  _

Computer (O's) move
--------------------
   0  1  2
0  O  X  O
1  O  _  X
2  _  X  _



choice: 8
Learner (X's) move
--------------------
   0  1  2
0  O  X  O
1  O  _  X
2  _  X  X

Computer (O's) move
--------------------
   0  1  2
0  O  X  O
1  O  O  X
2  _  X  X



choice: 6
Learner (X's) move
--------------------
   0  1  2
0  O  X  O
1  O  O  


Computer (O's) move
--------------------
   0  1  2
0  O  _  _
1  _  _  _
2  _  X  _



choice: 8
Learner (X's) move
--------------------
   0  1  2
0  O  _  _
1  _  _  _
2  _  X  X

Computer (O's) move
--------------------
   0  1  2
0  O  _  _
1  _  O  _
2  _  X  X



choice: 6
Learner (X's) move
--------------------
   0  1  2
0  O  _  _
1  _  O  _
2  X  X  X

Learner ('X') wins 💪

reward: 10 | max_steps: 5 | urgency_inv: 3 | steps: 3
  288/10000 [..............................] - ETA: 1:11 - reward: -14.7917

choice: 7
Learner (X's) move
--------------------
   0  1  2
0  _  _  _
1  _  _  _
2  _  X  _

Computer (O's) move
--------------------
   0  1  2
0  _  O  _
1  _  _  _
2  _  X  _



choice: 1
Learner (X's) move
--------------------
Learner made invalid move


choice: 3
Learner (X's) move
--------------------
   0  1  2
0  _  _  _
1  X  _  _
2  _  _  _

Computer (O's) move
--------------------
   0  1  2
0  _  _  _
1  X  _  _
2  _  O  _



choice: 6
Learner (X's) move
-------

  331/10000 [..............................] - ETA: 1:11 - reward: -13.7764

choice: 2
Learner (X's) move
--------------------
   0  1  2
0  O  _  X
1  _  _  _
2  X  _  _

Computer (O's) move
--------------------
   0  1  2
0  O  _  X
1  _  _  O
2  X  _  _



choice: 7
Learner (X's) move
--------------------
   0  1  2
0  O  _  X
1  _  _  O
2  X  X  _

Computer (O's) move
--------------------
   0  1  2
0  O  _  X
1  O  _  O
2  X  X  _



choice: 6
Learner (X's) move
--------------------
Learner made invalid move


choice: 7
Learner (X's) move
--------------------
   0  1  2
0  _  _  _
1  _  _  _
2  _  X  _

Computer (O's) move
--------------------
   0  1  2
0  _  _  _
1  O  _  _
2  _  X  _



choice: 4
Learner (X's) move
--------------------
   0  1  2
0  _  _  _
1  O  X  _
2  _  X  _

Computer (O's) move
--------------------
   0  1  2
0  _  _  _
1  O  X  _
2  _  X  O



choice: 1
Learner (X's) move
--------------------
   0  1  2
0  _  X  _
1  O  X  _
2  _  X  O

Learner ('X') wins

2  _  X  X

Computer (O's) move
--------------------
   0  1  2
0  _  O  O
1  _  _  _
2  _  X  X



choice: 5
Learner (X's) move
--------------------
   0  1  2
0  _  O  O
1  _  _  X
2  _  X  X

Computer (O's) move
--------------------
   0  1  2
0  O  O  O
1  _  _  X
2  _  X  X

Computer ('O') wins 😔

reward: -5 | max_steps: 5 | urgency_inv: 3 | steps: 3


choice: 7
Learner (X's) move
--------------------
   0  1  2
0  _  _  _
1  _  _  _
2  _  X  _

Computer (O's) move
--------------------
   0  1  2
0  _  _  O
1  _  _  _
2  _  X  _

  379/10000 [>.............................] - ETA: 1:18 - reward: -13.6807

choice: 8
Learner (X's) move
--------------------
   0  1  2
0  _  _  O
1  _  _  _
2  _  X  X

Computer (O's) move
--------------------
   0  1  2
0  _  _  O
1  _  O  _
2  _  X  X



choice: 2
Learner (X's) move
--------------------
Learner made invalid move


choice: 7
Learner (X's) move
--------------------
   0  1  2
0  _  _  _
1  _  _  _
2  _  X  _

Computer (O's) move
------




choice: 8
Learner (X's) move
--------------------
   0  1  2
0  _  _  O
1  _  _  _
2  _  X  X

Computer (O's) move
--------------------
   0  1  2
0  _  _  O
1  O  _  _
2  _  X  X

  420/10000 [>.............................] - ETA: 1:25 - reward: -12.7738

choice: 6
Learner (X's) move
--------------------
   0  1  2
0  _  _  O
1  O  _  _
2  X  X  X

Learner ('X') wins 💪

reward: 10 | max_steps: 5 | urgency_inv: 3 | steps: 3


choice: 7
Learner (X's) move
--------------------
   0  1  2
0  _  _  _
1  _  _  _
2  _  X  _

Computer (O's) move
--------------------
   0  1  2
0  _  _  _
1  _  _  _
2  O  X  _



choice: 1
Learner (X's) move
--------------------
   0  1  2
0  _  X  _
1  _  _  _
2  O  X  _

Computer (O's) move
--------------------
   0  1  2
0  _  X  _
1  O  _  _
2  O  X  _



choice: 4
Learner (X's) move
--------------------
   0  1  2
0  _  X  _
1  O  X  _
2  O  X  _

Learner ('X') wins 💪

reward: 10 | max_steps: 5 | urgency_inv: 3 | steps: 3
  424/10000 [>...............

  461/10000 [>.............................] - ETA: 1:29 - reward: -11.2798

choice: 7
Learner (X's) move
--------------------
   0  1  2
0  _  _  _
1  _  _  _
2  _  X  _

Computer (O's) move
--------------------
   0  1  2
0  _  _  _
1  _  O  _
2  _  X  _



choice: 3
Learner (X's) move
--------------------
   0  1  2
0  _  _  _
1  X  O  _
2  _  X  _

Computer (O's) move
--------------------
   0  1  2
0  _  _  _
1  X  O  _
2  O  X  _



choice: 1
Learner (X's) move
--------------------
   0  1  2
0  _  X  _
1  X  O  _
2  O  X  _

Computer (O's) move
--------------------
   0  1  2
0  O  X  _
1  X  O  _
2  O  X  _



choice: 2
Learner (X's) move
--------------------
   0  1  2
0  O  X  X
1  X  O  _
2  O  X  _

Computer (O's) move
--------------------
   0  1  2
0  O  X  X
1  X  O  _
2  O  X  O

Computer ('O') wins 😔

reward: -5 | max_steps: 5 | urgency_inv: 2 | steps: 4


choice: 7
Learner (X's) move
--------------------
   0  1  2
0  _  _  _
1  _  _  _
2  _  X  _

Computer (O's) move


Computer (O's) move
--------------------
   0  1  2
0  O  O  _
1  _  _  _
2  X  X  _



choice: 8
Learner (X's) move
--------------------
   0  1  2
0  O  O  _
1  _  _  _
2  X  X  X

Learner ('X') wins 💪

reward: 10 | max_steps: 5 | urgency_inv: 3 | steps: 3
  505/10000 [>.............................] - ETA: 1:29 - reward: -9.8218 

choice: 7
Learner (X's) move
--------------------
   0  1  2
0  _  _  _
1  _  _  _
2  _  X  _

Computer (O's) move
--------------------
   0  1  2
0  _  _  _
1  _  _  _
2  _  X  O



choice: 1
Learner (X's) move
--------------------
   0  1  2
0  _  X  _
1  _  _  _
2  _  X  O

Computer (O's) move
--------------------
   0  1  2
0  _  X  _
1  _  O  _
2  _  X  O



choice: 0
Learner (X's) move
--------------------
   0  1  2
0  X  X  _
1  _  O  _
2  _  X  O

Computer (O's) move
--------------------
   0  1  2
0  X  X  _
1  _  O  O
2  _  X  O



choice: 2
Learner (X's) move
--------------------
   0  1  2
0  X  X  X
1  _  O  O
2  _  X  O

Learner ('X') wins 

choice: 6
Learner (X's) move
--------------------
   0  1  2
0  O  _  _
1  _  _  _
2  X  X  _

Computer (O's) move
--------------------
   0  1  2
0  O  _  _
1  _  O  _
2  X  X  _

  547/10000 [>.............................] - ETA: 1:30 - reward: -8.3729

choice: 8
Learner (X's) move
--------------------
   0  1  2
0  O  _  _
1  _  O  _
2  X  X  X

Learner ('X') wins 💪

reward: 10 | max_steps: 5 | urgency_inv: 3 | steps: 3


choice: 7
Learner (X's) move
--------------------
   0  1  2
0  _  _  _
1  _  _  _
2  _  X  _

Computer (O's) move
--------------------
   0  1  2
0  _  _  _
1  _  O  _
2  _  X  _



choice: 8
Learner (X's) move
--------------------
   0  1  2
0  _  _  _
1  _  O  _
2  _  X  X

Computer (O's) move
--------------------
   0  1  2
0  _  _  _
1  O  O  _
2  _  X  X



choice: 6
Learner (X's) move
--------------------
   0  1  2
0  _  _  _
1  O  O  _
2  X  X  X

Learner ('X') wins 💪

reward: 10 | max_steps: 5 | urgency_inv: 3 | steps: 3


choice: 7
Learner (X's) move
--

choice: 7
Learner (X's) move
--------------------
   0  1  2
0  _  _  _
1  _  _  _
2  _  X  _

Computer (O's) move
--------------------
   0  1  2
0  _  O  _
1  _  _  _
2  _  X  _

  590/10000 [>.............................] - ETA: 1:30 - reward: -7.0847

choice: 8
Learner (X's) move
--------------------
   0  1  2
0  _  O  _
1  _  _  _
2  _  X  X

Computer (O's) move
--------------------
   0  1  2
0  _  O  _
1  _  _  _
2  O  X  X



choice: 4
Learner (X's) move
--------------------
   0  1  2
0  _  O  _
1  _  X  _
2  O  X  X

Computer (O's) move
--------------------
   0  1  2
0  _  O  O
1  _  X  _
2  O  X  X



choice: 0
Learner (X's) move
--------------------
   0  1  2
0  X  O  O
1  _  X  _
2  O  X  X

Learner ('X') wins 💪

reward: 10 | max_steps: 5 | urgency_inv: 2 | steps: 4


choice: 7
Learner (X's) move
--------------------
   0  1  2
0  _  _  _
1  _  _  _
2  _  X  _

Computer (O's) move
--------------------
   0  1  2
0  _  _  _
1  O  _  _
2  _  X  _



choice: 1
Learner (X'

2  _  X  _

Computer (O's) move
--------------------
   0  1  2
0  O  X  _
1  _  _  O
2  _  X  _



choice: 4
Learner (X's) move
--------------------
   0  1  2
0  O  X  _
1  _  X  O
2  _  X  _

Learner ('X') wins 💪

reward: 10 | max_steps: 5 | urgency_inv: 3 | steps: 3
  633/10000 [>.............................] - ETA: 1:30 - reward: -5.9716

choice: 7
Learner (X's) move
--------------------
   0  1  2
0  _  _  _
1  _  _  _
2  _  X  _

Computer (O's) move
--------------------
   0  1  2
0  _  _  _
1  _  _  O
2  _  X  _



choice: 1
Learner (X's) move
--------------------
   0  1  2
0  _  X  _
1  _  _  O
2  _  X  _

Computer (O's) move
--------------------
   0  1  2
0  _  X  _
1  O  _  O
2  _  X  _



choice: 4
Learner (X's) move
--------------------
   0  1  2
0  _  X  _
1  O  X  O
2  _  X  _

Learner ('X') wins 💪

reward: 10 | max_steps: 5 | urgency_inv: 3 | steps: 3


choice: 7
Learner (X's) move
--------------------
   0  1  2
0  _  _  _
1  _  _  _
2  _  X  _

Computer (O's) move

<keras.callbacks.callbacks.History at 0x633b3f908>

In [18]:
_dqn.forward?
[0] * 9

[0, 0, 0, 0, 0, 0, 0, 0, 0]

In [23]:
_dqn.forward([0, 0, -1, 0, 0, -1, 0, 0, -1])

8