In [7]:
from kaggle_environments import make
from keras import Model
from keras.optimizers import Adam
from rl.agents.dqn import DQNAgent
from rl.policy import EpsGreedyQPolicy
from rl.memory import SequentialMemory
from rl.processors import Processor
from keras.layers import Dense, Dropout, Input, Reshape
import numpy as np

In [31]:
class ConnectXProcessor(Processor):
    def process_observation(self, observation):
        return np.array(observation['board'])

    def process_state_batch(self, batch):
        return batch

    def process_reward(self, _reward):
        return (10 if _reward == 1 else 0 if _reward == 0 else -10) if _reward is not None else -10

    def process_action(self, action):
        return int(action)

In [32]:
i = Input(shape=(1, 42))
x = Reshape((42,))(i)
x = Dense(42, activation='leaky_relu')(x)
x = Dropout(0.1)(x)
x = Dense(42, activation='leaky_relu')(x)
x = Dropout(0.1)(x)
x = Dense(24, activation='linear')(x)
x = Dense(24, activation='linear')(x)
o = Dense(7, activation='leaky_relu')(x)
model = Model(inputs=i, outputs=o)
print(model.layers[-2])
print(model.output.shape)

<keras.layers.core.dense.Dense object at 0x0000025E4A088A30>
(None, 7)


In [33]:
policy = EpsGreedyQPolicy(0.2)
processor = ConnectXProcessor()
memory = SequentialMemory(limit=50000, window_length=1)
    agent = DQNAgent(model=model, policy=policy, memory=memory, nb_actions=7, nb_steps_warmup=100, target_model_update=1e-2,
                 processor=processor, enable_double_dqn=True, enable_dueling_network=True)
agent.compile(optimizer=Adam(), metrics=['mae'])
# agent.load_weights('dqn_weights_23.h5f')

In [20]:
env = make("connectx", debug=False)
trainer = env.train([None, "random"])
agent.fit(trainer, nb_steps=50000, visualize=False, verbose=1, nb_max_episode_steps=1000)
agent.save_weights('dqn_weights.h5f', overwrite=True)

Training for 50000 steps ...
Interval 1 (0 steps performed)
1160 episodes - episode_reward: 4.599 [-8.000, 12.000] - loss: 0.920 - mae: 2.547 - mean_q: 3.379

Interval 2 (10000 steps performed)
1133 episodes - episode_reward: 5.046 [-6.500, 12.000] - loss: 0.779 - mae: 3.146 - mean_q: 3.909

Interval 3 (20000 steps performed)
1130 episodes - episode_reward: 4.815 [-7.500, 12.500] - loss: 0.754 - mae: 3.319 - mean_q: 4.060

Interval 4 (30000 steps performed)
1131 episodes - episode_reward: 4.824 [-6.500, 11.500] - loss: 0.748 - mae: 3.309 - mean_q: 4.005

Interval 5 (40000 steps performed)
done, took 650.001 seconds


In [22]:
agent.fit(trainer, nb_steps=50000, visualize=False, verbose=1, nb_max_episode_steps=1000)
agent.save_weights('dqn_weights_2.h5f', overwrite=True)

Training for 50000 steps ...
Interval 1 (0 steps performed)
1172 episodes - episode_reward: 4.725 [-6.500, 12.000] - loss: 0.776 - mae: 3.367 - mean_q: 4.028

Interval 2 (10000 steps performed)
1135 episodes - episode_reward: 4.705 [-7.500, 11.500] - loss: 0.793 - mae: 3.362 - mean_q: 4.013

Interval 3 (20000 steps performed)
1068 episodes - episode_reward: 4.709 [-8.000, 12.000] - loss: 0.815 - mae: 3.389 - mean_q: 4.047

Interval 4 (30000 steps performed)
1095 episodes - episode_reward: 4.695 [-7.000, 12.000] - loss: 0.831 - mae: 3.384 - mean_q: 4.046

Interval 5 (40000 steps performed)
done, took 658.886 seconds


In [28]:
agent.load_weights('dqn_weights_2.h5f')
agent.fit(trainer, nb_steps=50000, visualize=False, verbose=1, nb_max_episode_steps=1000)
agent.save_weights('dqn_weights_3.h5f', overwrite=True)

Training for 50000 steps ...
Interval 1 (0 steps performed)
1131 episodes - episode_reward: 4.293 [-7.500, 11.500] - loss: 0.841 - mae: 3.287 - mean_q: 4.080

Interval 2 (10000 steps performed)
1200 episodes - episode_reward: 4.090 [-7.000, 12.500] - loss: 0.895 - mae: 3.204 - mean_q: 3.930

Interval 3 (20000 steps performed)
1188 episodes - episode_reward: 4.070 [-7.500, 11.500] - loss: 0.897 - mae: 3.091 - mean_q: 3.772

Interval 4 (30000 steps performed)
1188 episodes - episode_reward: 4.095 [-7.500, 12.000] - loss: 0.863 - mae: 3.031 - mean_q: 3.690

Interval 5 (40000 steps performed)
done, took 675.314 seconds


In [None]:
for c in range(3,23):
    env = make("connectx", debug=False)
    trainer = env.train([None, "negamax"])
    if c>0:
        agent.load_weights(f'dqn_weights_{c}.h5f')
    agent.fit(trainer, nb_steps=10000, visualize=False, verbose=1, nb_max_episode_steps=1000)
    agent.save_weights(f'dqn_weights_{c+1}.h5f', overwrite=True)

Training for 10000 steps ...
Interval 1 (0 steps performed)


  updates=self.state_updates,


done, took 632.274 seconds
Training for 10000 steps ...
Interval 1 (0 steps performed)
done, took 596.335 seconds
Training for 10000 steps ...
Interval 1 (0 steps performed)
done, took 578.387 seconds
Training for 10000 steps ...
Interval 1 (0 steps performed)
done, took 577.228 seconds
Training for 10000 steps ...
Interval 1 (0 steps performed)
done, took 579.499 seconds
Training for 10000 steps ...
Interval 1 (0 steps performed)
done, took 598.367 seconds
Training for 10000 steps ...
Interval 1 (0 steps performed)
done, took 592.345 seconds
Training for 10000 steps ...
Interval 1 (0 steps performed)
done, took 583.538 seconds
Training for 10000 steps ...
Interval 1 (0 steps performed)
done, took 599.906 seconds
Training for 10000 steps ...
Interval 1 (0 steps performed)
done, took 592.904 seconds
Training for 10000 steps ...
Interval 1 (0 steps performed)
done, took 593.691 seconds
Training for 10000 steps ...
Interval 1 (0 steps performed)
done, took 599.556 seconds
Training for 100

In [6]:
def kaggle_agent(observation, _):
    return processor.process_action(np.argmax(agent.forward(processor.process_observation(observation))))

In [None]:
env = make("connectx", debug=True)
env.play([kaggle_agent, None], width=500, height=450, fps=10)
# env.render(mode="ipython", width=500, height=450)

In [36]:
save_name = "dqn_weights_dd"
for c in range(1,20):
    env = make("connectx", debug=False)
    trainer = env.train([None, "negamax"])
    if c>0:
        agent.load_weights(f'{save_name}_{c}.h5f')
    agent.fit(trainer, nb_steps=5000, visualize=False, verbose=1, nb_max_episode_steps=1000)
    agent.save_weights(f'{save_name}_{c+1}.h5f', overwrite=True)

Training for 5000 steps ...
Interval 1 (0 steps performed)
Training for 5000 steps ...
Interval 1 (0 steps performed)
Training for 5000 steps ...
Interval 1 (0 steps performed)
Training for 5000 steps ...
Interval 1 (0 steps performed)
Training for 5000 steps ...
Interval 1 (0 steps performed)
Training for 5000 steps ...
Interval 1 (0 steps performed)
  135/10000 [..............................] - ETA: 9:59 - reward: -0.7407 done, took 8.299 seconds
Training for 5000 steps ...
Interval 1 (0 steps performed)
    3/10000 [..............................] - ETA: 15:57 - reward: 0.0000e+00done, took 0.379 seconds
Training for 5000 steps ...
Interval 1 (0 steps performed)
    2/10000 [..............................] - ETA: 20:25 - reward: 0.0000e+00done, took 0.290 seconds
Training for 5000 steps ...
Interval 1 (0 steps performed)
    3/10000 [..............................] - ETA: 16:36 - reward: 0.0000e+00done, took 0.378 seconds
Training for 5000 steps ...
Interval 1 (0 steps performed)
 

In [37]:
save_name = "dqn_weights_dd"
agent.policy = EpsGreedyQPolicy(0.05)
for c in range(6,20):
    env = make("connectx", debug=False)
    trainer = env.train([None, "negamax"])
    if c>0:
        agent.load_weights(f'{save_name}_{c}.h5f')
    agent.fit(trainer, nb_steps=2500, visualize=False, verbose=1, nb_max_episode_steps=1000)
    agent.save_weights(f'{save_name}_{c+1}.h5f', overwrite=True)

Training for 2500 steps ...
Interval 1 (0 steps performed)
Training for 2500 steps ...
Interval 1 (0 steps performed)
Training for 2500 steps ...
Interval 1 (0 steps performed)
Training for 2500 steps ...
Interval 1 (0 steps performed)
Training for 2500 steps ...
Interval 1 (0 steps performed)
Training for 2500 steps ...
Interval 1 (0 steps performed)
Training for 2500 steps ...
Interval 1 (0 steps performed)
Training for 2500 steps ...
Interval 1 (0 steps performed)
Training for 2500 steps ...
Interval 1 (0 steps performed)
Training for 2500 steps ...
Interval 1 (0 steps performed)
   63/10000 [..............................] - ETA: 7:24 - reward: 0.1587done, took 2.980 seconds
Training for 2500 steps ...
Interval 1 (0 steps performed)
    2/10000 [..............................] - ETA: 21:07 - reward: 0.0000e+00done, took 0.314 seconds
Training for 2500 steps ...
Interval 1 (0 steps performed)
done, took 0.365 seconds
Training for 2500 steps ...
Interval 1 (0 steps performed)
done, t

In [38]:
save_name = "dqn_weights_dd"
agent.policy = EpsGreedyQPolicy(0.025)
for c in range(15,20):
    env = make("connectx", debug=False)
    trainer = env.train([None, "negamax"])
    if c>0:
        agent.load_weights(f'{save_name}_{c}.h5f')
    agent.fit(trainer, nb_steps=2500, visualize=False, verbose=1, nb_max_episode_steps=1000)
    agent.save_weights(f'{save_name}_{c+1}.h5f', overwrite=True)

Training for 2500 steps ...
Interval 1 (0 steps performed)
Training for 2500 steps ...
Interval 1 (0 steps performed)
Training for 2500 steps ...
Interval 1 (0 steps performed)
Training for 2500 steps ...
Interval 1 (0 steps performed)
Training for 2500 steps ...
Interval 1 (0 steps performed)


In [39]:
save_name = "dqn_weights_dd"
agent.policy = EpsGreedyQPolicy(0.0125)
for c in range(20,25):
    env = make("connectx", debug=False)
    trainer = env.train([None, "negamax"])
    if c>0:
        agent.load_weights(f'{save_name}_{c}.h5f')
    agent.fit(trainer, nb_steps=2000, visualize=False, verbose=1, nb_max_episode_steps=1000)
    agent.save_weights(f'{save_name}_{c+1}.h5f', overwrite=True)

Training for 2000 steps ...
Interval 1 (0 steps performed)
 1999/10000 [====>.........................] - ETA: 7:40 - reward: 0.0850done, took 115.193 seconds
Training for 2000 steps ...
Interval 1 (0 steps performed)
 2000/10000 [=====>........................] - ETA: 7:37 - reward: -0.0300done, took 114.517 seconds
Training for 2000 steps ...
Interval 1 (0 steps performed)
 2000/10000 [=====>........................] - ETA: 7:43 - reward: -0.0350done, took 115.953 seconds
Training for 2000 steps ...
Interval 1 (0 steps performed)
 1999/10000 [====>.........................] - ETA: 7:22 - reward: -0.0500done, took 110.760 seconds
Training for 2000 steps ...
Interval 1 (0 steps performed)
 2000/10000 [=====>........................] - ETA: 7:29 - reward: -0.0050done, took 112.417 seconds


In [40]:
save_name = "dqn_weights_dd"
agent.policy = EpsGreedyQPolicy(0.1)
for c in range(25,30):
    env = make("connectx", debug=False)
    trainer = env.train([None, "negamax"])
    agent.load_weights(f'{save_name}_{c}.h5f')
    agent.fit(trainer, nb_steps=2500, visualize=False, verbose=1, nb_max_episode_steps=1000)
    agent.save_weights(f'{save_name}_{c+1}.h5f', overwrite=True)

Training for 2500 steps ...
Interval 1 (0 steps performed)
Training for 2500 steps ...
Interval 1 (0 steps performed)
Training for 2500 steps ...
Interval 1 (0 steps performed)
Training for 2500 steps ...
Interval 1 (0 steps performed)
Training for 2500 steps ...
Interval 1 (0 steps performed)


In [1]:
from kaggle_environments import make

Loading environment lux_ai_s2 failed: No module named 'pettingzoo'


In [2]:
env = make("connectx", debug=True)

In [3]:
trainer = env.train([None, "negamax"])

In [4]:
trainer.reset()

{'remainingOverageTime': 60,
 'step': 0,
 'board': [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 'mark': 1}

In [5]:
env.reset()

[{'action': 0,
  'reward': 0,
  'info': {},
  'observation': {'remainingOverageTime': 60,
   'step': 0,
   'board': [0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0],
   'mark': 1},
  'status': 'ACTIVE'},
 {'action': 0,
  'reward': 0,
  'info': {},
  'observation': {'remainingOverageTime': 60, 'mark': 2},
  'status': 'INACTIVE'}]

In [81]:
ep = env.run([ lambda _,__:0,"random"])
[(x[0]['reward'], x[1]['reward']) for x in ep][-1]

Invalid Action: Invalid column: 0


(None, 0)

[(0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (1, -1)]