In [2]:
from kaggle_environments import make
from keras import Model
from keras.optimizers import Adam
from rl.agents.dqn import DQNAgent
from rl.policy import EpsGreedyQPolicy
from rl.memory import SequentialMemory
from rl.processors import Processor
from keras.layers import Dense, Dropout, Input, Reshape
import numpy as np

In [3]:
class ConnectXProcessor(Processor):
    def process_observation(self, observation):
        return np.array(observation['board'])

    def process_state_batch(self, batch):
        return batch

    def process_reward(self, _reward):
        return (3 if _reward == 1 else -0.1 if _reward == 0 else -7) if _reward is not None else -21

    def process_action(self, action):
        return int(action)

In [4]:
i = Input(shape=(1, 42))
x = Reshape((42,))(i)
x = Dense(42, activation='leaky_relu')(x)
x = Dropout(0.1)(x)
x = Dense(42, activation='leaky_relu')(x)
x = Dropout(0.1)(x)
x = Dense(42, activation='leaky_relu')(x)
x = Dropout(0.1)(x)
x = Dense(42, activation='leaky_relu')(x)
x = Dropout(0.1)(x)
x = Dense(42, activation='leaky_relu')(x)
x = Dropout(0.1)(x)
x = Dense(24, activation='linear')(x)
o = Dense(7, activation='leaky_relu')(x)
model = Model(inputs=i, outputs=o)

In [5]:
policy = EpsGreedyQPolicy(0.15)
processor = ConnectXProcessor()
memory = SequentialMemory(limit=50000, window_length=1)
agent = DQNAgent(model=model, policy=policy, memory=memory, nb_actions=7, nb_steps_warmup=100, target_model_update=1e-2,
                 processor=processor, enable_double_dqn=True, enable_dueling_network=True)
agent.compile(optimizer=Adam(), metrics=['mae'])
# agent.load_weights('dqn_weights_23.h5f')

In [6]:
save_name = "dqn_weights_dd2"
for c in range(4,10):
    env = make("connectx", debug=False)
    trainer = env.train(["negamax", None])
    agent.policy = EpsGreedyQPolicy(0.2/(c+1))
    if c>0:
        agent.load_weights(f'{save_name}_{c}.h5f')
    agent.fit(trainer, nb_steps=10000, visualize=False, verbose=1)
    agent.save_weights(f'{save_name}_{c+1}.h5f', overwrite=True)

Training for 10000 steps ...
Interval 1 (0 steps performed)


  updates=self.state_updates,


done, took 1003.623 seconds
Training for 10000 steps ...
Interval 1 (0 steps performed)
done, took 1005.525 seconds
Training for 10000 steps ...
Interval 1 (0 steps performed)
done, took 953.171 seconds
Training for 10000 steps ...
Interval 1 (0 steps performed)
done, took 899.107 seconds
Training for 10000 steps ...
Interval 1 (0 steps performed)
done, took 903.041 seconds
Training for 10000 steps ...
Interval 1 (0 steps performed)
done, took 950.241 seconds


In [7]:
save_name = "dqn_weights_dd2"
for c in range(10,20):
    env = make("connectx", debug=False)
    trainer = env.train(["random", None])
    agent.policy = EpsGreedyQPolicy(0.2/(c+1))
    if c>0:
        agent.load_weights(f'{save_name}_{c}.h5f')
    agent.fit(trainer, nb_steps=10000, visualize=False, verbose=1)
    agent.save_weights(f'{save_name}_{c+1}.h5f', overwrite=True)

Training for 10000 steps ...
Interval 1 (0 steps performed)
done, took 162.196 seconds
Training for 10000 steps ...
Interval 1 (0 steps performed)
done, took 165.024 seconds
Training for 10000 steps ...
Interval 1 (0 steps performed)
done, took 169.146 seconds
Training for 10000 steps ...
Interval 1 (0 steps performed)
done, took 169.575 seconds
Training for 10000 steps ...
Interval 1 (0 steps performed)
done, took 167.997 seconds
Training for 10000 steps ...
Interval 1 (0 steps performed)
done, took 170.581 seconds
Training for 10000 steps ...
Interval 1 (0 steps performed)
done, took 170.293 seconds
Training for 10000 steps ...
Interval 1 (0 steps performed)
done, took 172.321 seconds
Training for 10000 steps ...
Interval 1 (0 steps performed)
done, took 171.714 seconds
Training for 10000 steps ...
Interval 1 (0 steps performed)
done, took 171.316 seconds


In [8]:
save_name = "dqn_weights_dd2"
agent.load_weights(f'{save_name}_{20}.h5f')
def kaggle_agent(observation, _):
    return processor.process_action(np.argmax(agent.forward(processor.process_observation(observation))))


In [9]:
from kaggle_environments import evaluate
env = make("connectx", debug=True)
evaluate("connectx", [kaggle_agent, "random"], num_episodes=10)

[[1, -1],
 [-1, 1],
 [1, -1],
 [None, 0],
 [1, -1],
 [1, -1],
 [None, 0],
 [1, -1],
 [1, -1],
 [1, -1]]