In [30]:
from kaggle_environments import make
from keras import Model
from keras.optimizers import Adam
from rl.agents.dqn import DQNAgent
from rl.policy import EpsGreedyQPolicy, LinearAnnealedPolicy
from rl.memory import SequentialMemory
from rl.processors import Processor
from keras.layers import Dense, Input, Reshape, Lambda, Concatenate
import numpy as np

In [31]:
class ConnectXProcessor(Processor):
    def process_observation(self, observation):
        return np.array( tuple(-1 if e == 2 else int(e) for e in observation['board']))

    def process_state_batch(self, batch):
        return batch

    def process_reward(self, _reward):
        return (0.5 if _reward == 1 else 0 if _reward == 0 else -0.3) if _reward is not None else -30

    def process_action(self, action):
        return int(action)

In [32]:
i = Input(shape=(1, 42))
r = Reshape((42,))(i)
y = Lambda(lambda e: e[:, :7])(r)
x = Dense(42*4, activation='sigmoid')(r)
x = Dense(42*4, activation='sigmoid')(x)
x = Dense(42*4, activation='sigmoid')(x)
x = Dense(42*4, activation='sigmoid')(x)
x = Dense(7, activation='sigmoid')(x)
o = Concatenate()([x, y])
o = Dense(7, activation='sigmoid')(o)
o = Dense(7, activation='linear')(o)
model = Model(inputs=i, outputs=o)

In [33]:
policy = EpsGreedyQPolicy(0.1)
policy = LinearAnnealedPolicy(policy, attr='eps', value_max=0.1, value_min=0.01, value_test=0.001, nb_steps=100000)
processor = ConnectXProcessor()
memory = SequentialMemory(limit=50000, window_length=1)
agent = DQNAgent(model=model, policy=policy, memory=memory, nb_actions=7, nb_steps_warmup=100, target_model_update=1e-2,
                 processor=processor, enable_double_dqn=True, enable_dueling_network=True)
agent.compile(optimizer=Adam(learning_rate=0.01), metrics=['mae'])
# agent.load_weights('dqn_weights_23.h5f')

In [34]:
def mean_reward(_rewards):
    _rewards = [r[0] for r in _rewards]
    wins = sum(1 for r in _rewards if r == 1)
    losses = sum(1 for r in _rewards if r == -1)
    mistakes = sum(1 for r in _rewards if r is None)
    opponent_mistakes = sum(1 for r in _rewards if r == 0)
    return "W: " + str(wins) + "; L: " + str(losses) + "; M: " + str(mistakes) + "; O: " + str(opponent_mistakes)
from kaggle_environments import evaluate

In [35]:
save_name = "bigger"
env = make("connectx", debug=False)
trainer = env.train(["random", None])
agent.fit(trainer, nb_steps=100000, visualize=False, verbose=1)
agent.save_weights(f'{save_name}.h5f', overwrite=True)

Training for 100000 steps ...
Interval 1 (0 steps performed)
1728 episodes - episode_reward: -1.437 [-30.000, 0.500] - loss: 3.978 - mae: 2.073 - mean_q: -0.870 - mean_eps: 0.095

Interval 2 (10000 steps performed)
1714 episodes - episode_reward: -0.876 [-30.000, 0.500] - loss: 1.674 - mae: 1.521 - mean_q: -0.083 - mean_eps: 0.087

Interval 3 (20000 steps performed)
1690 episodes - episode_reward: -0.932 [-30.000, 0.500] - loss: 1.751 - mae: 1.722 - mean_q: 0.345 - mean_eps: 0.078

Interval 4 (30000 steps performed)
1711 episodes - episode_reward: -0.843 [-30.000, 0.500] - loss: 1.738 - mae: 1.811 - mean_q: 0.340 - mean_eps: 0.069

Interval 5 (40000 steps performed)
1708 episodes - episode_reward: -0.254 [-30.000, 0.500] - loss: 1.725 - mae: 1.785 - mean_q: 0.386 - mean_eps: 0.060

Interval 6 (50000 steps performed)
1737 episodes - episode_reward: -0.473 [-30.000, 0.500] - loss: 1.657 - mae: 1.693 - mean_q: 0.415 - mean_eps: 0.051

Interval 7 (60000 steps performed)
1752 episodes - epi

In [38]:
def kaggle_agent(observation, _):
    return processor.process_action(np.argmax(agent.forward(processor.process_observation(observation))))
print(mean_reward(evaluate("connectx", [kaggle_agent, "random"], num_episodes=1000)))


W: 691; L: 13; M: 296; O: 0
