In [21]:
from kaggle_environments import make
from keras import Model
from keras.optimizers import Adam
from rl.agents.dqn import DQNAgent
from rl.policy import EpsGreedyQPolicy, LinearAnnealedPolicy
from rl.memory import SequentialMemory
from rl.processors import Processor
from keras.layers import Dense, Input, Reshape, Lambda, Concatenate
import numpy as np

In [22]:
class ConnectXProcessor(Processor):
    def process_observation(self, observation):
        return np.array( tuple(-1 if e == 2 else int(e) for e in observation['board']))

    def process_state_batch(self, batch):
        return batch

    def process_reward(self, _reward):
        return (1 if _reward == 1 else 0 if _reward == 0 else -0.3) if _reward is not None else -35

    def process_action(self, action):
        return int(action)

In [23]:
i = Input(shape=(1, 42))
r = Reshape((42,))(i)
y = Lambda(lambda e: e[:, :7])(r)
x = Dense(42*7, activation='leaky_relu')(r)
x = Dense(42*7, activation='leaky_relu')(x)
x = Dense(42*7, activation='leaky_relu')(x)
x = Dense(42*7, activation='leaky_relu')(x)
x = Dense(7, activation='leaky_relu')(x)
o = Concatenate()([x, y])
o = Dense(7, activation='leaky_relu')(o)
o = Dense(7, activation='linear')(o)
model = Model(inputs=i, outputs=o)

In [24]:
policy = EpsGreedyQPolicy(0.1)
policy = LinearAnnealedPolicy(policy, attr='eps', value_max=0.5, value_min=0.005, value_test=0.001, nb_steps=100000)
processor = ConnectXProcessor()
memory = SequentialMemory(limit=50000, window_length=1)
agent = DQNAgent(model=model, policy=policy, memory=memory, nb_actions=7, nb_steps_warmup=100, target_model_update=1e-2,
                 processor=processor, enable_double_dqn=True, enable_dueling_network=True)
agent.compile(optimizer=Adam(learning_rate=0.02), metrics=['mae'])
# agent.load_weights('dqn_weights_23.h5f')

In [25]:
def mean_reward(_rewards):
    _rewards = [r[0] for r in _rewards]
    wins = sum(1 for r in _rewards if r == 1)
    losses = sum(1 for r in _rewards if r == -1)
    mistakes = sum(1 for r in _rewards if r is None)
    opponent_mistakes = sum(1 for r in _rewards if r == 0)
    return "W: " + str(wins) + "; L: " + str(losses) + "; M: " + str(mistakes) + "; O: " + str(opponent_mistakes)
from kaggle_environments import evaluate

In [None]:
save_name = "bigger2"
env = make("connectx", debug=False)
trainer = env.train(["random", None])
agent.fit(trainer, nb_steps=100000, visualize=False, verbose=1)
agent.save_weights(f'{save_name}.h5f', overwrite=True)

Training for 100000 steps ...
Interval 1 (0 steps performed)
1165 episodes - episode_reward: -8.337 [-35.000, 1.000] - loss: 405236677.089 - mae: 7188.017 - mean_q: 10851.977 - mean_eps: 0.475

Interval 2 (10000 steps performed)
1127 episodes - episode_reward: -8.450 [-35.000, 1.000] - loss: 423229.491 - mae: 1176.023 - mean_q: 1680.019 - mean_eps: 0.426

Interval 3 (20000 steps performed)
1175 episodes - episode_reward: -8.553 [-35.000, 1.000] - loss: 40756296.130 - mae: 2553.676 - mean_q: 3706.718 - mean_eps: 0.376

Interval 4 (30000 steps performed)
1240 episodes - episode_reward: -8.611 [-35.000, 1.000] - loss: 264670446.787 - mae: 5847.688 - mean_q: 8708.483 - mean_eps: 0.327

Interval 5 (40000 steps performed)
1208 episodes - episode_reward: -9.562 [-35.000, 1.000] - loss: 17660875.176 - mae: 3388.619 - mean_q: 4561.621 - mean_eps: 0.277

Interval 6 (50000 steps performed)
1241 episodes - episode_reward: -8.367 [-35.000, 1.000] - loss: 1765132285.148 - mae: 19504.667 - mean_q: 17

In [29]:
def kaggle_agent(observation, _):
    return processor.process_action(np.argmax(agent.forward(processor.process_observation(observation))))
print(mean_reward(evaluate("connectx", [kaggle_agent, "random"], num_episodes=1000)))


W: 730; L: 10; M: 260; O: 0


In [30]:
def kaggle_agent(observation, _):
    return 1
print(mean_reward(evaluate("connectx", [kaggle_agent, "random"], num_episodes=1000)))


W: 712; L: 8; M: 280; O: 0
