In [14]:
from kaggle_environments import make
from keras import Model
from keras.optimizers import Adam
from rl.agents.dqn import DQNAgent
from rl.policy import EpsGreedyQPolicy, LinearAnnealedPolicy
from rl.memory import SequentialMemory
from rl.processors import Processor
from keras.layers import Dense, Input, Reshape, Lambda, Concatenate
import numpy as np

In [15]:
class ConnectXProcessor(Processor):
    def process_observation(self, observation):
        return np.array( tuple(-1 if e == 2 else int(e) for e in observation['board']))

    def process_state_batch(self, batch):
        return batch

    def process_reward(self, _reward):
        return (1 if _reward == 1 else 0 if _reward == 0 else -1) if _reward is not None else -5

    def process_action(self, action):
        return int(action)

In [16]:
i = Input(shape=(1, 42))
r = Reshape((42,))(i)
x = Dense(42, activation='leaky_relu')(r)
x = Dense(7, activation='leaky_relu')(x)
o = Dense(7, activation='linear')(x)
model = Model(inputs=i, outputs=o)

In [23]:
policy = EpsGreedyQPolicy()
policy = LinearAnnealedPolicy(policy, attr='eps', value_max=0.1, value_min=0.005, value_test=0.001, nb_steps=50000)
processor = ConnectXProcessor()
memory = SequentialMemory(limit=1000, window_length=1)
agent = DQNAgent(model=model, policy=policy, memory=memory, nb_actions=7, nb_steps_warmup=100, target_model_update=3e2,
                 processor=processor, enable_double_dqn=True, enable_dueling_network=True)
agent.compile(optimizer=Adam(learning_rate=0.01), metrics=['mae'])
# agent.load_weights('dqn_weights_23.h5f')

In [24]:
def mean_reward(_rewards):
    _rewards = [r[0] for r in _rewards]
    wins = sum(1 for r in _rewards if r == 1)
    losses = sum(1 for r in _rewards if r == -1)
    mistakes = sum(1 for r in _rewards if r is None)
    opponent_mistakes = sum(1 for r in _rewards if r == 0)
    return "W: " + str(wins) + "; L: " + str(losses) + "; M: " + str(mistakes) + "; O: " + str(opponent_mistakes)
from kaggle_environments import evaluate

In [25]:
save_name = "smaller"
env = make("connectx", debug=False)
trainer = env.train(["random", None])
agent.fit(trainer, nb_steps=50000, visualize=False, verbose=1)
agent.save_weights(f'{save_name}.h5f', overwrite=True)

Training for 50000 steps ...
Interval 1 (0 steps performed)
1493 episodes - episode_reward: -0.112 [-5.000, 1.000] - loss: 0.119 - mae: 0.586 - mean_q: 0.472 - mean_eps: 0.090

Interval 2 (10000 steps performed)
1565 episodes - episode_reward: -0.035 [-5.000, 1.000] - loss: 0.114 - mae: 0.663 - mean_q: 0.420 - mean_eps: 0.072

Interval 3 (20000 steps performed)
1602 episodes - episode_reward: 0.189 [-5.000, 1.000] - loss: 0.103 - mae: 0.675 - mean_q: 0.495 - mean_eps: 0.053

Interval 4 (30000 steps performed)
1603 episodes - episode_reward: 0.218 [-5.000, 1.000] - loss: 0.093 - mae: 0.651 - mean_q: 0.568 - mean_eps: 0.034

Interval 5 (40000 steps performed)
done, took 752.460 seconds


In [27]:
def kaggle_agent(observation, _):
    return processor.process_action(np.argmax(agent.forward(processor.process_observation(observation))))
print(mean_reward(evaluate("connectx", [kaggle_agent, "random"], num_episodes=1000)))


W: 734; L: 11; M: 255; O: 0


In [30]:
def kaggle_agent(observation, _):
    return 1
print(mean_reward(evaluate("connectx", [kaggle_agent, "random"], num_episodes=1000)))


W: 712; L: 8; M: 280; O: 0
