In [10]:
from kaggle_environments import make
from keras import Model
from keras.optimizers import Adam
from rl.agents.dqn import DQNAgent
from rl.policy import EpsGreedyQPolicy
from rl.memory import SequentialMemory
from rl.processors import Processor
from keras.layers import Dense, Input, Reshape, Lambda, Concatenate
import numpy as np

In [19]:
class ConnectXProcessor(Processor):
    def process_observation(self, observation):
        return np.array(observation['board'])

    def process_state_batch(self, batch):
        return batch

    def process_reward(self, _reward):
        return (1 if _reward == 1 else 0.5 if _reward == 0 else 0) if _reward is not None else -84

    def process_action(self, action):
        return int(action)

In [12]:
i = Input(shape=(1, 42))
r = Reshape((42,))(i)
y = Lambda(lambda e: e[:, :7])(r)
x = Dense(42, activation='leaky_relu')(r)
x = Dense(42, activation='leaky_relu')(x)
x = Dense(42, activation='leaky_relu')(x)
x = Dense(7, activation='linear')(x)
o = Concatenate()([x, y])
o = Dense(7, activation='leaky_relu')(o)
o = Dense(7, activation='linear')(o)
model = Model(inputs=i, outputs=o)

In [13]:
policy = EpsGreedyQPolicy(0.1)
processor = ConnectXProcessor()
memory = SequentialMemory(limit=50000, window_length=1)
agent = DQNAgent(model=model, policy=policy, memory=memory, nb_actions=7, nb_steps_warmup=100, target_model_update=1e-2,
                 processor=processor, enable_double_dqn=True, enable_dueling_network=True)
agent.compile(optimizer=Adam(learning_rate=0.01), metrics=['mae'])
# agent.load_weights('dqn_weights_23.h5f')

In [15]:
def mean_reward(_rewards):
    _rewards = [r[0] for r in _rewards]
    wins = sum(1 for r in _rewards if r == 1)
    losses = sum(1 for r in _rewards if r == -1)
    mistakes = sum(1 for r in _rewards if r is None)
    opponent_mistakes = sum(1 for r in _rewards if r == 0)
    return "W: " + str(wins) + "; L: " + str(losses) + "; M: " + str(mistakes) + "; O: " + str(opponent_mistakes)
from kaggle_environments import evaluate

In [14]:
save_name = "first_layer"
for c in range(0,10):
    env = make("connectx", debug=False)
    trainer = env.train(["random", None])
    agent.policy = EpsGreedyQPolicy(0.2-c*0.02)
    if c>0:
        agent.load_weights(f'{save_name}_{c}.h5f')
    agent.fit(trainer, nb_steps=10000, visualize=False, verbose=1)
    agent.save_weights(f'{save_name}_{c+1}.h5f', overwrite=True)

Training for 10000 steps ...
Interval 1 (0 steps performed)
done, took 140.125 seconds
Training for 10000 steps ...
Interval 1 (0 steps performed)
done, took 141.563 seconds
Training for 10000 steps ...
Interval 1 (0 steps performed)
done, took 141.023 seconds
Training for 10000 steps ...
Interval 1 (0 steps performed)
done, took 143.055 seconds
Training for 10000 steps ...
Interval 1 (0 steps performed)
done, took 143.669 seconds
Training for 10000 steps ...
Interval 1 (0 steps performed)
done, took 144.535 seconds
Training for 10000 steps ...
Interval 1 (0 steps performed)
done, took 141.491 seconds
Training for 10000 steps ...
Interval 1 (0 steps performed)
done, took 142.560 seconds
Training for 10000 steps ...
Interval 1 (0 steps performed)
done, took 142.547 seconds
Training for 10000 steps ...
Interval 1 (0 steps performed)
done, took 142.566 seconds


In [18]:
for i in range(0, 10):
    save_name = "first_layer"
    agent.load_weights(f'{save_name}_{i+1}.h5f')
    def kaggle_agent(observation, _):
        return processor.process_action(np.argmax(agent.forward(processor.process_observation(observation))))
    env = make("connectx", debug=True)
    print(mean_reward(evaluate("connectx", [kaggle_agent, "random"], num_episodes=100)))
    # print(mean_reward(evaluate("connectx", [kaggle_agent, "negamax"], num_episodes=20)))

W: 73; L: 2; M: 25; O: 0
W: 77; L: 0; M: 23; O: 0
W: 67; L: 5; M: 28; O: 0
W: 77; L: 1; M: 22; O: 0
W: 81; L: 0; M: 19; O: 0
W: 71; L: 2; M: 27; O: 0
W: 72; L: 1; M: 27; O: 0
W: 73; L: 0; M: 27; O: 0
W: 78; L: 1; M: 21; O: 0
W: 69; L: 1; M: 30; O: 0


Didn't execute below

In [None]:
save_name = "first_layer"
for c in range(10,20):
    env = make("connectx", debug=False)
    trainer = env.train(["negamax", None])
    agent.policy = EpsGreedyQPolicy(0.2-c*0.02)
    agent.load_weights(f'{save_name}_{c}.h5f')
    agent.fit(trainer, nb_steps=2000, visualize=False, verbose=1)
    agent.save_weights(f'{save_name}_{c+1}.h5f', overwrite=True)

In [None]:
for i in range(10, 20):
    save_name = "first_layer"
    agent.load_weights(f'{save_name}_{20}.h5f')
    def kaggle_agent(observation, _):
        return processor.process_action(np.argmax(agent.forward(processor.process_observation(observation))))
    env = make("connectx", debug=True)
    print(mean_reward(evaluate("connectx", [kaggle_agent, "random"], num_episodes=20)))
    print(mean_reward(evaluate("connectx", [kaggle_agent, "negamax"], num_episodes=20)))
