In [2]:
import os

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
import logging

logging.getLogger("tensorflow").setLevel(logging.ERROR)  # this goes *before* tf import

import numpy as np
import random
from collections import deque
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import MeanSquaredError, MSE
from tensorflow import reduce_mean, convert_to_tensor, squeeze, float32, GradientTape
from tqdm import tqdm

import tensorflow as tf

os.environ['AUTOGRAPH_VERBOSITY'] = '1'
tf.autograph.set_verbosity(0)


In [3]:
from kaggle_environments import make

Loading environment lux_ai_s2 failed: No module named 'pettingzoo'


In [4]:
from deep_q_learning import DQN

In [5]:
env = make("connectx", debug=False)

state_space_shape = tuple([len(env.state[0].observation.board)])
num_actions = env.configuration.columns
num_episodes = 1000
learning_rate = 0.01
discount_factor = 0.99
batch_size = 256
memory_size = 1024

epsilon = 1
epsilon_min = 0.01
epsilon_decay = 1.3 * (epsilon - epsilon_min) / num_episodes

In [6]:
def build_model(state_space_shape, num_actions, learning_rate):
    input_layer = Input(shape=state_space_shape)
    x = Dense(42)(input_layer)
    x = Dense(42)(x)
    x = Dense(num_actions)(x)

    m = Model(inputs=input_layer, outputs=x)
    m.compile(Adam(learning_rate=learning_rate), loss=MeanSquaredError())

    return m

In [7]:
model = build_model(state_space_shape, num_actions, learning_rate)
target_model = build_model(state_space_shape, num_actions, learning_rate)
agent = DQN(state_space_shape, num_actions, model, target_model,
            learning_rate, discount_factor, batch_size, memory_size)

In [8]:
def process_state(_state):
    return np.asarray(_state.board)


def preprocess_reward(_reward):
    return (20 if _reward == 1 else 5 if _reward == 0 else -1) if _reward is not None else -20


def postprocess_action(_action):
    return round(_action)

In [9]:
from my_utils import my_train, my_evaluate, mean_reward

In [10]:
configuration = {
    "num_episodes": num_episodes,
    "process_state": process_state, "process_reward": preprocess_reward, "process_action": postprocess_action,
    "epsilon": epsilon, "epsilon_decay": epsilon_decay, "epsilon_min": epsilon_min
}

my_train(agent, opponent="negamax", **configuration)

100%|██████████| 1000/1000 [20:56<00:00,  1.26s/it]


In [11]:
agent.save('connectx-flat', 1000)

In [12]:
agent.load('connectx-flat', 1000)

In [13]:
def my_agent(observation, configuration):
    _state = process_state(observation)
    _action = agent.get_action(_state, 0)
    _action = postprocess_action(_action)
    return _action


def fixed_agent(observation, configuration):
    return 3

In [14]:
print("My Agent vs Random Agent:", mean_reward(my_evaluate("connectx", [my_agent, "random"])))
print("My Agent vs Negamax Agent:", mean_reward(my_evaluate("connectx", [my_agent, "negamax"])))
print("Random Agent vs Negamax Agent:", mean_reward(my_evaluate("connectx", ["random", "negamax"])))
print("Fixed Agent vs Random Agent:", mean_reward(my_evaluate("connectx", [fixed_agent, "random"])))
print("Fixed Agent vs Negamax Agent:", mean_reward(my_evaluate("connectx", [fixed_agent, "negamax"])))
print("My Agent vs Fixed Agent", mean_reward(my_evaluate("connectx", [my_agent, fixed_agent])))

100%|██████████| 100/100 [00:42<00:00,  2.33it/s]


My Agent vs Random Agent: W: 72; L: 23; M: 5; O: 0


100%|██████████| 100/100 [02:22<00:00,  1.43s/it]


My Agent vs Negamax Agent: W: 4; L: 83; M: 13; O: 0


100%|██████████| 100/100 [01:22<00:00,  1.21it/s]


Random Agent vs Negamax Agent: W: 6; L: 92; M: 0; O: 2


100%|██████████| 100/100 [00:04<00:00, 22.94it/s]


Fixed Agent vs Random Agent: W: 61; L: 0; M: 39; O: 0


100%|██████████| 100/100 [00:55<00:00,  1.80it/s]


Fixed Agent vs Negamax Agent: W: 0; L: 18; M: 82; O: 0


100%|██████████| 100/100 [00:20<00:00,  4.78it/s]

My Agent vs Fixed Agent W: 100; L: 0; M: 0; O: 0





In [15]:
configuration = {
    "num_episodes": 1000,
    "process_state": process_state, "process_reward": preprocess_reward, "process_action": postprocess_action,
    "epsilon": epsilon, "epsilon_decay": epsilon_decay, "epsilon_min": epsilon_min
}

my_train(agent, opponent="negamax", **configuration)
agent.save('connectx-flat', 2000)
agent.load('connectx-flat', 2000)

print("My Agent vs Random Agent:", mean_reward(my_evaluate("connectx", [my_agent, "random"])))
print("My Agent vs Negamax Agent:", mean_reward(my_evaluate("connectx", [my_agent, "negamax"])))
print("Random Agent vs Negamax Agent:", mean_reward(my_evaluate("connectx", ["random", "negamax"])))
print("Fixed Agent vs Random Agent:", mean_reward(my_evaluate("connectx", [fixed_agent, "random"])))
print("Fixed Agent vs Negamax Agent:", mean_reward(my_evaluate("connectx", [fixed_agent, "negamax"])))
print("My Agent vs Fixed Agent", mean_reward(my_evaluate("connectx", [my_agent, fixed_agent])))

100%|██████████| 1000/1000 [23:30<00:00,  1.41s/it]
100%|██████████| 100/100 [00:46<00:00,  2.16it/s]


My Agent vs Random Agent: W: 79; L: 9; M: 12; O: 0


100%|██████████| 100/100 [02:41<00:00,  1.61s/it]


My Agent vs Negamax Agent: W: 5; L: 67; M: 28; O: 0


100%|██████████| 100/100 [01:37<00:00,  1.02it/s]


Random Agent vs Negamax Agent: W: 1; L: 97; M: 0; O: 2


100%|██████████| 100/100 [00:04<00:00, 22.40it/s]


Fixed Agent vs Random Agent: W: 72; L: 0; M: 28; O: 0


100%|██████████| 100/100 [01:00<00:00,  1.65it/s]


Fixed Agent vs Negamax Agent: W: 0; L: 22; M: 78; O: 0


100%|██████████| 100/100 [00:25<00:00,  3.85it/s]

My Agent vs Fixed Agent W: 0; L: 100; M: 0; O: 0





In [16]:
configuration = {
    "num_episodes": 1000,
    "process_state": process_state, "process_reward": preprocess_reward, "process_action": postprocess_action,
    "epsilon": epsilon, "epsilon_decay": epsilon_decay, "epsilon_min": epsilon_min
}

my_train(agent, opponent="negamax", **configuration)
agent.save('connectx-flat', 3000)
agent.load('connectx-flat', 3000)

print("My Agent vs Random Agent:", mean_reward(my_evaluate("connectx", [my_agent, "random"])))
print("My Agent vs Negamax Agent:", mean_reward(my_evaluate("connectx", [my_agent, "negamax"])))
print("My Agent vs Fixed Agent", mean_reward(my_evaluate("connectx", [my_agent, fixed_agent])))

100%|██████████| 1000/1000 [23:13<00:00,  1.39s/it]
100%|██████████| 100/100 [00:42<00:00,  2.33it/s]


My Agent vs Random Agent: W: 76; L: 23; M: 1; O: 0


100%|██████████| 100/100 [02:26<00:00,  1.47s/it]


My Agent vs Negamax Agent: W: 6; L: 83; M: 11; O: 0


100%|██████████| 100/100 [00:24<00:00,  4.08it/s]

My Agent vs Fixed Agent W: 100; L: 0; M: 0; O: 0





In [17]:
configuration = {
    "num_episodes": 1000,
    "process_state": process_state, "process_reward": preprocess_reward, "process_action": postprocess_action,
    "epsilon": epsilon, "epsilon_decay": epsilon_decay, "epsilon_min": epsilon_min
}

my_train(agent, opponent="negamax", **configuration)
agent.save('connectx-flat', 4000)
agent.load('connectx-flat', 4000)

print("My Agent vs Random Agent:", mean_reward(my_evaluate("connectx", [my_agent, "random"])))
print("My Agent vs Negamax Agent:", mean_reward(my_evaluate("connectx", [my_agent, "negamax"])))
print("My Agent vs Fixed Agent", mean_reward(my_evaluate("connectx", [my_agent, fixed_agent])))

100%|██████████| 1000/1000 [27:30<00:00,  1.65s/it]
100%|██████████| 100/100 [00:56<00:00,  1.77it/s]


My Agent vs Random Agent: W: 80; L: 20; M: 0; O: 0


100%|██████████| 100/100 [02:41<00:00,  1.61s/it]


My Agent vs Negamax Agent: W: 9; L: 88; M: 3; O: 0


100%|██████████| 100/100 [00:29<00:00,  3.40it/s]

My Agent vs Fixed Agent W: 0; L: 100; M: 0; O: 0





In [18]:
configuration = {
    "num_episodes": 1000,
    "process_state": process_state, "process_reward": preprocess_reward, "process_action": postprocess_action,
    "epsilon": epsilon, "epsilon_decay": epsilon_decay, "epsilon_min": epsilon_min
}

my_train(agent, opponent="negamax", **configuration)
agent.save('connectx-flat', 5000)
agent.load('connectx-flat', 5000)

print("My Agent vs Random Agent:", mean_reward(my_evaluate("connectx", [my_agent, "random"])))
print("My Agent vs Negamax Agent:", mean_reward(my_evaluate("connectx", [my_agent, "negamax"])))
print("My Agent vs Fixed Agent", mean_reward(my_evaluate("connectx", [my_agent, fixed_agent])))

100%|██████████| 1000/1000 [26:07<00:00,  1.57s/it]
100%|██████████| 100/100 [00:52<00:00,  1.90it/s]


My Agent vs Random Agent: W: 82; L: 18; M: 0; O: 0


100%|██████████| 100/100 [02:35<00:00,  1.56s/it]


My Agent vs Negamax Agent: W: 5; L: 91; M: 4; O: 0


100%|██████████| 100/100 [00:28<00:00,  3.57it/s]

My Agent vs Fixed Agent W: 0; L: 100; M: 0; O: 0



