In [4]:
from envs.connect4.Connect4Env import Connect4Env

from ai.dqn.DeepNetworkModel import DeepNetworkModel
from ai.dqn.DQN import DQN

from agents.AgentRandom import AgentRandom
from agents.AgentDQN import AgentDQN

from tools import *

import gym
from gym.envs.registration import register
import numpy as np
import tensorflow as tf
import tqdm



In [None]:
register(
    id='Connect4Env-v0',
    entry_point='envs.connect4.Connect4Env:Connect4Env',
)

In [10]:
env = gym.make('Connect4Env-v0')

episodes = 10_000
learning_rate = 0.0001# best so far 0.0001
gamma = 0.99

input_layer = tf.keras.layers.Conv2D(256, (3, 3), activation='relu', input_shape=(1, 3, 6, 7), data_format="channels_first", kernel_regularizer='l1_l2', padding="same")
hidden_layers = []
hidden_layers.append(tf.keras.layers.Conv2D(256, (3, 3), activation='relu', kernel_initializer='RandomNormal', kernel_regularizer='l1_l2', padding="same"))
hidden_layers.append(tf.keras.layers.Flatten())
hidden_layers.append(tf.keras.layers.Dense(256, activation='relu', kernel_initializer='RandomNormal'))
hidden_layers.append(tf.keras.layers.Dense(128, activation='relu', kernel_initializer='RandomNormal'))

batch_size = 256
epsilon = 0.9999
min_epsilon = 0.05
epsilon_multiplier = 0.99976 # 10k
max_memory = 10000
min_memory = 260
copy_step = 20
copy_iter = 0

total_turns = np.zeros(episodes)
total_rewards = np.zeros(episodes)

In [None]:
def preprocess_state(self, state):
    if state == None:
        return None
    if len(state) > 3:
        states = []
        for s in state:
            new_state = s
            player1 = np.where(new_state == 1, 1, 0).reshape((6,7))
            player2 = np.where(new_state == 2, 1, 0).reshape((6,7))
            empty = np.where(new_state == 0, 1, 0).reshape((6,7))
            states.append(np.array([player1, player2, empty], dtype=np.uint8).reshape((1, 3,6,7)))
        return np.asarray(states)
    new_state = state
    player1 = np.where(new_state == 1, 1, 0).reshape((6,7))
    player2 = np.where(new_state == 2, 1, 0).reshape((6,7))
    empty = np.where(new_state == 0, 1, 0).reshape((6,7))
    return np.array([player1, player2, empty], dtype=np.uint8).reshape((1, 3,6,7))

In [None]:
n_inputs = 42
n_outputs = 7

TrainNet = DQN(DeepNetworkModel(input_layer, hidden_layers, n_outputs), gamma, learning_rate, batch_size, max_memory, min_memory)
TargetNet = DQN(DeepNetworkModel(input_layer, hidden_layers, n_outputs), gamma, learning_rate, batch_size, max_memory, min_memory)


In [None]:
pbar = tqdm.tqdm(range(episodes))
min_epsilon = 0.05
p1, p2, illigal_p1, illigal_p2 = [0], [0], [0], [0]
random_agent = AgentRandom()
agents = [TrainNet, random_agent]
best_winrate = 0
for n in pbar:
    epsilon = max(min_epsilon, epsilon * epsilon_multiplier) 
    turn, reward = play_epiode_one_sided(env, agents, TargetNet, epsilon)
    if n % (episodes*0.005) == 0 and n!=0:
          
          dqn_agent = AgentDQN(TrainNet, False)
          test_agents = [dqn_agent, random_agent]
          p1_win, p2_win, illigal_p1_move, _ = test_winrate(env, test_agents, 100)
          p1.append(p1_win)
          p2.append(p2_win)
    total_rewards[n] = reward
    total_turns[n] = turn
    pbar.set_postfix({
        'epsilon': epsilon,
        'turns' : turn,
        'ai_winrate': p1[-1],
        'random_winrate': p2[-1],
        'draw' : 1-p1[-1]-p2[-1]- illigal_p1[-1],
        'illegal_ai': illigal_p1[-1]
    })
    copy_iter += 1
    if copy_iter % copy_step == 0:
        TargetNet.copy_weights(TrainNet)

In [None]:
import matplotlib.pyplot as plt
plt.plot(p1)
plt.plot(p2)
plt.plot(1-np.array(p1)-np.array(p2))
plt.plot(np.ones(len(p1))*0.8, 'r-')
plt.plot(np.ones(len(p1))*0.9, 'r--')
plt.plot(np.ones(len(p1)), 'r-.')
plt.legend(["player1_wins", "player2_wins", "draws"])
plt.show()

In [None]:
import matplotlib.pyplot as plt
mean_loss = np.zeros(len(TrainNet.loss_list))
for i in range(0, len(TrainNet.loss_list), 100):
    mean_loss[i] = np.mean(TrainNet.loss_list[max(0, i-1000):(i+1)])

total_turns_ = np.zeros(len(total_turns))
for i in range(len(total_turns_)):
    total_turns_[i] = np.mean(total_turns[max(0, i-1000):(i+1)])

mloss = mean_loss
mloss = np.where(mloss == 0, 0.00001, mloss)

plt.plot(mloss[1:])
plt.show()

plt.plot(total_turns_[200:])
plt.show()

In [None]:
random_agent = AgentRandom()
dqn_agent = AgentDQN(TrainNet)
agents = [dqn_agent, random_agent]
test_winrate(env, agents, 1000, swap_sides=False, info=True)

In [None]:
random_agent = AgentRandom()
dqn_agent = AgentDQN(TrainNet, illigal=False)
agents = [random_agent, dqn_agent]
test_winrate(env, agents, 1000, False, True)