In [3]:
from tensorflow import keras
from env import *
from agents.DQN import *
import numpy as np

In [4]:
env = ChessEnv()

model = DQN()
model_target = DQN()

In [5]:
action_history = []
state_history = []
state_next_history = []
rewards_history = []
done_history = []
episode_reward_history = []
critic_value_history = []
action_probs_history = []

gamma = 0.99
epsilon = 1
epsilon_min = 0.1
epsilon_max = 1.0
epsilon_interval = (
    epsilon_max - epsilon_min
)
batch_size = 32
max_steps_per_episode = 200
num_actions = 4096
optimizer = keras.optimizers.Adam(learning_rate=0.00025, clipnorm=1.0)

running_reward = 0
episode_count = 0
frame_count = 0

epsilon_random_frames = 50000
epsilon_greedy_frames = 1000000.0
max_memory_length = 10000
update_after_actions = 4
update_target_network = 100
loss_function = keras.losses.Huber()
len_episodes = 0
iterations = 300
eps = np.finfo(np.float32).eps.item()


In [6]:
for _ in range(iterations):
    state = np.array(env.reset())
    episode_reward = 0
    len_episodes += 1
    for timestep in range(1, max_steps_per_episode):
        frame_count += 1

        if frame_count < epsilon_random_frames or epsilon > np.random.rand(1)[0]:
            move, action = model.explore(env)
        else:
            move, action = model.predict(env)

        epsilon -= epsilon_interval / epsilon_greedy_frames
        epsilon = max(epsilon, epsilon_min)

        state_next, reward, done, _ = env.step(move)

        state_next = np.array(state_next)

        episode_reward += reward
        action_history.append(action)
        state_history.append(state)
        state_next_history.append(state_next)
        done_history.append(done)
        rewards_history.append(reward)
        state = state_next

        state_samples = []
        masks = []
        updated_q_values = []
        if frame_count % update_after_actions == 0 and len(done_history) > batch_size:
            indices = np.random.choice(
                range(len(done_history)), size=batch_size)
            state_sample = np.array([state_history[i] for i in indices])
            state_next_sample = np.array(
                [state_next_history[i] for i in indices])
            rewards_sample = [rewards_history[i] for i in indices]
            action_sample = [action_history[i] for i in indices]
            done_sample = tf.convert_to_tensor(
                [float(done_history[i]) for i in indices])
            future_rewards = model_target.model.predict(state_next_sample)
            updated_q_values = rewards_sample + gamma * tf.reduce_max(
                future_rewards, axis=1
            )

            updated_q_values = updated_q_values * \
                (1 - done_sample) - done_sample
            masks = tf.one_hot(action_sample, num_actions)

            with tf.GradientTape() as tape:
                # Train the model on the states and updated Q-values
                q_values = model.model(state_sample)

                # Apply the masks to the Q-values to get the Q-value for action taken
                q_action = tf.reduce_sum(tf.multiply(q_values, masks), axis=1)
                # Calculate loss between new Q-value and old Q-value
                loss = loss_function(updated_q_values, q_action)

            # Backpropagation
            grads = tape.gradient(loss, model.model.trainable_variables)
            optimizer.apply_gradients(
                zip(grads, model.model.trainable_variables))

        if frame_count % update_target_network == 0:
            # update the the target network with new weights
            model_target.model.set_weights(model.model.get_weights())
            # Log details
            template = "running reward: {:.2f} at episode {}, frame count {}"
            print(template.format(running_reward, episode_count, frame_count))

        # Limit the state and reward history
        if len(rewards_history) > max_memory_length:
            del rewards_history[:1]
            del state_history[:1]
            del state_next_history[:1]
            del action_history[:1]
            del done_history[:1]

        if done:
            break

    # Update running reward to check condition for solving
    episode_reward_history.append(episode_reward)
    if len(episode_reward_history) > 100:
        del episode_reward_history[:1]
    running_reward = np.mean(episode_reward_history)

    episode_count += 1

running reward: -89.50 at episode 8, frame count 100
running reward: -87.50 at episode 14, frame count 200
running reward: -87.64 at episode 22, frame count 300
running reward: -86.96 at episode 28, frame count 400
running reward: -87.11 at episode 35, frame count 500
running reward: -87.33 at episode 43, frame count 600
running reward: -87.20 at episode 50, frame count 700
running reward: -86.47 at episode 55, frame count 800
running reward: -86.22 at episode 60, frame count 900
running reward: -86.36 at episode 67, frame count 1000
running reward: -86.43 at episode 75, frame count 1100
running reward: -86.38 at episode 82, frame count 1200
running reward: -86.33 at episode 88, frame count 1300
running reward: -86.41 at episode 95, frame count 1400
running reward: -86.31 at episode 101, frame count 1500
running reward: -86.04 at episode 108, frame count 1600
running reward: -86.27 at episode 117, frame count 1700
running reward: -86.29 at episode 123, frame count 1800
running reward: 