In [5]:
from tensorflow import keras
from env import *
import numpy as np
from agents.A2C import *


In [6]:
np.set_printoptions(precision=3)

model = A2C()

In [7]:
def change_agent(agent_num):
    if agent_num == 0:
        return 1
    return 0
    

In [8]:
action_history = []
state_history = []
state_next_history = []
rewards_history = []
done_history = []
episode_reward_history = []
critic_value_history = []
action_probs_history = []

gamma = 0.99
epsilon = 1
epsilon_min = 0.1
epsilon_max = 1.0
epsilon_interval = (
    epsilon_max - epsilon_min
)
batch_size = 32
max_steps_per_episode = 200
num_actions = 4096
optimizer = keras.optimizers.SGD(learning_rate=0.000001, decay=1e-6, momentum=0.9, nesterov=True, clipvalue=0.5)

running_reward = 0
episode_count = 0
frame_count = 0

epsilon_random_frames = 50000
epsilon_greedy_frames = 1000000.0
max_memory_length = 10000
update_after_actions = 4
update_target_network = 100
loss_function = keras.losses.Huber()
len_episodes = 0
iterations = 300
eps = np.finfo(np.float32).eps.item()


In [10]:
env = ChessEnv()

for _ in range(iterations):
    agent_num = 0
    state = np.array(env.reset())
    episode_reward = 0
    len_episodes += 1
    with tf.GradientTape() as tape:
        for timestep in range(1, max_steps_per_episode):
            state = tf.convert_to_tensor(state)
            state = tf.expand_dims(state, 0)
            action_probs, critic_value = model.model(state)
            critic_value_history.append(critic_value[0, 0])
        
            legal_moves_probs = np.array(
                filter_legal_moves(env.board, action_probs[0],agent_num,env.translated_board))
            top_actions = (-legal_moves_probs).argsort()[:5]
            
            if(np.argmax(legal_moves_probs, axis=None) == 0):
                legal_moves_probs = np.array(filter_legal_moves(
                    env.board, action_probs[0], change_agent(agent_num), env.translated_board))
                top_actions = (-legal_moves_probs).argsort()[:5]
            
            action = np.random.choice(top_actions)
            move = num2move[action]
            is_legal = check_legal_move(env.board,move)
            while not is_legal:    
                action = np.random.choice(top_actions)
                move = num2move[action]
                is_legal = check_legal_move(env.board,move)
                
                
            
            action_probs_history.append(tf.math.log(action_probs[0, action]))

            state, reward, done, _ = env.step(move)
            rewards_history.append(reward)
            episode_reward += reward

            if done:
                break

        running_reward = 0.05 * episode_reward + (1 - 0.05) * running_reward

        returns = []
        discounted_sum = 0
        for r in rewards_history[::-1]:
            discounted_sum = r + gamma * discounted_sum
            returns.insert(0, discounted_sum)

        # Normalize
        returns = np.array(returns)
        returns = (returns - np.mean(returns)) / (np.std(returns) + eps)
        returns = returns.tolist()

        # Calculating loss values to update our network
        history = zip(action_probs_history, critic_value_history, returns)
        actor_losses = []
        critic_losses = []
        for log_prob, value, ret in history:
            diff = ret - value
            actor_losses.append(-log_prob * diff)  #
            critic_losses.append(
                loss_function(tf.expand_dims(value, 0), tf.expand_dims(ret, 0))
            )

        # Backpropagation
        loss_value = sum(actor_losses) + sum(critic_losses)
        grads = tape.gradient(loss_value, model.model.trainable_variables)
        optimizer.apply_gradients(zip(grads, model.model.trainable_variables))

        # Clear the loss and reward history
        action_probs_history.clear()
        critic_value_history.clear()
        rewards_history.clear()

    # Log details
    episode_count += 1
    if episode_count % 10 == 0:
        template = "running reward: {:.2f} at episode {}"
        print(template.format(running_reward, episode_count))


running reward: -84.65 at episode 130
running reward: -84.67 at episode 140
running reward: -85.46 at episode 150
running reward: -85.06 at episode 160
running reward: -85.92 at episode 170
running reward: -86.30 at episode 180
running reward: -85.99 at episode 190
running reward: -85.70 at episode 200
running reward: -85.99 at episode 210
running reward: -86.27 at episode 220
running reward: -86.35 at episode 230
running reward: -86.58 at episode 240
running reward: -86.30 at episode 250
running reward: -86.21 at episode 260
running reward: -86.36 at episode 270
running reward: -86.53 at episode 280
running reward: -87.17 at episode 290
running reward: -87.02 at episode 300
running reward: -86.95 at episode 310
running reward: -87.47 at episode 320
running reward: -87.59 at episode 330
running reward: -86.96 at episode 340
running reward: -87.09 at episode 350
running reward: -86.93 at episode 360
running reward: -87.22 at episode 370
running reward: -87.90 at episode 380
running rewa