In [1]:
from tensorflow import keras
from env import *
from agents.A2C import *
from scipy.special import softmax
import numpy as np

In [2]:
np.set_printoptions(precision=3)

model = A2C()

In [3]:
action_history = []
state_history = []
state_next_history = []
rewards_history = []
done_history = []
episode_reward_history = []
critic_value_history = []
action_probs_history = []

gamma = 0.99
epsilon = 1
epsilon_min = 0.1
epsilon_max = 1.0
epsilon_interval = (
    epsilon_max - epsilon_min
)
batch_size = 32
max_steps_per_episode = 200
num_actions = 4096
optimizer = keras.optimizers.SGD(learning_rate=0.000001, decay=1e-6, momentum=0.9, nesterov=True, clipvalue=0.5)

running_reward = 0
episode_count = 0
frame_count = 0

epsilon_random_frames = 50000
epsilon_greedy_frames = 1000000.0
max_memory_length = 10000
update_after_actions = 4
update_target_network = 100
loss_function = keras.losses.Huber()
len_episodes = 0
iterations = 300
eps = np.finfo(np.float32).eps.item()


In [4]:
env = ChessEnv()

for _ in range(iterations):
    print(_)
    state = np.array(env.reset())
    episode_reward = 0
    len_episodes += 1
    with tf.GradientTape() as tape:
        for timestep in range(1, max_steps_per_episode):
            state = tf.convert_to_tensor(state)
            state = tf.expand_dims(state, 0)
            action_probs, critic_value = model.model(state)
            critic_value_history.append(critic_value[0, 0])
        
            legal_moves_probs = np.array(
                filter_legal_moves(env.board, action_probs[0]))
            top_actions = (-legal_moves_probs).argsort()[:5]
            
            
            action = np.random.choice(top_actions)
            move = num2move[action]
            is_legal = check_legal_move(env.board,move)
            while not is_legal:    
                action = np.random.choice(top_actions)
                move = num2move[action]
                is_legal = check_legal_move(env.board,move)
                
                
            
            action_probs_history.append(tf.math.log(action_probs[0, action]))

            state, reward, done, _ = env.step(move)
            rewards_history.append(reward)
            episode_reward += reward

            if done:
                break

        running_reward = 0.05 * episode_reward + (1 - 0.05) * running_reward

        returns = []
        discounted_sum = 0
        for r in rewards_history[::-1]:
            discounted_sum = r + gamma * discounted_sum
            returns.insert(0, discounted_sum)

        # Normalize
        returns = np.array(returns)
        returns = (returns - np.mean(returns)) / (np.std(returns) + eps)
        returns = returns.tolist()

        # Calculating loss values to update our network
        history = zip(action_probs_history, critic_value_history, returns)
        actor_losses = []
        critic_losses = []
        for log_prob, value, ret in history:
            diff = ret - value
            actor_losses.append(-log_prob * diff)  #
            critic_losses.append(
                loss_function(tf.expand_dims(value, 0), tf.expand_dims(ret, 0))
            )

        # Backpropagation
        loss_value = sum(actor_losses) + sum(critic_losses)
        grads = tape.gradient(loss_value, model.model.trainable_variables)
        optimizer.apply_gradients(zip(grads, model.model.trainable_variables))

        # Clear the loss and reward history
        action_probs_history.clear()
        critic_value_history.clear()
        rewards_history.clear()

    # Log details
    episode_count += 1
    if episode_count % 10 == 0:
        template = "running reward: {:.2f} at episode {}"
        print(template.format(running_reward, episode_count))


0
1
2
3
4
5
6
7
8
9
running reward: -34.73 at episode 10
10
11
12
13
14
15
16
17
18
19
running reward: -56.03 at episode 20
20
21
22
23
24
25
26
27
28
29
running reward: -67.91 at episode 30
30
31
32
33
34
35
36
37
38
39
running reward: -75.78 at episode 40
40
41
42
43
44
45
46
47
48
49
running reward: -81.18 at episode 50
50
51
52
53
54
55
56
57
58
59
running reward: -84.07 at episode 60
60
61
62
63
64
65
66
67
68
69
running reward: -84.38 at episode 70
70
71
72
73
74
75
76
77
78
79
running reward: -86.75 at episode 80
80
81
82
83
84
85
86
87
88
89
running reward: -87.58 at episode 90
90
91
92
93
94
95
96
97
98
99
running reward: -88.24 at episode 100
100
101
102
103
104
105
106
107
108
109
running reward: -87.96 at episode 110
110
111
112
113
114
115
116
117
118
119
running reward: -87.63 at episode 120
120
121
122
123
124
125
126
127
128
129
running reward: -87.12 at episode 130
130
131
132
133
134
135
136
137
138
139
running reward: -87.30 at episode 140
140
141
142
143
144
145
146