In [75]:
import tensorflow as tf
import numpy as np

channel_width = 8
n_actions = 5

In [76]:
@tf.custom_gradient
def binarization(x):
    def grad(upstream ):       # STE gradient approximation
        return upstream
    return tf.cast(x >= 0.5, tf.float32), grad

class BinarizationLayer(tf.keras.layers.Layer):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)

    def call(self, inputs):
        return binarization(inputs)

In [77]:
def get_prompter_nn(channel_width):
    prompter_nn = tf.keras.Sequential()
    prompter_nn.add(tf.keras.layers.InputLayer(shape=[4]))
    prompter_nn.add(tf.keras.layers.Dense(6, activation="elu",
                                          kernel_regularizer=tf.keras.regularizers.l2(0.01)))
    prompter_nn.add(tf.keras.layers.Dense(channel_width, activation="sigmoid",
                                          kernel_regularizer=tf.keras.regularizers.l2(0.01)))
    # prompter_nn.add(BinarizationLayer())
    return prompter_nn

def get_prisoner_nn(channel_width, n_actions):
    prisoner_nn = tf.keras.Sequential()
    prisoner_nn.add(tf.keras.layers.InputLayer(shape=[4+channel_width]))
    prisoner_nn.add(tf.keras.layers.Dense(channel_width, activation="elu",
                                          kernel_regularizer=tf.keras.regularizers.l2(0.01)))
    prisoner_nn.add(tf.keras.layers.Dense(n_actions, activation="sigmoid",
                                          kernel_regularizer=tf.keras.regularizers.l2(0.01)))
    return prisoner_nn
 
class CombinedModel(tf.keras.Model):
    def __init__(self, channel_width, n_actions, **kwargs):
        super().__init__(**kwargs)
        self.prompter_nn = get_prompter_nn(channel_width)
        self.prisoner_nn = get_prisoner_nn(channel_width, n_actions)
        
    def call(self, prompter_inputs, additional_inputs):
        message = self.prompter_nn(prompter_inputs)
        prisoner_input = tf.concat([message, additional_inputs], axis=1)
        return self.prisoner_nn(prisoner_input)

In [78]:
def get_grads(model, prompter_inputs, additional_inputs):
    with tf.GradientTape() as tape:
        output = model(prompter_inputs, additional_inputs)
        action = int(tf.argmax(output, axis=1))
        print(f"{action = }")
        target = tf.constant(1.)
        loss = tf.reduce_mean(target - output[0, action]) ** 2
    grads = tape.gradient(loss, model.trainable_variables)
    return grads

prompter_inputs = np.array([[1.3, 1.1, -0.3, 4.2]], dtype=np.float32)
additional_inputs = np.array([[1.1, -2.4, 2.4, 1.0]], dtype=np.float32)

model = CombinedModel(channel_width, n_actions)
model.call(prompter_inputs, additional_inputs)

grad = get_grads(model, prompter_inputs, additional_inputs)
grad

action = 4


[<tf.Tensor: shape=(4, 6), dtype=float32, numpy=
 array([[ 3.1263661e-04, -1.5301888e-05, -2.2717851e-04, -1.0546765e-05,
          3.5749219e-04,  8.6789987e-05],
        [ 2.6453866e-04, -1.2947752e-05, -1.9222798e-04, -8.9241867e-06,
          3.0249340e-04,  7.3437688e-05],
        [-7.2146911e-05,  3.5312050e-06,  5.2425810e-05,  2.4338690e-06,
         -8.2498205e-05, -2.0028459e-05],
        [ 1.0100567e-03, -4.9436869e-05, -7.3396129e-04, -3.4074164e-05,
          1.1549748e-03,  2.8039841e-04]], dtype=float32)>,
 <tf.Tensor: shape=(6,), dtype=float32, numpy=
 array([ 2.4048969e-04, -1.1770683e-05, -1.7475270e-04, -8.1128965e-06,
         2.7499400e-04,  6.6761531e-05], dtype=float32)>,
 <tf.Tensor: shape=(6, 8), dtype=float32, numpy=
 array([[-2.6073697e-04,  1.9458523e-04, -2.0404528e-04, -2.0504031e-04,
         -4.8614066e-04,  1.2168771e-03, -6.9807297e-06, -6.3336879e-04],
        [ 1.7673551e-04, -1.3189584e-04,  1.3830815e-04,  1.3898262e-04,
          3.2952105e-04, -8

In [79]:
0.3 / 4

0.075

In [80]:
def play_one_step(env, obs, model, loss_fn):
    prompter_inputs = tf.convert_to_tensor([obs["prompter"]])
    prisoner_inputs = tf.convert_to_tensor([obs["prisoner"][:4]])
    with tf.GradientTape() as tape:
        # print(f"{prompter_inputs = }")
        # print(f"{prisoner_inputs = }")
        output = model(prompter_inputs, prisoner_inputs)
        probabilities = np.array([0.075, 0.075, 0.075, 0.075, 0.075])
        predicted_action = int(tf.argmax(output, axis=1))
        # print(f"{predicted_action = }")
        # add randomness for exploration:
        probabilities[predicted_action] = 0.7
        possible_actions = np.array([0, 1, 2, 3, 4])
        prisoner_action = np.random.choice(possible_actions, p=probabilities)
        
        target = tf.constant([1.])
        loss = tf.reduce_mean(loss_fn(target, [output[0, prisoner_action]]))
    grads = tape.gradient(loss, model.trainable_variables)

    actions = {"prompter": np.array(output[0]), # !!!!!!!!!!!!!!!!!!!!!!!!!!! mistake
               "prisoner": prisoner_action}
    # print(f"{actions = }")
    obs, rewards, term, trunc, infos = env.step(actions)
    return obs, rewards, term, trunc, grads

In [81]:
def play_one_episode(env, model, loss_fn):
    obs, info = env.reset(seed=24)
    rewards_list, grads_list = [], []

    while env.agents:
        obs, rew, term, trunc, grads = play_one_step(env, obs, model, loss_fn)
        rewards_list.append(rew["prisoner"])
        grads_list.append(grads)
    return rewards_list, grads_list

In [82]:
def discount_rewards(rewards, discount_factor):
    """Функция возвращает отдачу (return)"""
    discounted = np.array(rewards)
    for step in range(len(rewards)-2, -1 , -1):
        discounted[step] += discounted[step+1] * discount_factor
    return discounted

In [83]:
def get_final_grads(grads, discounted_rewards):
    mean_grads = []
    for layer in range(len(grads[0])):
        layer_grads = []
        for iter in range(len(grads)):
            layer_grads.append(grads[iter][layer] * discounted_rewards[iter])
        mean_grads.append(tf.reduce_mean(layer_grads, axis=0))
    return mean_grads

Training

In [84]:
from prisoner_guard_prompter import env

n_episodes = 1000000
max_cycles = 202
discount_factor = 0.92
lr = 0.03
loss_fn = tf.keras.losses.mse
optimizer = tf.optimizers.Adam(learning_rate=lr)

In [85]:
from IPython.display import clear_output

In [86]:
mean_grads

[<tf.Tensor: shape=(4, 6), dtype=float32, numpy=
 array([[-1.11939715e-14,  3.73603544e-11,  1.43755632e-10,
          4.42829828e-13, -1.41134293e-09, -2.64765265e-10],
        [ 9.10806566e-15, -3.03985587e-11, -1.16967963e-10,
         -3.60312068e-13,  1.14835041e-09,  2.15428425e-10],
        [-1.11341888e-14,  3.71608369e-11,  1.42987899e-10,
          4.40464939e-13, -1.40380563e-09, -2.63351313e-10],
        [ 7.42094374e-15, -2.47677174e-11, -9.53015375e-11,
         -2.93570100e-13,  9.35637234e-10,  1.75523790e-10]], dtype=float32)>,
 <tf.Tensor: shape=(6,), dtype=float32, numpy=
 array([-5.8410450e-16,  1.9494742e-12,  7.5012097e-12,  2.3106986e-14,
        -7.3644271e-11, -1.3815527e-11], dtype=float32)>,
 <tf.Tensor: shape=(6, 8), dtype=float32, numpy=
 array([[ 1.44232460e-27,  2.95613307e-17, -3.22681798e-10,
          0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
          0.00000000e+00,  8.27560437e-21],
        [ 1.41879243e-27,  2.90790216e-17, -3.17417065e-10,

In [87]:
environment = env(render_mode=None, max_cycles=max_cycles)
for i in range(n_episodes):
    # clear_output()
    # print(f"\r{(100 * i / n_episodes)}%", end='')
    print(f"\r{(i)}", end='')
    rewards, grads = play_one_episode(environment, model, loss_fn)
    dr = discount_rewards(rewards, discount_factor)
    mean_grads = get_final_grads(grads, dr)
    optimizer.apply_gradients(zip(mean_grads, model.trainable_variables))

0

10545

KeyboardInterrupt: 

In [88]:
environment = env(render_mode="human", max_cycles=200)
r, g = play_one_episode(environment, model, loss_fn)

Game Over!


In [None]:
dr = discount_rewards(r, 0.92)

In [None]:
g

[[<tf.Tensor: shape=(4, 6), dtype=float32, numpy=
  array([[ 1.5977242e-21,  0.0000000e+00, -1.6976556e-28,  0.0000000e+00,
           5.4376038e-21,  1.1134527e-21],
         [-1.3000013e-21,  0.0000000e+00,  1.3813113e-28,  0.0000000e+00,
          -4.4243504e-21, -9.0596977e-22],
         [ 1.5891916e-21,  0.0000000e+00, -1.6885894e-28,  0.0000000e+00,
           5.4085645e-21,  1.1075064e-21],
         [-1.0591971e-21,  0.0000000e+00,  1.1254457e-28,  0.0000000e+00,
          -3.6048111e-21, -7.3815358e-22]], dtype=float32)>,
  <tf.Tensor: shape=(6,), dtype=float32, numpy=
  array([ 8.3369698e-23,  0.0000000e+00, -8.8584148e-30, -0.0000000e+00,
          2.8373570e-22,  5.8100275e-23], dtype=float32)>,
  <tf.Tensor: shape=(6, 8), dtype=float32, numpy=
  array([[ 0.0000000e+00,  0.0000000e+00, -8.5607143e-21,  0.0000000e+00,
           0.0000000e+00,  0.0000000e+00,  0.0000000e+00,  0.0000000e+00],
         [ 0.0000000e+00,  0.0000000e+00,  3.6546929e-22,  0.0000000e+00,
           

In [None]:
get_final_grads(g, dr)

[<tf.Tensor: shape=(4, 6), dtype=float32, numpy=
 array([[-1.2830597e-03,  5.0238679e-05,  1.0197418e-03,  3.9262659e-07,
         -2.4073037e-07,  1.7307951e-03],
        [ 9.2466793e-04, -3.6205714e-05, -7.3490146e-04, -2.8295582e-07,
          1.7348813e-07, -1.2473393e-03],
        [ 7.1732164e-04, -2.8086990e-05, -5.7010795e-04, -2.1950616e-07,
          1.3458538e-07, -9.6763752e-04],
        [ 1.2648639e-03, -4.9526217e-05, -1.0052802e-03, -3.8705858e-07,
          2.3731647e-07, -1.7062499e-03]], dtype=float32)>,
 <tf.Tensor: shape=(6,), dtype=float32, numpy=
 array([ 1.4385676e-04, -5.6327654e-06, -1.1433348e-04, -4.4021327e-08,
         2.6990703e-08, -1.9405695e-04], dtype=float32)>,
 <tf.Tensor: shape=(6, 8), dtype=float32, numpy=
 array([[-1.29423497e-05, -3.96198273e-04, -6.23300322e-04,
         -3.59989781e-06, -1.75546695e-04, -1.85411365e-03,
         -1.30880746e-08, -1.25926235e-05],
        [ 4.05330593e-06,  1.24082057e-04,  1.95206303e-04,
          1.12742259e-0

In [None]:
dr = discount_rewards(r, 0.92)
print(dr)

[-23 -24 -26 -28 -30 -32 -34 -36 -39 -42 -23 -24 -26 -28 -30 -32 -34 -36
 -39 -42 -23 -24 -26 -28 -30 -32 -34 -36 -39 -42 -23 -24 -26 -28 -30 -32
 -34 -36 -39 -42 -23 -24 -26 -28 -30 -32 -34 -36 -39 -42 -23 -24 -26 -28
 -30 -32 -34 -36 -39 -42 -23 -24 -26 -28 -30 -32 -34 -36 -39 -42 -23 -24
 -26 -28 -30 -32 -34 -36 -39 -42 -23 -24 -26 -28 -30 -32 -34 -36 -39 -42
 -23 -24 -26 -28 -30 -32 -34 -36 -39 -42 -23 -24 -26 -28 -30 -32 -34 -36
 -39 -42 -23 -24 -25 -27 -29 -31 -33 -35 -38 -41 -22 -23 -24 -25 -27 -29
 -31 -33 -35 -38 -19 -20 -21 -22 -23 -24 -26 -28 -30 -32 -12 -13 -14 -15
 -16 -17 -18 -19 -20 -21  -1  -1  -1  -1  -1  -1  -1  -1  -1]


Умножаем градиенты на соответствующие отдачи

In [None]:
for i in range(len(grads)):
    for j in range(len(grads[i])):
        grads[i][j] = grads[i][j] * dr[i]

grads[0][0]

<tf.Tensor: shape=(4, 6), dtype=float32, numpy=
array([[ 2.30569586e-09,  1.54859122e-12,  9.13172316e-11,
         7.04336728e-11,  1.01246830e-10,  2.48841225e-10],
       [-7.96938293e-10, -5.35253482e-13, -3.15627940e-11,
        -2.43446218e-11, -3.49948473e-11, -8.60092206e-11],
       [ 3.37101014e-09,  2.26409603e-12,  1.33509079e-10,
         1.02976565e-10,  1.48026508e-10,  3.63814839e-10],
       [-3.24701321e-10, -2.18081480e-13, -1.28598165e-11,
        -9.91887388e-12, -1.42581589e-11, -3.50432461e-11]], dtype=float32)>

In [None]:
mean_grads = []
for layer in range(len(grads[0])):
    layer_grads = []
    for iter in range(len(grads)):
        layer_grads.append(grads[iter][layer])
    mean_grads.append(tf.reduce_mean(layer_grads, axis=0))
        

In [None]:
len(mean_grads)

8

In [None]:
mean_grads

[<tf.Tensor: shape=(4, 6), dtype=float32, numpy=
 array([[ 3.0409288e-05, -3.6841658e-07, -2.7900282e-07, -3.2921366e-06,
         -4.4111501e-05, -1.0284467e-04],
        [ 6.7294008e-05, -6.4309103e-07,  4.5435055e-08, -3.4138859e-05,
         -5.8305024e-05, -8.6672961e-05],
        [ 2.2426181e-04, -2.3306568e-06, -5.3367239e-07, -8.7366127e-05,
         -2.3621433e-04, -4.3559779e-04],
        [-2.1601272e-05,  2.2449275e-07,  5.1404189e-08,  8.4152498e-06,
          2.2752558e-05,  4.1957508e-05]], dtype=float32)>,
 <tf.Tensor: shape=(6,), dtype=float32, numpy=
 array([ 1.5011081e-05, -1.5600372e-07, -3.5721637e-08, -5.8478959e-06,
        -1.5811131e-05, -2.9156952e-05], dtype=float32)>,
 <tf.Tensor: shape=(6, 8), dtype=float32, numpy=
 array([[-3.7275374e-04,  1.9536335e-04, -6.9409261e-06,  2.3865106e-05,
         -5.6443972e-05,  1.3124802e-05, -1.5866302e-04, -4.4128194e-04],
        [ 4.4301865e-05, -2.1059954e-05,  8.5208967e-07, -2.7984820e-06,
          6.5909130e-06, -1

In [None]:
optimizer = tf.optimizers.Adam(learning_rate=0.01)

In [None]:
optimizer.apply_gradients(zip(mean_grads, model.trainable_variables))

<KerasVariable shape=(), dtype=int64, path=adam/iteration>