In [1]:
import tensorflow as tf
import numpy as np

n_actions = 5

model = tf.keras.models.Sequential([
    tf.keras.layers.InputLayer(shape=[4]),
    tf.keras.layers.Dense(5, activation="sigmoid",
                          kernel_regularizer=tf.keras.regularizers.l2(0.01))
])

In [2]:
def get_grads(model, inputs):
    with tf.GradientTape() as tape:
        output = model(inputs)
        action = int(tf.argmax(output, axis=1))
        print(f"{action = }")
        target = tf.constant(1.)
        loss = tf.reduce_mean(target - output[0, action]) ** 2
    grads = tape.gradient(loss, model.trainable_variables)
    return grads

inputs = np.array([[1.3, 2.1, -0.3, 4.2]], dtype=np.float32)
grad = get_grads(model, inputs)
# grad

action = 3


In [3]:
def play_one_step(env, obs, model, loss_fn):
    prisoner_inputs = tf.convert_to_tensor([obs["prisoner"]])
    with tf.GradientTape() as tape:
        output = model(prisoner_inputs)
        probabilities = np.array([0.075, 0.075, 0.075, 0.075, 0.075])
        predicted_action = int(tf.argmax(output, axis=1))
        # print(f"{predicted_action = }")
        # add randomness for exploration:
        probabilities[predicted_action] = 0.7
        possible_actions = np.array([0, 1, 2, 3, 4])
        prisoner_action = np.random.choice(possible_actions, p=probabilities)
        
        target = tf.constant([1.])
        loss = tf.reduce_mean(loss_fn(target, [output[0, prisoner_action]]))
    grads = tape.gradient(loss, model.trainable_variables)

    actions = {"prisoner": prisoner_action}
    # print(f"{actions = }")
    obs, rewards, term, trunc, infos = env.step(actions)
    return obs, rewards, term, trunc, grads

def play_one_episode(env, model, loss_fn):
    obs, info = env.reset()
    # obs, info = env.reset(seed=24)
    rewards_list, grads_list = [], []

    while env.agents:
        obs, rew, term, trunc, grads = play_one_step(env, obs, model, loss_fn)
        rewards_list.append(rew["prisoner"])
        grads_list.append(grads)
    return rewards_list, grads_list

def discount_rewards(rewards, discount_factor):
    """Функция возвращает отдачу (return)"""
    discounted = np.array(rewards)
    for step in range(len(rewards)-2, -1 , -1):
        discounted[step] += discounted[step+1] * discount_factor
    return discounted

def get_final_grads(grads, discounted_rewards):
    mean_grads = []
    for layer in range(len(grads[0])):
        layer_grads = []
        for iter in range(len(grads)):
            layer_grads.append(grads[iter][layer] * discounted_rewards[iter])
        mean_grads.append(tf.reduce_mean(layer_grads, axis=0))
    return mean_grads

In [4]:
from escape import env

n_episodes = 1000000
max_cycles = 202
discount_factor = 0.92
lr = 0.03
loss_fn = tf.keras.losses.mse
optimizer = tf.optimizers.Adam(learning_rate=lr)

pygame 2.3.0 (SDL 2.24.2, Python 3.11.9)
Hello from the pygame community. https://www.pygame.org/contribute.html


In [5]:
environment = env(render_mode=None, max_cycles=max_cycles)
for i in range(n_episodes):
    # print(f"\r{(100 * i / n_episodes)}%", end='')
    print(f"\r{(i)}", end='')
    rewards, grads = play_one_episode(environment, model, loss_fn)
    dr = discount_rewards(rewards, discount_factor)
    mean_grads = get_final_grads(grads, dr)
    optimizer.apply_gradients(zip(mean_grads, model.trainable_variables))

0

  warn("You are calling render method without specifying any render mode. "


96

KeyboardInterrupt: 

In [11]:
environment = env(render_mode="human", max_cycles=200)
r, g = play_one_episode(environment, model, loss_fn)

Game Over!


In [14]:
r

[-1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 19,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 19,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 19,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 19,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -21,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -21,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -21,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -21,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -21,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -21,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -21,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -21,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -21,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -21,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -21,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -21,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -21,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -21,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -21,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,


In [13]:
dr = discount_rewards(r, 0.92)
dr

array([ -1,   0,   2,   4,   6,   8,  10,  12,  15,  18,  -1,   0,   2,
         4,   6,   8,  10,  12,  15,  18,  -1,   0,   1,   3,   5,   7,
         9,  11,  14,  17,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,  -2,
        -2, -23, -24, -26, -28, -30, -32, -34, -36, -39, -42, -23, -24,
       -26, -28, -30, -32, -34, -36, -39, -42, -23, -24, -26, -28, -30,
       -32, -34, -36, -39, -42, -23, -24, -26, -28, -30, -32, -34, -36,
       -39, -42, -23, -24, -26, -28, -30, -32, -34, -36, -39, -42, -23,
       -24, -26, -28, -30, -32, -34, -36, -39, -42, -23, -24, -26, -28,
       -30, -32, -34, -36, -39, -42, -23, -24, -26, -28, -30, -32, -34,
       -36, -39, -42, -23, -24, -26, -28, -30, -32, -34, -36, -39, -42,
       -23, -24, -26, -28, -30, -32, -34, -36, -39, -42, -23, -24, -26,
       -28, -30, -32, -34, -36, -39, -42, -23, -24, -26, -28, -30, -32,
       -34, -36, -39, -42, -23, -24, -25, -27, -29, -31, -33, -35, -38,
       -41, -22, -23, -24, -25, -27, -29, -31, -33, -35, -38, -1