Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/master'
Browse files Browse the repository at this point in the history
  • Loading branch information
markub3327 committed Dec 22, 2023
2 parents a6709da + 33d4349 commit 2ab325a
Show file tree
Hide file tree
Showing 2 changed files with 92 additions and 9 deletions.
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -147,4 +147,7 @@ wandb/
.vscode/

# ignore save folder
save/
save/

# JetBrains IDEs
.idea/
96 changes: 88 additions & 8 deletions rl_toolkit/networks/models/dueling.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras.initializers import Orthogonal, TruncatedNormal
from tensorflow.keras.layers import (
from tensorflow.keras.layers import ( # Flatten,; GlobalAveragePooling1D,; GlobalMaxPooling1D,; Lambda,
Add,
Dense,
Dropout,
Expand Down Expand Up @@ -47,7 +47,7 @@ def __init__(
num_heads,
dropout_rate,
attention_dropout_rate,
**kwargs
**kwargs,
):
super(Encoder, self).__init__(**kwargs)

Expand Down Expand Up @@ -122,7 +122,7 @@ def __init__(
gamma,
tau,
target_dqn_model=None,
**kwargs
**kwargs,
):
super(DuelingDQN, self).__init__(**kwargs)
self._target_dqn_model_wrapper = (
Expand All @@ -142,11 +142,21 @@ def __init__(
for _ in range(num_layers)
]

<<<<<<< HEAD
# Reduce
# self.flatten = Lambda(lambda x: x[:, -1])
# self.flatten = GlobalMaxPooling1D()
self.flatten = GlobalAveragePooling1D()

=======
# Reduce
# self.flatten = Lambda(lambda x: x[:, -1])
# self.flatten = GlobalMaxPooling1D()
# self.flatten = GlobalAveragePooling1D()
# self.flatten = Flatten()
# self.drop_out = Dropout(0.05)

>>>>>>> master
# Output
self.V = Dense(
1,
Expand All @@ -165,9 +175,14 @@ def call(self, inputs, training=None):
for layer in self.e_layers:
x = layer(x, training=training)

<<<<<<< HEAD

# Reduce block
x = self.flatten(x, training=training)
=======
# Reduce block
# x = self.flatten(x, training=training)
>>>>>>> master
# x = self.drop_out(x, training=training)

# compute value & advantage
Expand All @@ -180,7 +195,9 @@ def call(self, inputs, training=None):
return V + A # [B, A]

def get_action(self, state, temperature):
return tf.random.categorical(self(state, training=False) / temperature, 1)[0, 0]
return tf.random.categorical(
self(state, training=False)[:, -1] / temperature, 1
)[0, 0]

def _update_target(self):
for source_weight, target_weight in zip(
Expand All @@ -190,6 +207,53 @@ def _update_target(self):
self.tau * source_weight + (1.0 - self.tau) * target_weight
)

def _compute_n_step_rewards(
self, rewards, discount_factor, next_state_value, is_terminal
):
n = tf.shape(rewards)[1]
rewards = tf.squeeze(rewards, axis=-1)

# Create a discount factor tensor
discounts = discount_factor ** tf.range(n + 1, dtype=rewards.dtype)
# print(f"discounts: {discounts}")

# Pad the rewards tensor to ensure proper handling of the last elements in each sequence
padded_rewards = tf.pad(rewards, [[0, 0], [0, n - 1]])
# print(f"padded_rewards: {padded_rewards}")

# Create a sliding window view of the padded_rewards
windows = tf.TensorArray(
dtype=rewards.dtype,
size=n,
# element_shape=(tf.shape(rewards)[0], tf.shape(rewards)[1]),
)

for i in tf.range(n):
value = tf.slice(padded_rewards, [0, i], [-1, tf.shape(rewards)[1]])
# print(f"value: {value}")
windows = windows.write(i, value)

# Stack the windows into a single tensor
rewards_windows = tf.transpose(windows.stack(), [1, 0, 2])
# print(f"rewards_windows: {rewards_windows}")

# Multiply each window by the corresponding discount factor
discounted_windows = rewards_windows * discounts[:-1]
# print(f"discounted_windows: {discounted_windows}")

# Sum along the time axis to get the n-step rewards
n_step_rewards = tf.reduce_sum(discounted_windows, axis=-1)
# print(f"n_step_rewards: {n_step_rewards}")

# Add the next state value with discount
n_step_rewards += (
(1.0 - is_terminal)
* tf.reverse(discounts[1:], axis=[0])[tf.newaxis, :]
* next_state_value[:, tf.newaxis]
)

return n_step_rewards

def train_step(self, sample):
# Set dtype
ext_reward = tf.cast(sample.data["ext_reward"], dtype=self.dtype)
Expand All @@ -199,13 +263,29 @@ def train_step(self, sample):
next_Q = self._target_dqn_model_wrapper(
sample.data["next_observation"], training=False
)
next_Q = tf.reduce_max(next_Q, axis=-1)
next_Q = tf.reduce_max(next_Q, axis=-1)[:, -1]

# get targets
targets = self(sample.data["observation"])
indices = tf.range(tf.shape(targets)[0], dtype=sample.data["action"].dtype)
indices = tf.transpose([indices, sample.data["action"]])
updates = ext_reward[:, -1] + (1.0 - terminal[:, -1]) * self.gamma * next_Q
# indices = tf.range(tf.shape(targets)[0], dtype=sample.data["action"].dtype)
# indices = tf.transpose([indices, sample.data["action"]])
# updates = ext_reward[:, -1] + (1.0 - terminal[:, -1]) * self.gamma * next_Q
a = tf.repeat(
tf.expand_dims(
tf.range(tf.shape(targets)[0], dtype=sample.data["action"].dtype), 1
),
tf.shape(targets)[1],
axis=-1,
)
b = tf.repeat(
tf.expand_dims(
tf.range(tf.shape(targets)[1], dtype=sample.data["action"].dtype), 0
),
tf.shape(targets)[0],
axis=0,
)
indices = tf.stack([a, b, sample.data["action"]], axis=-1)
updates = self._compute_n_step_rewards(ext_reward, self.gamma, next_Q, terminal)
targets = tf.stop_gradient(
tf.tensor_scatter_nd_update(targets, indices, updates)
)
Expand Down

0 comments on commit 2ab325a

Please sign in to comment.