Merge remote-tracking branch 'origin/master'

markub3327 · Dec 22, 2023 · 2ab325a · 2ab325a
2 parents a6709da + 33d4349
commit 2ab325a
Show file tree

Hide file tree

Showing 2 changed files with 92 additions and 9 deletions.
diff --git a/.gitignore b/.gitignore
@@ -147,4 +147,7 @@ wandb/
 .vscode/
 
 # ignore save folder
-save/
+save/
+
+# JetBrains IDEs
+.idea/
diff --git a/rl_toolkit/networks/models/dueling.py b/rl_toolkit/networks/models/dueling.py
@@ -1,7 +1,7 @@
 import tensorflow as tf
 from tensorflow.keras import Model
 from tensorflow.keras.initializers import Orthogonal, TruncatedNormal
-from tensorflow.keras.layers import (
+from tensorflow.keras.layers import (  # Flatten,; GlobalAveragePooling1D,; GlobalMaxPooling1D,; Lambda,
     Add,
     Dense,
     Dropout,
@@ -47,7 +47,7 @@ def __init__(
         num_heads,
         dropout_rate,
         attention_dropout_rate,
-        **kwargs
+        **kwargs,
     ):
         super(Encoder, self).__init__(**kwargs)
 
@@ -122,7 +122,7 @@ def __init__(
         gamma,
         tau,
         target_dqn_model=None,
-        **kwargs
+        **kwargs,
     ):
         super(DuelingDQN, self).__init__(**kwargs)
         self._target_dqn_model_wrapper = (
@@ -142,11 +142,21 @@ def __init__(
             for _ in range(num_layers)
         ]
 
+<<<<<<< HEAD
 	# Reduce
         # self.flatten = Lambda(lambda x: x[:, -1])
         # self.flatten = GlobalMaxPooling1D()
         self.flatten = GlobalAveragePooling1D()
 
+=======
+        # Reduce
+        # self.flatten = Lambda(lambda x: x[:, -1])
+        # self.flatten = GlobalMaxPooling1D()
+        # self.flatten = GlobalAveragePooling1D()
+        # self.flatten = Flatten()
+        # self.drop_out = Dropout(0.05)
+
+>>>>>>> master
         # Output
         self.V = Dense(
             1,
@@ -165,9 +175,14 @@ def call(self, inputs, training=None):
         for layer in self.e_layers:
             x = layer(x, training=training)
 
+<<<<<<< HEAD
 
         # Reduce block
         x = self.flatten(x, training=training)
+=======
+        # Reduce block
+        # x = self.flatten(x, training=training)
+>>>>>>> master
         # x = self.drop_out(x, training=training)
 
         # compute value & advantage
@@ -180,7 +195,9 @@ def call(self, inputs, training=None):
         return V + A  # [B, A]
 
     def get_action(self, state, temperature):
-        return tf.random.categorical(self(state, training=False) / temperature, 1)[0, 0]
+        return tf.random.categorical(
+            self(state, training=False)[:, -1] / temperature, 1
+        )[0, 0]
 
     def _update_target(self):
         for source_weight, target_weight in zip(
@@ -190,6 +207,53 @@ def _update_target(self):
                 self.tau * source_weight + (1.0 - self.tau) * target_weight
             )
 
+    def _compute_n_step_rewards(
+        self, rewards, discount_factor, next_state_value, is_terminal
+    ):
+        n = tf.shape(rewards)[1]
+        rewards = tf.squeeze(rewards, axis=-1)
+
+        # Create a discount factor tensor
+        discounts = discount_factor ** tf.range(n + 1, dtype=rewards.dtype)
+        # print(f"discounts: {discounts}")
+
+        # Pad the rewards tensor to ensure proper handling of the last elements in each sequence
+        padded_rewards = tf.pad(rewards, [[0, 0], [0, n - 1]])
+        # print(f"padded_rewards: {padded_rewards}")
+
+        # Create a sliding window view of the padded_rewards
+        windows = tf.TensorArray(
+            dtype=rewards.dtype,
+            size=n,
+            # element_shape=(tf.shape(rewards)[0], tf.shape(rewards)[1]),
+        )
+
+        for i in tf.range(n):
+            value = tf.slice(padded_rewards, [0, i], [-1, tf.shape(rewards)[1]])
+            # print(f"value: {value}")
+            windows = windows.write(i, value)
+
+        # Stack the windows into a single tensor
+        rewards_windows = tf.transpose(windows.stack(), [1, 0, 2])
+        # print(f"rewards_windows: {rewards_windows}")
+
+        # Multiply each window by the corresponding discount factor
+        discounted_windows = rewards_windows * discounts[:-1]
+        # print(f"discounted_windows: {discounted_windows}")
+
+        # Sum along the time axis to get the n-step rewards
+        n_step_rewards = tf.reduce_sum(discounted_windows, axis=-1)
+        # print(f"n_step_rewards: {n_step_rewards}")
+
+        # Add the next state value with discount
+        n_step_rewards += (
+            (1.0 - is_terminal)
+            * tf.reverse(discounts[1:], axis=[0])[tf.newaxis, :]
+            * next_state_value[:, tf.newaxis]
+        )
+
+        return n_step_rewards
+
     def train_step(self, sample):
         # Set dtype
         ext_reward = tf.cast(sample.data["ext_reward"], dtype=self.dtype)
@@ -199,13 +263,29 @@ def train_step(self, sample):
         next_Q = self._target_dqn_model_wrapper(
             sample.data["next_observation"], training=False
         )
-        next_Q = tf.reduce_max(next_Q, axis=-1)
+        next_Q = tf.reduce_max(next_Q, axis=-1)[:, -1]
 
         # get targets
         targets = self(sample.data["observation"])
-        indices = tf.range(tf.shape(targets)[0], dtype=sample.data["action"].dtype)
-        indices = tf.transpose([indices, sample.data["action"]])
-        updates = ext_reward[:, -1] + (1.0 - terminal[:, -1]) * self.gamma * next_Q
+        # indices = tf.range(tf.shape(targets)[0], dtype=sample.data["action"].dtype)
+        # indices = tf.transpose([indices, sample.data["action"]])
+        # updates = ext_reward[:, -1] + (1.0 - terminal[:, -1]) * self.gamma * next_Q
+        a = tf.repeat(
+            tf.expand_dims(
+                tf.range(tf.shape(targets)[0], dtype=sample.data["action"].dtype), 1
+            ),
+            tf.shape(targets)[1],
+            axis=-1,
+        )
+        b = tf.repeat(
+            tf.expand_dims(
+                tf.range(tf.shape(targets)[1], dtype=sample.data["action"].dtype), 0
+            ),
+            tf.shape(targets)[0],
+            axis=0,
+        )
+        indices = tf.stack([a, b, sample.data["action"]], axis=-1)
+        updates = self._compute_n_step_rewards(ext_reward, self.gamma, next_Q, terminal)
         targets = tf.stop_gradient(
             tf.tensor_scatter_nd_update(targets, indices, updates)
         )