In [3]:
import numpy as np
import random

SumTree

In [4]:
class SumTree:
    def __init__(self, capacity):
        self.capacity = capacity  # Set the capacity of the SumTree
        self.tree = np.zeros(2 * capacity - 1)   # Initialize the tree with zeros
        self.data = np.zeros(capacity, dtype=object)  # Initialize the data array
        self.write = 0  # Initialize the write pointer
        self.n_entries = 0

    def _propagate(self, idx, change):
        change = np.real(change)
        parent = (idx - 1) // 2  # Calculate the parent index

        self.tree[parent] += change  # Update the parent's value

        if parent != 0:
            self._propagate(parent, change)  # Recursively propagate the change upwards

    def _retrieve(self, idx, s):
        left = 2 * idx + 1
        right = left + 1

        if left >= len(self.tree):  # If we're at a leaf node
            return idx

        if s <= self.tree[left]:
            return self._retrieve(left, s)
        else:
            return self._retrieve(right, s - self.tree[left])

    def total(self):
        return self.tree[0]  # Return the total sum (root of the tree)

    def add(self, priority, data):
        priority = abs(np.real(priority))  # to ensure priority is real and non-negative value
        idx = self.write + self.capacity - 1  # Calculate the index to write to

        self.data[self.write] = data
        self.update(idx, priority)

        self.write += 1  # Move the write pointer
        if self.write >= self.capacity:  # If we've reached the end
            self.write = 0  # Reset the write pointer (circular buffer)

        if self.n_entries < self.capacity:
            self.n_entries += 1

    def update(self, idx, priority):
        priority = abs(np.real(priority))  # to ensure priority is real and non-negative value
        change = priority - self.tree[idx]  # Calculate the change in priority

        self.tree[idx] = priority  # Update the priority
        self._propagate(idx, change)  # Propagate the change upwards

    def get(self, s):
        idx = self._retrieve(0, s)  # Get the index for the given sum
        dataIdx = idx - self.capacity + 1  # Calculate the data index

        return idx, self.tree[idx], self.data[dataIdx]  # Return index, priority, and data

Memory

In [5]:
class Memory:
    epsilon = 1e-5
    alpha = 0.8  # Controls how much prioritization is used
    beta = 0.3
    beta_increment_per_sampling = 0.0005

    def __init__(self, capacity):
        self.tree = SumTree(capacity)
        self.capacity = capacity

    def _get_priority(self, error):
        priority = (np.abs(error) + self.epsilon) ** self.alpha
        return priority

    def _get_reward_factor(self, reward):
        reward_factor = (reward + 1e-5) ** (self.alpha * 0.5)  # Scale reward influence
        return reward_factor

    def add(self, error, reward, sample):
        # Prioritize based on a combination of TD-error and reward
        priority = self._get_priority(error)
        reward_factor = self._get_reward_factor(reward)
        final_priority = priority * reward_factor

        self.tree.add(final_priority, sample)

    def sample(self, n):
        batch = []
        idxs = []
        segment = self.tree.total() / n
        priorities = []

        self.beta = np.min([1., self.beta + self.beta_increment_per_sampling])

        for i in range(n):
            a = segment * i
            b = segment * (i + 1)

            s = random.uniform(a, b)

            (idx, priority, data) = self.tree.get(s)

            priorities.append(priority)
            batch.append(data)
            idxs.append(idx)

        sampling_probabilities = priorities / self.tree.total()
        is_weight = np.power(self.tree.n_entries * sampling_probabilities, -self.beta)
        is_weight /= is_weight.max()

        return batch, idxs, is_weight

    def update(self, idx, error, reward):
        priority = self._get_priority(error)
        reward_factor = self._get_reward_factor(reward)
        final_priority = priority * reward_factor
        self.tree.update(idx, final_priority)


PER-DQN

In [6]:

from keras.models import Sequential
from keras.layers import Dense, BatchNormalization
from keras.optimizers import Adam

from tensorflow.keras.callbacks import TensorBoard
import datetime
import tensorflow as tf

Huber Loss Function

In [7]:


HUBER_LOSS_DELTA = 1.0


def huber_loss(y_true, y_pred):
    err = y_true - y_pred

    cond = tf.abs(err) < HUBER_LOSS_DELTA

    squared_loss = 0.5 * tf.square(err)
    # quadratic_loss = HUBER_LOSS_DELTA * (tf.abs(err) - 0.5 * HUBER_LOSS_DELTA)
    linear = HUBER_LOSS_DELTA * (tf.abs(err) - 0.5 * HUBER_LOSS_DELTA)

    # Keras does not cover where function in tensorflow :-(
    loss = tf.where(cond, squared_loss, linear)
    loss = tf.reduce_mean(loss)

    return loss


DDQN - PER

In [8]:
class DDQNAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.gamma = 0.99    # discount rate
        self.epsilon = 1.0  # exploration rate
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995
        self.dqn_learning_rate = 0.0003

        self.model = self._build_model()
        self.target_model = self._build_model()
        self.update_target_model()

        self.memory = Memory(1000000)  # PER Memory

        self.tau = 0.1  # this is a soft update rate for the target model
        self.batch_size = 32

        self.reward_threshold = 5.0  # Threshold for high rewards
        self.action_rewards = {a: [] for a in range(self.action_size)}  # Store rewards for each action

    def set_reward_threshold(self, reward_threshold):
        self.reward_threshold = reward_threshold

    def _build_model(self):
        # Neural Net for Deep-Q learning Model
        model = Sequential()
        model.add(Dense(64, input_dim=self.state_size, activation='relu'))
        model.add(BatchNormalization()) 
        model.add(Dense(64, activation='relu'))
        model.add(BatchNormalization()) 
        model.add(Dense(self.action_size, activation='linear'))
        model.compile(loss='mse',
                      optimizer=Adam(learning_rate=self.dqn_learning_rate))
        return model
    
    def update_target_model(self):
        self.target_model.set_weights(self.model.get_weights())

    def soft_update_target_model(self):
        target_weights = self.target_model.get_weights()
        main_weights = self.model.get_weights()

        new_weights = []
        for main_weight, target_weight in zip(main_weights, target_weights):
            updated_w = self.tau * main_weight + (1 - self.tau) * target_weight
            new_weights.append(updated_w)

        self.target_model.set_weights(new_weights)

    def memorize(self, state, action, reward, next_state, done):

        target = self.model.predict(state)

        if done:
            target[0][action] = reward
        else:
            best_next_action = np.argmax(self.model.predict(next_state)[0])
            target[0][action] = reward + self.gamma * self.target_model.predict(next_state)[0][best_next_action]


        # Compute TD-error
        current_q_value = self.model.predict(state)[0][action]
        error = abs(target[0][action] - current_q_value)

        """...............keep adding the new experiences in memory..................."""
        self.memory.add(error, reward, (state, action, reward, next_state, done))

    def act(self, state):
        # Exploration: choose a random action
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        else:
            # Exploitation: predict the action based on the Q-values
            act_values = self.model.predict(state)

            # Reward-based action bias
            high_reward_actions = self.get_high_reward_actions(state)

            if high_reward_actions:
                # Add some probability to select actions with high past rewards
                action = self.bias_towards_high_reward_actions(act_values, high_reward_actions)
            else:
                action = np.argmax(act_values[0])

            return action

    def past_rewards_for_action(self, action):
        # Calculate the average reward for the given action
        if len(self.action_rewards[action]) == 0:
            return 0
        return np.mean(self.action_rewards[action])

    def get_high_reward_actions(self, state):
        # Identify actions that consistently lead to high rewards
        # This can be stored or computed based on experience
        high_reward_actions = []
        for a in range(self.action_size):
            avg_reward = self.past_rewards_for_action(a)
            if avg_reward > self.reward_threshold:
                high_reward_actions.append(a)
        return high_reward_actions

    def bias_towards_high_reward_actions(self, act_values, high_reward_actions):
        exp_act_values = np.exp(act_values[0] / self.tau)
        boltzman_probabilities = exp_act_values / np.sum(exp_act_values)

        probabilities = boltzman_probabilities.copy()

        if high_reward_actions:
            for a in high_reward_actions:
                probabilities[a] += 0.1  # Increase the probability for high-reward actions

            probabilities /= probabilities.sum()

        return np.random.choice(range(self.action_size), p=probabilities)


    def replay(self, batch_size=32):
        """......................................................................."""
        # if len(self.memory) < batch_size:
        #     return  # Don't replay until there's enough samples
        
        # Sample a batch of experiences from memory
        minibatch, idxs, is_weights = self.memory.sample(batch_size)

        for i, (state, action, reward, next_state, done) in enumerate(minibatch):
            target = self.model.predict(state)
            if done:
                target[0][action] = reward
            else:
                # Double DQN: Use main model to select action, target model to evaluate it
                best_next_action = np.argmax(self.model.predict(next_state)[0])
                target[0][action] = reward + self.gamma * self.target_model.predict(next_state)[0][best_next_action]

                # next_q_values = self.model.predict(next_state)
                # target[0][action] = reward + self.gamma * np.amax(next_q_values[0])

            # Compute TD-error
            current_q_value = self.model.predict(state)[0][action]
            error = abs(target[0][action] - current_q_value)

            # Update the memory with the new priority that incorporates reward
            self.memory.update(idxs[i], error, reward)

            self.model.train_on_batch(state, target)
        
        # Soft update the target model after every replay step
        self.soft_update_target_model()

        # Reduce epsilon to encourage exploitation over time
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

    def load(self, name):
        """ Load pre-trained model """
        self.model.load_weights(name)

    def save(self, name):
        """ Save trained model """
        self.model.save_weights(name)


DQN - PER

In [9]:
class DQNAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.gamma = 0.99    # discount rate
        self.epsilon = 1.0  # exploration rate
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995
        self.dqn_learning_rate = 0.001
        self.model = self._build_model()

        self.memory = Memory(1000000)  # PER Memory

        self.reward_threshold = 5.0  # Threshold for high rewards
        self.action_rewards = {a: [] for a in range(self.action_size)}  # Store rewards for each action

    def set_reward_threshold(self, reward_threshold):
        self.reward_threshold = reward_threshold

    def _build_model(self):
        # Neural Net for Deep-Q learning Model
        model = Sequential()
        model.add(Dense(24, input_dim=self.state_size, activation='relu'))
        model.add(Dense(24, activation='relu'))
        model.add(Dense(self.action_size, activation='linear'))
        model.compile(loss='mse',
                      optimizer=Adam(learning_rate=self.dqn_learning_rate))
        return model

    def memorize(self, state, action, reward, next_state, done):

        target = self.model.predict(state)
        if done:
            target[0][action] = reward
        else:
            next_q_values = self.model.predict(next_state)
            target[0][action] = reward + self.gamma * np.amax(next_q_values[0])

        # Compute TD-error
        error = abs(np.real(target[0][action] - self.model.predict(state)[0][action]))
        """...............keep adding the new experiences in memory..................."""
        self.memory.add(error, reward, (state, action, reward, next_state, done))

    def act(self, state):
        # Exploration: choose a random action
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        else:
            # Exploitation: predict the action based on the Q-values
            act_values = self.model.predict(state)

            # Reward-based action bias
            high_reward_actions = self.get_high_reward_actions(state)

            if high_reward_actions:
                # Add some probability to select actions with high past rewards
                action = self.bias_towards_high_reward_actions(act_values, high_reward_actions)
            else:
                action = np.argmax(act_values[0])

            return action

    def past_rewards_for_action(self, action):
        # Calculate the average reward for the given action
        if len(self.action_rewards[action]) == 0:
            return 0
        return np.mean(self.action_rewards[action])

    def get_high_reward_actions(self, state):
        # Identify actions that consistently lead to high rewards
        # This can be stored or computed based on experience
        high_reward_actions = []
        for a in range(self.action_size):
            avg_reward = self.past_rewards_for_action(a)
            if avg_reward > self.reward_threshold:
                high_reward_actions.append(a)
        return high_reward_actions

    def bias_towards_high_reward_actions(self, act_values, high_reward_actions):
        # Introduce a bias to select one of the high-reward actions
        probabilities = np.ones(self.action_size) * 0.1  # Small probability for each action
        for a in high_reward_actions:
            probabilities[a] += 0.2  # Increase probability for high-reward actions
        probabilities /= probabilities.sum()  # Normalize
        return np.random.choice(range(self.action_size), p=probabilities)

    def replay(self, batch_size=32):
        """......................................................................."""

        # Sample a batch of experiences from memory
        minibatch, idxs, is_weights = self.memory.sample(batch_size)

        for i, (state, action, reward, next_state, done) in enumerate(minibatch):
            target = self.model.predict(state)
            if done:
                target[0][action] = reward
            else:
                next_q_values = self.model.predict(next_state)
                target[0][action] = reward + self.gamma * np.amax(next_q_values[0])

            # Compute TD-error
            error = abs(target[0][action] - self.model.predict(state)[0][action])

            # Update the memory with the new priority that incorporates reward
            self.memory.update(idxs[i], error, reward)
            self.model.train_on_batch(state, target)

        # Reduce epsilon to encourage exploitation over time
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay


Main


In [10]:
import gymnasium as gym

In [11]:
if __name__ == "__main__":
    """..............................for tensorboard logs..................................."""
    log_dir = "logs/AdaptiveBehavior_DDQN" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")

    tensorboard_callback = TensorBoard(log_dir=log_dir, histogram_freq=1)

    episode_rewards = []
    episode_epsilons = []

    writer = tf.summary.create_file_writer(log_dir)

    """..................,,,,,,,.the main training loop....................................."""
    env = gym.make('CartPole-v1')

    state_size = env.observation_space.shape[0]
    action_size = env.action_space.n

    agent = DDQNAgent(state_size, action_size)
    agent.set_reward_threshold(1500)

    scores = []
    EPISODES = 2000
    batch_size = 32
    avg_window = 100

    for e in range(EPISODES):
        state, _ = env.reset(seed=42)
        state = np.reshape(state, [1, state_size])
        done = False
        total_reward = 0
        time = 0

        while not done:
            time += 1
            action = agent.act(state)
            next_state, reward, done, _, _ = env.step(action)
            reward = reward if not done else -10
            next_state = np.reshape(next_state, [1, state_size])

            agent.memorize(state, action, reward, next_state, done)
            state = next_state
            total_reward += reward

            if done:
                scores.append(total_reward)
                if e % 100 == 0 and e > 1:
                    print("episode: {}/{}, Score Mean: {} / Median: {} ".format(e, EPISODES, int(np.mean(scores)),
                                                                                int(np.median(scores))))
                    print("Beta {:.5f} / Eps: {:.5f}".format(agent.memory.beta, agent.epsilon))
                

        if agent.memory.tree.n_entries > 1000:
            agent.replay(batch_size)

        with writer.as_default():
            tf.summary.scalar('Total Reward', total_reward, step=e)
            tf.summary.scalar('Epsilon', agent.epsilon, step=e)

            # Compute and log the average reward over the last 100 episodes
            if len(scores) >= avg_window:
                avg_reward = np.mean(scores[-avg_window:])
                tf.summary.scalar('Average Reward (last 100 episodes)', avg_reward, step=e)

    env.close()



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11m



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/s

In [7]:
%tensorboard --logdir ./Test

UsageError: Line magic function `%tensorboard` not found.
