<a href="https://colab.research.google.com/github/newmantic/DDPG_options_hedge/blob/main/DDPG_options_hedge.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers
import gym

class OUActionNoise:
    def __init__(self, mean, std_dev, theta=0.15, dt=1e-2, x_initial=None):
        self.theta = theta
        self.mean = mean
        self.std_dev = std_dev
        self.dt = dt
        self.x_initial = x_initial
        self.reset()

    def __call__(self):
        x = (
            self.x_prev
            + self.theta * (self.mean - self.x_prev) * self.dt
            + self.std_dev * np.sqrt(self.dt) * np.random.normal(size=self.mean.shape)
        )
        self.x_prev = x
        return x

    def reset(self):
        self.x_prev = self.x_initial if self.x_initial is not None else np.zeros_like(self.mean)


class Buffer:
    def __init__(self, num_states, num_actions, buffer_capacity=100000, batch_size=64):
        self.buffer_capacity = buffer_capacity
        self.batch_size = batch_size
        self.buffer_counter = 0

        # Correct the shape to match the environment's state and action spaces
        self.state_buffer = np.zeros((self.buffer_capacity, num_states))
        self.action_buffer = np.zeros((self.buffer_capacity, num_actions))
        self.reward_buffer = np.zeros((self.buffer_capacity, 1))
        self.next_state_buffer = np.zeros((self.buffer_capacity, num_states))

    def record(self, obs_tuple):
        index = self.buffer_counter % self.buffer_capacity

        self.state_buffer[index] = obs_tuple[0]
        self.action_buffer[index] = obs_tuple[1]
        self.reward_buffer[index] = obs_tuple[2]
        self.next_state_buffer[index] = obs_tuple[3]

        self.buffer_counter += 1

    def sample(self):
        record_range = min(self.buffer_counter, self.buffer_capacity)
        batch_indices = np.random.choice(record_range, self.batch_size)

        state_batch = tf.convert_to_tensor(self.state_buffer[batch_indices])
        action_batch = tf.convert_to_tensor(self.action_buffer[batch_indices])
        reward_batch = tf.convert_to_tensor(self.reward_buffer[batch_indices])
        next_state_batch = tf.convert_to_tensor(self.next_state_buffer[batch_indices])

        return state_batch, action_batch, reward_batch, next_state_batch







In [8]:
class DDPGAgent:
    def __init__(self, num_states, num_actions, upper_bound, lower_bound):
        self.num_states = num_states
        self.num_actions = num_actions
        self.upper_bound = upper_bound
        self.lower_bound = lower_bound

        self.actor_model = self.create_actor_model()
        self.critic_model = self.create_critic_model()

        self.target_actor = self.create_actor_model()
        self.target_critic = self.create_critic_model()

        self.target_actor.set_weights(self.actor_model.get_weights())
        self.target_critic.set_weights(self.critic_model.get_weights())

        # Initialize the buffer with the correct state and action dimensions
        self.buffer = Buffer(num_states, num_actions)
        self.gamma = 0.99
        self.tau = 0.005
        self.critic_lr = 0.002
        self.actor_lr = 0.001

        self.critic_optimizer = tf.keras.optimizers.Adam(self.critic_lr)
        self.actor_optimizer = tf.keras.optimizers.Adam(self.actor_lr)

        self.noise = OUActionNoise(np.zeros(1), std_dev=0.2)

    def create_actor_model(self):
        inputs = layers.Input(shape=(self.num_states,))
        out = layers.Dense(256, activation="relu")(inputs)
        out = layers.Dense(256, activation="relu")(out)
        outputs = layers.Dense(self.num_actions, activation="tanh")(out)

        model = tf.keras.Model(inputs, outputs)
        return model

    def create_critic_model(self):
        state_input = layers.Input(shape=(self.num_states,))
        action_input = layers.Input(shape=(self.num_actions,))

        concat = layers.Concatenate()([state_input, action_input])

        out = layers.Dense(256, activation="relu")(concat)
        out = layers.Dense(256, activation="relu")(out)
        outputs = layers.Dense(1)(out)

        model = tf.keras.Model([state_input, action_input], outputs)
        return model

    def policy(self, state):
        sampled_actions = tf.squeeze(self.actor_model(state))
        noise = self.noise()
        sampled_actions = sampled_actions.numpy() + noise
        legal_action = np.clip(sampled_actions, self.lower_bound, self.upper_bound)
        return np.squeeze(legal_action)

    def update(self):
        state_batch, action_batch, reward_batch, next_state_batch = self.buffer.sample()

        # Ensure the reward_batch is of type float32
        reward_batch = tf.cast(reward_batch, dtype=tf.float32)

        with tf.GradientTape() as tape:
            target_actions = self.target_actor(next_state_batch)

            # Cast critic's output to float32 to ensure same type for addition
            target_critic_value = self.target_critic([next_state_batch, target_actions], training=True)
            y = reward_batch + self.gamma * tf.cast(target_critic_value, dtype=tf.float32)

            critic_value = self.critic_model([state_batch, action_batch], training=True)
            critic_loss = tf.math.reduce_mean(tf.math.square(y - critic_value))

        critic_grad = tape.gradient(critic_loss, self.critic_model.trainable_variables)
        self.critic_optimizer.apply_gradients(zip(critic_grad, self.critic_model.trainable_variables))

        with tf.GradientTape() as tape:
            actions = self.actor_model(state_batch, training=True)
            critic_value = self.critic_model([state_batch, actions], training=True)
            actor_loss = -tf.math.reduce_mean(critic_value)

        actor_grad = tape.gradient(actor_loss, self.actor_model.trainable_variables)
        self.actor_optimizer.apply_gradients(zip(actor_grad, self.actor_model.trainable_variables))

    def update_target(self, target_weights, weights):
        for (a, b) in zip(target_weights, weights):
            a.assign(b * self.tau + a * (1 - self.tau))

    def update_target_network(self):
        self.update_target(self.target_actor.variables, self.actor_model.variables)
        self.update_target(self.target_critic.variables, self.critic_model.variables)



In [10]:
def train_ddpg():
    env = gym.make("Pendulum-v1")  # Replace with an options trading simulator environment
    num_states = env.observation_space.shape[0]
    num_actions = env.action_space.shape[0]
    upper_bound = env.action_space.high[0]
    lower_bound = env.action_space.low[0]

    agent = DDPGAgent(num_states, num_actions, upper_bound, lower_bound)

    episodes = 30
    for ep in range(episodes):
        prev_state = env.reset()
        episodic_reward = 0

        while True:
            # The action returned by agent.policy is a single value array, so we need to extract it.
            action = agent.policy(tf.convert_to_tensor([prev_state], dtype=tf.float32))
            action = np.squeeze(action)  # Extract the scalar value

            # Pass the scalar action to env.step
            state, reward, done, _ = env.step([action])  # Ensure action is passed as a list/array

            agent.buffer.record((prev_state, [action], reward, state))
            episodic_reward += reward

            agent.update()
            agent.update_target_network()

            prev_state = state

            if done:
                break

        print(f"Episode {ep}, Reward: {episodic_reward}")

In [11]:

if __name__ == "__main__":
    train_ddpg()

Episode 0, Reward: -1251.0065181114087
Episode 1, Reward: -1340.8269607252926
Episode 2, Reward: -1750.1570692085372
Episode 3, Reward: -1486.8933472192468
Episode 4, Reward: -1616.5251307690714
Episode 5, Reward: -1415.7678569126706
Episode 6, Reward: -1359.1714614929513
Episode 7, Reward: -1212.1008949267286
Episode 8, Reward: -1639.5173224021191
Episode 9, Reward: -1061.4995448931588
Episode 10, Reward: -1170.1961428302375
Episode 11, Reward: -902.983293815275
Episode 12, Reward: -931.3447052498097
Episode 13, Reward: -372.43118919121963
Episode 14, Reward: -1155.7972916245972
Episode 15, Reward: -247.27060395362665
Episode 16, Reward: -241.96959259250056
Episode 17, Reward: -1054.002444442528
Episode 18, Reward: -823.7997258548645
Episode 19, Reward: -244.54180496750422
Episode 20, Reward: -787.9266572693169
Episode 21, Reward: -505.4232198651289
Episode 22, Reward: -127.34765836924323
Episode 23, Reward: -496.762125960873
Episode 24, Reward: -377.56677103874745
Episode 25, Reward: