In [None]:
import numpy as np
import torch
import torch.nn as nn
from deep_hedging_env import HedgingEnv
from logit_normal import LogitNormal
from plot_utils import plot_portfolio_vs_option_price
from torchrl.envs import GymWrapper
from torchrl.collectors import SyncDataCollector
from torchrl.data.replay_buffers import ReplayBuffer
from torchrl.data.replay_buffers.samplers import SamplerWithoutReplacement
from torchrl.data.replay_buffers.storages import LazyTensorStorage
from torchrl.modules import ProbabilisticActor, SafeModule
from tensordict.nn import TensorDictModule
from torchrl.objectives import ClipPPOLoss
from torchrl.modules import ProbabilisticActor, SafeModule
from torchrl.modules import (
    ValueOperator,
    ActorValueOperator,
    NormalParamExtractor,
)
from torchrl.objectives.value import GAE
from torchrl.envs.utils import ExplorationType, set_exploration_type

# PPO (Recurrent)

In [2]:
S0 = np.array([50.0, 100.0, 200.0])
K = np.array([[45.0, 55.0], [90.0, 110.0], [180.0, 220.0]])
sigma = np.array([0.15, 0.2, 0.25])
r = 0.05
num_simulation = 32
num_step = 250
history_len = 5
base_env = HedgingEnv(S0, K, sigma, r, num_simulation=num_simulation, num_step=num_step, history_len=history_len)
env = GymWrapper(base_env)

In [3]:
frames_per_batch = env.num_envs * num_step
sub_batch_num = 10
sub_batch_size = frames_per_batch // sub_batch_num
frames_per_batch, sub_batch_size

(48000, 4800)

In [4]:
# Param for PPO
clip_param = 0.2
value_coef = 0.5
entropy_coef = 0.01
# Param for GAE
gamma = 0.99
lmbda = 0.95

In [5]:
class FeatureExtractor(nn.Module):
    def __init__(self):
        super(FeatureExtractor, self).__init__()
        self.rnn = nn.GRU(
            input_size=11, hidden_size=64, num_layers=2, batch_first=True, dropout=0.0
        )

    def forward(self, x):
        if len(x.shape) > 3:  # Handle 4D input
            x_reshaped = x.view(-1, x.shape[-2], x.shape[-1])
            output, _ = self.rnn(x_reshaped)
            # Reshape output back to original batch dimensions
            output = output.view(
                x.shape[0], x.shape[1], x.shape[2], output.shape[-1]
            )
        else:  # Handle 3D input directly
            output, _ = self.rnn(x)
        output = output[..., -1, :]  # Take output from the last time step
        return output

In [6]:
class FeatureExtractor(nn.Module):
    def __init__(self):
        super(FeatureExtractor, self).__init__()
        self.rnn = nn.LSTM(
            input_size=11, hidden_size=64, num_layers=2, batch_first=True, dropout=0.0
        )

    def forward(self, x):
        if len(x.shape) > 3:  # Handle 4D input
            x_reshaped = x.view(-1, x.shape[-2], x.shape[-1])
            output, _ = self.rnn(x_reshaped)
            # Reshape output back to original batch dimensions
            output = output.view(
                x.shape[0], x.shape[1], x.shape[2], output.shape[-1]
            )
        else:  # Handle 3D input directly
            output, _ = self.rnn(x)
        output = output[..., -1, :]  # Take output from the last time step
        return output


feature_extractor = SafeModule(
    module=FeatureExtractor(),
    in_keys=["observation"],
    out_keys=["feature"],
)
policy_network = TensorDictModule(
    nn.Sequential(torch.nn.Linear(64, 2), NormalParamExtractor()),
    in_keys=["feature"],
    out_keys=["loc", "scale"],
)
actor = ProbabilisticActor(
    module=policy_network,
    in_keys=["loc", "scale"],
    out_keys=["action"],
    distribution_class=LogitNormal,
    return_log_prob=True,
)
critic = ValueOperator(
    module=nn.Sequential(torch.nn.Linear(64, 8), nn.Tanh(), nn.Linear(8, 1)),
    in_keys=["feature"],
    out_keys=["state_value"],
)
model = ActorValueOperator(feature_extractor, actor, critic)

In [7]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

ActorValueOperator(
    module=ModuleList(
      (0): SafeModule(
          module=FeatureExtractor(
            (rnn): LSTM(11, 64, num_layers=2, batch_first=True)
          ),
          device=cuda:0,
          in_keys=['observation'],
          out_keys=['feature'])
      (1): ProbabilisticActor(
          module=ModuleList(
            (0): TensorDictModule(
                module=Sequential(
                  (0): Linear(in_features=64, out_features=2, bias=True)
                  (1): NormalParamExtractor(
                    (scale_mapping): biased_softplus()
                  )
                ),
                device=cuda:0,
                in_keys=['feature'],
                out_keys=['loc', 'scale'])
            (1): SafeProbabilisticModule()
          ),
          device=cuda:0,
          in_keys=['feature'],
          out_keys=['loc', 'scale', 'action', 'sample_log_prob'])
      (2): ValueOperator(
          module=Sequential(
            (0): Linear(in_features=64, out_

In [8]:
advantage_module = GAE(
    gamma=gamma,
    lmbda=lmbda,
    value_network=model.get_value_operator(),
    shifted=True # make sure use this one for RNN
)

In [9]:
loss_module = ClipPPOLoss(
    actor_network=model.get_policy_operator(),
    critic_network=model.get_value_operator(),
    clip_epsilon=clip_param,
    entropy_coef=entropy_coef,
    value_coef=value_coef,
)

In [10]:
optim = torch.optim.Adam(loss_module.parameters(),lr=1e-4)

In [11]:
num_epochs = 50
num_episodes = 200

In [None]:
# Train

for epoch in range(num_epochs):
    for episode in range(num_episodes):
        env.reset(seed=epoch + 1000)
        collector = SyncDataCollector(
            env,
            model.get_policy_operator(),
            frames_per_batch=frames_per_batch,
            total_frames=frames_per_batch,
            device=device,
        )
        replay_buffer = ReplayBuffer(
            storage=LazyTensorStorage(max_size=frames_per_batch),
            sampler=SamplerWithoutReplacement(),
        )
        for batch in collector:
            advantage_module(batch)
            replay_buffer.extend(batch.reshape(-1).cpu())
            for _ in range(sub_batch_num):
                subdata = replay_buffer.sample(sub_batch_size)
                optim.zero_grad()
                # Forward pass PPO loss
                loss = loss_module(subdata.to(device))
                loss_sum = (
                    loss["loss_critic"] + loss["loss_objective"] + loss["loss_entropy"]
                )
                # Backward pass
                loss_sum.backward()
                torch.nn.utils.clip_grad_norm_(loss_module.parameters(), max_norm=1.0)
                for param in loss_module.parameters():
                    if param.grad is not None:
                        param.grad = torch.nan_to_num(param.grad)
                # Update the networks
                optim.step()

        if (episode + 1) % 10 == 0:
            print(
                f"Epoch {epoch+1}/{num_epochs}, Episode {episode + 1}/{num_episodes}, Loss: {loss_sum.item()}, Avg. Reward: {batch['next', 'reward'].mean().item()}"
            )

Epoch 1/50, Episode 10/200, Loss: 150.07315063476562, Avg. Reward: -4.582630157470703
Epoch 1/50, Episode 20/200, Loss: 144.41307067871094, Avg. Reward: -4.500133514404297
Epoch 1/50, Episode 30/200, Loss: 148.0863800048828, Avg. Reward: -4.487645626068115
Epoch 1/50, Episode 40/200, Loss: 130.9796600341797, Avg. Reward: -4.166134834289551
Epoch 1/50, Episode 50/200, Loss: 141.50942993164062, Avg. Reward: -4.3846869468688965
Epoch 1/50, Episode 60/200, Loss: 149.19015502929688, Avg. Reward: -4.507399082183838
Epoch 1/50, Episode 70/200, Loss: 150.46881103515625, Avg. Reward: -4.559741020202637
Epoch 1/50, Episode 80/200, Loss: 168.5126190185547, Avg. Reward: -5.078275680541992
Epoch 1/50, Episode 90/200, Loss: 155.8957977294922, Avg. Reward: -4.807435989379883
Epoch 1/50, Episode 100/200, Loss: 146.28187561035156, Avg. Reward: -4.433305740356445
Epoch 1/50, Episode 110/200, Loss: 156.56600952148438, Avg. Reward: -4.754453182220459
Epoch 1/50, Episode 120/200, Loss: 150.64706420898438, 

In [None]:
# Test

base_env = HedgingEnv(S0, K, sigma, r, num_simulation=5, num_step=num_step, history_len=history_len)
env = GymWrapper(base_env, device=device)
env.reset(seed=0)

with set_exploration_type(ExplorationType.DETERMINISTIC):
    rollout = env.rollout(max_steps=num_step, policy=model.get_policy_operator())

In [None]:
rewards = rollout['next', 'reward'].detach().cpu().numpy()
rewards.min(), rewards.max(), rewards.mean(), rewards.std()

In [None]:
plot_portfolio_vs_option_price(env._env)