In [20]:
import sys
import os
sys.path.append('..')  # Adjust the path

# Obtain relevant functions
from src.pricing import black_scholes_price, black_scholes_delta
from src.hedging import initialize_simulation, PnL

In [21]:
import numpy as np
from scipy.stats import norm
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.optim as optim


In [22]:
# Simulation parameters
n_paths = 100  # number of simulation paths
S0 = np.full(n_paths, 100.0)  # initial stock price for each path
K = 100.0
T = 1.0
r = 0.05
sigma = 0.2
steps = 10_000  # daily steps in one year
dt = T / steps

In [None]:
class DeltaApproximator(nn.Module):
    def __init__(self, input_dim=3, hidden_dim=32, output_dim=1):
        """
        Input features: current stock price S, time to expiry, risk-free rate r.
        Output: hedge position (delta) for a call option (in [0,1]).
        """
        super(DeltaApproximator, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, output_dim),
            nn.Sigmoid()  # to ensure output in [0,1] for call options.
        )
    
    def forward(self, x):
        return self.net(x)


In [29]:
# ---------------------------
# Critic: Estimates the state value
# ---------------------------
class ValueNetwork(nn.Module):
    def __init__(self, input_dim=3, hidden_dim=32, output_dim=1):
        super(ValueNetwork, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, output_dim)
        )
    
    def forward(self, x):
        return self.net(x)

In [30]:
class HedgingEnv:
    def __init__(self, S0, K, T, r, sigma, steps):
        self.S0 = S0
        self.K = K
        self.T = T
        self.r = r
        self.sigma = sigma
        self.steps = steps
        self.dt = T / steps
        self.reset()
    
    def reset(self):
        self.current_S = self.S0
        self.t = 0
        self.shares_held = 0.0
        self.initial_option_price = black_scholes_price(self.S0, self.K, self.T, self.r, self.sigma)
        self.cash_account = self.initial_option_price
        return self._get_obs()
    
    def _get_obs(self):
        # Observations: current stock price, time to expiry, and risk-free rate.
        time_remaining = self.T - self.t * self.dt
        return np.array([self.current_S, time_remaining, self.r], dtype=np.float32)
    
    def step(self, action):
        """
        Action is the new hedge (delta). The trade is the difference from the current hedge.
        """
        # Compute the trade required.
        delta_old = self.shares_held
        delta_new = action
        shares_to_trade = delta_new - delta_old
        
        # Trade at current price.
        self.cash_account -= shares_to_trade * self.current_S
        self.shares_held = delta_new
        
        # Cash grows at the risk-free rate.
        self.cash_account *= np.exp(self.r * self.dt)
        
        # Evolve the stock price.
        z = np.random.normal(0, 1)
        self.current_S = self.current_S * np.exp((self.r - 0.5 * self.sigma**2) * self.dt +
                                                 self.sigma * np.sqrt(self.dt) * z)
        self.t += 1
        done = (self.t >= self.steps)
        obs = self._get_obs()
        reward = 0.0
        
        if done:
            # At expiry, compute the option payoff (for a call) and the hedging error.
            option_payoff = max(0.0, self.current_S - self.K)
            portfolio_value = self.cash_account + self.shares_held * self.current_S
            reward = -abs(portfolio_value - option_payoff)
        return obs, reward, done, {}

In [31]:
# ---------------------------
# Actor-Critic Training Function
# ---------------------------
def train_actor_critic(env, actor_net, critic_net, actor_optimizer, critic_optimizer, num_episodes=5000, gamma=1.0):
    actor_net.train()
    critic_net.train()
    
    for episode in range(num_episodes):
        states = []
        actions = []
        rewards = []
        log_probs = []
        values = []
        
        obs = env.reset()
        done = False
        
        while not done:
            # Convert observation to tensor.
            state_tensor = torch.tensor(obs, dtype=torch.float32).unsqueeze(0)  # shape: (1, 3)
            
            # Actor: get the mean delta.
            delta_mean = actor_net(state_tensor)
            fixed_std = 0.05  # Fixed std for exploration.
            dist = torch.distributions.Normal(delta_mean, fixed_std)
            action_tensor = dist.sample()
            log_prob = dist.log_prob(action_tensor)
            
            # Ensure action is within [0,1].
            action_clipped = torch.clamp(action_tensor, 0.0, 1.0)
            action = action_clipped.item()
            
            # Critic: estimate the state value.
            value = critic_net(state_tensor)
            
            states.append(obs)
            actions.append(action)
            log_probs.append(log_prob)
            values.append(value)
            
            obs, reward, done, _ = env.step(action)
            rewards.append(reward)
        
        # Compute returns (cumulative discounted rewards).
        returns = []
        G = 0
        for r in reversed(rewards):
            G = r + gamma * G
            returns.insert(0, G)
        returns = torch.tensor(returns, dtype=torch.float32).unsqueeze(1)
        values = torch.cat(values)
        log_probs = torch.cat(log_probs)
        
        # Compute advantage.
        advantages = returns - values.detach()
        
        # Actor loss (policy gradient with advantage).
        actor_loss = - (log_probs * advantages).mean()
        
        # Critic loss (mean squared error).
        critic_loss = nn.MSELoss()(values, returns)
        
        total_loss = actor_loss + critic_loss
        
        actor_optimizer.zero_grad()
        critic_optimizer.zero_grad()
        total_loss.backward()
        actor_optimizer.step()
        critic_optimizer.step()
        
        if episode % 50 == 0:
            total_reward = sum(rewards)
            print(f"Episode {episode}: Total Reward = {total_reward:.4f}, Actor Loss = {actor_loss.item():.4f}, Critic Loss = {critic_loss.item():.4f}")



In [32]:
if __name__ == "__main__":
    # Environment parameters.
    S0 = 100.0
    K = 100.0
    T = 1.0
    r = 0.05
    sigma = 0.2
    steps = 252  # daily hedging.
    
    env = HedgingEnv(S0, K, T, r, sigma, steps)
    
    # Instantiate actor (policy) and critic networks.
    actor_net = DeltaApproximator(input_dim=3, hidden_dim=32, output_dim=1)
    critic_net = ValueNetwork(input_dim=3, hidden_dim=32, output_dim=1)
    
    # Define separate optimizers.
    actor_optimizer = optim.Adam(actor_net.parameters(), lr=1e-3)
    critic_optimizer = optim.Adam(critic_net.parameters(), lr=1e-3)
    
    # Train using the actor-critic method.
    train_actor_critic(env, actor_net, critic_net, actor_optimizer, critic_optimizer, num_episodes=5000, gamma=1.0)

Episode 0: Total Reward = -18.0840, Actor Loss = 24.3440, Critic Loss = 245.3714
Episode 50: Total Reward = -11.2909, Actor Loss = 1.0131, Critic Loss = 0.4753
Episode 100: Total Reward = -6.1853, Actor Loss = -14.9981, Critic Loss = 92.5718
Episode 150: Total Reward = -6.4276, Actor Loss = -15.9625, Critic Loss = 106.0560
Episode 200: Total Reward = -9.8045, Actor Loss = -1.7162, Critic Loss = 2.6450
Episode 250: Total Reward = -9.2938, Actor Loss = -8.4733, Critic Loss = 28.6252
Episode 300: Total Reward = -10.3686, Actor Loss = -3.2546, Critic Loss = 4.6861
Episode 350: Total Reward = -10.8989, Actor Loss = -0.6797, Critic Loss = 0.3150
Episode 400: Total Reward = -9.4749, Actor Loss = 1.3298, Critic Loss = 1.1811
Episode 450: Total Reward = -10.4518, Actor Loss = -1.9687, Critic Loss = 1.8083
Episode 500: Total Reward = -10.5832, Actor Loss = -0.9849, Critic Loss = 0.5250
Episode 550: Total Reward = -11.0193, Actor Loss = -2.8628, Critic Loss = 4.2810
Episode 600: Total Reward = -9

In [27]:
# --- Example usage ---
if __name__ == "__main__":
    # Environment parameters
    S0 = 100.0
    K = 100.0
    T = 1.0
    r = 0.05
    sigma = 0.2
    steps = 252  # daily hedging
    
    # Create the hedging environment.
    env = HedgingEnv(S0, K, T, r, sigma, steps)
    
    # Instantiate the policy network.
    policy_net = DeltaApproximator(input_dim=3, hidden_dim=64, output_dim=1)
    optimizer = optim.Adam(policy_net.parameters(), lr=0.01)
    
    # Train the policy using the REINFORCE algorithm.
    train_policy(env, policy_net, optimizer, num_episodes=100000)
    
    # After training, you can integrate the trained network into your dynamic hedging simulation.
    # For example, pass policy_net as the neural_net argument to dynamic_delta_hedging.

Episode 0: Total Reward = -1.3574
Episode 50: Total Reward = -10.2214
Episode 100: Total Reward = -12.0360
Episode 150: Total Reward = -10.5291
Episode 200: Total Reward = -11.0444
Episode 250: Total Reward = -6.5201
Episode 300: Total Reward = -10.7951
Episode 350: Total Reward = -9.1607
Episode 400: Total Reward = -10.0869
Episode 450: Total Reward = -6.8954
Episode 500: Total Reward = -10.4611
Episode 550: Total Reward = -1.2182
Episode 600: Total Reward = -10.2305
Episode 650: Total Reward = -4.3425
Episode 700: Total Reward = -0.9316
Episode 750: Total Reward = -49.5558
Episode 800: Total Reward = -12.5141
Episode 850: Total Reward = -11.4693
Episode 900: Total Reward = -11.4853
Episode 950: Total Reward = -11.7442
Episode 1000: Total Reward = -34.4901
Episode 1050: Total Reward = -1.8881
Episode 1100: Total Reward = -11.9435
Episode 1150: Total Reward = -6.0925
Episode 1200: Total Reward = -10.5355
Episode 1250: Total Reward = -5.7720
Episode 1300: Total Reward = -10.3901
Episode

KeyboardInterrupt: 