<a href="https://colab.research.google.com/github/newmantic/PPO_options_hedge/blob/main/PPO_options_hedge.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install gym pandas numpy gymnasium shimmy stable-baselines3[extra]

import gymnasium as gym  # Use gymnasium instead of gym
import numpy as np
import pandas as pd
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv  # Corrected import from vec_env
from gymnasium import spaces  # Gymnasium's updated spaces


In [16]:

# Define the custom environment
class OptionTradingEnv(gym.Env):  # Use gym.Env from gymnasium
    def __init__(self, data):
        super(OptionTradingEnv, self).__init__()

        # Load historical market data (price, volatility, option Greeks, etc.)
        self.data = data
        self.current_step = 0

        # Define the state space: features like prices, Greeks, volatility, etc.
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(self.data.shape[1],), dtype=np.float32)

        # Define the action space: amount of options to trade (continuous between -1 to 1)
        # Negative values mean selling, positive means buying
        self.action_space = spaces.Box(low=-1, high=1, shape=(1,), dtype=np.float32)

    def reset(self, seed=None, options=None):
        # Handle the random seed
        if seed is not None:
            self.np_random, _ = gym.utils.seeding.np_random(seed)
        self.current_step = 0
        return self.data.iloc[self.current_step].values, {}

    def step(self, action):
        # Apply action to the environment (trading an option)
        current_state = self.data.iloc[self.current_step].values
        next_state = self.data.iloc[self.current_step + 1].values
        done = self.current_step >= len(self.data) - 2

        # Action represents the trading size (scaled between -1 and 1)
        trading_size = action[0] * 100  # Scale action to actual size

        # Define the reward based on profit/loss
        option_price_change = next_state[0] - current_state[0]  # Assumed index for option price
        reward = trading_size * option_price_change

        # Add risk penalty (e.g., higher trading size increases risk)
        reward -= 0.01 * np.abs(trading_size)

        self.current_step += 1

        return next_state, reward, done, {}, {}

    def render(self, mode='human'):
        # Optionally implement rendering of the trading environment
        pass



In [20]:
# Load historical data (dummy data in this example)
data = pd.DataFrame({
    'option_price': np.random.randn(1000),   # Example option prices
    'delta': np.random.randn(1000),          # Option delta
    'gamma': np.random.randn(1000),          # Option gamma
    'vega': np.random.randn(1000),           # Option vega
    'theta': np.random.randn(1000),          # Option theta
    'volatility': np.random.randn(1000),     # Market volatility (VIX)
})

# Create the environment
env = OptionTradingEnv(data)
env = DummyVecEnv([lambda: env])  # PPO requires a vectorized environment

# Create and train the PPO agent
model = PPO("MlpPolicy", env, verbose=1)
model.learn(total_timesteps=10000)





Using cpu device
-----------------------------
| time/              |      |
|    fps             | 966  |
|    iterations      | 1    |
|    time_elapsed    | 2    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 671         |
|    iterations           | 2           |
|    time_elapsed         | 6           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.011296996 |
|    clip_fraction        | 0.104       |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.41       |
|    explained_variance   | 5.32e-05    |
|    learning_rate        | 0.0003      |
|    loss                 | 4.07e+04    |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0168     |
|    std                  | 0.99        |
|    value_loss           | 9.86e+04    |
-----------------

<stable_baselines3.ppo.ppo.PPO at 0x7edb3b2ba350>

In [21]:
# Test the trained agent

obs = env.reset()
for i in range(100):
    action, _states = model.predict(obs)
    obs, reward, done, _= env.step(action)
    if done:
        break

# Save the model
model.save("ppo_option_trading")

# Load the model for future use
model = PPO.load("ppo_option_trading")