In [1]:
import gym
from gym import spaces
import numpy as np
import matplotlib.pyplot as plt
from stable_baselines3 import SAC
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.callbacks import BaseCallback
import torch

2025-03-14 10:53:45.169392: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
class MarketMakingEnv(gym.Env):
    def __init__(self, t=0, dt=0.001, T=1, eta=0.0, sigma=0.1, sigma_bar=0.1, varsigma=0.1, inventory_max=5, cash_balance=0, S0=50.005, bidprice=50, askprice=50.01,
                 spread=0.01, inventory=0, phi=0.001):
        super(MarketMakingEnv, self).__init__()
        self.initial_parameters = (spread, askprice, bidprice, S0)
        self.t = t
        self.dt = dt
        self.T = T
        self.eta = eta
        self.sigma = sigma
        self.sigma_bar = sigma_bar
        self.varsigma = varsigma
        self.inventory_max = inventory_max        
        self.cash_balance = cash_balance
        self.old_cash = cash_balance
        self.current_price = S0
        self.old_price = S0
        self.spread = spread
        self.inventory = inventory
        self.old_inventory = inventory
        self.phi = phi # Risk aversion parameter 
        self.ask_order = np.inf 
        self.bid_order = -np.inf 
        self.prev_ask_order = np.inf
        self.prev_bid_order = -np.inf
        self.reward = 0 
        self.cumulative_reward = 0
        self.non_adverse_fill_prob = 0.2
        self.prev_action = 0
        
        # Records
        self.time_history = []
        self.cash_balance_history = []
        self.price_history = []  
        self.inventory_history = []
        self.reward_history = []
        self.cumulative_reward_history = []
        self.bid_trade_prices = []
        self.ask_trade_prices = []
        self.bid_trade_times = []
        self.ask_trade_times = []
        
        # State and action space
        self.action_space = spaces.Box(low=-1, high=1, shape=(1,), dtype=np.float32)  # Buy, Sell, Hold
        self.observation_space = spaces.Box(low=np.array([0, -5], dtype=np.float32), 
                                              high=np.array([100, 5], dtype=np.float32), 
                                             dtype=np.float32)  # Prices, Inventory
        
        self.reset()

    def reset(self):
        self.t = 0
        self.current_price = self.initial_parameters[3]
        self.old_price = self.initial_parameters[3]
        self.inventory = 0.0
        self.old_inventory = 0.0
        self.cash_balance = 0
        self.old_cash = 0
        self.done = False
        self.spread = self.initial_parameters[0]
        self.ask_price = self.initial_parameters[1]
        self.bid_price = self.initial_parameters[2]
        self.ask_order = np.inf
        self.bid_order = -np.inf
        self.prev_ask_order = np.inf
        self.prev_bid_order = -np.inf
        self.reward = 0 # Initialize reward
        self.cumulative_reward = 0
        self.non_adverse_fill_prob = 0.2
        self.prev_action = 0
        
        # Reset records
        self.time_history = [self.t]
        self.price_history = [self.current_price]  
        self.inventory_history = [self.inventory]
        self.cash_balance_history = [self.cash_balance]
        self.reward_history = [0]
        self.cumulative_reward_history = []
        self.bid_trade_prices = []
        self.ask_trade_prices = []
        self.bid_trade_times = []
        self.ask_trade_times = []
        
        return self.normalize_state()
    
    def normalize_state(self):
        
        # Normalize current_price using z-score
        if len(self.price_history) > 0:
            current_price_mean = np.mean(self.price_history)
            current_price_std = np.std(self.price_history)
        else:
            current_price_mean = 1
            current_price_std = 1
        if current_price_std == 0:
            normalized_current_price = 0  
        else:
            normalized_current_price = (self.current_price - current_price_mean) / current_price_std

        # Normalize inventory using min-max normalization
        inventory_min = 0
        inventory_max = self.inventory_max
        if inventory_max == inventory_min:  
            normalized_inventory = 0  
        else:
            normalized_inventory = (self.inventory - inventory_min) / (inventory_max - inventory_min)

        normalized_state = np.array([normalized_current_price, normalized_inventory], dtype=np.float32)
        normalized_state = np.nan_to_num(normalized_state, nan=0.0)

        return normalized_state


    def step(self, action):
               
        self.old_cash, self.old_price, self.old_inventory = self.cash_balance, self.current_price, self.inventory
        self.prev_bid_order = self.bid_order
        self.prev_ask_order = self.ask_order
        self.t += self.dt
        self.time_history.append(self.t) 
        dW = np.random.normal(0, np.sqrt(self.dt))
        self.current_price += self.eta * self.dt + (self.sigma+self.sigma_bar+self.varsigma) * np.sqrt(self.dt) * dW
        self.current_price = max(0, self.current_price)
        self.price_history.append(self.current_price)    
        
        action = action[0] 
        if action > 0:
            action = 1  # Buy
            self.bid_order = np.round(self.current_price - self.spread, 2)
            self.ask_order = np.inf
        elif action < 0:
            action = -1  # Sell
            self.ask_order = np.round(self.current_price + self.spread, 2)
            self.bid_order = -np.inf
        else:
            action = 0  # Hold
            self.ask_order = np.inf
            self.bid_order = -np.inf     
        
        if self.current_price <= 0 or self.t >= self.T:
            self.done = True

        if self.inventory <= -self.inventory_max: # max short
            self.ask_order = np.inf
        if self.inventory >= self.inventory_max: # max long
            self.bid_order = -np.inf
        
        # Check Fills
        if self.prev_action == 1 and self.prev_bid_order != -np.inf:
            if (self.current_price - (self.spread / 2)) < self.prev_bid_order and self.inventory < self.inventory_max:
                self.inventory += 1
                self.cash_balance -= self.prev_bid_order
                self.bid_trade_prices.append(self.prev_bid_order)
                self.bid_trade_times.append(self.t)
                self.ask_order = np.inf
                self.bid_order = -np.inf
        elif action == 1: 
            if (self.current_price - (self.spread / 2)) >= self.prev_bid_order and self.inventory < self.inventory_max and np.random.binomial(1, self.non_adverse_fill_prob) == 1:
                self.inventory += 1
                self.cash_balance -= self.bid_order
                self.bid_trade_prices.append(self.current_price - (self.spread / 2))
                self.bid_trade_times.append(self.t)
                self.ask_order = np.inf
                self.bid_order = -np.inf
        elif self.prev_action == -1 and self.prev_ask_order != np.inf:
            if (self.current_price + (self.spread / 2)) > self.prev_ask_order and self.inventory > -self.inventory_max:
                self.inventory -= 1
                self.cash_balance += self.prev_ask_order
                self.ask_trade_prices.append(self.prev_ask_order)
                self.ask_trade_times.append(self.t)
                self.ask_order = np.inf
                self.bid_order = -np.inf
        elif action == -1: 
            if (self.current_price + (self.spread / 2)) <= self.prev_ask_order and self.inventory > -self.inventory_max and np.random.binomial(1, self.non_adverse_fill_prob) == 1:
                self.inventory -= 1
                self.cash_balance += self.ask_order
                self.ask_trade_prices.append(self.current_price + (self.spread / 2))
                self.ask_trade_times.append(self.t)
                self.ask_order = np.inf
                self.bid_order = -np.inf
        else:  
            pass
        self.prev_action = action
        
        inventory_penalty = self.phi * self.inventory**2
        self.reward = (self.cash_balance + self.inventory * self.current_price - 
                   (self.old_cash + self.old_inventory * self.old_price)) - inventory_penalty
        
        self.cumulative_reward += self.reward
        self.cumulative_reward_history.append(self.cumulative_reward)
        self.inventory_history.append(self.inventory)
        self.cash_balance_history.append(self.cash_balance)
        self.reward_history.append(self.reward)
        
        info = {
            'cash_balance_history': self.cash_balance_history,
            'inventory_history': self.inventory_history,
            'price_history': self.price_history,
            'reward_history': self.reward_history, 
            'cumulative_reward_history': self.cumulative_reward_history,
            'bid_trade_prices': self.bid_trade_prices,
            'ask_trade_prices': self.ask_trade_prices,
            'bid_trade_times': self.bid_trade_times,
            'ask_trade_times': self.ask_trade_times,
            'time_history': self.time_history,
            'done': self.done,
        }
        
        return (self.normalize_state(), self.reward, self.done, info)

    def render(self, mode='human'):
        pass

In [4]:
num_timesteps = 1000000 
env = MarketMakingEnv()
model = SAC("MlpPolicy", env, verbose=1)
model.learn(total_timesteps=num_timesteps)
model.save("market_making_model_sac")

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.




---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1e+03    |
|    ep_rew_mean     | -18      |
| time/              |          |
|    episodes        | 4        |
|    fps             | 125      |
|    time_elapsed    | 31       |
|    total_timesteps | 4000     |
| train/             |          |
|    actor_loss      | -6.55    |
|    critic_loss     | 0.00313  |
|    ent_coef        | 0.311    |
|    ent_coef_loss   | -1.95    |
|    learning_rate   | 0.0003   |
|    n_updates       | 3899     |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1e+03    |
|    ep_rew_mean     | -18      |
| time/              |          |
|    episodes        | 8        |
|    fps             | 124      |
|    time_elapsed    | 64       |
|    total_timesteps | 8000     |
| train/             |          |
|    actor_loss      | -6.78    |
|    critic_loss     | 0.158    |
|    ent_coef 

In [6]:
trained_model = SAC.load("market_making_model_sac")

# Testing the model
obs = env.reset()
episode_info = []
for i in range(200000):
    action, _states = trained_model.predict(obs)
    obs, rewards, dones, info = env.step(action)
    if dones:
        obs = env.reset()