In [1]:
import numpy as np
import gym
from gym import spaces
from gym.utils import seeding
import pandas as pd
from collections import deque
import itertools
from stable_baselines3.dqn import DQN


def safe_divide(a, b):
    return np.divide(a, b, out=np.zeros_like(a), where=b!=0)

def moving_average(iterable, n=3):
    # moving_average([40, 30, 50, 46, 39, 44]) --> 40.0 42.0 45.0 43.0
    # http://en.wikipedia.org/wiki/Moving_average
    it = iter(iterable)
    d = deque(itertools.islice(it, n-1))
    print(next(it))
    d.appendleft(0)
    s = sum(d)
    print(d)
    print(s)
    for elem in it:
        s += elem - d.popleft()
        d.append(elem)
        yield s / n

In [None]:
class TradingEnv(gym.Env):

    metadata = {'render.modes': ['human']}

    def __init__(
        self,
        df: pd.DataFrame,
        window_size: int = 1,
        n_action: int = 5,
        tick_size: float = 0.1,
        lot_size: int = 100,
        start_nav: float = 1e6,
        kappa: float = 0.02,
    ):

        self.seed()
        self.df = df
        self.window_size = window_size
        self.window = deque(maxlen=self.window_size)
        self.max_asset = self.df.shape[1]
        self.shape = (window_size, self.max_asset)
        self.tick_size = tick_size
        self.lot_size = lot_size
        self.kappa = kappa
        self.start_nav = start_nav
        self.shares = np.arange(-200, 300, 100)

        # spaces
        self.n_action = n_action
        self.action_space = spaces.Discrete(self.n_action)
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=self.shape, dtype=np.float32)

        # episode
        self._start_tick = self.window_size - 1
        self._end_tick = self.df.shape[0] - 1
        self._current_tick = None
        self._last_trade_tick = None
        self._first_rendering = None
        self.done = None
        # self.position = None
        # self.position_history = None
        self.total_reward = None
        self.total_profit = None
        self.history = None

    def seed(self, seed=None):
        self.np_random, seed = seeding.np_random(seed)
        return [seed]

    def reset(self):
        self._current_tick = self._start_tick
        self._first_rendering = True
        self.window.clear()
        self.window.extend(self.df.iloc[:self.window_size].to_numpy())
        self.done = False
        self.total_reward = 0
        self.total_profit = 0  # unit
        self.history = {
            'actions': (self.window_size-1) * [None],
            'shares': (self.window_size-1) * [0], #TODO
            'delta_vt': (self.window_size-1) * [0],
            'total_reward': (self.window_size-1) * [self.total_reward],
            'total_profit': (self.window_size-1) * [self.total_profit]
        }
        return self._get_observation()

    def step(self, action: int):
        self._current_tick += 1
        new_prices = self.df.iloc[self._current_tick, :].item()
        
        self.done = False
        if self._current_tick == self._end_tick:
            self.done = True
        
        delta_vt = self.delta_vt(action, new_prices)
        step_reward = self._calculate_reward(delta_vt)
        self.total_reward += step_reward
        self.total_profit += delta_vt
        
        # always update history last
        # dont move this row
        self.window.append(new_prices)
        info = dict(
            actions = action,
            delta_vt = delta_vt,
            total_reward = self.total_reward,
            total_profit = self.total_profit,
            shares = self._decode_action(action))
        self._update_history(info)

        return new_prices, step_reward, self.done, info

    def _get_observation(self):
        # process window
        return self.window[-1]

    def _update_history(self, info):
        if not self.history:
            self.history = {key: [] for key in info.keys()}

        for key, value in info.items():
            self.history[key].append(value)
    
    def spread_cost(self, dn: np.ndarray) -> float:
        # return sum(abs(dn) * self.tick_size)
        return abs(dn) * self.tick_size

    def impact_cost(self, dn: np.ndarray) -> float:
        # return sum(dn ** 2 * self.tick_size / self.lot_size)
        return dn ** 2 * self.tick_size / self.lot_size
    
    def total_cost(self, dn: np.ndarray) -> float:
        return self.spread_cost(dn) + self.impact_cost(dn)
    
    def delta_vt(
        self, 
        action: int,
        prices: float,
    ):
        shares = self._decode_action(action)
        prev_shares = self.history['shares'][-1]
        dn = shares - prev_shares
        rate = prices / self.window[-1] - 1 if self.window[-1] > 0 else 0
        return np.sum(prev_shares * self.window[-1] * rate) - self.total_cost(dn)
    
    def _decode_action(self, action: np.ndarray) -> np.ndarray:
        return np.take(self.shares, action)

    def render(self, mode='human'):
        # def _plot_position(position, tick):
        #     color = None
        #     if position == Positions.Short:
        #         color = 'red'
        #     elif position == Positions.Long:
        #         color = 'green'
        #     if color:
        #         plt.scatter(tick, self.prices[tick], color=color)

        # if self._first_rendering:
        #     self._first_rendering = False
        #     plt.cla()
        #     plt.plot(self.prices)
        #     start_position = self.history['position_history'][self._start_tick]
        #     _plot_position(start_position, self._start_tick)

        # _plot_position(self._position, self._current_tick)

        # plt.suptitle(
        #     "Total Reward: %.6f" % self._total_reward + ' ~ ' +
        #     "Total Profit: %.6f" % self._total_profit
        # )

        # plt.pause(0.01)
        pass


    def render_all(self, mode='human'):
        # window_ticks = np.arange(len(self.history['position_history']))
        # plt.plot(self.prices)

        # short_ticks = []
        # long_ticks = []
        # for i, tick in enumerate(window_ticks):
        #     if self.history['position_history'][i] == Positions.Short:
        #         short_ticks.append(tick)
        #     elif self.history['position_history'][i] == Positions.Long:
        #         long_ticks.append(tick)

        # plt.plot(short_ticks, self.prices[short_ticks], 'ro')
        # plt.plot(long_ticks, self.prices[long_ticks], 'go')

        # plt.suptitle(
        #     "Total Reward: %.6f" % self._total_reward + ' ~ ' +
        #     "Total Profit: %.6f" % self._total_profit
        # )
        pass
        
        
    def close(self):
        # plt.close()
        pass


    def save_rendering(self, filepath):
        # plt.savefig(filepath)
        pass


    def pause_rendering(self):
        # plt.show()
        pass

    def _calculate_reward(self, delta_vt):
        return delta_vt - self.kappa * (delta_vt ** 2)

    def max_possible_profit(self):  # trade fees are ignored
        raise NotImplementedError

In [2]:
from stable_baselines3.common.env_checker import check_env
from stock_env.envs.single_stock_env import SingleStockEnv

price = np.load('price.pkl.npy')
df = pd.DataFrame()
df['A'] = price
env = SingleStockEnv(df)
check_env(env)

done = False
obs = env.reset()
while not done:
    _, _, done, _ = env.step(env.action_space.sample())

data = pd.concat([df, pd.DataFrame(env.history)], axis=1, join='inner')
data.head(10)

Unnamed: 0,A,actions,quantity,delta_vt,total_reward,total_profit,portfolio_value,nav,cash
0,50.0,8,300,-82.532602,-82.873183,-82.532602,49917.467398,15037.467398,34880.0
1,50.124891,5,300,8.511069,-74.365736,-74.021533,49925.978467,15045.978467,34880.0
2,50.153262,7,500,-25.268899,-99.666561,-99.290431,49900.709569,25111.36188,24789.347688
3,50.222724,2,200,-88.119009,-188.173818,-187.40944,49812.59056,10076.425743,39736.164817
4,50.382129,0,-300,-339.968672,-533.921424,-527.378112,49472.621888,-15154.607286,64627.229174
5,50.515358,9,100,-206.97814,-743.041562,-734.356252,49265.643748,5044.557622,44221.086126
6,50.445576,7,300,-39.656318,-782.77651,-774.012569,49225.987431,15154.016549,34071.970882
7,50.513388,5,300,-3.247638,-786.024675,-777.260207,49222.739793,15150.768911,34071.970882
8,50.502563,0,-200,-298.522921,-1089.003393,-1075.783128,48924.216872,-10099.035528,59023.2524
9,50.495178,0,-700,-320.520877,-1414.660951,-1396.304005,48603.695995,-35367.145225,83970.84122


In [None]:
from stable_baselines3.common.evaluation import evaluate_policy
mean, std = evaluate_policy(model, env, n_eval_episodes=1, render=True)
print(f"Mean reward: {mean:.2f} +/- {std: .2f}")

In [None]:
from stock_env.envs.single_stock_env import SingleStockEnv
price = np.load('price.pkl.npy')
df = pd.DataFrame()
df['A'] = price
env = SingleStockEnv(df, init_cash=5e4)

model = DQN(
    'MlpPolicy',
    env=env, 
    learning_rate=0.001,
    gamma=0.999,
    exploration_initial_eps=0.1,
    exploration_final_eps=0.1,
    learning_starts=1000,
    target_update_interval=1000,
    tensorboard_log='log',
    verbose=1,
)
model.learn(
    1e6,
)