In [20]:
"""
In this example we demonstrate how to implement a DQN agent and
train it to trade optimally on a periodic price signal.
Training time is short and results are unstable.
Do not hesitate to run several times and/or tweak parameters to get better results.
Inspired from https://github.com/keon/deep-q-learning
"""
import random

import numpy as np
from keras.layers import Dense
from keras.models import Sequential
from keras.optimizers import Adam
from tgym.envs import SpreadTrading
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [17]:
class DQNAgent:
    def __init__(self,
                 state_size,
                 action_size,
                 episodes,
                 episode_length,
                 memory_size=2000,
                 train_interval=100,
                 gamma=0.95,
                 learning_rate=0.001,
                 batch_size=64,
                 epsilon_min=0.01
                 ):
        self.state_size = state_size
        self.action_size = action_size
        self.memory_size = memory_size
        self.memory = [None] * memory_size
        self.gamma = gamma
        self.epsilon = 1.0
        self.epsilon_min = epsilon_min
        self.epsilon_decrement = (self.epsilon - epsilon_min)\
            * train_interval / (episodes * episode_length)  # linear decrease rate
        self.learning_rate = learning_rate
        self.train_interval = train_interval
        self.batch_size = batch_size
        self.brain = self._build_brain()
        self.i = 0

    def _build_brain(self):
        """Build the agent's brain
        """
        brain = Sequential()
        neurons_per_layer = 24
        activation = "relu"
        brain.add(Dense(neurons_per_layer,
                        input_dim=self.state_size,
                        activation=activation))
        brain.add(Dense(neurons_per_layer, activation=activation))
        brain.add(Dense(self.action_size, activation='linear'))
        brain.compile(loss='mse', optimizer=Adam(lr=self.learning_rate))
        return brain

    def act(self, state):
        """Acting Policy of the DQNAgent
        """
        action = np.zeros(self.action_size)
        if np.random.rand() <= self.epsilon:
            action[random.randrange(self.action_size)] = 1
        else:
            state = state.reshape(1, self.state_size)
            act_values = self.brain.predict(state)
            action[np.argmax(act_values[0])] = 1
        return action

    def observe(self, state, action, reward, next_state, done, warming_up=False):
        """Memory Management and training of the agent
        """
        self.i = (self.i + 1) % self.memory_size
        self.memory[self.i] = (state, action, reward, next_state, done)
        if (not warming_up) and (self.i % self.train_interval) == 0:
            if self.epsilon > self.epsilon_min:
                self.epsilon -= self.epsilon_decrement
            state, action, reward, next_state, done = self._get_batches()
            reward += (self.gamma
                       * np.logical_not(done)
                       * np.amax(self.brain.predict(next_state),
                                 axis=1))
            q_target = self.brain.predict(state)
            #print "state: ", state[0]
            #print "action[0]: ", action[0]
            #print "action[1]: ", action[1]
            #print "q_target: ", q_target[action[0], action[1]]
            #print "reward: ", reward
            
            q_target[action[0], action[1]] = reward
            return self.brain.fit(state, q_target,
                                  batch_size=self.batch_size,
                                  epochs=1,
                                  verbose=False)

    def _get_batches(self):
        """Selecting a batch of memory
           Split it into categorical subbatches
           Process action_batch into a position vector
        """
        batch = np.array(random.sample(self.memory, self.batch_size))
        state_batch = np.concatenate(batch[:, 0])\
            .reshape(self.batch_size, self.state_size)
        action_batch = np.concatenate(batch[:, 1])\
            .reshape(self.batch_size, self.action_size)
        reward_batch = batch[:, 2]
        next_state_batch = np.concatenate(batch[:, 3])\
            .reshape(self.batch_size, self.state_size)
        done_batch = batch[:, 4]
        # action processing
        action_batch = np.where(action_batch == 1)
        return state_batch, action_batch, reward_batch, next_state_batch, done_batch

In [18]:
import matplotlib.pyplot as plt
import sys
sys.path.append('/Users/matthewdixon/Downloads/Trading-Gym/')
from tgym.envs import SpreadTrading
#from tgym.gens.deterministic import WavySignal
#from tgym.gens.random import AR1
from tgym.gens.csvstream import CSVStreamer
# Instantiating the environmnent
generator = CSVStreamer(filename='../../data/AMZN-L1.csv')
#generator = AR1(a=0.1, ba_spread=0.1)   #WavySignal(period_1=25, period_2=50, epsilon=-0.5)
episodes = 100
episode_length = 400
trading_fee = .0
time_fee = 0
history_length = 2
environment = SpreadTrading(spread_coefficients=[1],
                            data_generator=generator,
                                trading_fee=trading_fee,
                                time_fee=time_fee,
                                history_length=history_length)

In [5]:
generator = AR1(a=0.1, ba_spread=0.1)   #WavySignal(period_1=25, period_2=50, epsilon=-0.5)
generator.__dict__

{'_trainable': False,
 'gen_kwargs': {'a': 0.1, 'ba_spread': 0.1},
 'generator': <generator object _generator at 0x115692e10>,
 'n_products': 1}

In [6]:
environment.__dict__

{'_action': array([1, 0, 0]),
 '_closed_plot': False,
 '_data_generator': <tgym.gens.random.AR1 at 0x11568b490>,
 '_depths_history': [(1000, 1000), (988, 1006)],
 '_entry_price': 0,
 '_episode_length': 1000,
 '_exit_price': 0,
 '_first_render': True,
 '_history_length': 2,
 '_iteration': 0,
 '_position': array([1, 0, 0]),
 '_prices_history': [(100, 100.1), (8.985108107208035, 9.085108107208034)],
 '_spread_coefficients': [1],
 '_time_fee': 0,
 '_total_pnl': 0,
 '_total_reward': 0,
 '_trading_fee': 0.2,
 'n_actions': 3,
 'state_shape': (12,)}

In [7]:
state = environment.reset()
state

array([1.00000000e+02, 1.00100000e+02, 9.73035592e+00, 9.83035592e+00,
       1.00000000e+03, 1.00000000e+03, 1.00000000e+03, 1.00200000e+03,
       0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 0.00000000e+00])

In [19]:
state = environment.reset()
# Instantiating the agent
memory_size = 3000
state_size = len(state)
gamma = 0.96
epsilon_min = 0.01
batch_size = 64
action_size = len(SpreadTrading._actions)
train_interval = 10
learning_rate = 0.001
agent = DQNAgent(state_size=state_size,
                     action_size=action_size,
                     memory_size=memory_size,
                     episodes=episodes,
                     episode_length=episode_length,
                     train_interval=train_interval,
                     gamma=gamma,
                     learning_rate=learning_rate,
                     batch_size=batch_size,
                     epsilon_min=epsilon_min)
# Warming up the agent
for _ in range(memory_size):
        action = agent.act(state)
        next_state, reward, done, _ = environment.step(action)
        agent.observe(state, action, reward, next_state, done, warming_up=True)
# Training the agent
for ep in range(episodes):
    state = environment.reset()
    rew = 0
    for _ in range(episode_length):
        action = agent.act(state)
        next_state, reward, done, _ = environment.step(action)
        loss = agent.observe(state, action, reward, next_state, done)
        state = next_state
        rew += reward
    print("Ep:" + str(ep)
           + "| rew:" + str(round(rew, 2))
           + "| eps:" + str(round(agent.epsilon, 2))
           + "| loss:" + str(round(loss.history["loss"][0], 4)))

Ep:0| rew:-10.47| eps:0.99| loss:1378.1431
Ep:1| rew:-0.98| eps:0.98| loss:91.3164
Ep:2| rew:-11.04| eps:0.97| loss:49.2645
Ep:3| rew:-2.39| eps:0.96| loss:57.4468
Ep:4| rew:1.88| eps:0.95| loss:146.9944
Ep:5| rew:-7.58| eps:0.94| loss:140.1393
Ep:6| rew:-14.06| eps:0.93| loss:100.5125
Ep:7| rew:-7.6| eps:0.92| loss:21.5093
Ep:8| rew:2.42| eps:0.91| loss:14.273
Ep:9| rew:-7.62| eps:0.9| loss:13.6398
Ep:10| rew:-12.16| eps:0.89| loss:3.9049
Ep:11| rew:-18.32| eps:0.88| loss:3.8552
Ep:12| rew:-17.57| eps:0.87| loss:2.1659
Ep:13| rew:-24.8| eps:0.86| loss:2.699
Ep:14| rew:-9.16| eps:0.85| loss:1.6074
Ep:15| rew:-1.8| eps:0.84| loss:1.6647
Ep:16| rew:-7.9| eps:0.83| loss:1.6519
Ep:17| rew:15.37| eps:0.82| loss:1.6424
Ep:18| rew:1.57| eps:0.81| loss:1.4861
Ep:19| rew:-33.38| eps:0.8| loss:1.424
Ep:20| rew:-34.96| eps:0.79| loss:1.2017
Ep:21| rew:-11.38| eps:0.78| loss:1.7213
Ep:22| rew:5.02| eps:0.77| loss:1.2855
Ep:23| rew:-21.87| eps:0.76| loss:3.0298
Ep:24| rew:-4.47| eps:0.75| loss:1.85

In [None]:
# Running the agent
done = False
state = environment.reset()
while not done:
    action = agent.act(state)
    state, _, done, info = environment.step(action)
    if 'status' in info and info['status'] == 'Closed plot':
        done = True
    else:
        environment.render()