### Reinforcement Learning for Stock Market Trading

In [1]:
# Imports
import math, random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pandas_datareader as pdr

from tqdm import tqdm_notebook, tqdm
from collections import deque

### Defining the Stock Market Trader

In [2]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam

class stock_trader():
    def __init__(self, state_size, action_space=3, model_name='mr_stonks'): # stay, buy, or sell
        self.state_size = state_size
        self.action_space = action_space
        self.memory = deque(maxlen=2000) # no. of elements we can store
        self.inventory = []
        self.model_name = model_name

        self.gamma = 0.95 # discount factor
        self.epsilon = 1.0 # probability of random action selection; initialised to random actions
        self.epsilon_final = 0.01 # lower bound of epsilon
        self.epsilon_decay = 0.995 # rate of decay

        self.model = self.model_builder()

    def model_builder(self):
        model = Sequential()
        model.add(Dense(units=32, activation='relu', input_dim=self.state_size))
        model.add(Dense(units=64, activation='relu'))
        model.add(Dense(units=128, activation='relu'))
        # Output layer
        model.add(Dense(units=self.action_space, activation='linear'))

        model.compile(loss='mse', optimizer=Adam(lr=0.001))

        return model

    def trade(self, state):
        if (random.random() <= self.epsilon): # <= epsilon means a random action is taken; exploration
            return random.randrange(self.action_space) # returns a random number between 0-2; stay, buy or sell
        
        actions = self.model.predict(state)

        return np.argmax(actions[0]) # only return action with highest probability

    def batch_train(self, batch_size):
        batch = []

        for i in range(len(self.memory)-batch_size+1, len(self.memory)):
            batch.append(self.memory[i])

        for state, action, reward, next_state, done in batch:
            reward = reward
            
            if (not done): # if not in terminal state (still have actions that can be taken)
                reward = reward + self.gamma * np.amax(self.model.predict(next_state)[0]) # Deep Q-learning Bellman equation

            target = self.model.predict(state)
            target[0][action] = reward

            self.model.fit(state, target, epochs=1, verbose=0)

        # decrease epsilon
        if (self.epsilon > self.epsilon_final):
            self.epsilon *= self.epsilon_decay

#### Preprocess Data

In [3]:
# Define sigmoid function
def sigmoid(x):
    return 1/(1+math.exp(-x))

# Price format
def stocks_price_format(n):
    return f'- $ {abs(n):2f}' if (n < 0) else f'$ {abs(n):2f}'

In [4]:
# Dataset loader
def dataset_loader(stock_name):
  dataset = pdr.DataReader(stock_name, data_source="yahoo")
  
  start_date = str(dataset.index[0]).split()[0]
  end_date = str(dataset.index[-1]).split()[0]
  
  close = dataset['Close']

  return close

In [5]:
# State creator
def state_creator(data, timestep, window_size):
    starting_id = timestep - window_size+1

    if (starting_id >= 0):
        windowed_data = data[starting_id:timestep+1]
    else:
        windowed_data = - starting_id * [data[0]] + list(data[0:timestep+1])

    state = []

    for i in range(window_size - 1):
        state.append(sigmoid(windowed_data[i+1] - windowed_data[i]))

    return np.array([state])

In [6]:
# Stock
stock_name = 'AAPL'
data = dataset_loader(stock_name)

data

Date
2015-06-18    127.879997
2015-06-19    126.599998
2015-06-22    127.610001
2015-06-23    127.029999
2015-06-24    128.110001
                 ...    
2020-06-10    352.839996
2020-06-11    335.899994
2020-06-12    338.799988
2020-06-15    342.989990
2020-06-16    350.480011
Name: Close, Length: 1258, dtype: float64

#### Training the Trader

In [7]:
window_size = 10
episodes = 1000

batch_size = 32
data_samples = len(data) - 1

trader = stock_trader(window_size)
trader.model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 32)                352       
_________________________________________________________________
dense_1 (Dense)              (None, 64)                2112      
_________________________________________________________________
dense_2 (Dense)              (None, 128)               8320      
_________________________________________________________________
dense_3 (Dense)              (None, 3)                 387       
Total params: 11,171
Trainable params: 11,171
Non-trainable params: 0
_________________________________________________________________


#### Training Loop

In [8]:
for episode in range(1, episodes+1):
    print(f'Episode: {episode}/{episodes}')

    # Initialise
    state = state_creator(data, 0, window_size+1)
    total_profit = 0
    trader.inventory = []

    for t in tqdm(range(data_samples)):
        action = trader.trade(state)
        next_state = state_creator(data, t+1, window_size+1)
        reward = 0

        if (action == 1): # Buying
            trader.inventory.append(data[t])
            print("AI Trader bought: ", stocks_price_format(data[t]))

        elif (action == 2) and len(trader.inventory): # Selling
            buy_price = trader.inventory.pop(0)

            reward = max(data[t]-buy_price, 0)
            total_profit += data[t] - buy_price
            print("AI Trader sold: ", stocks_price_format(data[t]), " Profit: " + stocks_price_format(data[t] - buy_price))

        done = True if (t==data_samples-1) else False

        trader.memory.append((state, action, reward, next_state, done))
        state = next_state

        if (done):
            print('########################')
            print(f'TOTAL PROFIT: {total_profit}')
            print('########################')

        if (len(trader.memory) > batch_size):
            trader.batch_train(batch_size)

    if (episode%10 == 0):
        trader.model.save(f'{trader.model_name}-{episode}')

0%|          | 0/1257 [00:00<?, ?it/s]Episode: 1/1000
AI Trader bought:  $ 126.599998
AI Trader bought:  $ 127.610001
AI Trader sold:  $ 127.029999  Profit: $ 0.430000
AI Trader bought:  $ 127.500000
AI Trader bought:  $ 126.750000
AI Trader bought:  $ 125.430000
AI Trader sold:  $ 126.000000  Profit: - $ 1.610001
AI Trader bought:  $ 125.690002
AI Trader bought:  $ 122.570000
AI Trader bought:  $ 125.660004
AI Trader sold:  $ 125.610001  Profit: - $ 1.889999
AI Trader bought:  $ 128.509995
AI Trader bought:  $ 129.619995
AI Trader sold:  $ 125.220001  Profit: - $ 1.529999
AI Trader sold:  $ 125.160004  Profit: - $ 0.269997
AI Trader bought:  $ 122.769997
AI Trader bought:  $ 123.379997
AI Trader bought:  $ 122.989998
AI Trader bought:  $ 122.370003
AI Trader bought:  $ 118.440002
AI Trader bought:  $ 114.639999
  3%|▎         | 33/1257 [00:03<02:14,  9.11it/s]AI Trader bought:  $ 115.400002
  3%|▎         | 34/1257 [00:06<19:01,  1.07it/s]AI Trader sold:  $ 115.129997  Profit: - $ 10.

KeyboardInterrupt: 

Interrupted model training because it will take forever to complete. STONKS