In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

In [2]:
from datetime import datetime
import itertools
import argparse
import re
import os
import pickle

In [3]:
from sklearn.preprocessing import StandardScaler

In [4]:
#Let's use AAPL, MSI and SBUX
def get_data():
    #returns a list of T * 3 stock prices
    #each row is a different stock
    # 0 == AAPL
    # 1 == MSI
    # 2 == SBUX
    df = pd.read_csv("C:\\Users\\mohan\\Documents\\GitHub\\machine_learning_examples\\tf2.0\\aapl_msi_sbux.csv")
    return df.values

In [5]:
def get_scaler(env):
    #return scikit-learn scaler object to scale the states
    #Note you could also populate the replay buffer here.
    states = []
    
    for _ in range(env.n_step):
        action = np.random.choice(env.action_space)
        state, reward, done, info = env.step(action)
        states.append(state)
        if done:
            break
    scaler = StandardScaler()
    scaler.fit(states)
    return scaler

In [6]:
def maybe_make_dir(directory):
    '''It checks if a particular directory exists if it doesn't the it creates the directory'''
    if not os.path.exists(directory):
        os.makedirs(directory)

In [7]:
class LinearModel:
    '''A linear regression Model with multiple outputs using stochastic gradient descent'''
    
    def __init__(self, input_dim, n_action):
        self.W = np.random.randn(input_dim, n_action)/np.sqrt(input_dim)
        self.b = np.zeros(n_action)
        
        #momentum terms
        self.vW = 0
        self.vb = 0
        
        self.losses = [] #we will populate this at each step of a gradient decsent.
        
    def predict(self, X):
        #make sure X is N * D
        assert(len(X.shape) == 2)
        return X.dot(self.W) + self.b
    
    def sgd(self, X, Y, learning_rate=0.01, momentum=0.9):
        #make sure X is N * D
        assert(len(X.shape)==2)
        
        #Here the loss values are two dimensinal, normally we would divide by N only, but now we divide by N * K, N=1 in stohastic gradient descent
        num_values = np.prod(Y.shape)
        
        #Do one step of gradient descent
        #We mupltiply by 2 to get the exact gradient.
        Yhat = self.predict(X)
        gW = 2 * X.T.dot(Yhat - Y) / num_values
        gb = 2 * (Yhat - Y).sum() / num_values
        
        #update momentum terms
        self.vW = momentum * self.vW - learning_rate * gW
        self.vb = momentum * self.vb - learning_rate * gb
        
        self.W += self.vW
        self.b += self.vb
        
        mse = np.mean((Yhat - Y) ** 2)
        self.losses.append(mse)
        
    def load_weights(self, filepath):
        npz = np.load(filepath)
        self.W = npz['W']
        self.b = npz['b']

    def save_weights(self, filepath):
        np.savez(filepath, W = self.W, b = self.b)
        

In [8]:
class MultiStockEnv:
    """
    A 3 stock trading Environment
    State: vector of size 7 (n_stock * 2 + 1)
        - # shares of stock 1 owned
        - # shares of stock 2 owned
        - # shares of stock 3 owned
        - # price of stock 1(using daily close price)
        - # price of stock 2
        - # price of stock 3
        - cash owned (can be used to purchase more stocks)
    Action: categorical variable with 27(3**3) possiblites
        -for each stock you can:
        - 0 = sell
        - 1 = hold
        - 2 = buy
    """
    
    def __init__(self, data, initial_investment=20000):
        #data
        self.stock_price_history = data
        self.n_step, self.n_stock = self.stock_price_history.shape
        
        #instance attribute
        self.initial_investment = initial_investment
        self.current_step = None
        self.stock_owned = None
        self.stock_price = None
        self.cash_in_hand = None
        
        self.action_space = np.arange(3**self.n_stock)
        
        #action permutations
        #returns a nested list with elements like :
        # [0, 0, 0]
        # [0, 0, 1]
        # [0, 0, 2]
        # [0, 1, 0]
        # [0, 1, 1]
        # etc.
        # 0 - Sell
        # 1 - Hold
        # 2 - Buy
        self.action_list = list(map(list, itertools.product([0, 1, 2], repeat = self.n_stock)))
        
        # Calculate the size of a state
        self.state_dim = self.n_stock * 2 + 1
        
        self.reset()
        
    def reset(self):
        self.current_step = 0
        self.stock_owned = np.zeros(self.n_stock)
        self.stock_price = self.stock_price_history[self.current_step]
        self.cash_in_hand = self.initial_investment
        return self._get_obs()
    
    def step(self, action):
        assert action in self.action_space
        
        #get current value before performing an action
        prev_val = self._get_val()
        
        #update price i.e go to the next day
        self.current_step += 1
        self.stock_price = self.stock_price_history[self.current_step]
        
        #perform the trade
        self._trade(action)
        
        #get the new value after taking an action
        current_val = self._get_val()
        
        #reward is the increase in portfolio value 
        reward = current_val - prev_val
        
        #done if we run out of data
        done = self.current_step == self.n_step - 1
        
        #store the current value of a portfolio here.
        info = {'current_val': current_val}
        
        #conform to the Gym API
        return self._get_obs(), reward, done, info
    
    def _get_obs(self):
        #Here state and observations are same, but many times states are some transformations of observation.
        obs = np.empty(self.state_dim)
        obs[ : self.n_stock] = self.stock_owned
        obs[self.n_stock : self.n_stock * 2] = self.stock_price
        obs[-1] = self.cash_in_hand
        return obs
    
    def _get_val(self):
        #current portfolio value
        return self.stock_owned.dot(self.stock_price) + self.cash_in_hand
    
    def _trade(self, action):
        #get index of the action we want to perform
        # 0 - sell
        # 1 - hold
        # 2 - buy
        # eg [2, 1, 0] means buy first stock, hold second stock and sell third stock
        
        action_vec = self.action_list[action]
        
        #determine which stock to sell and which stock to buy
        
        sell_index = [] #stores index of the stock we want to sell
        buy_index = [] #stores index of the stock we want to buy
        
        for index, action in enumerate(action_vec):
            if action == 0:
                sell_index.append(index)
            elif action == 2:
                buy_index.append(index)
                
        #sell any stocks we want to sell then buy any stocks we want to buy.
        if sell_index:
            #Note: to simplify the problem, when we sell, we sell all the shares of the stock.
            for index in sell_index:
                self.cash_in_hand += self.stock_owned[index] * self.stock_price[index]
                self.stock_owned[index] = 0
        if buy_index:
            #Note: While buying we will loop through each stock we want to buy and buy one share at a time untill we run out of cash.
            can_buy = True
            while can_buy:
                for index in buy_index:
                    if self.cash_in_hand > self.stock_price[index]:
                        self.stock_owned[index] += 1
                        self.cash_in_hand -= self.stock_price[index]
                    else:
                        can_buy = False
        

In [9]:
class DQNAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.gamma = 0.95 #discount rate
        self.epsilon = 1 #initial value of exploration rate
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995
        self.model = LinearModel(state_size, action_size)
        
    def act(self, state):
        if np.random.randn() <= self.epsilon:
            return np.random.choice(self.action_size)
        act_values = self.model.predict(state)
        return np.argmax(act_values[0]) #returns action - need to understand a little bit better
    
    def train(self, state, action, reward, next_state, done):
        if done:
            target = reward
        else:
            target = self.gamma * np.amax(self.model.predict(next_state), axis=1)
            
        target_full = self.model.predict(state) #calculating full target
        target_full[0, action] = target #replace prediction value of the actual target with target
        
        #Run one Training step
        self.model.sgd(state, target_full)
        
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay
            
    def load(self, name):
        self.model.load_weights(name)

    def save(self, name):
        self.model.save_weights(name)

In [10]:
def play_one_episode(agent, env, is_train, scaler):
    #After transforming states are already 1 * D
    state = env.reset()
    state = scaler.transform([state])
    done = False
    while not done:
        action = agent.act(state)
        next_state, reward, done, info = env.step(action)
        next_state = scaler.transform([next_state])
        if is_train == 'train':
            agent.train(state, action, reward, next_state, done)
        state = next_state
    return info['current_val']

In [11]:
def main(mode):
    #config
    models_folder = 'linear_rl_trader_models'
    rewards_folder = 'linear_rl_trader_rewards'
    num_episodes = 2000
    batch_size = 32
    initial_investment = 20000
    
    maybe_make_dir(models_folder)
    maybe_make_dir(rewards_folder)
    
    data = get_data()
    n_timesteps, n_stocks = data.shape
    
    n_train = n_timesteps//2
    train_data = data[: n_train]
    test_data = data[n_train: ]
    
    env = MultiStockEnv(train_data, initial_investment)
    state_size = env.state_dim
    action_size = len(env.action_space)
    agent = DQNAgent(state_size, action_size)
    scaler = get_scaler(env)
    
    #For storing the final portfolio value at the end of each episode.
    portfolio_value = []
    
    if mode == 'test':
        #then load the previous scaler
        with open(f'{models_folder}/scaler.pkl', 'rb') as f:
            scaler = pickle.load(f)
        
        #remake the environment with test_data 
        env = MultiStockEnv(test_data, initial_investment)
        
        #make sure that epsilon is not 1.
        #no need to run multiple episodes if epsilon = 0, it is deterministic
        
        agent.epsilon = 0.01
        
        #load trained weights
        agent.load(f'{models_folder}/linear.npz')
        
    #play the game n number of times.
    for e in range(num_episodes):
        t0 = datetime.now()
        val = play_one_episode(agent, env, mode, scaler)
        dt = datetime.now() - t0
        print(f"episode: {e + 1}/{num_episodes}, epsiode_end_value: {val}, duration: {dt}")
        portfolio_value.append(val) #append episode end portfolio

    #save the weights when we are done.
    if mode == "train":
        #save the DQN.
        agent.save(f'{models_folder}/linear.npz')
        
        #save the scaler
        with open(f'{models_folder}/scaler.pkl', 'wb') as f:
            pickle.dump(scaler, f)
            
        #plot lessons
        plt.plot(agent.model.losses)
        plt.show()
        
    #saving portfolio value for each episode
    np.save(f'{rewards_folder}/{mode}.npy', portfolio_value)

In [13]:
if __name__ == "__main__":
    #main("train")
    main("test")

episode: 1/2000, epsiode_end_value: 28785.180000000568, duration: 0:00:00.222178
episode: 2/2000, epsiode_end_value: 37120.790000000175, duration: 0:00:00.190606
episode: 3/2000, epsiode_end_value: 27298.40500000099, duration: 0:00:00.169778
episode: 4/2000, epsiode_end_value: 37636.16999999876, duration: 0:00:00.226520
episode: 5/2000, epsiode_end_value: 24884.079999999376, duration: 0:00:00.242911
episode: 6/2000, epsiode_end_value: 22882.40999999978, duration: 0:00:00.193022
episode: 7/2000, epsiode_end_value: 18784.309999999554, duration: 0:00:00.209017
episode: 8/2000, epsiode_end_value: 15420.290000000092, duration: 0:00:00.167427
episode: 9/2000, epsiode_end_value: 29054.680000000193, duration: 0:00:00.179450
episode: 10/2000, epsiode_end_value: 29924.204999999984, duration: 0:00:00.184842
episode: 11/2000, epsiode_end_value: 20619.02499999896, duration: 0:00:00.193372
episode: 12/2000, epsiode_end_value: 28729.45000000058, duration: 0:00:00.166584
episode: 13/2000, epsiode_end_