In [143]:
#!pip install torch pandas sklearn yfinance boto3


In [147]:
import numpy as np
import json, boto3, datetime, os
import pandas as pd
import numpy as np
import yfinance as yf
from sklearn.model_selection import train_test_split
from datetime import datetime, timedelta

import torch
import torch.nn as nn
from torch.nn import Module, GRU, Linear, LSTM
from torch.utils.data import DataLoader, TensorDataset

now = datetime.now() + timedelta(hours=8)
dt_string_file = now.strftime("%d_%m_%y")
dt_string_yf = now.strftime("%Y-%m-%d")


In [148]:
class Config_QQQ:

    etf = "QQQ"

    dataset_type = "qqq"

    feature_columns = list(range(1,7))
    label_columns = [2,3]
    label_in_feature_index = (lambda x,y: [x.index(i) for i in y])(feature_columns, label_columns)

    predict_day = 1

    model_type = "gru"
    train_data_rate = 0.95

    do_train = True
    do_predict = True
    time_step = 20
    valid_data_rate = 0.15
    random_seed = 42
    shuffle_train_data = True

    input_size = len(feature_columns)
    output_size = len(label_columns)
    hidden_size = 128       
    layers = 2
    dropout_rate = 0.2
    batch_size = 64
    learning_rate = 0.001
    epoch = 100
    do_continue_train = False
    do_train_visualized = False
    patience = 5
    
    model_save_path = "models/"
    if not os.path.exists(model_save_path):
        os.makedirs(model_save_path)

    data_save_path = "dataset/"
    if not os.path.exists(data_save_path):
        os.makedirs(data_save_path)

class Data_QQQ:
    def __init__(self, config):
        self.config = config
        self.data, self.data_column_name = self.read_data()

        self.data_num = self.data.shape[0]
        self.train_num = int(self.data_num * self.config.train_data_rate)

        # Normalization
        self.mean = np.mean(self.data, axis=0) 
        self.std = np.std(self.data, axis=0)
        self.norm_data = (self.data - self.mean)/self.std 

    def read_data(self):
        df = yf.download(self.config.etf, end=dt_string_yf)
        df.to_csv(self.config.data_save_path+self.config.etf+".csv")
        df = pd.read_csv(self.config.data_save_path+self.config.etf+".csv", usecols=self.config.feature_columns)
        return df.values, df.columns.tolist()

    def get_train_and_valid_data(self):
        feature_data = self.norm_data[:self.train_num]
        label_data = self.norm_data[self.config.predict_day : self.config.predict_day + self.train_num,
                                    self.config.label_in_feature_index]

        train_x = [feature_data[start_index + i*self.config.time_step : start_index + (i+1)*self.config.time_step]
                        for start_index in range(self.config.time_step)
                        for i in range((self.train_num - start_index) // self.config.time_step)]
        train_y = [label_data[start_index + i*self.config.time_step : start_index + (i+1)*self.config.time_step]
                    for start_index in range(self.config.time_step)
                    for i in range((self.train_num - start_index) // self.config.time_step)]

        train_x, train_y = np.array(train_x), np.array(train_y)

        train_x, valid_x, train_y, valid_y = train_test_split(train_x, train_y, 
        test_size=self.config.valid_data_rate, random_state=self.config.random_seed, 
        shuffle=self.config.shuffle_train_data)
        
        return train_x, valid_x, train_y, valid_y

    def get_test_data(self):
        feature_data = self.norm_data[self.train_num:]
        sample_interval = min(feature_data.shape[0], self.config.time_step)
        self.start_num_in_test = feature_data.shape[0] % sample_interval
        time_step_size = feature_data.shape[0] // sample_interval

        test_x = [feature_data[self.start_num_in_test+i*sample_interval : self.start_num_in_test+(i+1)*sample_interval]
                   for i in range(time_step_size)]

        return np.array(test_x)

In [149]:
class Config_top10:

    etf = "QQQ"
    tickers = ["AAPL", "MSFT", "AMZN", "TSLA", "GOOG", "FB", "GOOGL", "NVDA", "PYPL", "ADBE"]

    dataset_type = "top10"    

    feature_columns = list(range(0,22))
    label_columns = [20,21]
    label_in_feature_index = (lambda x,y: [x.index(i) for i in y])(feature_columns, label_columns)

    predict_day = 1

    model_type = "gru"
    train_data_rate = 0.95

    do_train = True
    do_predict = True
    time_step = 20
    valid_data_rate = 0.15
    random_seed = 42
    shuffle_train_data = True

    input_size = len(feature_columns)
    output_size = len(label_columns)
    hidden_size = 128       
    layers = 2
    dropout_rate = 0.2
    batch_size = 64
    learning_rate = 0.001
    epoch = 100
    do_continue_train = False
    do_train_visualized = False
    patience = 5

    model_save_path = "models/"
    if not os.path.exists(model_save_path):
        os.makedirs(model_save_path)

    data_save_path = "dataset/"
    if not os.path.exists(data_save_path):
        os.makedirs(data_save_path)

class Data_top10:
    def __init__(self, config):
        self.config = config
        self.data, self.data_column_name = self.read_data()

        self.data_num = self.data.shape[0]
        self.train_num = int(self.data_num * self.config.train_data_rate)

        # Normalization
        self.mean = np.mean(self.data, axis=0) 
        self.std = np.std(self.data, axis=0)
        self.norm_data = (self.data - self.mean)/self.std 

    def read_data(self):
        tickers = self.config.tickers
        tickers.append(self.config.etf)
        ticker_list = yf.Tickers(tickers)
        df = {}
        for ticker in tickers:
            if ticker != "QQQ":
                df[ticker+"_High"] = ticker_list.tickers[ticker].history(start="2016-01-01", end=dt_string_yf)['High']
                df[ticker+"_Low"] = ticker_list.tickers[ticker].history(start="2016-01-01", end=dt_string_yf)['Low']
            else:
                df["High"] = ticker_list.tickers[ticker].history(start="2016-01-01", end=dt_string_yf)['High']
                df["Low"] = ticker_list.tickers[ticker].history(start="2016-01-01", end=dt_string_yf)['Low']
        df = pd.DataFrame(df)
        df.to_csv(self.config.data_save_path+"top10"+".csv")
        return df.values, df.columns.tolist()

    def get_train_and_valid_data(self):
        feature_data = self.norm_data[:self.train_num]
        label_data = self.norm_data[self.config.predict_day : self.config.predict_day + self.train_num,
                                    self.config.label_in_feature_index]

        train_x = [feature_data[start_index + i*self.config.time_step : start_index + (i+1)*self.config.time_step]
                        for start_index in range(self.config.time_step)
                        for i in range((self.train_num - start_index) // self.config.time_step)]
        train_y = [label_data[start_index + i*self.config.time_step : start_index + (i+1)*self.config.time_step]
                    for start_index in range(self.config.time_step)
                    for i in range((self.train_num - start_index) // self.config.time_step)]

        train_x, train_y = np.array(train_x), np.array(train_y)

        train_x, valid_x, train_y, valid_y = train_test_split(train_x, train_y, 
        test_size=self.config.valid_data_rate, random_state=self.config.random_seed, 
        shuffle=self.config.shuffle_train_data)
        
        return train_x, valid_x, train_y, valid_y

    def get_test_data(self):
        feature_data = self.norm_data[self.train_num:]
        sample_interval = min(feature_data.shape[0], self.config.time_step)
        self.start_num_in_test = feature_data.shape[0] % sample_interval
        time_step_size = feature_data.shape[0] // sample_interval

        test_x = [feature_data[self.start_num_in_test+i*sample_interval : self.start_num_in_test+(i+1)*sample_interval]
                   for i in range(time_step_size)]

        return np.array(test_x)

In [150]:
class LSTM(Module):
    def __init__(self, input_size, hidden_size, layers, output_size, dropout_rate):
        super(LSTM, self).__init__()
        self.lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size,
                         num_layers=layers, batch_first=True, dropout=dropout_rate)
        self.linear = Linear(in_features=hidden_size, out_features=output_size)

    def forward(self, x, hidden=None):
        lstm_out, hidden = self.lstm(x, hidden)
        linear_out = self.linear(lstm_out)
        return linear_out, hidden

def lstm_train(config, train_and_valid_data):
    
    train_X, train_Y, valid_X, valid_Y = train_and_valid_data
    
    train_X, train_Y = torch.from_numpy(train_X).float(), torch.from_numpy(train_Y).float() # To Tensor
    train_loader = DataLoader(TensorDataset(train_X, train_Y), batch_size=config.batch_size, drop_last=True)
    
    valid_X, valid_Y = torch.from_numpy(valid_X).float(), torch.from_numpy(valid_Y).float() #To Tensor
    valid_loader = DataLoader(TensorDataset(valid_X, valid_Y), batch_size=config.batch_size, drop_last=True)
    
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    model = LSTM(config.input_size, config.hidden_size, config.layers, config.output_size, config.dropout_rate).to(device)
    
    optimizer =  torch.optim.Adam(model.parameters(), lr=config.learning_rate)
    criterion = torch.nn.MSELoss(reduction='mean')
    
    valid_loss_min = float("inf")
    bad_epoch = 0
    global_step = 0
    
    for epoch in range(config.epoch):
        print("Epoch {}/{}".format(epoch, config.epoch))
        model.train()
        train_loss_array = []
        hidden_train = None
        for i, _data in enumerate(train_loader):
            _train_X, _train_Y = _data[0].to(device),_data[1].to(device)
            optimizer.zero_grad()               
            pred_Y, hidden_train = model(_train_X, hidden_train)    
            hidden_train = None
            #if not config.do_continue_train:
            #    hidden_train = None
            #else:
            #    h_0, c_0 = hidden_train
            #    h_0.detach_(), c_0.detach_()    
            #    hidden_train = (h_0, c_0)
            loss = criterion(pred_Y, _train_Y)  
            loss.backward()                     
            optimizer.step()                    
            train_loss_array.append(loss.item())
            global_step += 1
            if config.do_train_visualized and global_step % 100 == 0: 
                vis.line(X=np.array([global_step]), Y=np.array([loss.item()]), win='Train_Loss',
                         update='append' if global_step > 0 else None, name='Train', opts=dict(showlegend=True))

        model.eval()                    
        valid_loss_array = []
        hidden_valid = None
        for _valid_X, _valid_Y in valid_loader:
            _valid_X, _valid_Y = _valid_X.to(device), _valid_Y.to(device)
            pred_Y, hidden_valid = model(_valid_X, hidden_valid)
            #if not config.do_continue_train: hidden_valid = None
            loss = criterion(pred_Y, _valid_Y)  
            valid_loss_array.append(loss.item())

        train_loss_cur = np.mean(train_loss_array)
        valid_loss_cur = np.mean(valid_loss_array)
        print("The train loss is {:.6f}. ".format(train_loss_cur) +
              "The valid loss is {:.6f}.".format(valid_loss_cur))
        if config.do_train_visualized:      
            vis.line(X=np.array([epoch]), Y=np.array([train_loss_cur]), win='Epoch_Loss',
                     update='append' if epoch > 0 else None, name='Train', opts=dict(showlegend=True))
            vis.line(X=np.array([epoch]), Y=np.array([valid_loss_cur]), win='Epoch_Loss',
                     update='append' if epoch > 0 else None, name='Eval', opts=dict(showlegend=True))

        if valid_loss_cur < valid_loss_min:
            valid_loss_min = valid_loss_cur
            bad_epoch = 0
            torch.save(model.state_dict(), config.model_save_path + config.model_type + "_" + config.dataset_type + "_model_"+ dt_string_file+ ".pth")
        else:
            bad_epoch += 1
            if bad_epoch >= config.patience:    # Stop training if bad epoch
                print(" The training stops early in epoch {}".format(epoch))
                break

def lstm_predict(config, test_X):

    test_X = torch.from_numpy(test_X).float()
    test_set = TensorDataset(test_X)
    test_loader = DataLoader(test_set, batch_size=1)

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    model = LSTM(config.input_size, config.hidden_size, config.layers, config.output_size, config.dropout_rate).to(device)
    model.load_state_dict(torch.load(config.model_save_path + config.model_type + "_" + config.dataset_type + "_model_" + dt_string_file + ".pth"))

    result = torch.Tensor().to(device)

    model.eval()
    hidden_predict = None
    for _data in test_loader:
        data_X = _data[0].to(device)
        pred_X, hidden_predict = model(data_X, hidden_predict)
        # if not config.do_continue_train: hidden_predict = None 
        cur_pred = torch.squeeze(pred_X, dim=0)
        result = torch.cat((result, cur_pred), dim=0)

    return result.detach().cpu().numpy()


In [151]:
class GRU(Module):
    def __init__(self, input_size, hidden_size, layers, output_size, dropout_rate):
        super(GRU, self).__init__()
        self.hidden_size = hidden_size
        self.layers = layers
        
        self.gru = nn.GRU(input_size, hidden_size, layers, batch_first=True, dropout=dropout_rate)
        self.fc = Linear(hidden_size, output_size)

    def forward(self, x, hidden=None):
        #h0 = torch.zeros(self.layers, x.size(0), self.hidden_size).requires_grad_()
        gru_out, hidden = self.gru(x, hidden)
        out = self.fc(gru_out) 
        return out, hidden

def gru_train(config, train_and_valid_data):
    train_X, train_Y, valid_X, valid_Y = train_and_valid_data
    
    train_X, train_Y = torch.from_numpy(train_X).float(), torch.from_numpy(train_Y).float() # To Tensor
    train_loader = DataLoader(TensorDataset(train_X, train_Y), batch_size=config.batch_size, drop_last=True)
    
    valid_X, valid_Y = torch.from_numpy(valid_X).float(), torch.from_numpy(valid_Y).float() #To Tensor
    valid_loader = DataLoader(TensorDataset(valid_X, valid_Y), batch_size=config.batch_size, drop_last=True)
    
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    model = GRU(config.input_size, config.hidden_size, config.layers, config.output_size, config.dropout_rate).to(device)
    
    optimizer =  torch.optim.Adam(model.parameters(), lr=config.learning_rate)
    criterion = torch.nn.MSELoss()
    
    valid_loss_min = float("inf")
    bad_epoch = 0
    global_step = 0
    
    for epoch in range(config.epoch):
        print("Epoch {}/{}".format(epoch, config.epoch))
        model.train()
        train_loss_array = []
        hidden_train = None
        for i, _data in enumerate(train_loader):
            _train_X, _train_Y = _data[0].to(device),_data[1].to(device)
            optimizer.zero_grad()               
            pred_Y, hidden_train = model(_train_X, hidden_train)    
            hidden_train = None
            #if not config.do_continue_train:
            #    hidden_train = None
            #else:
            #    h_0, c_0 = hidden_train
            #    h_0.detach_(), c_0.detach_()    
            #    hidden_train = (h_0, c_0)
            loss = criterion(pred_Y, _train_Y)  
            loss.backward()                     
            optimizer.step()                    
            train_loss_array.append(loss.item())
            global_step += 1
            if config.do_train_visualized and global_step % 100 == 0: 
                vis.line(X=np.array([global_step]), Y=np.array([loss.item()]), win='Train_Loss',
                         update='append' if global_step > 0 else None, name='Train', opts=dict(showlegend=True))

        model.eval()
        valid_loss_array = []
        hidden_valid = None
        for _valid_X, _valid_Y in valid_loader:
            _valid_X, _valid_Y = _valid_X.to(device), _valid_Y.to(device)
            pred_Y, hidden_valid = model(_valid_X, hidden_valid)
            #if not config.do_continue_train: hidden_valid = None
            loss = criterion(pred_Y, _valid_Y)  
            valid_loss_array.append(loss.item())

        train_loss_cur = np.mean(train_loss_array)
        valid_loss_cur = np.mean(valid_loss_array)
        print("The train loss is {:.6f}. ".format(train_loss_cur) +
              "The valid loss is {:.6f}.".format(valid_loss_cur))
        if config.do_train_visualized:      
            vis.line(X=np.array([epoch]), Y=np.array([train_loss_cur]), win='Epoch_Loss',
                     update='append' if epoch > 0 else None, name='Train', opts=dict(showlegend=True))
            vis.line(X=np.array([epoch]), Y=np.array([valid_loss_cur]), win='Epoch_Loss',
                     update='append' if epoch > 0 else None, name='Eval', opts=dict(showlegend=True))

        if valid_loss_cur < valid_loss_min:
            valid_loss_min = valid_loss_cur
            bad_epoch = 0
            torch.save(model.state_dict(), config.model_save_path + config.model_type + "_" + config.dataset_type + "_model_" + dt_string_file + ".pth")
        else:
            bad_epoch += 1
            if bad_epoch >= config.patience:    # Stop training if bad epoch
                print(" The training stops early in epoch {}".format(epoch))
                break

def gru_predict(config, test_X):

    test_X = torch.from_numpy(test_X).float()
    test_set = TensorDataset(test_X)
    test_loader = DataLoader(test_set, batch_size=1)

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    model = GRU(config.input_size, config.hidden_size, config.layers, config.output_size, config.dropout_rate).to(device)
    model.load_state_dict(torch.load(config.model_save_path + config.model_type + "_" + config.dataset_type + "_model_" + dt_string_file + ".pth"))

    result = torch.Tensor().to(device)

    model.eval()
    hidden_predict = None
    for _data in test_loader:
        data_X = _data[0].to(device)
        pred_X, hidden_predict = model(data_X, hidden_predict)
        # if not config.do_continue_train: hidden_predict = None 
        cur_pred = torch.squeeze(pred_X, dim=0)
        result = torch.cat((result, cur_pred), dim=0)

    return result.detach().cpu().numpy()


In [152]:
def main(config):

    config_dict = {}
    for key in dir(config):
        if not key.startswith("_"):
            config_dict[key] = getattr(config, key)
            
    if config.dataset_type == "qqq":
        dataset = Data_QQQ(config)
    elif config.dataset_type == "top10":
        dataset = Data_top10(config)
        
    if config.do_train:
        train_X, valid_X, train_Y, valid_Y = dataset.get_train_and_valid_data()
        if config.model_type == "gru":
            gru_train(config, [train_X, train_Y, valid_X, valid_Y])
        elif config.model_type == "lstm":
            lstm_train(config, [train_X, train_Y, valid_X, valid_Y])

    if config.do_predict:
        test_X = dataset.get_test_data()
        if config.model_type == "gru":
            pred_result = gru_predict(config, test_X)
        elif config.model_type == "lstm":
            pred_result = lstm_predict(config, test_X)
    
    label_data = dataset.data[dataset.train_num + dataset.start_num_in_test : ,
                                            config.label_in_feature_index]
    predict_data = pred_result * dataset.std[config.label_in_feature_index] + \
                dataset.mean[config.label_in_feature_index]

    label_name = [dataset.data_column_name[i] for i in config.label_in_feature_index]
    label_column_num = len(config.label_columns)

    loss = np.mean((label_data[config.predict_day:] - predict_data[:-config.predict_day] ) ** 2, axis=0)
    loss_norm = loss/(dataset.std[config.label_in_feature_index] ** 2)
    print("The mean squared error of stock {} is ".format(label_name) + str(loss_norm))

    label_X = range(dataset.data_num - dataset.train_num - dataset.start_num_in_test)
    predict_X = [ x + config.predict_day for x in label_X]

    for i in range(label_column_num):
        try:
            config_dict[label_name[i]] = int(np.squeeze(predict_data[-config.predict_day:, i]))
        except:
            a = list(np.squeeze(predict_data[-config.predict_day:, i]))
            a.reverse()
            config_dict[label_name[i]] = a
        print("The predicted stock {} for the next {} day(s) is: ".format(label_name[i],
        config.predict_day) + str(np.squeeze(predict_data[-config.predict_day:, i])))
        
    path_log = "log/" + config.dataset_type + "_" + config.model_type+"_" + dt_string_file + ".json"
    path_model = "models/" + config.model_type+ "_" + config.dataset_type + "_model_" + dt_string_file + ".pth"
    path_dataset = "dataset/"
    with open(path_log, "w") as outfile:
        json.dump(config_dict, outfile, indent = 1)
    
    bucket = "stockpricestorage"
    s3 = boto3.resource('s3')
    s3.meta.client.upload_file(path_log, bucket, path_log)
    s3.meta.client.upload_file(path_model, bucket, path_model)
    for root,dirs,files in os.walk(path_dataset):
        for file in files:
            s3.meta.client.upload_file(os.path.join(root,file),bucket, path_dataset+file)
    print("successful")

In [153]:
config_QQQ = Config_QQQ()
config_top10 = Config_top10()

config_top10.predict_day = 1
config_top10.do_train = True
config_top10.model_type = "lstm"
main(config_top10)

config_top10.do_train = False
config_top10.predict_day = 5
main(config_top10)

config_top10.predict_day = 1
config_top10.do_train = True
config_top10.model_type = "gru"
main(config_top10)

config_top10.do_train = False
config_top10.predict_day = 5
main(config_top10)

config_QQQ.predict_day = 1
config_QQQ.do_train = True
config_QQQ.model_type = "lstm"
main(config_QQQ)

config_QQQ.do_train = False
config_QQQ.predict_day = 5
main(config_QQQ)

config_QQQ.predict_day = 1
config_QQQ.do_train = True
config_QQQ.model_type = "gru"
main(config_QQQ)

config_QQQ.do_train = False
config_QQQ.predict_day = 5
main(config_QQQ)

Epoch 0/100
The train loss is 0.303939. The valid loss is 0.142448.
Epoch 1/100
The train loss is 0.083956. The valid loss is 0.086421.
Epoch 2/100
The train loss is 0.038653. The valid loss is 0.040694.
Epoch 3/100
The train loss is 0.024191. The valid loss is 0.031182.
Epoch 4/100
The train loss is 0.017865. The valid loss is 0.023587.
Epoch 5/100
The train loss is 0.013736. The valid loss is 0.019103.
Epoch 6/100
The train loss is 0.010458. The valid loss is 0.014847.
Epoch 7/100
The train loss is 0.007964. The valid loss is 0.011869.
Epoch 8/100
The train loss is 0.006091. The valid loss is 0.009605.
Epoch 9/100
The train loss is 0.004777. The valid loss is 0.007919.
Epoch 10/100
The train loss is 0.003948. The valid loss is 0.007166.
Epoch 11/100
The train loss is 0.003307. The valid loss is 0.006422.
Epoch 12/100
The train loss is 0.002992. The valid loss is 0.006151.
Epoch 13/100
The train loss is 0.002738. The valid loss is 0.005878.
Epoch 14/100
The train loss is 0.002552. The