# Time-series Classification & Prediction w/ LSTM models

NOTE: Some cells will not run properly as I have removed some snippets of private code that is not shared here. If you wish to see my full notebook, you are more than welcome to ask me for it.


Given data in timesteps, predict a future timestep. 

#### Data hyperparameters:
- timestep_length: each data point is a summation of data of the timestep length
- seq_len: # of timesteps observed before making a prediction
##### Dependent parameters (changing the hyperparameters will change these parameters accordingly):
- target/prediction_timestep: (how far into the future you want to predict) # of timesteps from the last observed timestep
- batch_size: # of sequences

#### LSTM hyperparameters:
- input_size: # of features in each data point
- hidden_size: # of nodes in hidden layer
- num_layers

In [1]:
import torch
import torch.nn as nn
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import matplotlib.pyplot as plt
# import indicators
'''
import math
import requests
import datetime
import csv
import time
from operator import itemgetter
'''

device = torch.device(torch.cuda.current_device() if torch.cuda.is_available() else 'cpu')
print(device)

cuda:0


## LSTM

In [2]:
class LstmModel(nn.Module):
    def __init__(self, hidden_size, num_layers=1, dropout_rate=.1):
        super(LstmModel, self).__init__()
        # Hyperparams
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.dropout_rate = dropout_rate
        self.num_classes = 2  # Binary Classification
        # Model
        self.rnn = nn.LSTM(input_size=self.input_size, hidden_size=self.hidden_size, num_layers=self.num_layers,
                          dropout=self.dropout_rate)
        self.dropout = nn.Dropout(self.dropout_rate)
        self.fc_out = nn.Linear(self.hidden_size, self.num_classes)  # Output layer (FC)
        self.softmax = nn.Softmax(dim=-1)
        
    def forward(self, inputs, hidden_state):
        # init
        batch_size = int(inputs.size(1))
        
        outputs, hidden_state = self.rnn(inputs, hidden_state)
        # Take last output only (many-to-one)
        last_output = outputs[-1].view(1, batch_size, -1)
        
        last_output = self.dropout(last_output)
        last_output = self.fc_out(last_output)
        last_output = self.softmax(last_output).view(batch_size, self.num_classes)
        
        return last_output, hidden_state

## LSTM-FCN

In [3]:
class LstmFCN(nn.Module):
    def __init__(self, hidden_size=128, num_layers=1, dropout_rate=.8, dimension_shuffle=False):
        super(LstmFCN, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.dropout_rate = dropout_rate
        self.dimension_shuffle = dimension_shuffle
        self.out_features = 128
        self.kernel_sizes = [8, 5, 3]
        self.num_classes = 2
        # Model
        if dimension_shuffle:
            self.rnn = nn.LSTM(input_size=sequence_len, hidden_size=self.hidden_size, num_layers=self.num_layers)
        else:
            self.rnn = nn.LSTM(input_size=input_size, hidden_size=self.hidden_size, num_layers=self.num_layers)
            #                   , dropout=self.dropout_rate)  # obsolete with single layer
        self.dropout = nn.Dropout(self.dropout_rate)
        # FCN (Conv)
        self.conv1 = nn.Conv1d(in_channels=input_size, out_channels=128, kernel_size=self.kernel_sizes[0])
        self.bn1 = nn.BatchNorm1d(128)
        self.relu = nn.ReLU()
        self.conv2 = nn.Conv1d(128, 256, kernel_size=self.kernel_sizes[1])
        self.bn2 = nn.BatchNorm1d(256)
        # relu
        self.conv3 = nn.Conv1d(256, self.out_features, kernel_size=kernel_sizes[2])
        self.bn3 = nn.BatchNorm1d(self.out_features)
        # self.pool = nn.AvgPool1d()  # done below due to variable sequence_len
        # Concatenate LSTM+FCN
        # lstm_fcn concat shape: dim1: [7 x 256 x 1]  # dim0: [14 x 128 x 1], dim2: [7 x 128 x 2]
        self.calc_concat_size = self.out_features + self.hidden_size
        # evaluate all returned features from both nets, & produce a label/classification
        self.fc_out = nn.Linear(self.calc_concat_size, self.num_classes)
        self.softmax = nn.Softmax(dim=-1)
        
    def forward(self, inputs, hidden_state):
        # init
        # inputs: [seq_len, batch_size, input_size]
        batch_size = int(inputs.size(1))
        seq_len = int(inputs.size(0))  # timesteps
        input_size = int(inputs.size(2))  # num_variables
        bn = nn.BatchNorm1d(batch_size).cuda()  # for varying batch sizes.
        
        ''' inputs -> both LSTM & FCN separately, then concatenate '''
        # inputs -> LSTM
        # LSTM takes inputs of [seq_len, batch_size, input_size] (if shuffle: [input_size, batch_size, seq_len])
        if self.dimension_shuffle:
            inputs = inputs.reshape(input_size, batch_size, seq_len)
        lstm_outputs, hidden_state = self.rnn(inputs, hidden_state)
        # If many-to-one: Take last output only
        lstm_outputs = lstm_outputs[-1].view(1, batch_size, -1)  # fixme? comment out if many-to-many
        lstm_outputs = self.dropout(lstm_outputs)
        # LSTM outputs: [seq_len, batch_size, hidden_size] (if shuffle: [input_size, batch_size, hidden_size])
        # if many-to-one: [1, batch_size, hidden_size]
        
        # inputs -> FCN
        # FCN takes inputs of [batch_size, input_size, seq_len]
        inputs = inputs.reshape(batch_size, input_size, seq_len)
        outputs = self.conv1(inputs)
        outputs = self.bn1(outputs)
        outputs = self.relu(outputs)
        outputs = self.conv2(outputs)
        outputs = self.bn2(outputs)
        outputs = self.relu(outputs)
        outputs = self.conv3(outputs)
        outputs = self.bn2(outputs)
        fcn_outputs = self.relu(outputs)
        # [batch_size, self.out_features, fcn_out_size]  # [batch_size, 128, 51]
        
        # Global Average Pooling (helps minimize overfitting): takes average of entire sequence (dimensionality reduction)
        # calculate fcn_out_size (due to variable sequence length)
        fcn_out_size = seq_len - sum(self.kernel_sizes) + len(self.kernel_sizes)  # 64 - (8+5+3) + 3
        pool = nn.AvgPool1d(fcn_out_size)
        fcn_outputs = pool(fcn_outputs)
        # FCN outputs: [batch_size, self.out_features, 1]
        
        ''' Concatenate LSTM+FCN '''
        # Sizes of tensors must match except in chosen dimension. (default=0)
        # Must reshape atleast 1 of the tensors
        lstm_outputs = lstm_outputs.view(batch_size, -1, 1)  # [batch_size, hidden_size, 1]
        # Concat dim=1 (out_features from each net). "adding" features from lstm+fcn together, for each sample
        lstm_fcn_outputs = torch.cat([lstm_outputs, fcn_outputs], dim=1)
        # lstm_fcn concat shape: [batch_size, self.out_features + self.hidden_size, 1]
        
        # Linear
        lstm_fcn_outputs = lstm_fcn_outputs.squeeze(2)  # [batch_size, features]
        lstm_fcn_outputs = self.fc_out(lstm_fcn_outputs)
        lstm_fcn_outputs = self.softmax(lstm_fcn_outputs)
        # LSTM-FCN outputs: [batch_size, num_classes]
        
        return lstm_fcn_outputs, hidden_state  # passing hidden_state is obsolete, only done for compatibility w other model

## Get Raw Data

- Stock quotes [timestep, price, volume], from CryptoCompare API

In [None]:
period = 128  # hyperparam
api_crypto = 'https://min-api.cryptocompare.com/data/v2/'
timestep_length = 'minute'  # 'hour', 'day'
coin = 'BTC'
url_crypto = api_crypto + 'histo' + timestep_length + '?fsym=' + coin + '&tsym=USD' + '&limit=2000'

''' Get Data (in increments of 2000) '''
# raw format: 
raw_field_headers = ['time', 'high', 'low', 'open', 'volumefrom', 'volumeto', 'close']  # , 'conf', 'cont']
field_headers = raw_field_headers  # define desired fields & order
crypto_quotes = []

queries = 5  # Set how many increments  # 2,000 * 5 = 10,000
timestamp = None  # needed to call additional queries
url = url_crypto
for i in range(queries):
    query = []
    if timestamp:
        url = url_crypto + '&toTs=' + str(timestamp)
    response = requests.get(url)
    raw_quotes = response.json()['Data']['Data']  # json
    timestamp = raw_quotes[0]['time']
    
    for quote in raw_quotes:
        desired_quote = []
        for field in field_headers:
            desired_quote.append(quote[field])
        query.append(desired_quote)
    query.reverse()
    if crypto_quotes:  # ADDITIONAL QUERIES
        if query[0] == crypto_quotes[-1]:
            del query[0]
            # print("deleted repeated timestamp")
        crypto_quotes.extend(query)
    else:  # 1ST QUERY
        crypto_quotes = query
    del raw_quotes
print(crypto_quotes[:2], crypto_quotes[-2:])

''' Modify data '''
# chronological order
crypto_quotes.reverse()

# TODO
''' ADD PERSONAL 'indicators.py' file to GitHub
# INDICATORS
crypto_quotes = indicators.atr(crypto_quotes, period=period, closeCol=6, lowCol=2, highCol=1)  # period hyperparam
crypto_quotes = indicators.rsi(crypto_quotes, period=period, close=6)
crypto_quotes = indicators.aggregate(crypto_quotes, period=period, lowCol=2, highCol=1)
field_headers.append('atr')
field_headers.append('rsi')
field_headers.append('agg_high')
field_headers.append('agg_low')
'''

for i in range(len(crypto_quotes)):
    # print datetime timestamp of first & last quotes (to verify order)
    if i < 2 or i > len(crypto_quotes)-2:
        crypto_quotes[i][0] = str(datetime.datetime.fromtimestamp(crypto_quotes[i][0]))  # convert to datetime timestamp
        print(crypto_quotes[i])
    # Move close_price to end of list
    crypto_quotes[i].append(crypto_quotes[i][6])  # add to end
    del crypto_quotes[i][6]  # del original spot (close)
    # del crypto_quotes[i][0]  # del timestamp
    crypto_quotes[i][0] = crypto_quotes[i][3]  # replace timestamp with open_price
    del crypto_quotes[i][3]  # del original spot (open)
    ''' Add (2) spots for features that will be added later '''  # observed classes 0 & 1
    crypto_quotes[i].insert(0, 0)
    crypto_quotes[i].insert(0, 0)
    # TODO: Add more
    # crypto_quotes[i].insert(0, 0)
    # crypto_quotes[i].insert(0, 0)
    
# modify corresponding field headers
field_headers.append('close')
del field_headers[6]
field_headers[0] = field_headers[3]
del field_headers[3]
field_headers.insert(0, 'PRIVATE_CODE0')  # private repo: feel free to ask me to see it
field_headers.insert(1, 'PRIVATE_CODE1')
field_headers[5] = 'vol_from'  # shorthand 'volumefrom'
field_headers[6] = 'vol_to'

# delete first quotes w/ missing indicator values
del crypto_quotes[:period+1]
mod = len(crypto_quotes) % 10  # make divisible by 10
if mod > 0: del crypto_quotes[:mod]

print(crypto_quotes[-1])
features = {}
features_i = {}
for i in range(len(field_headers)):
    features_i[i] = field_headers[i]  # int: 'string'
    features[field_headers[i]] = i  # 'string': int
print(features)
print("# of quotes:", len(crypto_quotes))


### Verify data (Plot/print)

In [None]:
quotes = np.asarray(crypto_quotes)
# full dataset: ~10,000 (~= 7 days)  # 1 day = 1440
slice_from = -9000  # -9000
slice_to = -1  # -1

plt.plot(quotes[slice_from:slice_to, -1])
plt.title('close_price')
plt.show()
plt.subplot(2,1,1)
plt.plot(quotes[slice_from:slice_to, 3])
plt.title('high, low')
plt.subplot(2,1,2)
plt.plot(quotes[slice_from:slice_to, 4])
plt.show()

plt.subplot(2,1,1)
plt.plot(quotes[slice_from:slice_to, features['vol_from']])
plt.title('vol_from, vol_to')
plt.subplot(2,1,2)
plt.plot(quotes[slice_from:slice_to, features['vol_to']])
plt.show()
scaler = MinMaxScaler()
vols = scaler.fit_transform(quotes[slice_from:slice_to, 5:7])
plt.plot(vols[:, 0])
plt.plot(vols[:, 1])
plt.show()

plt.subplot(2,1,1)
plt.plot(quotes[slice_from:slice_to, features['atr']])
plt.title('atr, rsi')
plt.subplot(2,1,2)
plt.plot(quotes[slice_from:slice_to, features['rsi']])
plt.show()

plt.subplot(2,1,1)
plt.plot(quotes[slice_from:slice_to, features['agg_high']])
plt.title('Aggregate high, low')
plt.subplot(2,1,2)
plt.plot(quotes[slice_from:slice_to, features['agg_low']])
plt.show()

## Preprocess

In [None]:
quotes = np.asarray(crypto_quotes)
unscaled_quotes = quotes
''' Feature Normalization/Scaling '''
# Linearly transform x to y= (x-min)/(max-min)
# quotes = signal.detrend(quotes)
scaler = MinMaxScaler()  # normalize inputs (price,vol) 0-1 => min-max (low-high)
quotes[:, 5:9] = scaler.fit_transform(quotes[:, 5:9])
print("features 5-8 normalized:")
print(quotes[-1])
min_price = min(quotes[:, features['low']])
max_price = max(quotes[:, features['high']])
print("min, max prices:", min_price, max_price)
for i in range(len(quotes)):
    quotes[i][features['open']] = (quotes[i][features['open']] - min_price) / (max_price - min_price)
    quotes[i][features['high']] = (quotes[i][features['high']] - min_price) / (max_price - min_price)
    quotes[i][features['low']] = (quotes[i][features['low']] - min_price) / (max_price - min_price)
    quotes[i][features['close']] = (quotes[i][features['close']] - min_price) / (max_price - min_price)
    quotes[i][features['agg_high']] = (quotes[i][features['agg_high']] - min_price) / (max_price - min_price)
    quotes[i][features['agg_low']] = (quotes[i][features['agg_low']] - min_price) / (max_price - min_price)
print("all prices normalized:")
print(quotes[-1])

# HYPERPARAMETERS
target = 8  # 2,4,8  # how far into the future to predict
percent_change = .001
sequence_len = 64
short_sequence_len = 24  # 16
average_of = 1024  # 256  # 512, 1024  # PRIVATE
input_size = len(quotes[0])
print("input_size:", input_size)
test_size = 1000  # Set size of dataset

In [None]:
''' sequence data
Group the data into contiguous sequences of sequence_length. (and classify, and get last/live sequence)
Input: quotes
Output: 
- sequenced_data, labels
- PRIVATE_CODE0, PRIVATE_CODE1
'''
private_quote_sequences = []  # train
private_labels = []
all_quote_sequences = []  # test
test_labels = []
populating_test = False
unscaled_quote_sequences = []
# private
private_code0 = []
private_code1 = []
private_min = []
private_max = []
close = -1  # close_price column

sequence = []
last_sequence = []
for t in range(sequence_len, len(quotes)):  # fixme?
    sequence = quotes[t - sequence_len: t]
    unscaled_sequence = unscaled_quotes[t - sequence_len: t]

    ''' Define labels/classes (price action) '''
    last_observed_timestep = sequence[-1][close]  # sequence[-1] == quotes[t-1]
    try:
        ''' Train labels '''
        prediction_timestep = quotes[t+sequence_len+target, close]
        ''' PRIVATE
        # prediction_timeframe = quotes[ ... private code ... ]  # # set timeframe=1 for prediction_timestep = quotes[t+seq_len+target, close]
        timeframe_labels = []
        # Characterize timeframe by ... private ... at ... private ... timestep
        for quote in prediction_timeframe:
            if quote < last_observed_timestep - (last_observed_timestep * percent_change):  # default: pct_change = 0
                ... private ...
            elif quote > ... private ... :
                ... private ...
            else:  # doesn't fall into criteria. will be filtered/skipped
                timeframe_labels.append(None)
        # Filtering
        # PRIVATE_CODE
        '''
        if prediction_timestep < last_observed_timestep:
            label = 0
        elif prediction_timestep > last_observed_timestep:
            label = 1
        else:
            label = -1
        if t%512==0: print(t, ":", label)
            
        ''' Test labels (may vary from Train) '''
        # PRIVATE_CODE
        if t >= (len(quotes) - test_size):  # Train set fully populated  # NOTE: modified/private
            populating_test = True
            if t > len(quotes) - test_size:
                if label >= 0:
                    all_quote_sequences.append(sequence)
                    test_labels.append(label)
                    unscaled_quote_sequences.append(unscaled_sequence)
                else:  # label == -1. Test set will not exclude any sequence
                    test_prediction_timeframe = quotes[t: (t - 1) + target, close]  # modified/private
                    test0 = False
                    test1 = False
                    for quote in test_prediction_timeframe:
                        if quote < last_observed_timestep:
                            test0 = True
                        elif quote > last_observed_timestep:
                            test1 = True
                    if test0 and test1:
                        test_labels.append(-1)
                    elif test0:
                        test_labels.append(0)
                    elif test1:
                        test_labels.append(1)
                    else:
                        test_labels.append(-1)
                    all_quote_sequences.append(sequence)
                    unscaled_quote_sequences.append(unscaled_sequence)
        else:  # Populate Train
            if label < 0:  # <  # <= for "unary" classification (buy)
                continue  # label -1 skipped (undesirable training data)
                # label = 0
            private_quote_sequences.append(sequence)
            private_labels.append(label)
            # PRIVATE_CODE.append(...private...)
            
        # If you are here, a label has been appended to Train or Test within this loop
        
        ''' CALCULATE ... PRIVATE ... OF DEFINED CLASSES'''
        # PRIVATE_CODE
        
        
print("num sequences:", len(private_quote_sequences))
num_class0 = private_labels.count(0)
num_class1 = private_labels.count(1)
print("num_class0:", num_class0)
print("num_class1:", num_class1)
''' class_weights if class imbalance? '''
nominator = min(num_class0, num_class1)
class0_weight = nominator / num_class0
class1_weight = nominator / num_class1
class_weights = torch.Tensor([class0_weight, class1_weight]).to(device)
print("class_weights:", class_weights)

print("num sequences (test):", len(all_quote_sequences))
print('"classless":', test_labels.count(-1))
test_data = np.asarray(all_quote_sequences)
test_labels = np.asarray(test_labels)
print(test_data[-1][-1])


### Exploratory Data Analysis (EDA)
Private code

# Main

In [None]:
''' Initialize model(s) '''
fcn = False
ensemble = False

if fcn:
    hidden_size = 128
    num_layers = 2  # 1
    dropout_rate = .8  # .6
    # weight_decay = 1e-3  # 1e-6
    lr_init = 1e-3  # default Adam lr=1e-3
    lr = lr_init
    model = LstmFCN(hidden_size=hidden_size, num_layers=num_layers, dropout_rate=dropout_rate, dimension_shuffle=False)
else:
    hidden_size = 256  # 128
    num_layers = 3  # 2-4
    # num_layers_fc = 1  # 0 still has 1 linear output layer
    dropout_rate = .8  # .1, .6
    # weight_decay = 1e-3  # 1e-3, 5e-4, 1e-4, 1e-5
    lr_init = 1e-3  # 5e-3
    lr = lr_init
    model = LstmModel(hidden_size, num_layers, dropout_rate)  # , num_layers_fc, batch_size) 

model.to(device)
loss_function = nn.CrossEntropyLoss()  # weight=class_weights)
# optimizer = torch.optim.Adam(model.parameters(), weight_decay=weight_decay, lr=lr)
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)  # TODO: warm restart
hidden_state = None

if ensemble:
    # short_sequence_len = 16  # must be init during data generation
    hidden_size2 = 256  # 128
    num_layers2 = 3  # 1-2
    # num_layers_fc2 = 0  # 0 still has 1 linear output layer
    dropout_rate2 = .8
    # weight_decay2 = 1e-3
    # lr = 1e-3
    model2 = LstmModel(hidden_size2, num_layers2, dropout_rate2)  # , num_layers_fc2, batch_size)
    model2.to(device)
    # optimizer2 = torch.optim.Adam(model2.parameters(), weight_decay=weight_decay2, lr=lr2)
    optimizer2 = torch.optim.AdamW(model2.parameters(), lr=lr)
    hidden_state2 = None

#### LR Range Test

In [None]:
''' LR Range Test '''
lr_test = 1e-7
# init temporary "test" model_tests
model_test = model
optimizer_test = torch.optim.Adam(model_test.parameters(), lr=lr_test)
if ensemble: 
    model_test2 = model2
    optimizer_test2 = torch.optim.Adam(model_test2.parameters(), lr=lr_test)
    
lrs = []
losses = []
losses2 = []
batch_size = 1
train_sequences = np.asarray(private_quote_sequences)  # todo: all_quote_sequences
train_labels = np.asarray(private_labels)
for i in range(len(train_sequences)):
    hidden_state = None  # "stateless": init before every batch. "stateful": retain hidden state
    sequence_batch = torch.from_numpy(train_sequences[i:i+batch_size]).float().view(sequence_len, batch_size, input_size).to(device)  # TODO: [0:i]
    label_batch = torch.from_numpy(train_labels[i:i+batch_size]).long().to(device)
    
    optimizer_test.zero_grad()
    outputs, hidden_state = model_test(sequence_batch, hidden_state)  # batch[:, :-1, :]
    loss = loss_function(outputs, label_batch)
    loss.backward(retain_graph=True)
    optimizer_test.step()
    if ensemble:
        hidden_state2 = None  # ensemble
        optimizer_test2.zero_grad()
        outputs2, hidden_state2 = model_test2(sequence_batch[-short_sequence_len:, :, :], hidden_state2)  # ensemble
        loss2 = loss_function(outputs2, label_batch)  # ensemble
        #
        loss2.backward(retain_graph=True)
        optimizer_test2.step()
    
    lrs.append(round(lr_test,6))
    losses.append(round(float(loss.data),4))
    if ensemble: losses2.append(round(float(loss2.data),4))
    lr_test *= 1.25
    optimizer_test = torch.optim.Adam(model_test.parameters(), lr=lr_test)
    if ensemble: optimizer_test2 = torch.optim.Adam(model_test2.parameters(), lr=lr_test)
    if lr_test >= .1:
        break
del lr_test, model_test, optimizer_test, hidden_state
if ensemble: del model_test2, optimizer_test2, hidden_state2

plt.plot(losses)
plt.show()
if ensemble:
    plt.plot(losses2)
    plt.title('model2')
    plt.show()
    
# print(lr_plot[:])

## Train
w/ LOOCV Test integrated

In [None]:
''' Train '''
batch_size = 1
batch_num = 0
eval_every = 40  # 40, 80
predictions_vs_labels = []
loocv = []
# plot_sequence = []
pred_vs_lab1 = []
pred_vs_lab2 = []  # ensemble
max_preds = []
max_preds2 = []  # ensemble
ensemble_weak = []  # ensemble
batch_num_restart = 0
lr_restart = 0
# for data_batch, labels_batch in zip(train_data, train_labels):
train_sequences = np.asarray(private_quote_sequences)  # todo: all_quote_sequences
train_labels = np.asarray(private_labels)
for i in range(len(train_sequences) - batch_size - 1):
    hidden_state = None  # "stateless": init before every batch. "stateful": retain hidden state
    sequence_batch = torch.from_numpy(train_sequences
                                      [i:i+batch_size]).float().view(sequence_len, 
                                                                     batch_size, input_size).to(device)  # TODO: [0:i]
    label_batch = torch.from_numpy(train_labels[i:i+batch_size]).long().to(device)
    
    optimizer.zero_grad()
    outputs, hidden_state = model(sequence_batch, hidden_state)  # batch[:, :-1, :]
    loss = loss_function(outputs, label_batch)
    if ensemble:
        hidden_state2 = None  # ensemble
        optimizer2.zero_grad()
        outputs2, hidden_state2 = model2(sequence_batch[-short_sequence_len:, :, :], hidden_state2)  # ensemble
        loss2 = loss_function(outputs2, label_batch)  # ensemble
    # Backprop
    try:
        loss.backward(retain_graph=True)
        if ensemble: loss2.backward(retain_graph=True)  # ensemble
    except RuntimeError:  # memory issue
        loss.backward()  # last backprop
        if ensemble: loss2.backward()  # ensemble
        # del sequence_batch, label_batch, outputs, loss
        torch.cuda.empty_cache()
        break  # end training (early)
    optimizer.step()
    if ensemble: optimizer2.step()  # ensemble
    
    batch_num += 1
    batch_num_restart += 1
    lr_restart += 1
    ''' LOOCV (Test) '''
    if batch_num_restart >= eval_every:
        hidden_state_test = None
        loocv_sequence = torch.from_numpy(train_sequences[i+batch_size]).float().view(sequence_len, 1, input_size).to(device)
        prediction, hidden_state_test = model(loocv_sequence, hidden_state_test)  # batch[:, -1, :]
        label = train_labels[i+batch_size]  # label_batch[-1]
        # loss = loss_function(prediction, label)
        if ensemble:
            hidden_state_test2 = None  # ensemble
            prediction2, hidden_state_test2 = model2(loocv_sequence[-short_sequence_len:, :, :], hidden_state_test2)  # batch[-short_seq_len:, -1, :]
        del loocv_sequence
        
        ''' Eval '''
        prediction = list(prediction.data.squeeze())
        max_prediction = float(max(prediction))
        prediction = prediction.index(max(prediction))
        
        selective_guard = False
        max_preds.append(max_prediction)
        avg_max = sum(max_preds) / len(max_preds)
        if not ensemble:
            selective_guard = max_prediction >= avg_max
        else:  # ensemble
            prediction2 = list(prediction2.data.squeeze())
            max_prediction2 = float(max(prediction2))
            prediction2 = prediction2.index(max(prediction2))
            max_preds2.append(max_prediction2)
            avg_max2 = sum(max_preds2) / len(max_preds2)
            selective_guard = prediction == prediction2   # and max_prediction >= avg_max and max_prediction2 >= avg_max2
            
        # selective_guard = True  # COMMENT OUT IF YOU WANT TO FILTER OUT SOME PREDICTIONS. True will eval every prediction
        if selective_guard:  # if deemed worthy prediction
            predictions_vs_labels.append([prediction, label])  # then save for evaluation
            # Evaluate + Print to console after collecting certain #of predictions
            if len(predictions_vs_labels) % eval_every == 0:
                print("Batch", batch_num, "(", batch_num_restart, ")")
                print("loss:", round(float(loss.data),4))
                accuracy = (predictions_vs_labels.count([0,0]) 
                            + predictions_vs_labels.count([1,1])) / len(predictions_vs_labels)
                accuracy_last_eval = (predictions_vs_labels[-eval_every:].count([0,0]) 
                                   + predictions_vs_labels[-eval_every:].count([1,1])) / eval_every
                print("accuracy:", round(accuracy,4), "(last", eval_every, ":", accuracy_last_eval, ")")
                loocv.append(accuracy)
                
                if accuracy_last_eval < .6 and lr_restart >= len(train_sequences)//16:
                    if lr > 3e-4:
                        lr *= (1/math.sqrt(2))
                    if lr < 3e-4:
                        lr = 3e-4
                    if batch_num_restart >= len(train_sequences)//4:  # WARM RESTART
                        lr_init /= 1.1
                        lr = lr_init  # 1e-3
                        batch_num_restart = 0
                    print("lr =>", round(lr,6))
                    # optimizer = torch.optim.Adam(model.parameters(), weight_decay=weight_decay, lr=lr)
                    optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
                    if ensemble: optimizer2 = torch.optim.AdamW(model2.parameters(), lr=lr)
                    lr_restart = 0
                
                restart = False  # change to True if want to restart model if performance suddenly falls steeply
                # e.g. overfit, ... private ...
                if restart:
                    if accuracy_last_eval < .4:  # .4, .45, .5
                        print("restarting model(s)")
                        batch_num_restart = 0
                        # del model, loss_function, optimizer
                        if fcn:
                            model = LstmFCN(hidden_size, num_layers, dropout_rate)
                        else:
                            model = LstmModel(hidden_size, num_layers, num_layers_fc, dropout_rate)
                        model.to(device)
                        loss_function = nn.CrossEntropyLoss()  # weight=class_weights)
                        optimizer = torch.optim.Adam(model.parameters(), weight_decay=weight_decay)  # , lr=lr)
                        if ensemble:
                            # del model2, optimizer2
                            # short_sequence_len = 16
                            model2 = LstmModel(hidden_size, num_layers, num_layers_fc, dropout_rate)
                            model2.to(device)
                            optimizer2 = torch.optim.Adam(model2.parameters(), weight_decay=weight_decay, lr=lr2)
        else:
            pred_vs_lab1.append([prediction, label])
            if ensemble: pred_vs_lab2.append([prediction2, label])
            if len(pred_vs_lab1) % eval_every == 0:
                print("skipped eval", len(pred_vs_lab1), "times. (selective_guard)")
                if len(pred_vs_lab1) % (eval_every * 10) == 0:
                    acc_1 = (pred_vs_lab1.count([0,0]) + pred_vs_lab1.count([1,1])) / len(pred_vs_lab1)
                    print("(filtered out):", round(acc_1, 4), "(", len(pred_vs_lab1), ")")
                    if ensemble:
                        acc_2 = (pred_vs_lab2.count([0,0]) + pred_vs_lab2.count([1,1])) / len(pred_vs_lab2)
                        print("(filtered2):", round(acc_2, 4), "(", len(pred_vs_lab2), ")")
                        
    del sequence_batch, label_batch, outputs, loss
    if ensemble:
        del outputs2, loss2
    torch.cuda.empty_cache()

print("DONE")

#### Results

In [None]:
''' Results (Train) '''
# Final print to console
accuracy_2nd_half = (predictions_vs_labels[-len(predictions_vs_labels)//2:].count([0,0]) + 
                      predictions_vs_labels[-len(predictions_vs_labels)//2:].count([1,1])) / (len(predictions_vs_labels)//2)
accuracy_4th_q = (predictions_vs_labels[-len(predictions_vs_labels)//4:].count([0,0]) + 
                      predictions_vs_labels[-len(predictions_vs_labels)//4:].count([1,1])) / (len(predictions_vs_labels)//4)
tp = predictions_vs_labels.count([1,1])
tn = predictions_vs_labels.count([0,0])
fp = predictions_vs_labels.count([1,0])
fn = predictions_vs_labels.count([0,1])
accuracy = (tp + tn) / len(predictions_vs_labels)
print("LOOCV accuracy:", round(accuracy,4), "(", len(predictions_vs_labels), ")")
print("accuracy_2nd_half:", round(accuracy_2nd_half,4))
print("accuracy_4th_q:", round(accuracy_4th_q,4))
print("tn, fp:", tn, fp)
print("fn, tp:", fn, tp)
precision = tp / (tp+fp)
print("Precision (how precise model's prediction of 1 is):", round(precision,4))
recall = tp / (tp+fn)  # "accuracy 1 only"
print("Recall (accuracy 1 only):", round(recall,4))
f1 = 2 * (precision*recall) / (precision+recall)
print("F1 Score:", round(f1,4))
plt.gca().set_ylim([0,1])  # ax = plt.gca()
plt.grid()
plt.plot(loocv)
plt.show()

# Filtered predictions
if pred_vs_lab1:
    print("Performance on predictions that were filtered out: (selective_guard)")
    tp1 = pred_vs_lab1.count([1,1])
    tn1 = pred_vs_lab1.count([0,0])
    fp1 = pred_vs_lab1.count([1,0])
    fn1 = pred_vs_lab1.count([0,1])
    print("tn, fp:", tn1, fp1)
    print("fn, tp:", fn1, tp1)
    acc_1 = (tp1 + tn1) / len(pred_vs_lab1)
    print(acc_1, "(", len(pred_vs_lab1), ")")
    if ensemble:
        acc_2 = (pred_vs_lab2.count([0,0]) + pred_vs_lab2.count([1,1])) / len(pred_vs_lab2)
        print("model2:", acc_2, "(", len(pred_vs_lab2), ")")

## Test (Separate)

In [None]:
test_predictions_vs_labels = []
plot_test = []
for sequence, label in zip(test_data, test_labels):
    hidden_state = None
    hidden_state2 = None
    sequence = torch.from_numpy(sequence).float().view(sequence_len, 1, input_size).to(device)
    # label = torch.from_numpy(label).long().to(device)
    
    prediction, hidden_state = model(sequence, hidden_state)
    prediction = list(prediction.data.squeeze())
    max_prediction = float(max(prediction))
    prediction = prediction.index(max(prediction))
    # print("prediction:", prediction)
    if ensemble:
        prediction2, hidden_state2 = model2(sequence[-short_sequence_len:], hidden_state2)
        prediction2 = list(prediction2.data.squeeze())
        max_prediction2 = float(max(prediction2))
        prediction2 = prediction2.index(max(prediction2))
        
    # plot price sequence
    plt.plot(sequence[:, :, -1].cpu())  # -1: close_price
    # plt.show()
    # plot other features
    plt.subplot(1,1,1)
    plt.plot(sequence[:, :, 5].cpu())
    plt.show()
    
    test_predictions_vs_labels.append([prediction, label])
    if len(test_predictions_vs_labels) % eval_every == 0:
        accuracy = (test_predictions_vs_labels.count([0,0]) 
                    + test_predictions_vs_labels.count([1,1]) 
                    + test_predictions_vs_labels.count([0,-1])  # ... PRIVATE ... 
                    + test_predictions_vs_labels.count([1,-1])) / len(test_predictions_vs_labels)
        accuracy_last_eval = (test_predictions_vs_labels[-eval_every:].count([0,0]) 
                            + test_predictions_vs_labels[-eval_every:].count([1,1]) 
                            + test_predictions_vs_labels[-eval_every:].count([0,-1])  # ... PRIVATE ...
                            + test_predictions_vs_labels[-eval_every:].count([1,-1])) / eval_every
        print("accuracy:", round(accuracy,4), "(last", eval_every, ":", accuracy_last_eval, ")")
        plot_test.append(accuracy)
    
accuracy = (test_predictions_vs_labels.count([0,0]) 
            + test_predictions_vs_labels.count([1,1]) 
            + test_predictions_vs_labels.count([0,-1])  # ... PRIVATE ...
            + test_predictions_vs_labels.count([1,-1])) / len(test_predictions_vs_labels)
print("Test accuracy:", round(accuracy,4), "(", len(test_predictions_vs_labels), ")")
accuracy = (test_predictions_vs_labels.count([0,0]) 
            + test_predictions_vs_labels.count([1,1])) / (len(test_labels) - np.count_nonzero(test_labels==-1))
print("accuracy (class only):", round(accuracy,4))

# del test_data, test_labels
# ax = plt.gca()
plt.gca().set_ylim([0,1])
plt.grid()
plt.plot(plot_test)
plt.show()