In [2]:
# Pytorch Imports
import numpy as np
import torch
from torch import nn, optim
import torch.nn.functional as F
import torch.utils.data
import pandas as pd
import datetime
import time

# Generate Data and Dataloader

In [4]:
# Function to generate two csv files for training/testing
def get_stock_data(file_name):
    ''' 
        Will divide a dataset into both test and training sets
        
        Args:
        
        file_name (string): path to the csv file containing all data
        percent_test (float): percentage of dataset to set aside for testing
        
        returns nothing
    '''
    # Open file and place into dataframe
    columns = ['date', 'open', 'high', 'low', 'close', 'volume', '50ma', 'label']
    stock_data = pd.read_csv(file_name)[columns]
    stock_data['date'] = stock_data['date'].apply(lambda x: time.mktime(datetime.datetime.strptime(x, "%Y-%m-%d").timetuple()), convert_dtype=True)
    stock_data = stock_data.dropna()
    return stock_data

# Get stock data from csv
stock_data = get_stock_data('./data/aapl.csv')

In [5]:
# Get input data and labels
signals = stock_data['label'].values
daily_data = stock_data.drop(['date', 'label'], axis=1).values

print(signals.shape)
print(daily_data.shape)

(1203,)
(1203, 6)


In [111]:
def dataloader(daily_data, labels, input_length=6, sequence_length=5, batch_size=10):
    # Get total number of days for which we have data -- only want full batches
    days_per_batch = batch_size * sequence_length
    total_days = (len(daily_data) // days_per_batch) * days_per_batch
    
    # Iterate through daily data, at intervals of batch_size X sequence_length
    for ii in range(0, total_days, days_per_batch):
        
        # Get all days in this batch
        batch_days = daily_data[ii: ii+days_per_batch]
        
        # Create the batch/label tensor of the right shape (seq_len x batch_size x input_features)
        batch = torch.zeros((sequence_length, batch_size, input_length), dtype=torch.float64)
        label_data = []
        
        # Fill out this batch/labels
        for batch_num, jj in enumerate(range(0, len(batch_days), sequence_length)):
            sequence_tensor = torch.tensor(batch_days[jj:jj+sequence_length])
            batch[:, batch_num] = sequence_tensor
            
            # Only want labels for day at the end of sequence
            label_data.append(labels[jj+sequence_length-1])
            
        # Fill out label tensor
        label_tensor = torch.tensor(label_data)
        
        yield batch, label_tensor

# Create test and validation data

In [114]:
# Split data into test and validation sets -- will have testing data be first data
# in dataset
test_prop = 0.2
test_end_idx = int(len(daily_data) * test_prop)

# Create testing data
test_features = daily_data[:test_end_idx]
test_labels = signals[:test_end_idx]

# Create training data
train_features = daily_data[test_end_idx:]
train_labels = signals[test_end_idx:]

In [105]:
# Test out the dataloader
sample_batch, sample_labels = next(iter(dataloader(train_features, train_labels)))
print(sample_batch.shape, sample_labels.shape)

torch.Size([5, 10, 6]) torch.Size([10])


# Model Definition

In [37]:
class StockClassifier(nn.Module):
    
    def __init__(self, input_length = 6,lstm_size = 64, lstm_layers=1, output_size = 3, 
                               drop_prob=0.2):
        super().__init__()
        self.input_length = input_length
        self.output_size = output_size
        self.lstm_size = lstm_size
        self.lstm_layers = lstm_layers
        self.drop_prob = drop_prob
        
        ## TODO: define the LSTM
        self.lstm = nn.LSTM(input_length, lstm_size, lstm_layers, 
                            dropout=drop_prob, batch_first=False)
        
        ## TODO: define a dropout layer
        self.dropout = nn.Dropout(drop_prob)
        
        ## TODO: define the final, fully-connected output layer
        self.fc = nn.Linear(lstm_size, output_size)
      
    
    def forward(self, nn_input, hidden_state):
        '''
            Perform a forward pass through the network
            
            Args:
                nn_input: the batch of input to NN
                hidden_state: The LSTM hidden/cell state tuple
                
            Returns:
                logps: log softmax output
                hidden_state: the updated hidden/cell state tuple
        '''
        # Input -> LSTM
        lstm_out, hidden_state = self.lstm(nn_input, hidden)

        # Stack up LSTM outputs -- this gets the final LSTM output for each sequence in the batch
        lstm_out = lstm_out[-1, :, :]
        
        # LSTM -> Dense Layer
        dense_out = self.dropout(self.fc(lstm_out))
        
        # Apply Log Softmax to dense output -- sum denominator across columns
        logps = F.log_softmax(dense_out, dim=1)
                
        # Return the final output and the hidden state
        return logps, hidden_state
    
    
    def init_hidden(self, batch_size):
        ''' Initializes hidden state '''
        # Create two new tensors with sizes n_layers x batch_size x n_hidden,
        # initialized to zero, for hidden state and cell state of LSTM
        weight = next(self.parameters()).data

        hidden = (weight.new(self.lstm_layers, batch_size, self.lstm_size).zero_(),
              weight.new(self.lstm_layers, batch_size, self.lstm_size).zero_())
        
        return hidden

# Test Model

In [None]:
model = StockClassifier(input_length=6, lstm_size=8, lstm_layers=1, output_size=3, drop_prob=0.1).double()
hidden = model.init_hidden(10)
logps, _ = model.forward(batch, hidden)
print(logps)

# Train Model

In [38]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Ensure that our model is set to 'double' as our volume value requires Float64
model = StockClassifier(input_length=6, lstm_size=128, lstm_layers=2, output_size=3, drop_prob=0.2).double()
model.to(device)

StockClassifier(
  (lstm): LSTM(6, 128, num_layers=2, dropout=0.2)
  (dropout): Dropout(p=0.2)
  (fc): Linear(in_features=128, out_features=3, bias=True)
)

In [115]:
epochs = 1
batch_size = 2
learning_rate = 0.003
clip = 5

print_every = 10
criterion = nn.NLLLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
model.train()

for epoch in range(epochs):
    print('Starting Epoch {}'.format(epoch+1))
    steps = 0
    
    for train_batch, labels in dataloader(train_features, train_labels, batch_size=batch_size, input_length=6):
        steps += 1
        
        #print("Train Batch: ", train_batch.shape, "Labels Size: ", labels.shape)
        
        # Initialize Hidden/Cell state -- batch size is dynamic to account for batches that are not full
        hidden = model.init_hidden(train_batch.shape[1])
        hidden = tuple([each.data for each in hidden])
        
        # Set tensors to correct device -- GPU or CPU
        train_batch, train_labels = train_batch.to(device), labels.to(device)
        for each in hidden:
            each.to(device)
            
        # Zero out gradients
        optimizer.zero_grad()
        
        # Run data through model -- output is output and new hidden/cell state
        output, hidden = model(train_batch, hidden)
        
        # Calculate loss and perform back prop -- clip grads if necessary
        loss = criterion(output, train_labels)
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        # Take optimizer step
        optimizer.step()
        
        # VALIDATION OF MODEL#
        if steps % print_every == 0:
            model.eval()
            val_losses = []
            accuracy = []
            with torch.no_grad():
                for val_batch, val_labels in dataloader(test_features, test_labels, batch_size=batch_size, input_length=6):

                    #Init hidden state -- again we have a dynamic batch size here
                    val_hidden = model.init_hidden(val_batch.shape[1])
                    val_hidden = tuple([each.data for each in val_hidden])

                    # Set device for tensors
                    val_batch, val_labels = val_batch.to(device), val_labels.to(device)
                    for each in val_hidden:
                        each.to(device)

                    # Run data through network
                    val_out, val_hidden = model(val_batch, val_hidden)

                    # Calculate and record loss
                    val_loss = criterion(val_out, val_labels)
                    val_losses.append(val_loss.item())

                    # Calculate accuracy of predictions
                    ps = torch.exp(val_out)
                    top_p, top_class = ps.topk(1, dim=1)
                    equals = top_class == val_labels.view(*top_class.shape)
                    accuracy.append(torch.mean(equals.type(torch.FloatTensor)).item())

            # Print out metrics
            print('Epoch: {}/{}...'.format(epoch+1, epochs),
                  'Step: {}...'.format(steps),
                  'Train Loss: {:.6f}...'.format(loss.item()),
                  'Val Loss: {:.6f}...'.format(np.mean(val_losses)),
                  'Accuracy: {:.6f}%...'.format(np.mean(accuracy) * 100))
            
            # Set back to training mode
            model.train()

Starting Epoch 1
Epoch: 1/1... Step: 10... Train Loss: 0.609900... Val Loss: 0.816253... Accuracy: 2.083333%...
Epoch: 1/1... Step: 20... Train Loss: 0.562413... Val Loss: 0.842489... Accuracy: 0.000000%...
Epoch: 1/1... Step: 30... Train Loss: 0.588911... Val Loss: 0.589054... Accuracy: 97.916667%...
Epoch: 1/1... Step: 40... Train Loss: 0.663687... Val Loss: 0.561035... Accuracy: 85.416667%...
Epoch: 1/1... Step: 50... Train Loss: 0.650729... Val Loss: 0.776855... Accuracy: 35.416667%...
Epoch: 1/1... Step: 60... Train Loss: 0.466971... Val Loss: 0.847237... Accuracy: 27.083333%...
Epoch: 1/1... Step: 70... Train Loss: 0.681019... Val Loss: 1.047770... Accuracy: 0.000000%...
Epoch: 1/1... Step: 80... Train Loss: 0.667817... Val Loss: 0.799426... Accuracy: 27.083333%...
Epoch: 1/1... Step: 90... Train Loss: 0.738087... Val Loss: 0.737807... Accuracy: 37.500000%...


The training loop will error out if you try to run it multiple times. This happens because the state of the dataloaders has not changed since the last run, and therefore you'll run out of data very quickly. When this happens, go back to the 'Create test and validation data' cell and re-run. This will reset the data in the generators and allow you to try and train again.