In [3]:
import pandas as pd
import numpy as np

from pypf.instrument import DataframeInstrument
from pypf.chart import PFChart

import torch
from torch import nn
import torch.nn.functional as F

# Generate PnF Sequences
For a given ticker, generate its PnF chart, and then serialize the Xs and Os into a string to be used as a sequence in a character-level RNN

In [41]:
def generate_pnf_char_sequence(chart_meta_data):
    '''
        In _chart_meta_data, for each date we have a 'move' and a 'direction'. The 
        direction is either X or O, and the 'move' is the number of that character.
        To create sequences, we will iterate through each date in our range, and concat
        'move' number of the 'direction' character to our sequence
        
        Args:
            chart_meta_data (dict): the chart meta data dict from PnF object
            
        Returns:
            sequence (string): sequence of Xs and Os representing PnF chart
    '''
    sequence = ''
    # Iterate through each day
    for k,v in chart_meta_data.items():
        direction = v['direction']
        move = v['move']
        sequence += direction * move
    
    return sequence

In [44]:
# Load ticker data into dataframe
aapl_df = pd.read_csv('./data/aapl_extended.csv')

# Rename columns for use in PnF Library
aapl_df.columns = ['Date','Open', 'High', 'Low', 'Close', 'Adj_Close', 'Volume']
print(aapl_df.head())

# Create PnF Chart from dataframe
dfi = DataframeInstrument('AAPL', dataframe=aapl_df)
chart = PFChart(dfi, duration=20.0) #Duration = years of data
chart.create_chart()

# Create sequence        
pnf_sequence = generate_pnf_char_sequence(chart._chart_meta_data)

print(len(pnf_sequence), pnf_sequence)

         Date   Open   High    Low  Close  Adj_Close    Volume
0  1998-01-02  13.63  16.25  13.50  16.25     0.5103   6411700
1  1998-01-05  16.50  16.56  15.19  15.88     0.4987   5820300
2  1998-01-06  15.94  20.00  14.75  18.94     0.5948  16182800
3  1998-01-07  18.81  19.00  17.31  17.50     0.5496   9300200
4  1998-01-08  17.44  18.62  16.94  18.19     0.5713   6910900
11545 oooooooxxxxxxooooxxxxxxxxxxxoooxxxxxxooooooxxxoooooooxxxxxxooooooxxxxoooooooxxxoooxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxoooooooooxxxxxxxxxxxxxxxxxooooooooooooooxxxxxxxxooooooxxxxxxooooooooxxxxxxoooooooxxxoooooooxxxxxxxxxxxxxxxxxoooooooxxxxxxooooooxxxxoooooooooooxxxxxxxxxxoooxxxxxxxxxxxxxxxxxxxxooooxxxxxxoooooooooxxxxxxxxoooooxxxxoooooooxxxxoooooxxxxxxxxoooxxxxxxxoooooooooxxxxxxxxxxxxxxxxooooxxxxoooooxxxxxxxxxxooooxxxxxxxxxxxxxxxxxxxxxxooooxxxxoooxxxxxooooooxxxxxxxooooooooooooooxxxooooooooooooxxxxxoooooooooooooooxxxxxxxxxxxoooooooxxxxxxxxxxxxxxxoooooooooxxxxxxxxxooooooooxxxxxxxxxxxxxxxxoooooooooxxxxxxxxxxxooooxxxooox

# Tokenize
Want to map our input characters to integers and vice versa

In [51]:
chars = tuple(set(pnf_sequence))
int2char = dict(enumerate(chars))
char2int = {ch:ii for ii, ch in int2char.items()}

encoded = np.array([char2int[char] for char in pnf_sequence])

# One-Hot Encode Data

In [52]:
def one_hot_encode(arr, n_labels):
    # Init one-hot array to correct size
    one_hot = np.zeros((np.multiply(*arr.shape), n_labels), dtype=np.float32)
    
    #Fill appropriate elements with 1s
    one_hot[np.arange(one_hot.shape[0]), arr.flatten()] = 1.
    
    # Reshape to get back to desired shape
    one_hot = one_hot.reshape((*arr.shape, n_labels))
    
    return one_hot

# Create Batch Generator

In [53]:
def get_batches(arr, batch_size, seq_length):
    '''Create a generator that returns batches of size
       batch_size x seq_length from arr.
       
       Arguments
       ---------
       arr: Array you want to make batches from
       batch_size: Batch size, the number of sequences per batch
       seq_length: Number of encoded chars in a sequence
    '''
    
    batch_size_total = batch_size * seq_length
    # total number of batches we can make
    n_batches = len(arr)//batch_size_total
    
    # Keep only enough characters to make full batches
    arr = arr[:n_batches * batch_size_total]
    # Reshape into batch_size rows
    arr = arr.reshape((batch_size, -1))
    
    # iterate through the array, one sequence at a time
    for n in range(0, arr.shape[1], seq_length):
        # The features
        x = arr[:, n:n+seq_length]
        # The targets, shifted by one
        y = np.zeros_like(x)
        try:
            y[:, :-1], y[:, -1] = x[:, 1:], arr[:, n+seq_length]
        except IndexError:
            y[:, :-1], y[:, -1] = x[:, 1:], arr[:, 0]
        yield x, y

# Create Test and Training Data


In [58]:
test_prop = 0.2
test_end_idx = int(len(encoded) * test_prop)

test_data = encoded[:test_end_idx]
train_data = encoded[test_end_idx:]

print(len(test_data), len(train_data))

2309 9236


# Define Model

In [59]:
class CharRNN(nn.Module):
    
    def __init__(self, tokens, lstm_size=256, lstm_layers=2,
                               drop_prob=0.5, lr=0.001):
        super().__init__()
        self.drop_prob = drop_prob
        self.lstm_layers = lstm_layers
        self.lstm_size = lstm_size
        self.lr = lr
        
        # Create Char Dictionaries
        self.chars = tokens
        self.int2char = dict(enumerate(self.chars))
        self.char2int = {ch: ii for ii, ch in self.int2char.items()}
        
        ## Define LSTM Layer
        self.lstm = nn.LSTM(len(self.chars), lstm_size, lstm_layers, 
                            dropout=drop_prob, batch_first=True)
        
        ## Define Dropout Layer
        self.dropout = nn.Dropout(drop_prob)
        
        ## Define Final Desnse Output Layer
        self.fc = nn.Linear(lstm_size, len(self.chars))
      
    
    def forward(self, nn_input, hidden):
        ''' Forward pass through the network. 
            These inputs are x, and the hidden/cell state `hidden`. '''
                
        # Input -> LSTM
        lstm_out, hidden = self.lstm(nn_input, hidden)
        
        # Stack up LSTM outputs using view
        # you may need to use contiguous to reshape the output
        lstm_out = lstm_out.contiguous().view(-1, self.lstm_size)
        
        # LSTM -> Dense -> Dropout
        dense_out = self.dropout(self.fc(lstm_out))
        
        # return the final output and the hidden state
        return dense_out, hidden
    
    
    def init_hidden(self, batch_size):
        ''' Initializes hidden state '''
        # Create two new tensors with sizes lstm_layers x batch_size x lstm_size,
        # initialized to zero, for hidden state and cell state of LSTM
        weight = next(self.parameters()).data
        
        hidden = (weight.new(self.lstm_layers, batch_size, self.lstm_size).zero_(),
                      weight.new(self.lstm_layers, batch_size, self.lstm_size).zero_())
        
        return hidden

# Initialize Model

In [62]:
# Determine if we can run on GPU or not
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

lstm_size = 512
lstm_layers = 2

model = CharRNN(chars, lstm_size, lstm_layers)
model.to(device)
print(model)

CharRNN(
  (lstm): LSTM(2, 512, num_layers=2, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.5)
  (fc): Linear(in_features=512, out_features=2, bias=True)
)


# Train Model

In [68]:
epochs = 5
batch_size = 8
sequence_length = 25
clip = 5
learn_rate = 0.003
print_every = 10

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr = learn_rate)

steps = 0
n_chars = len(model.chars)

for epoch in range(epochs):
    print("Starting Epoch {}".format(epoch+1))
    # Set to training mode
    model.train()
    
    # Iterate through training batches
    for train_batch, train_labels in get_batches(train_data, batch_size, sequence_length):
        steps += 1
        
        # Init hidden state
        hidden = model.init_hidden(batch_size)
        hidden = tuple([each.data for each in hidden])
        
        # One-hot encode data and convert to tensors
        train_batch = one_hot_encode(train_batch, n_chars)
        train_batch, train_labels = torch.from_numpy(train_batch), torch.from_numpy(train_labels)
        
        # Ensure that tensors are on correct devices
        train_batch, train_labels = train_batch.to(device), train_labels.to(device)
        for each in hidden:
            each.to(device)
            
        # Zero-out gradients
        model.zero_grad()
        
        # Run data through model and get output/new hidden state
        output, hidden = model(train_batch, hidden)
        
        # Calculate loss and perform backprop -- clip gradients if necessary
        loss = criterion(output, train_labels.view(batch_size*sequence_length).long())
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        #Take step with optimizer
        optimizer.step()
        
        # Validate Model
        if steps % print_every == 0:
            model.eval()
            
            val_losses = []
            
            for val_batch, val_labels in get_batches(test_data, batch_size, sequence_length):
                # Init hidden state
                val_hidden = model.init_hidden(batch_size)
                val_hidden = tuple([each.data for each in val_hidden])
                
                # One-hot encode data and convert to tensors
                val_batch = one_hot_encode(val_batch, n_chars)
                val_batch, val_labels = torch.from_numpy(val_batch), torch.from_numpy(val_labels)
                
                # Ensure tensors are on correct devices
                val_batch, val_labels = val_batch.to(device), val_labels.to(device)
                for each in hidden:
                    each.to(device)
                    
                # Run validation data through network
                val_output, val_hidden = model(val_batch, val_hidden)
                
                # Calculate and record loss
                val_loss = criterion(val_output, val_labels.view(batch_size*sequence_length).long())
                val_losses.append(val_loss.item())
                
                # Set back to training mode
                model.train()
                
            # Print out statistics
            print("Epoch: {}/{}...".format(epoch+1, epochs),
                  "Step: {}...".format(steps),
                  "Loss: {:.4f}...".format(loss.item()),
                  "Val Loss: {:.4f}".format(np.mean(val_losses)))

Starting Epoch 1
Epoch: 1/5... Step: 10... Loss: 0.4477... Val Loss: 0.4364
Epoch: 1/5... Step: 20... Loss: 0.5068... Val Loss: 0.4232
Epoch: 1/5... Step: 30... Loss: 0.4721... Val Loss: 0.4756
Epoch: 1/5... Step: 40... Loss: 0.4883... Val Loss: 0.4289
Starting Epoch 2
Epoch: 2/5... Step: 50... Loss: 0.4342... Val Loss: 0.4295
Epoch: 2/5... Step: 60... Loss: 0.4771... Val Loss: 0.4090
Epoch: 2/5... Step: 70... Loss: 0.4494... Val Loss: 0.4253
Epoch: 2/5... Step: 80... Loss: 0.4805... Val Loss: 0.4249
Epoch: 2/5... Step: 90... Loss: 0.4565... Val Loss: 0.4239
Starting Epoch 3
Epoch: 3/5... Step: 100... Loss: 0.4468... Val Loss: 0.4414
Epoch: 3/5... Step: 110... Loss: 0.4776... Val Loss: 0.4590
Epoch: 3/5... Step: 120... Loss: 0.4535... Val Loss: 0.4192
Epoch: 3/5... Step: 130... Loss: 0.4464... Val Loss: 0.4250
Starting Epoch 4
Epoch: 4/5... Step: 140... Loss: 0.4033... Val Loss: 0.4207
Epoch: 4/5... Step: 150... Loss: 0.5081... Val Loss: 0.4285
Epoch: 4/5... Step: 160... Loss: 0.4243..