## Glove + LSTM model with 8-K forms

In [4]:
import pandas as pd
import torch
import torchtext
from torchtext import data
from torchtext.legacy.data import Field, LabelField, TabularDataset, BucketIterator
import torch.nn as nn

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
df = pd.read_pickle("/content/drive/MyDrive/Colab Notebooks/wholething.pkl")
df.dropna(inplace=True)
df = df.reset_index(drop=True)
df.head()

Unnamed: 0,symbol,cik,form,access_number,filed_date,accepted_date,report_url,filing_url,string,texts,lens,pct_change1,pct_change2,beta1,beta2,mkt_excess1,mkt_excess2
0,NICK,1000045,8-K,0001193125-19-024617,2019-02-01,2019-02-01 06:31:07,https://www.sec.gov/Archives/edgar/data/100004...,https://www.sec.gov/Archives/edgar/data/100004...,8-K 1 d675768d8k.htm FORM 8-K\n UNITED STAT...,Item 2.02\nResults of Operations and Financial...,182,0.039772,0.018824,0.1235,0.121,0.039502,0.017531
1,MFIN,1000209,8-K,0001193125-19-004285,2019-01-08,2019-01-08 09:10:35,https://www.sec.gov/Archives/edgar/data/100020...,https://www.sec.gov/Archives/edgar/data/100020...,8-K 1 d685338d8k.htm 8-K\n UNITED STATES\nS...,Item 2.02 Results of Operations and Financial ...,711,0.005894,0.019647,0.9098,0.91085,-0.003435,0.005013
2,MFIN,1000209,8-K,0001193125-19-007413,2019-01-11,2019-01-11 16:32:03,https://www.sec.gov/Archives/edgar/data/100020...,https://www.sec.gov/Archives/edgar/data/100020...,8-K 1 d682501d8k.htm FORM 8-K\n UNITED STAT...,ITEM 1.01.\nENTRY INTO A MATERIAL DEFINITIVE A...,261,-0.019268,-0.019268,0.8742,0.8742,-0.013888,-0.013888
3,MFIN,1000209,8-K,0001193125-19-024926,2019-02-01,2019-02-01 08:50:35,https://www.sec.gov/Archives/edgar/data/100020...,https://www.sec.gov/Archives/edgar/data/100020...,8-K 1 d671008d8k.htm FORM 8-K\n UNITED STAT...,ITEM 1.01.\nENTRY INTO A MATERIAL DEFINITIVE A...,257,0.077505,0.035918,0.8901,0.88485,0.076161,0.02762
4,MFIN,1000209,8-K,0001193125-19-047009,2019-02-21,2019-02-21 16:13:02,https://www.sec.gov/Archives/edgar/data/100020...,https://www.sec.gov/Archives/edgar/data/100020...,8-K 1 d711389d8k.htm 8-K\n UNITED STATES\nS...,ITEM 1.01.\nENTRY INTO A MATERIAL DEFINITIVE A...,257,0.076013,0.076013,0.9225,0.9225,0.07004,0.07004


In [7]:
def label1(row):
    if row["pct_change1"] > row["beta1"]*row["mkt_excess1"]:
        return "pos"
    else:
        return "neg"

In [8]:
def label2(row):
    if row["pct_change2"] > row["beta2"]*row["mkt_excess2"]:
        return "pos"
    else:
        return "neg"

In [9]:
df["label1"] = df.apply(label1, axis = 1)
df["label2"] = df.apply(label2, axis = 1)

In [10]:
df = df[["texts", "label2"]]
df.head()

Unnamed: 0,texts,label2
0,Item 2.02\nResults of Operations and Financial...,pos
1,Item 2.02 Results of Operations and Financial ...,pos
2,ITEM 1.01.\nENTRY INTO A MATERIAL DEFINITIVE A...,neg
3,ITEM 1.01.\nENTRY INTO A MATERIAL DEFINITIVE A...,pos
4,ITEM 1.01.\nENTRY INTO A MATERIAL DEFINITIVE A...,pos


In [11]:
# save texts and labels
df.to_csv('/content/drive/MyDrive/Colab Notebooks/data_lstm.csv')

In [12]:
# define fields for tabular dataset in directory
TEXT = Field(tokenize="spacy",lower = True, sequential=True, batch_first=True,include_lengths=True)
LABEL = LabelField(dtype = torch.float,batch_first=True)
fields = [(None, None),('text',TEXT), ('label', LABEL)]

In [13]:
#call data from directory with Tabular Dataset and process with pipeline defined in fields
data=TabularDataset(path = "/content/drive/MyDrive/Colab Notebooks/data_lstm.csv",format = 'csv',fields = fields,skip_header = True)

In [14]:
# a sequence with its label
vars(data[0])

{'label': 'pos',
 'text': ['item',
  '2.02',
  '\n',
  'results',
  'of',
  'operations',
  'and',
  'financial',
  'condition',
  '\n',
  'on',
  'february',
  '1',
  ',',
  '2019',
  'nicholas',
  'financial',
  ',',
  'inc.',
  '(',
  'the',
  '“',
  'company',
  '”',
  ')',
  'issued',
  'a',
  'press',
  'release',
  'announcing',
  'the',
  'company',
  '’s',
  'financial',
  'results',
  'for',
  'its',
  'quarter',
  'ended',
  'december',
  '31',
  ',',
  '2018',
  '.',
  'a',
  'copy',
  'of',
  'this',
  'press',
  'release',
  'is',
  'attached',
  'hereto',
  'as',
  'exhibit',
  '99.1',
  '.',
  '\n',
  'the',
  'information',
  'included',
  'in',
  'this',
  'current',
  'report',
  'on',
  'form',
  '8-k',
  '(',
  'including',
  'exhibit',
  '99.1',
  'hereto',
  ')',
  'is',
  'furnished',
  'pursuant',
  'to',
  'this',
  'item',
  '2.02',
  'and',
  'shall',
  'not',
  'be',
  'deemed',
  'to',
  'be',
  '“',
  'filed',
  '”',
  'for',
  'the',
  'purposes',
  'of'

In [15]:
# split data into training and validation sets
train_data, valid_data = data.split(split_ratio=0.8)

In [16]:
# build vocabulary from training set using 100 dimensional GloVe embeddings
TEXT.build_vocab(train_data,min_freq=3,vectors = "glove.6B.100d")  
LABEL.build_vocab(train_data)

.vector_cache/glove.6B.zip: 862MB [02:41, 5.35MB/s]                           
100%|█████████▉| 399997/400000 [00:14<00:00, 28650.43it/s]

In [17]:
# set devide to "cuda"
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')  

#set batch size
BATCH_SIZE = 64

#Load an iterator
train_iterator, valid_iterator = BucketIterator.splits(
    (train_data, valid_data), 
    batch_size = BATCH_SIZE,
    sort_key = lambda x: len(x.text),
    sort_within_batch=True,
    device = device)

In [18]:
# define classifier

class classifier(nn.Module):
    
    #define all the layers used in model
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, 
                 bidirectional, dropout):
        
        #Constructor
        super().__init__()          
        
        #embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        #lstm layer
        self.lstm = nn.LSTM(embedding_dim, 
                           hidden_dim, 
                           num_layers=n_layers, 
                           bidirectional=bidirectional, 
                           dropout=dropout,
                           batch_first=True)
        
        #dense layer
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        
        #activation function
        self.act = nn.Sigmoid()
        
    def forward(self, text, text_lengths):
        
        # pass sequences through embedding layer
        embedded = self.embedding(text)
      
        #packed sequence
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths.cpu(), batch_first=True)
            
        #pass sequences through LSTM
        packed_output, (hidden, cell) = self.lstm(packed_embedded)
        
        #concat the final forward and backward hidden state
        hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1)
                
        #pass hidden state through fully connected layer
        dense_outputs=self.fc(hidden)

        #Final activation function
        outputs=self.act(dense_outputs)
        
        return outputs

In [19]:
#define hyperparameters
size_of_vocab = len(TEXT.vocab)
embedding_dim = 100
num_hidden_nodes = 32
num_output_nodes = 1
num_layers = 2
bidirection = True
dropout = 0.2

#instantiate the model
model = classifier(size_of_vocab, embedding_dim, num_hidden_nodes,num_output_nodes, num_layers, 
                   bidirectional = True, dropout = dropout)

In [20]:
print(model)

#No. of trianable parameters
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
    
print(f'The model has {count_parameters(model):,} trainable parameters')

#Initialize the pretrained embedding
pretrained_embeddings = TEXT.vocab.vectors
model.embedding.weight.data.copy_(pretrained_embeddings)

print(pretrained_embeddings.shape)

classifier(
  (embedding): Embedding(22654, 100)
  (lstm): LSTM(100, 32, num_layers=2, batch_first=True, dropout=0.2, bidirectional=True)
  (fc): Linear(in_features=64, out_features=1, bias=True)
  (act): Sigmoid()
)
The model has 2,324,857 trainable parameters
torch.Size([22654, 100])


In [None]:
!pip install transformers

In [22]:
# optimizer from hugging face transformers
from transformers import AdamW

# define the optimizer
optimizer = AdamW(model.parameters(), lr = 1e-4)

#define the loss
criterion = nn.BCELoss()

#define metric
def binary_accuracy(preds, y):
    #round predictions to the closest integer
    rounded_preds = torch.round(preds)
    
    correct = (rounded_preds == y).float() 
    acc = correct.sum() / len(correct)
    return acc
    
#push to cuda if available
model = model.to(device)
criterion = criterion.to(device)

100%|█████████▉| 399997/400000 [00:29<00:00, 28650.43it/s]

In [23]:
def train(model, iterator, optimizer, criterion):
    
    #initialize every epoch 
    epoch_loss = 0
    epoch_acc = 0
    
    #set the model in training phase
    model.train()  
    
    for batch in iterator:
        
        #resets the gradients after every batch
        optimizer.zero_grad()   
        
        #retrieve text and no. of words
        text, text_lengths = batch.text   
        
        #convert to 1D tensor
        predictions = model(text, text_lengths).squeeze()  
        
        #compute the loss
        loss = criterion(predictions, batch.label)        
        
        #compute the binary accuracy
        acc = binary_accuracy(predictions, batch.label)   
        
        #backpropage the loss and compute the gradients
        loss.backward()       
        
        #update the weights
        optimizer.step()      
        
        #loss and accuracy
        epoch_loss += loss.item()  
        epoch_acc += acc.item()    
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [24]:
def evaluate(model, iterator, criterion):
    
    #initialize every epoch
    epoch_loss = 0
    epoch_acc = 0

    #deactivating dropout layers
    model.eval()
    
    #deactivates autograd
    with torch.no_grad():
    
        for batch in iterator:
        
            #retrieve text and no. of words
            text, text_lengths = batch.text
            
            #convert to 1d tensor
            predictions = model(text, text_lengths).squeeze()
            
            #compute loss and accuracy
            loss = criterion(predictions, batch.label)
            acc = binary_accuracy(predictions, batch.label)
            
            #keep track of loss and accuracy
            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [25]:
N_EPOCHS = 15
best_valid_loss = float('inf')
best_valid_accuracy = 0

for epoch in range(N_EPOCHS):
     
    #train the model
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    
    #evaluate the model
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    #save the best model
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'saved_weights.pt')
    
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

    if valid_acc > best_valid_accuracy:
      best_valid_accuracy = valid_acc 

	Train Loss: 0.689 | Train Acc: 53.64%
	 Val. Loss: 0.688 |  Val. Acc: 55.16%
	Train Loss: 0.687 | Train Acc: 55.47%
	 Val. Loss: 0.687 |  Val. Acc: 55.16%
	Train Loss: 0.687 | Train Acc: 55.48%
	 Val. Loss: 0.688 |  Val. Acc: 55.16%
	Train Loss: 0.687 | Train Acc: 55.49%
	 Val. Loss: 0.687 |  Val. Acc: 55.16%
	Train Loss: 0.686 | Train Acc: 55.48%
	 Val. Loss: 0.687 |  Val. Acc: 55.16%
	Train Loss: 0.685 | Train Acc: 55.48%
	 Val. Loss: 0.686 |  Val. Acc: 55.12%
	Train Loss: 0.681 | Train Acc: 55.93%
	 Val. Loss: 0.685 |  Val. Acc: 55.09%
	Train Loss: 0.678 | Train Acc: 57.05%
	 Val. Loss: 0.684 |  Val. Acc: 55.30%
	Train Loss: 0.673 | Train Acc: 57.99%
	 Val. Loss: 0.684 |  Val. Acc: 54.38%
	Train Loss: 0.671 | Train Acc: 58.88%
	 Val. Loss: 0.684 |  Val. Acc: 55.94%
	Train Loss: 0.667 | Train Acc: 59.44%
	 Val. Loss: 0.685 |  Val. Acc: 55.39%
	Train Loss: 0.664 | Train Acc: 60.22%
	 Val. Loss: 0.685 |  Val. Acc: 56.38%
	Train Loss: 0.661 | Train Acc: 60.45%
	 Val. Loss: 0.685 |  Val

In [26]:
print(best_valid_accuracy)

0.5642289957579445
