In [None]:
!pip install transformers

In [32]:
import pandas as pd
import numpy as np
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader
import time
from tqdm import tqdm
import tensorflow as tf
from sklearn.metrics import classification_report, accuracy_score

In [None]:
# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

In [4]:
# Load train data
df = pd.read_csv('../datasets/train_data.csv')

# Report the number of sentences.
print('Number of training sentences: {:,}\n'.format(df.shape[0]))

Number of training sentences: 235,403



In [18]:
# Define some variables
random_state = 42


# Split data into train and validation sets
from sklearn.model_selection import train_test_split

train_text, validation_text, train_labels, validation_labels = train_test_split(df['Sentence'], df['Label'], 
                                                            random_state=random_state, test_size=0.1)


In [30]:
# Load the BERT tokenizer.
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

# Tokenize all of the sentences and map the tokens to their word IDs.
train_text_encoded = [tokenizer.encode(sent, 
                                 add_special_tokens=True, # Add '[CLS]' and '[SEP]'
                                 padding='max_length', # Pad & truncate all sentences.
                                 truncation=True,
                                 max_length=512,
                                 ) for sent in train_text]
validation_text_encoded = [tokenizer.encode(sent,
                                    add_special_tokens=True, # Add '[CLS]' and '[SEP]'
                                    padding='max_length', # Pad & truncate all sentences.
                                    truncation=True,
                                    max_length=512,
                                    ) for sent in validation_text]

# Print sentence 0, now as a list of IDs.
print('Original: ', train_text[0])
print('Token IDs:', train_text_encoded[0])

Original:  Our principal sources of liquidity are cash from operations and more recently, proceeds from our debt and equity offerings.
Token IDs: [101, 1996, 10768, 11890, 3843, 1037, 8001, 3343, 4861, 5517, 2008, 2009, 2097, 2053, 2936, 9146, 13117, 2015, 4114, 2004, 19875, 4523, 2000, 8980, 2019, 3318, 4171, 21447, 1999, 2037, 3465, 1011, 1997, 1011, 2326, 6165, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [31]:
# As the data is extremely imbalanced, we will use SMOTE to balance the data
from imblearn.over_sampling import SMOTE

sm = SMOTE(random_state=random_state, sampling_strategy=0.5)
train_text_resampled, train_labels_resampled = sm.fit_resample(train_text_encoded, train_labels)

In [None]:
# Train and validation data to tensors
train_text_tensor = torch.tensor(train_text_resampled).to(device)
validation_text_tensor = torch.tensor(validation_text_encoded).to(device)

In [34]:
# Define the batch size
batch_size = 32

# Create the DataLoader for our training set.
train_data = TensorDataset(train_text_tensor, train_labels_resampled)
train_dataloader = DataLoader(train_data, batch_size=batch_size, shuffle=True)

# Create the DataLoader for our validation set.
validation_data = TensorDataset(validation_text_tensor, validation_labels)
validation_dataloader = DataLoader(validation_data, batch_size=batch_size, shuffle=False)

## Train FLS Classifier

In [35]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2,output_attentions=False)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly i

In [36]:
model.to(device) # send the model to GPU
model.train() # switch to train mode i.e. forward, backward, optimization
optimizer = AdamW(model.parameters(), lr=5e-5) # choose an optimizer for the gradient descent
loss_values = [] # accumulate the losses, can be used with a validation set to choose the epochs so as to avoid overfitting

# define number of epochs
epochs = 3
for epoch in range(epochs): #number of epochs i.e. how many times is the whole dataset passed through the architecture
      # =================================
      #              Training
      # =================================
      
      print("epoch: ", epoch+1)
      print("Training...")
      # capture time
      total_t0 = time.time()
      train_total_loss = 0
      for batch in tqdm(train_dataloader): # split into batches to fit into the memory
            input_ids, labels = batch
            input_ids.to(device)
            labels.to(device)
            
            # Always clear any previously calculated gradients before performing a
            # backward pass. 
            optimizer.zero_grad()


            # Perform a forward pass (evaluate the model on this training batch).
            # This will return the loss (rather than the model output) because we
            # have provided the `labels`.
            # The documentation for this `model` function is here: 
            # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
            outputs = model(input_ids,labels=labels)
            # Calculate the loss i.e. distance between predicted labels and true labels using cross entropy
            loss = outputs[0]

            # Accumulate the training loss over all of the batches so that we can
            # calculate the average loss at the end. `loss` is a Tensor containing a
            # single value; the `.item()` function just returns the Python value 
            # from the tensor.
            train_total_loss += loss.item()
            # Perform a backward pass to calculate the gradients.
            loss.backward()

            # Clip the norm of the gradients to 1.0.
            # This is to help prevent the "exploding gradients" problem.
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            # Update parameters using the optimizer and the gradient values
            optimizer.step()
      
      # print result summaries
      print("")
      print("summary results")
      print("epoch | train loss | train time")
      
      # Calculate the average loss over the training data.
      avg_train_loss = train_total_loss / len(train_dataloader)
      
      # Store the loss value for plotting the learning curve.
      loss_values.append(avg_train_loss)
      
      
      # training time end
      training_time = time.time() - total_t0
      print(f"{epoch+1:5d} | {avg_train_loss:.5f} |  {training_time:}")
      
      # =================================
      #             Validation
      # =================================
      # After the completion of each training epoch, measure our performance on
      # our validation set.
      print("")
      print("Running Validation...")
      # capture time
      total_t0 = time.time()
      # switch to evaluation mode i.e. no backward pass
      model.eval()
      # Tracking variables
      
      # Evaluate data for one epoch
      with torch.no_grad():
            preds_list = []
            accuracy_list = []
            labelsset=[]
            accuracy_sum = 0
            for batch in tqdm(validation_dataloader):
                  input_ids, labels = batch
                  input_ids.to(device)
                  outputs = model(input_ids)
                  logits =outputs.logits.detach().cpu().numpy()   # Taking the softmax of output
                  pred=np.argmax(logits, axis=1).tolist()
                  acc=accuracy_score(labels.detach().cpu().numpy().tolist(), pred)
                  accuracy_sum+=acc
                  preds_list.extend(pred)
                  accuracy_list.append(acc)
                  labelsset.extend(labels.detach().cpu().numpy())
      
      mean_accuracy = accuracy_sum / len(validation_dataloader)
      print("  Accuracy: {0:.2f}".format(mean_accuracy))


## Evaluate model on test set

In [None]:
# Import test set
df_test = pd.read_csv('../datasets/test_data.csv')

In [None]:
# Load the BERT tokenizer.
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

# Tokenize all of the sentences and map the tokens to their word IDs.
text_encoded = [tokenizer.encode(sent,
                                add_special_tokens=True, # Add '[CLS]' and '[SEP]'
                                padding='max_length', # Pad & truncate all sentences.
                                truncation=True,
                                max_length=512,
                                ) for sent in df_test['Sentence']]

# Data to tensors
text_tensor = torch.tensor(train_text_resampled).to(device)

# Create the DataLoader for our test set
test_data = TensorDataset(text_tensor)
test_dataloader = DataLoader(test_data, batch_size=batch_size, shuffle=False)


In [None]:
def test(test_dataloader):
# Put the model in evaluation mode--the dropout layers behave differently
# during evaluation.  
  model.eval()
# Telling the model not to compute or store gradients, saving memory and
# speeding up the process
  with torch.no_grad():
    preds_list = []
    accuracy_list = []
    labelsset=[]
    accuracy_sum = 0
    for batch in tqdm(test_dataloader):
      input_ids, labels = batch
      input_ids.to(device)
      outputs = model(input_ids)
      logits =outputs.logits.detach().cpu().numpy()   # Taking the softmax of output
      pred=np.argmax(logits, axis=1).tolist()
      acc=accuracy_score(labels.detach().cpu().numpy().tolist(), pred)
      accuracy_sum+=acc
      preds_list.extend(pred)
      accuracy_list.append(acc)
      labelsset.extend(labels.detach().cpu().numpy())

  mean_accuracy = accuracy_sum / len(test_dataloader)
  return mean_accuracy, preds_list, accuracy_list, labelsset

In [None]:
mean_accuracy, preds_list, accuracy_list, labelsset = test(test_dataloader)

In [None]:
print("Accuracy on test set: ", mean_accuracy)
print("Classification report: ", classification_report(labelsset,preds_list))