In [1]:
import pandas as pd
import numpy as np
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader
import time
from tqdm import tqdm
import tensorflow as tf
from sklearn.metrics import classification_report, accuracy_score

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 4 GPU(s) available.
We will use the GPU: Tesla V100-SXM2-32GB


In [3]:
# Load train data
df = pd.read_csv('../datasets/train.csv')

# Report the number of sentences.
print('Number of training sentences: {:,}\n'.format(df.shape[0]))

Number of training sentences: 294,202



In [4]:
# Define some variables
random_state = 42


# Split data into train and validation sets
from sklearn.model_selection import train_test_split

train_text, validation_text, train_labels, validation_labels = train_test_split(df['Sentence'], df['Label'], 
                                                            random_state=random_state, test_size=0.1)


In [5]:
# Load the BERT tokenizer.
tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-pretrain', do_lower_case=True)

# Tokenize all of the sentences and map the tokens to their word IDs.
train_text_encoded = [tokenizer.encode(sent, 
                                 add_special_tokens=True, # Add '[CLS]' and '[SEP]'
                                 padding='max_length', # Pad & truncate all sentences.
                                 truncation=True,
                                 max_length=512,
                                 ) for sent in train_text]
validation_text_encoded = [tokenizer.encode(sent,
                                    add_special_tokens=True, # Add '[CLS]' and '[SEP]'
                                    padding='max_length', # Pad & truncate all sentences.
                                    truncation=True,
                                    max_length=512,
                                    ) for sent in validation_text]



In [6]:
# As the data is extremely imbalanced, we will use SMOTE to balance the data
from imblearn.over_sampling import SMOTE

sm = SMOTE(random_state=random_state, sampling_strategy='auto')
train_text_resampled, train_labels_resampled = sm.fit_resample(train_text_encoded, train_labels)

In [7]:
# Train and validation data to tensors
train_text_tensor = torch.tensor(train_text_resampled).to(device)
train_labels_tensor = torch.tensor(train_labels_resampled).to(device)

validation_text_tensor = torch.tensor(validation_text_encoded).to(device)
validation_labels_tensor = torch.tensor(validation_labels.values).to(device)

In [8]:
# Train and validation data to tensors
train_text_tensor = torch.tensor(train_text_resampled).to(device)
train_labels_tensor = torch.tensor(train_labels_resampled).to(device)

validation_text_tensor = torch.tensor(validation_text_encoded).to(device)
validation_labels_tensor = torch.tensor(validation_labels.values).to(device)

In [9]:
# Define the batch size
batch_size = 32

# Create the DataLoader for our training set.
train_data = TensorDataset(train_text_tensor, train_labels_tensor)
train_dataloader = DataLoader(train_data, batch_size=batch_size, shuffle=True)

# Create the DataLoader for our validation set.
validation_data = TensorDataset(validation_text_tensor, validation_labels_tensor)
validation_dataloader = DataLoader(validation_data, batch_size=batch_size, shuffle=False)

In [10]:
model = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-pretrain', num_labels=2,output_attentions=False)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at yiyanghkust/finbert-pretrain and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
# Initialize the best validation accuracy.
best_validation_accuracy = 0.0

model.to(device) # send the model to GPU
model.train() # switch to train mode i.e. forward, backward, optimization
optimizer = AdamW(model.parameters(), lr=5e-5) # choose an optimizer for the gradient descent
loss_values = [] # accumulate the losses, can be used with a validation set to choose the epochs so as to avoid overfitting

# define number of epochs
epochs = 3
for epoch in range(epochs): #number of epochs i.e. how many times is the whole dataset passed through the architecture
      # =================================
      #              Training
      # =================================
      
      print("epoch: ", epoch+1)
      print("Training...")
      # capture time
      total_t0 = time.time()
      train_total_loss = 0
      for batch in tqdm(train_dataloader): # split into batches to fit into the memory
            input_ids, labels = batch
            input_ids.to(device)
            labels.to(device)
            
            # Always clear any previously calculated gradients before performing a
            # backward pass. 
            optimizer.zero_grad()


            # Perform a forward pass (evaluate the model on this training batch).
            # This will return the loss (rather than the model output) because we
            # have provided the `labels`.
            # The documentation for this `model` function is here: 
            # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
            outputs = model(input_ids,labels=labels)
            # Calculate the loss i.e. distance between predicted labels and true labels using cross entropy
            loss = outputs[0]

            # Accumulate the training loss over all of the batches so that we can
            # calculate the average loss at the end. `loss` is a Tensor containing a
            # single value; the `.item()` function just returns the Python value 
            # from the tensor.
            train_total_loss += loss.item()
            # Perform a backward pass to calculate the gradients.
            loss.backward()

            # Clip the norm of the gradients to 1.0.
            # This is to help prevent the "exploding gradients" problem.
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            # Update parameters using the optimizer and the gradient values
            optimizer.step()
      
      # print result summaries
      print("")
      print("summary results")
      print("epoch | train loss | train time")
      
      # Calculate the average loss over the training data.
      avg_train_loss = train_total_loss / len(train_dataloader)
      
      # Store the loss value for plotting the learning curve.
      loss_values.append(avg_train_loss)
      
      
      # training time end
      training_time = time.time() - total_t0
      print(f"{epoch+1:5d} | {avg_train_loss:.5f} |  {training_time:}")
      
      # =================================
      #             Validation
      # =================================
      # After the completion of each training epoch, measure our performance on
      # our validation set.
      print("")
      print("Running Validation...")
      # capture time
      total_t0 = time.time()
      # switch to evaluation mode i.e. no backward pass
      model.eval()
      # Tracking variables
      
      # Evaluate data for one epoch
      with torch.no_grad():
            preds_list = []
            accuracy_list = []
            labelsset=[]
            accuracy_sum = 0
            for batch in tqdm(validation_dataloader):
                  input_ids, labels = batch
                  input_ids.to(device)
                  outputs = model(input_ids)
                  logits =outputs.logits.detach().cpu().numpy()   # Taking the softmax of output
                  pred=np.argmax(logits, axis=1).tolist()
                  acc=accuracy_score(labels.detach().cpu().numpy().tolist(), pred)
                  accuracy_sum+=acc
                  preds_list.extend(pred)
                  accuracy_list.append(acc)
                  labelsset.extend(labels.detach().cpu().numpy())
      
      mean_accuracy = accuracy_sum / len(validation_dataloader)
      print("  Accuracy: {0:.2f}".format(mean_accuracy))
      
      # Check if the current model is the best one and save it.
      if mean_accuracy > best_validation_accuracy:
            print("  Best model found! Saving it.")
            # Save the best model in the specified location.
            model.save_pretrained('../model/finbert-fls')
            best_validation_accuracy = mean_accuracy



epoch:  1
Training...


  0%|          | 0/14591 [00:00<?, ?it/s]

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.
100%|██████████| 14591/14591 [3:38:40<00:00,  1.11it/s]  



summary results
epoch | train loss | train time
    1 | 0.03720 |  13120.943731069565

Running Validation...


100%|██████████| 920/920 [04:29<00:00,  3.42it/s]


  Accuracy: 0.99
  Best model found! Saving it.
epoch:  2
Training...


100%|██████████| 14591/14591 [3:31:28<00:00,  1.15it/s]  



summary results
epoch | train loss | train time
    2 | 0.21748 |  12688.572664737701

Running Validation...


100%|██████████| 920/920 [04:27<00:00,  3.44it/s]


  Accuracy: 0.12
epoch:  3
Training...


100%|██████████| 14591/14591 [3:31:16<00:00,  1.15it/s]  



summary results
epoch | train loss | train time
    3 | 0.69448 |  12676.38683795929

Running Validation...


100%|██████████| 920/920 [04:27<00:00,  3.44it/s]

  Accuracy: 0.88





In [12]:
# Save model
#model.save_pretrained('../model/finbert-fls')