<a href="https://colab.research.google.com/github/nishitjain97/NLP_538_Fall_2022_Project_HaND/blob/main/GPT_Predictions_and_Evaluation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Notebook for BERT predictions and evaluation

## Package Loading

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
!pip install tensorflow
!pip install transformers
!pip install scikit-learn

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 23.0 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 71.2 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 36.5 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.1 tokenizers-0.13.2 transformers-4.25.1
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
import numpy as np
import pandas as pd
import os
import time
import datetime
import random
import torch
import tensorflow as tf

from transformers import BertTokenizer
from torch.utils.data import TensorDataset
from transformers import BertForSequenceClassification
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from sklearn.metrics import multilabel_confusion_matrix, f1_score, precision_score, recall_score

from torch.utils.data import Dataset, DataLoader
from transformers import (set_seed,
                          TrainingArguments,
                          Trainer,
                          GPT2Config,
                          GPT2Tokenizer, 
                          get_linear_schedule_with_warmup,
                          GPT2ForSequenceClassification)
from torch.optim import AdamW
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
import warnings
from tqdm.notebook import tqdm

## Utilities

In [4]:
def encode_sentiment(sentiment):
  if sentiment == 'Negative':
    return 0
  elif sentiment == 'Neutral':
    return 1
  else:
    return 2

In [5]:
def process_data(data, tokenizer):
  input_ids = {}
  attention_masks = {}
  labels = {}
  
  for key in data.keys():
    print(key)
    input_ids[key] = []
    attention_masks[key] = []
    
    for sentence in data[key]['MASKED_DOCUMENT']:
      encoded_dict = tokenizer.encode_plus(
                          sentence,                      # Sentence to encode.
                          add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                          max_length = MAX_LENGTH,           # Pad & truncate all sentences.
                          padding = 'max_length',
                          return_attention_mask = True,   # Construct attn. masks.
                          return_tensors = 'pt',     # Return pytorch tensors.
                          truncation=True
                    )
      input_ids[key].append(encoded_dict['input_ids'])
      attention_masks[key].append(encoded_dict['attention_mask'])

    input_ids[key] = torch.cat(input_ids[key], dim=0)
    attention_masks[key] = torch.cat(attention_masks[key], dim=0)

  for key in data.keys():
    labels[key] = data[key]['TRUE_SENTIMENT'].apply(lambda x: encode_sentiment(x))

  dataset = {}
  
  for key in data.keys():
    dataset[key] = TensorDataset(input_ids[key], attention_masks[key], torch.tensor(labels[key]))

  return dataset

In [6]:
def generate_dataloaders(config, dataset):
  dataloaders = {}
  
  for key in dataset.keys():
    dataloaders[key] = DataLoader(
        dataset[key],
        sampler = RandomSampler(dataset[key]),
        batch_size = config[key]
    )

  return dataloaders

In [7]:
# Function to calculate the accuracy of our predictions vs labels
def get_accuracy(preds, labels):
  return np.sum(preds == labels) / len(labels)

def multiclass_confusion_matrix(preds, labels):
  return multilabel_confusion_matrix(labels, preds)

In [32]:
def make_predictions(model, dataloader):
  model.eval()

  # Tracking variables 
  predictions , true_labels = [], []

  # Predict 
  for batch in tqdm(dataloader, total=len(dataloader)):
    # add original labels
    true_labels += batch['labels'].numpy().flatten().tolist()

    # move batch to device
    batch = {k:v.type(torch.long).to(device) for k,v in batch.items()}
    
    # Unpack the inputs from our dataloader
    # b_input_ids, b_input_mask, b_labels = batch
    
    with torch.no_grad():        

        outputs = model(**batch)

        logits = outputs.logits
        
        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        
        # get predicitons to list
        predict_content = logits.argmax(axis=-1).flatten().tolist()

        # update list
        predictions += predict_content

    
  return predictions, true_labels

def get_precision(cm):
  true_pos = np.diag(cm) 
  return np.sum(true_pos / np.sum(cm, axis=0))

def get_recall(cm):
  true_pos = np.diag(cm)
  return np.sum(true_pos / np.sum(cm, axis=1))

def evaluate_predictions(predictions, true_labels):
  predictions = np.argmax(predictions, axis=1).flatten()
  true_labels = true_labels.flatten()
  
  acc = get_accuracy(predictions, true_labels)
  p = precision_score(true_labels, predictions, average=None)
  r = recall_score(true_labels, predictions, average=None)
  f1 = f1_score(true_labels, predictions, average=None)
  macro_f1 = f1_score(true_labels, predictions, average='macro')
  
  return acc, p, r, f1, macro_f1

## Configurations

In [None]:
MAX_LENGTH = 512

batch_sizes = {
    'train': 8,
    'dev': 4,
    'random_test': 4,
    'fixed_test': 4
}

In [9]:
# Get the GPU device name.
device_name = tf.test.gpu_device_name()

# The device name should look like the following:
if device_name == '/device:GPU:0':
    print('Found GPU at: {}'.format(device_name))

Found GPU at: /device:GPU:0


In [10]:
# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla T4


## Load Data

In [39]:
data_path = "/content/gdrive/MyDrive/augmented_data/"
data = {}

for item in ['fixed_test', 'train_deepwd_aug', 'train_fooler_aug', 'train_pwws_aug']:
  data[item] = pd.read_csv(os.path.join(data_path, item + '.csv'))

## Model Loading

In [None]:
tokenizer = BertTokenizer.from_pretrained("/content/gdrive/MyDrive/NLP_Project/model_save")
model = BertForSequenceClassification.from_pretrained("/content/gdrive/MyDrive/NLP_Project/model_save")

if device.type != 'cpu':
  model.cuda()

In [None]:
dataset = process_data(data, tokenizer)

random_test
fixed_test


In [None]:
dataloaders = generate_dataloaders(batch_sizes, dataset)

In [None]:
for key, dataloader in dataloaders.items():
  predictions, true_labels = make_predictions(model, dataloader)
  predictions = np.concatenate(predictions)
  true_labels = np.concatenate(true_labels)
  acc, p, r, f1, macro_f1 = evaluate_predictions(predictions, true_labels)

  _warn_prf(average, modifier, msg_start, len(result))


## GPT2

In [12]:
# Set seed for reproducibility.
set_seed(123)

# Number of training epochs (authors on fine-tuning Bert recommend between 2 and 4).
epochs = 4

# Look for gpu to use. Will use `cpu` by default if no gpu found.
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Name of transformers model - will use already pretrained model.
# Path of transformer model - will load your own model from local disk.
model_name_or_path = 'gpt2'

# Dictionary of labels and their id - this will be used to convert.
# String labels to number ids.
labels_ids = {'Negative': 0, 'Neutral': 1, 'Positive': 2}

# How many labels are we using in training.
# This is used to decide size of classification head.
n_labels = len(labels_ids)

In [14]:
class CustomDataset(Dataset):
  r"""PyTorch Dataset class for loading data.

  This is where the data parsing happens.

  This class is built with reusability in mind: it can be used as is as.

  Arguments:

    path (:obj:`str`):
        Path to the data partition.

  """

  def __init__(self, data, use_tokenizer):
    self.texts = data['MASKED_DOCUMENT']
    self.labels = data['TRUE_SENTIMENT']

    # Number of exmaples.
    self.n_examples = len(self.labels)
    return

  def __len__(self):
    r"""When used `len` return the number of examples.

    """
    
    return self.n_examples

  def __getitem__(self, item):
    r"""Given an index return an example from the position.
    
    Arguments:

      item (:obj:`int`):
          Index position to pick an example to return.

    Returns:
      :obj:`Dict[str, str]`: Dictionary of inputs that contain text and 
      asociated labels.

    """

    return {'text':self.texts[item],
            'label':self.labels[item]}
     

In [15]:
class Gpt2ClassificationCollator(object):
    r"""
    Data Collator used for GPT2 in a classificaiton rask. 
    
    It uses a given tokenizer and label encoder to convert any text and labels to numbers that 
    can go straight into a GPT2 model.

    This class is built with reusability in mind: it can be used as is as long
    as the `dataloader` outputs a batch in dictionary format that can be passed 
    straight into the model - `model(**batch)`.

    Arguments:

      use_tokenizer (:obj:`transformers.tokenization_?`):
          Transformer type tokenizer used to process raw text into numbers.

      labels_ids (:obj:`dict`):
          Dictionary to encode any labels names into numbers. Keys map to 
          labels names and Values map to number associated to those labels.

      max_sequence_len (:obj:`int`, `optional`)
          Value to indicate the maximum desired sequence to truncate or pad text
          sequences. If no value is passed it will used maximum sequence size
          supported by the tokenizer and model.

    """

    def __init__(self, use_tokenizer, labels_encoder, max_sequence_len=None):

        # Tokenizer to be used inside the class.
        self.use_tokenizer = use_tokenizer
        # Check max sequence length.
        self.max_sequence_len = use_tokenizer.model_max_length if max_sequence_len is None else max_sequence_len
        # Label encoder used inside the class.
        self.labels_encoder = labels_encoder

        return

    def __call__(self, sequences):
        r"""
        This function allowes the class objesct to be used as a function call.
        Sine the PyTorch DataLoader needs a collator function, I can use this 
        class as a function.

        Arguments:

          item (:obj:`list`):
              List of texts and labels.

        Returns:
          :obj:`Dict[str, object]`: Dictionary of inputs that feed into the model.
          It holddes the statement `model(**Returned Dictionary)`.
        """

        # Get all texts from sequences list.
        texts = [sequence['text'] for sequence in sequences]
        # Get all labels from sequences list.
        labels = [sequence['label'] for sequence in sequences]
        # Encode all labels using label encoder.
        labels = [self.labels_encoder[label] for label in labels]
        # Call tokenizer on all texts to convert into tensors of numbers with 
        # appropriate padding.
        inputs = self.use_tokenizer(text=texts, return_tensors="pt", padding=True, truncation=True,  max_length=self.max_sequence_len)
        # Update the inputs with the associated encoded labels as tensor.
        inputs.update({'labels':torch.tensor(labels)})

        return inputs
     

In [16]:
# Get model configuration.
print('Loading configuraiton...')
model_config = GPT2Config.from_pretrained(pretrained_model_name_or_path='/content/gdrive/MyDrive/gpt_save/config.json', num_labels=n_labels)

Loading configuraiton...


In [17]:
# Get model's tokenizer.
print('Loading tokenizer...')
tokenizer = GPT2Tokenizer.from_pretrained(pretrained_model_name_or_path='/content/gdrive/MyDrive/gpt_save')
# default to left padding
tokenizer.padding_side = "right"
# Define PAD Token = EOS Token = 50256
tokenizer.pad_token = tokenizer.eos_token

Loading tokenizer...


In [18]:
# Get the actual model.
print('Loading model...')
model = GPT2ForSequenceClassification.from_pretrained(pretrained_model_name_or_path='/content/gdrive/MyDrive/gpt_save/pytorch_model.bin', config='/content/gdrive/MyDrive/gpt_save/config.json')

Loading model...


In [19]:
# resize model embedding to match new tokenizer
model.resize_token_embeddings(len(tokenizer))
# fix model padding token id
model.config.pad_token_id = model.config.eos_token_id


# Load model to defined device.
model.to(device)
print('Model loaded to `%s`'%device)

Model loaded to `cuda`


In [20]:
MAX_LENGTH = 512

In [21]:
# Create data collator to encode text and labels into numbers.
gpt2_classification_collator = Gpt2ClassificationCollator(use_tokenizer=tokenizer, 
                                                          labels_encoder=labels_ids, 
                                                          max_sequence_len=MAX_LENGTH)

In [40]:
batch_sizes = {
    'fixed_test': 4,
    'train_deepwd_aug': 4,
    'train_fooler_aug': 4,
    'train_pwws_aug': 4
}

In [41]:
dataset = {}
dataloaders = {}

for key in data.keys():
  dataset[key] = CustomDataset(data[key], use_tokenizer=tokenizer)
  dataloaders[key] = DataLoader(dataset[key], batch_size=batch_sizes[key], shuffle=True, collate_fn=gpt2_classification_collator)

In [42]:
for key, dataloader in dataloaders.items():
  predictions, true_labels = make_predictions(model, dataloader)
  acc = get_accuracy(predictions, true_labels)
  p = precision_score(true_labels, predictions, average=None)
  r = recall_score(true_labels, predictions, average=None)
  f1 = f1_score(true_labels, predictions, average=None)
  macro_f1 = f1_score(true_labels, predictions, average='macro')
  print(acc, p, r, f1, macro_f1)

  0%|          | 0/207 [00:00<?, ?it/s]

0.0 [0.52173913 0.46017699 0.55965909] [0.08633094 0.65       0.53532609] [0.14814815 0.5388601  0.54722222] 0.41141015799910446


  0%|          | 0/1678 [00:00<?, ?it/s]

0.0 [0.62711864 0.6240049  0.83603491] [0.21082621 0.81781701 0.76279863] [0.31556503 0.70788468 0.79773944] 0.6070630516577659


  0%|          | 0/925 [00:00<?, ?it/s]

0.0 [0.64137931 0.625      0.82858803] [0.22090261 0.82078853 0.75770457] [0.32862191 0.70963743 0.79156259] 0.6099406430032838


  0%|          | 0/1662 [00:00<?, ?it/s]

0.0 [0.63596491 0.62650976 0.8367283 ] [0.21137026 0.81803478 0.76526225] [0.31728665 0.70957559 0.7994012 ] 0.6087544790655728


In [36]:
acc = get_accuracy(predictions, true_labels)
p = precision_score(true_labels, predictions, average=None)
r = recall_score(true_labels, predictions, average=None)
f1 = f1_score(true_labels, predictions, average=None)
macro_f1 = f1_score(true_labels, predictions, average='macro')

In [37]:
print(acc, p, r, f1, macro_f1)

0.0 [0.62711864 0.6240049  0.83603491] [0.21082621 0.81781701 0.76279863] [0.31556503 0.70788468 0.79773944] 0.6070630516577659
