<a href="https://colab.research.google.com/github/mcsantiago/BERT-COVID-StanceDetection/blob/main/BERT_Stance_Detection_DEMO_VIEW.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Stance BERTection :D

## TODO:
- Regenerate the 0.4 drop model graph
- Figure out how to load weights
- Demo only interface
- Figure out how to incorporate other embeddings from other models

## Download the data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import nltk
import re


In [None]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
# Setup stopwords list & word (noun, adjective, and verb) lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    """Function to clean text using RegEx operations, removal of stopwords, and lemmatization."""
    text = re.sub(r'[^\w\s]', '', text, re.UNICODE)
    text = text.lower()
    text = [lemmatizer.lemmatize(token) for token in text.split(' ')]
    text = [lemmatizer.lemmatize(token, 'v') for token in text]
    text = [word for word in text if word not in stop_words]
    text = ' '.join(text)
    text = text.lstrip().rstrip()
    text = re.sub(' +', ' ', text)
    return text

clean_text('the cat is trained for the #lemmatization of idk.')

'cat train lemmatization idk'

In [None]:
import logging

# Disable unwanted warning messages from pytorch_transformers
# NOTE: Run once without the line below to check if anything is wrong, here we target to eliminate
# the message "Token indices sequence length is longer than the specified maximum sequence length"
# since we already take care of it within the tokenize() function through fixing sequence length
logging.getLogger('pytorch_transformers').setLevel(logging.CRITICAL)

In [None]:
!pip install pytorch_transformers # We will need this for our model



In [None]:
import torch
from torch.utils.data import DataLoader
import torch.nn as nn
from torch.autograd import Variable
from pytorch_transformers import BertConfig, BertTokenizer, BertModel

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("DEVICE FOUND: %s" % DEVICE)

DEVICE FOUND: cuda


In [None]:
# Set seeds for reproducibility
SEED = 42
torch.manual_seed(seed=SEED)
torch.backends.cudnn.deterministic = True

## Hyperparameters

In [None]:
# Define hyperparameters
NUM_EPOCHS = 20
BATCH_SIZE = 64

PRETRAINED_MODEL_NAME = 'bert-base-cased'
NUM_PRETRAINED_BERT_LAYERS = 1 # 2 -> 1
MAX_TOKENIZATION_LENGTH = 512
NUM_CLASSES = 4 # agree, disagree, no_stance, not_relevant
TOP_DOWN = True
NUM_RECURRENT_LAYERS = 1 # 2 -> 1
HIDDEN_SIZE = 256
REINITIALIZE_POOLER_PARAMETERS = True
USE_BIDIRECTIONAL = True
DROPOUT_RATE = 0.6
AGGREGATE_ON_CLS_TOKEN = True
CONCATENATE_HIDDEN_STATES = True

APPLY_CLEANING = True
TRUNCATION_METHOD = 'head-only'
NUM_WORKERS = 0

BERT_LEARNING_RATE = 1e-5
CUSTOM_LEARNING_RATE = 1e-4
BETAS = (0.9, 0.999)
BERT_WEIGHT_DECAY = 0.01
EPS = 1e-8

# Regularization


## FineTunedBert

In [None]:
class FineTunedBert(nn.Module):
    """
    Finetuning model that utilizes BERT tokenizer, pretrained BERT embedding, pretrained BERT
    encoders, an optional recurrent neural network  choice of LSTM, dropout, and finally a dense
    layer for classification.
    @param (str) pretrained_model_name: name of the pretrained BERT model for tokenizing input
           sequences, extracting vector representations for each token, [...]
    @param (int) num_pretrained_bert_layers: number of BERT Encoder layers to be utilized
    @param (int) max_tokenization_length: maximum number of positional embeddings, or the sequence
           length of an example that will be fed to BERT model (default: 512)
    @param (int) num_classes: number of classes to distinct between for classification; specify
           2 for binary classification (default: 1)
    @param (bool) top_down: whether to assign parameters (weights and biases) in order or
           backwards (default: True)
    @param (int) num_recurrent_layers: number of LSTM layers to utilize (default: 1)
    @param (bool) use_bidirectional: whether to use a bidirectional LSTM or not (default: False)
    @param (int) hidden_size: number of recurrent units in each LSTM cell (default: 128)
    @param (bool) reinitialize_pooler_parameters: whether to use the pretrained pooler parameters
           or initialize weights as ones and biases zeros and train for scratch (default: False)
    @param (float) dropout_rate: possibility of each neuron to be discarded (default: 0.10)
    @param (bool) aggregate_on_cls_token: whether to pool on only the hidden states of the [CLS]
           token for classification or on the hidden states of all (512) tokens (default: True)
    @param (bool) concatenate_hidden_states: whether to concatenate all the available hidden states
           outputted by the embedding and encoder layers (K+1) or only use the latest hidden state
           (default: False)
    @param (bool) use_gpu: whether to utilize GPU (CUDA) or not (default: False)
    """
    def __init__(self, pretrained_model_name, num_pretrained_bert_layers, max_tokenization_length, vocab_file = '',
                 num_classes=1, top_down=True, num_recurrent_layers=1, use_bidirectional=False,
                 hidden_size=128, reinitialize_pooler_parameters=False, dropout_rate=0.10,
                 aggregate_on_cls_token=True, concatenate_hidden_states=False, use_gpu=False):
        super(FineTunedBert, self).__init__()
        self.num_recurrent_layers = num_recurrent_layers
        self.use_bidirectional = use_bidirectional
        self.hidden_size = hidden_size
        self.aggregate_on_cls_token = aggregate_on_cls_token
        self.concatenate_hidden_states = concatenate_hidden_states
        self.use_gpu = use_gpu

        # Configure tokenizer
        self.tokenizer = BertTokenizer.from_pretrained(pretrained_model_name)
        self.tokenizer.max_len = max_tokenization_length
        self.tokenizer.vocab_file = vocab_file

        # Get global BERT config
        self.config = BertConfig.from_pretrained(pretrained_model_name)
        # Extract all parameters (weights and bias matrices) for the 12 layers
        all_states_dict = BertModel.from_pretrained(pretrained_model_name,
                                                    config=self.config).state_dict()

        # Get customized BERT config
        self.config.max_position_embeddings = max_tokenization_length
        self.config.num_hidden_layers = num_pretrained_bert_layers
        self.config.output_hidden_states = True
        self.config.output_attentions = True

        # Get pretrained BERT model & all its learnable parameters
        self.bert = BertModel.from_pretrained(pretrained_model_name,
                                              config=self.config)
        current_states_dict = self.bert.state_dict()

        # Assign matching parameters (weights and biases of all kinds of layers)
        # i)  Top-Down Approach: 1st layer takes weights of 1st pretrained BERT layer
        if top_down:
            for param in current_states_dict.keys():
                if 'pooler' not in param or not reinitialize_pooler_parameters:
                    current_states_dict[param] = all_states_dict[param]
                else:
                    if 'weight' in param:
                        current_states_dict[param] = torch.ones(self.config.hidden_size,
                                                                self.config.hidden_size)
                    elif 'bias' in param:
                        current_states_dict[param] = torch.zeros(self.config.hidden_size)

        # ii) Bottom-Up Approach: 1st layer takes weights of 12th (last) pretrained BERT layer
        else:
            align = 5 + ((12 - num_pretrained_bert_layers) * 16)
            for index, param in enumerate(current_states_dict.keys()):
                # There are 5 initial (shared) parameters from embeddings in each BERT model
                if index < 5 and 'embeddings' in param:
                    current_states_dict[param] = all_states_dict[param]
                # There are 16 parameters for each of the K pretrained BERT layers (16 x K params)
                elif index >= 5 and 'pooler' not in param:
                    current_states_dict[param] = list(all_states_dict.values())[align:][index-5]
                # There are 2 parameters for the pooling layer at the end in each BERT model
                else:
                    if not reinitialize_pooler_parameters:
                        current_states_dict[param] = all_states_dict[param]
                    else:
                        if 'weight' in param:
                            current_states_dict[param] = torch.ones(self.config.hidden_size,
                                                                    self.config.hidden_size)
                        elif 'bias' in param:
                            current_states_dict[param] = torch.zeros(self.config.hidden_size)

        del all_states_dict
        # Update parameters in extracted BERT model
        self.bert.load_state_dict(current_states_dict)

        logging.info('Loaded %d learnable parameters from pretrained BERT model with %d layer(s)' %
                     (len(list(self.bert.parameters())), num_pretrained_bert_layers))

        # Number of input hidden dimensions from the final BERT layer, as input to other layers
        input_hidden_dimension = None
        if concatenate_hidden_states:
            input_hidden_dimension = (num_pretrained_bert_layers + 1) * self.config.hidden_size
        else:
            input_hidden_dimension = self.config.hidden_size

        # Define additional layers & utilities specific to the finetuned task
        # Flatten tensors to (B, P*(H or H')) -> converts tensors to 2D for classification
        self.flatten_sequence_length = lambda t: t.view(-1,
                                                        self.config.max_position_embeddings *
                                                        input_hidden_dimension)

        # Dropout to prevent overfitting
        self.dropout = nn.Dropout(p=dropout_rate)
        if self.num_recurrent_layers > 0:
            # Recurrent Layer
            self.lstm = nn.LSTM(input_size=input_hidden_dimension,
                                hidden_size=hidden_size,
                                num_layers=num_recurrent_layers,
                                bidirectional=use_bidirectional,
                                batch_first=True)
            # Dense Layer for Classification
            self.clf = nn.Linear(in_features=hidden_size*2 if use_bidirectional else hidden_size,
                                 out_features=num_classes)
        else:
            # Dense Layer for Classification
            if aggregate_on_cls_token:
                self.clf = nn.Linear(in_features=input_hidden_dimension,
                                     out_features=num_classes)
            else:
                self.clf = nn.Linear(in_features=max_tokenization_length * input_hidden_dimension,
                                     out_features=num_classes)

    def get_tokenizer(self):
        """Function to easily access the BERT tokenizer"""
        return self.tokenizer

    def get_bert_attention(self, raw_sentence, header, device):
        """Function for getting the multi-head self-attention output from pretrained BERT"""
        # Tokenize & encode raw sentence
        x = tokenize_and_encode(text=raw_sentence,                             # (P)
                                header=header,
                                tokenizer=self.get_tokenizer(),
                                max_tokenization_length=self.config.max_position_embeddings,
                                truncation_method='head-only')
        # Convert the tokenized list to a Tensor
        x = torch.tensor(data=x, device=device)
        # Reshape input for BERT output
        x = x.unsqueeze(dim=1).view(1, -1)                                     # (B=1, P)

        # Get features
        token_type_ids, attention_mask = get_features(input_ids=x,
                                                      tokenizer=self.get_tokenizer(),
                                                      device=device)
        # Pass tokenized sequence through pretrained BERT model
        bert_outputs = self.bert(input_ids=x,                                  # (...) SEE forward()
                                 token_type_ids=token_type_ids,
                                 attention_mask=attention_mask,
                                 position_ids=None,
                                 head_mask=None)
        attention_outputs = bert_outputs[3]                                    # ([K] x (1, N, P, P))
        return attention_outputs

    def predict(self, text, header_id):
        # get input_ids (tokenize_and_encode)
        header = targets[target_idx.index(header_id)]
        stance_input_ids = tokenize_and_encode(text, 
                                               self.get_tokenizer(), 
                                               header, 
                                               apply_cleaning=APPLY_CLEANING,
                                               max_tokenization_length=MAX_TOKENIZATION_LENGTH,
                                               truncation_method=TRUNCATION_METHOD)
        # Batch size of 1
        stance_input_ids = torch.from_numpy(np.array(stance_input_ids).reshape(1, -1)).long().to(DEVICE)
        # get attention_mask / token_id (get_features)
        stance_token_id, stance_attention_mask = get_features(stance_input_ids,
                                                              tokenizer=self.get_tokenizer(),
                                                              device=DEVICE)
        # forward
        return self.forward(input_ids=stance_input_ids,
                            token_type_ids=stance_token_id,
                            attention_mask=stance_attention_mask)
  
    def forward(self, input_ids, token_type_ids=None, attention_mask=None,     # input_ids: (B, P)
                position_ids=None, head_mask=None):
        """Function implementing a forward pass of the model"""
        # Pass tokenized sequence through pretrained BERT model
        bert_outputs = self.bert(input_ids=input_ids,
                                 token_type_ids=token_type_ids,
                                 attention_mask=attention_mask,
                                 position_ids=position_ids,
                                 head_mask=head_mask)
        sequence_output = bert_outputs[0]                                      # (B, P, H)
        pooled_output = bert_outputs[1]                                        # (B, H)
        hidden_outputs = bert_outputs[2]                                       # ([K+1] x (B, P, H))
        attention_outputs = bert_outputs[3]                                    # ([K] x (B, N, P, P))

        if self.concatenate_hidden_states:
            sequence_output = torch.cat(hidden_outputs, dim=-1)                # (B, P, H' = (K+1) x H)

        if self.num_recurrent_layers > 0:
            # Set initial states
            if self.use_gpu:
                h0 = Variable(torch.zeros(self.num_recurrent_layers * 2        # (L * 2 OR L, B, H)
                                          if self.use_bidirectional else self.num_recurrent_layers,
                                          input_ids.shape[0],
                                          self.hidden_size)).cuda()
                c0 = Variable(torch.zeros(self.num_recurrent_layers * 2        # (L * 2 OR L, B, H)
                                          if self.use_bidirectional else self.num_recurrent_layers,
                                          input_ids.shape[0],
                                          self.hidden_size)).cuda()
            else:
                h0 = Variable(torch.zeros(self.num_recurrent_layers * 2        # (L * 2 OR L, B, H)
                                          if self.use_bidirectional else self.num_recurrent_layers,
                                          input_ids.shape[0],
                                          self.hidden_size))
                c0 = Variable(torch.zeros(self.num_recurrent_layers * 2        # (L * 2 OR L, B, H)
                                          if self.use_bidirectional else self.num_recurrent_layers,
                                          input_ids.shape[0],
                                          self.hidden_size))

            lstm_output = self.lstm(sequence_output, (h0, c0))                 # (B, P, H*), (2 x (B, B, H*))
            sequence_output, _ = lstm_output

            # Get last timesteps for each example in the batch; we do this to counteract padding
            last_timesteps = []
            for i in range(len(attention_mask)):
                last_timesteps.append(
                    attention_mask[i].tolist().index(0)
                    if 0 in attention_mask[i].tolist() else self.tokenizer.max_len - 1
                )

            if self.use_gpu:
                last_timesteps = torch.tensor(data=last_timesteps).cuda()      # (B)
            else:
                last_timesteps = torch.tensor(data=last_timesteps)             # (B)
            relative_hidden_size = self.hidden_size*2 if self.use_bidirectional else self.hidden_size
            last_timesteps = last_timesteps.repeat(1, relative_hidden_size)    # (1, B x H*)
            last_timesteps = last_timesteps.view(-1, 1, relative_hidden_size)  # (B, 1, H*)
            pooled_sequence_output = sequence_output.gather(                   # (B, H*)
                dim=1,
                index=last_timesteps
            ).squeeze()

            pooled_sequence_output = self.dropout(pooled_sequence_output)      # (B, H*)
            logits = self.clf(pooled_sequence_output)                          # (B, num_classes)
        else:
            if not self.aggregate_on_cls_token:
                pooled_output = self.flatten_sequence_length(sequence_output)  # (B, P x H)

            pooled_output = self.dropout(pooled_output)                        # (B, P x H OR H)
            logits = self.clf(pooled_output)                                   # (B, num_classes)

        return logits                                                          # (B, num_classes)

    


In [None]:
# Initialize to-be-finetuned Bert model
model = FineTunedBert(pretrained_model_name=PRETRAINED_MODEL_NAME,
                      vocab_file="drive/MyDrive/CS 6320 Project/data/vocabulary.txt",
                      num_pretrained_bert_layers=NUM_PRETRAINED_BERT_LAYERS,
                      max_tokenization_length=MAX_TOKENIZATION_LENGTH,
                      num_classes=NUM_CLASSES,
                      top_down=TOP_DOWN,
                      num_recurrent_layers=NUM_RECURRENT_LAYERS,
                      use_bidirectional=USE_BIDIRECTIONAL,
                      hidden_size=HIDDEN_SIZE,
                      reinitialize_pooler_parameters=REINITIALIZE_POOLER_PARAMETERS,
                      dropout_rate=DROPOUT_RATE,
                      aggregate_on_cls_token=AGGREGATE_ON_CLS_TOKEN,
                      concatenate_hidden_states=CONCATENATE_HIDDEN_STATES,
                      use_gpu=True if torch.cuda.is_available() else False)

## tokenize_and_encode

In [None]:
targets = ["RNA alters a person's DNA when taking the COVID-19 vaccine.",
            "The COVID-19 vaccine causes infertility or miscarriages in women.",
            "Natural COVID-19 immunity is better than immunity derived from a COVID-19 vaccine.",
            "The COVID-19 vaccine causes Bell's palsy.",
            "The COVID-19 vaccine contains tissue from aborted fetuses.",
            "The COVID-19 vaccine was developed to control the general population either through microchip tracking or nanotransducers in our brains.",
            "More people will die as a result of a negative side effect to the COVID-19 vaccine than would actually die from the coronavirus.",
            "There are severe side effects of the coronavirus vaccines, worse than having the virus."]
target_idx = [1,2,3,4,7,8,9,10]

In [None]:
import os
import re
from torch.utils.data import Dataset
from tqdm import trange

def tokenize_and_encode(text, tokenizer, header='', apply_cleaning=False, max_tokenization_length=512,
                        truncation_method='head-only', split_head_density=0.5):
    """
    Function to tokenize & encode a given text.
    @param (str) text: a sequence of words to be tokenized in raw string format
    @param (pytorch_transformers.BertTokenizer) tokenizer: tokenizer with pre-figured mappings
    @param (bool) apply_cleaning: whether or not to perform common cleaning operations on texts;
           note that enabling only makes sense if language of the task is English (default: False)
    @param (int) max_tokenization_length: maximum number of positional embeddings, or the sequence
           length of an example that will be fed to BERT model (default: 512)
    @param (str) truncation_method: method that will be applied in case the text exceeds
           @max_tokenization_length; currently implemented methods include 'head-only', 'tail-only',
           and 'head+tail' (default: 'head-only')
    @param (float) split_head_density: weight on head when splitting between head and tail, only
           applicable if @truncation_method='head+tail' (default: 0.5)
    @return (list) input_ids: the encoded integer indexes of the given text; note that
            get_data_iterators() function converts this to a Tensor under the hood
    """
    if apply_cleaning:
        text = clean_text(text=text)

    # print(text, header)
    # Tokenize and encode
    tokenized_text = tokenizer.tokenize(text)
    input_ids = tokenizer.convert_tokens_to_ids(tokenized_text)
    tokenized_header = tokenizer.tokenize(header)
    header_ids = tokenizer.convert_tokens_to_ids(tokenized_header)

    # Subtract 2 ([CLS] and[SEP] tokens) to get the actual text tokenization length
    text_tokenization_length = max_tokenization_length - 3 - len(header_ids)
    # Truncate sequences with the specified approach
    if len(input_ids) > text_tokenization_length:
        # i)   Head-Only Approach: Keep the first N tokens
        if truncation_method == 'head-only':
            input_ids = input_ids[:text_tokenization_length]
        # ii)  Tail-Only Approach: Keep the last N tokens
        elif truncation_method == 'tail-only':
            input_ids = input_ids[-text_tokenization_length:]
        # iii) Head+Tail Approach: Keep the first F tokens and last L tokens where F + L = N
        elif truncation_method == 'head+tail':
            head_tokenization_length = int(text_tokenization_length * split_head_density)
            tail_tokenization_length = text_tokenization_length - head_tokenization_length
            input_head_ids = input_ids[:head_tokenization_length]
            input_tail_ids = input_ids[-tail_tokenization_length:]
            input_ids = input_head_ids + input_tail_ids  

    # Plug in CLS & SEP special tokens for identification of start & end points of sequences
    cls_id = tokenizer.convert_tokens_to_ids('[CLS]')
    sep_id = tokenizer.convert_tokens_to_ids('[SEP]')
    input_ids = [cls_id] + header_ids + [sep_id] + input_ids + [sep_id]
    #input_ids = tokenizer.encode(input_ids, )


    # Pad sequences & corresponding masks and features
    pad_id = tokenizer.convert_tokens_to_ids('[PAD]')
    if len(input_ids) < max_tokenization_length:
        padding_length = max_tokenization_length - len(input_ids)
        input_ids = input_ids + ([pad_id] * padding_length)

    # Check if input is in correct length
    # assert len(input_ids) == max_tokenization_length
    return input_ids


def get_features(input_ids, tokenizer, device):
    """
    Function to get BERT-related features, and helps to build the total input representation.
    @param (Tensor) input_ids: the encoded integer indexes of a batch, with shape: (B, P)
    @param (Tensor) header_ids: the encoded integer indexes of a batch, with shape: (B, P)
    @param (pytorch_transformers.BertTokenizer) tokenizer: tokenizer with pre-figured mappings
    @param (torch.device) device: 'cpu' or 'gpu', decides where to store the outputted tensors
    @return (Tensor, Tensor) token_type_ids, attention_mask: features describe token type with
            a 0 for the first sentence and a 1 for the pair sentence; enable attention on a
            particular token with a 1 or disable it with a 0
    """
    token_type_ids, attention_mask = [], []

    # Iterate over batch
    for input_ids_example in input_ids:
        # Convert tensor to a 1D list
        input_ids_example = input_ids_example.squeeze().tolist()
        # Set example to whole input when batch size is 1
        if input_ids.shape[0] == 1:
            input_ids_example = input_ids.squeeze().tolist()
        # Get padding information
        padding_token_id = tokenizer.convert_tokens_to_ids('[PAD]')
        padding_length = input_ids_example.count(padding_token_id)
        text_length = len(input_ids_example) - padding_length

        # Get segment IDs -> all 0s for one sentence, which is the case for sequence classification
        token_type_ids_example = [0] * len(input_ids_example)
        # Get input mask -> 1 for real tokens, 0 for padding tokens
        attention_mask_example = ([1] * text_length) + ([0] * padding_length)

        # Check if features are in correct length
        assert len(token_type_ids_example) == len(input_ids_example)
        assert len(attention_mask_example) == len(input_ids_example)
        token_type_ids.append(token_type_ids_example)
        attention_mask.append(attention_mask_example)

    # Convert lists to tensors
    token_type_ids = torch.tensor(data=token_type_ids, device=device)
    attention_mask = torch.tensor(data=attention_mask, device=device)
    return token_type_ids, attention_mask

## TweetDataset

In [None]:

class TweetDataset(Dataset):
    """
    Tweet Dataset for easily iterating over and performing common operations.
    @param (str) input_directory: path of directory where the desired data exists
    @param (pytorch_transformers.BertTokenizer) tokenizer: tokenizer with pre-figured mappings
    @param (bool) apply_cleaning: whether or not to perform common cleaning operations on texts;
           note that enabling only makes sense if language of the task is English
    @param (int) max_tokenization_length: maximum number of positional embeddings, or the sequence
           length of an example that will be fed to BERT model (default: 512)
    @param (str) truncation_method: method that will be applied in case the text exceeds
           @max_tokenization_length; currently implemented methods include 'head-only', 'tail-only',
           and 'head+tail' (default: 'head-only')
    @param (float) split_head_density: weight on head when splitting between head and tail, only
           applicable if @truncation_method='head+tail' (default: 0.5)
    @param (torch.device) device: 'cpu' or 'gpu', decides where to store the data tensors
    """
    def __init__(self, input_directory, tokenizer, apply_cleaning, max_tokenization_length,
                 truncation_method='head-only', split_head_density=0.5, device='cpu'):
        super(TweetDataset).__init__()
        # agree folder
        self.agree_path = os.path.join(input_directory, 'agree')
        self.agree_files = [f for f in os.listdir(self.agree_path)
                               if os.path.isfile(os.path.join(self.agree_path, f))]
        self.num_agree_examples = len(self.agree_files)
        self.agree_label = 0
        # disagree folder
        self.disagree_path = os.path.join(input_directory, 'disagree')
        self.disagree_files = [f for f in os.listdir(self.disagree_path)
                               if os.path.isfile(os.path.join(self.disagree_path, f))]
        self.num_disagree_examples = len(self.disagree_files)
        self.disagree_label = 1
        # nostance folder
        self.nostance_path = os.path.join(input_directory, 'no_stance')
        self.nostance_files = [f for f in os.listdir(self.nostance_path)
                               if os.path.isfile(os.path.join(self.nostance_path, f))]
        self.num_nostance_examples = len(self.nostance_files)
        self.nostance_label = 2
        # notrelevant folder
        self.notrelevant_path = os.path.join(input_directory, 'not_relevant')
        self.notrelevant_files = [f for f in os.listdir(self.notrelevant_path)
                               if os.path.isfile(os.path.join(self.notrelevant_path, f))]
        self.num_notrelevant_examples = len(self.notrelevant_files)
        self.notrelevant_label = 3

        self.tokenizer = tokenizer
        self.apply_cleaning = apply_cleaning
        self.max_tokenization_length = max_tokenization_length
        self.truncation_method = truncation_method
        self.split_head_density = split_head_density
        self.device = device

        # Pre-tokenize & encode examples
        self.pre_tokenize_and_encode_examples()

    def pre_tokenize_and_encode_examples(self):
        """
        Function to tokenize & encode examples and save the tokenized versions to a separate folder.
        This way, we won't have to perform the same tokenization and encoding ops every epoch.
        """
        self.__pre_tokenize_and_encode_examples__(self.agree_path, self.agree_files)
        self.__pre_tokenize_and_encode_examples__(self.disagree_path, self.disagree_files)
        self.__pre_tokenize_and_encode_examples__(self.nostance_path, self.nostance_files)
        self.__pre_tokenize_and_encode_examples__(self.notrelevant_path, self.notrelevant_files)

    def __pre_tokenize_and_encode_examples__(self, path, files):
        """
        Function to tokenize & encode examples and save the tokenized versions to a separate folder.
        This way, we won't have to perform the same tokenization and encoding ops every epoch.
        """
        if not os.path.exists(os.path.join(path, 'tokenized_and_encoded')):
            os.mkdir(os.path.join(path, 'tokenized_and_encoded'))

            # Clean & tokenize tweets
            for i in trange(len(files), desc='Tokenizing & Encoding {} Tweets'.format(path),
                            leave=True):
                file = files[i]
                m_id = int(file.split('_')[0])
                idx = target_idx.index(m_id)
                target = targets[idx]
                with open(os.path.join(path, file), mode='r', encoding='utf8') as f:
                    example = f.read()
                example = re.sub(r'<br />', '', example)
                example = example.lstrip().rstrip()
                example = re.sub(' +', ' ', example)
                example = tokenize_and_encode(text=example,
                                              header=target,
                                              tokenizer=self.tokenizer,
                                              apply_cleaning=self.apply_cleaning,
                                              max_tokenization_length=self.max_tokenization_length,
                                              truncation_method=self.truncation_method,
                                              split_head_density=self.split_head_density)

                with open(os.path.join(path, 'tokenized_and_encoded', file), mode='wb') as f:
                    pickle.dump(obj=example, file=f)
        else:
            logging.warning('Tokenized {} directory already exists!'.format(path))

    def __len__(self):
        return len(self.agree_files) + len(self.disagree_files) + len(self.nostance_files) + len(self.notrelevant_files)

    def __getitem__(self, index):
        # agree
        if index < self.num_agree_examples:
            file = self.agree_files[index]
            label = torch.tensor(data=self.agree_label, dtype=torch.long).to(self.device)
            with open(os.path.join(self.agree_path, 'tokenized_and_encoded', file), mode='rb') as f:
                example = pickle.load(file=f)
        elif index < self.num_agree_examples + self.num_disagree_examples:
            file = self.disagree_files[index - self.num_agree_examples]
            label = torch.tensor(data=self.disagree_label, dtype=torch.long).to(self.device)
            with open(os.path.join(self.disagree_path, 'tokenized_and_encoded', file), mode='rb') as f:
                example = pickle.load(file=f)
        elif index < self.num_agree_examples + self.num_disagree_examples + self.num_nostance_examples:
            file = self.nostance_files[index - self.num_agree_examples - self.num_disagree_examples]
            label = torch.tensor(data=self.nostance_label, dtype=torch.long).to(self.device)
            with open(os.path.join(self.nostance_path, 'tokenized_and_encoded', file), mode='rb') as f:
                example = pickle.load(file=f)
        elif index < self.num_agree_examples + self.num_disagree_examples + self.num_nostance_examples + self.num_notrelevant_examples:
            file = self.notrelevant_files[index - self.num_agree_examples - self.num_disagree_examples - self.num_nostance_examples]
            label = torch.tensor(data=self.notrelevant_label, dtype=torch.long).to(self.device)
            with open(os.path.join(self.notrelevant_path, 'tokenized_and_encoded', file), mode='rb') as f:
                example = pickle.load(file=f)
        else:
            raise ValueError('Out of range index while accessing dataset')

        return torch.from_numpy(np.array(example)).long().to(self.device), label

In [None]:
from torch.utils.data import DataLoader
from pytorch_transformers import AdamW  # Adam's optimization w/ fixed weight decay
import pickle

# Initialize train & test datasets
train_dataset = TweetDataset(input_directory="drive/MyDrive/CS 6320 Project/data/train",
                            tokenizer=model.get_tokenizer(),
                            apply_cleaning=True,
                            max_tokenization_length=MAX_TOKENIZATION_LENGTH,
                            truncation_method='head-only',
                            device=DEVICE)

test_dataset = TweetDataset(input_directory="drive/MyDrive/CS 6320 Project/data/test",
                           tokenizer=model.get_tokenizer(),
                           apply_cleaning=True,
                           max_tokenization_length=MAX_TOKENIZATION_LENGTH,
                           truncation_method='head-only',
                           device=DEVICE)

print(train_dataset)



<__main__.TweetDataset object at 0x7fb522cd0790>


In [None]:
# Acquire iterators through data loaders
train_loader = DataLoader(dataset=train_dataset,
                          batch_size=BATCH_SIZE,
                          shuffle=True)

test_loader = DataLoader(dataset=test_dataset,
                         batch_size=BATCH_SIZE,
                         shuffle=False)

# Define loss function
criterion = nn.CrossEntropyLoss()

# Define identifiers & group model parameters accordingly (check README.md for the intuition)
bert_learning_rate = BERT_LEARNING_RATE
custom_learning_rate = CUSTOM_LEARNING_RATE
bert_identifiers = ['embeddings']
no_weight_decay_identifiers = ['bias', 'LayerNorm.weight']
grouped_model_parameters = [
        {'params': [param for name, param in model.named_parameters()
                    if any(identifier in name for identifier in bert_identifiers) and
                    not any(identifier_ in name for identifier_ in no_weight_decay_identifiers)],
          'lr': bert_learning_rate,
          'betas': (0.9, 0.999),
          'weight_decay': 0.01,
          'eps': 1e-8},
        {'params': [param for name, param in model.named_parameters()
                    if any(identifier in name for identifier in bert_identifiers) and
                    any(identifier_ in name for identifier_ in no_weight_decay_identifiers)],
          'lr': bert_learning_rate,
          'betas': (0.9, 0.999),
          'weight_decay': 0.0,
          'eps': 1e-8},
        {'params': [param for name, param in model.named_parameters()
                    if not any(identifier in name for identifier in bert_identifiers)],
          'lr': custom_learning_rate,
          'betas': (0.9, 0.999),
          'weight_decay': 0.0,
          'eps': 1e-8}
]
# Define optimizer
optimizer = AdamW(grouped_model_parameters)


In [None]:
# Place model & loss function on GPU
model, criterion = model.to(DEVICE), criterion.to(DEVICE)

In [None]:
from sklearn.metrics import f1_score, confusion_matrix

def binary_accuracy(y_pred, y_true):
    """Function to calculate binary accuracy per batch"""
    y_pred_max = torch.argmax(y_pred, dim=-1)
    correct_pred = (y_pred_max == y_true).float()
    acc = correct_pred.sum() / len(correct_pred)
    return acc


def train(model, iterator, criterion, optimizer, device, include_bert_masks=True):
    """
    Function to carry out the training process
    @param (torch.nn.Module) model: model object to be trained
    @param (torch.utils.data.DataLoader) iterator: data loader to iterate over batches
    @param (torch.nn.[...]) criterion: loss function to backpropagate on
    @param (torch.optim.[...]) optimizer: optimization algorithm
    @param (torch.device) device: 'cpu' or 'gpu', decides where to store the outputted tensors
    @param (bool) include_bert_masks: whether to include token type IDs & attention masks alongside
           input IDs when passing to model or not (default: True)
    """
    epoch_loss, epoch_acc = 0.0, 0.0

    for batch in iterator:
        # Get training input IDs & labels from the current batch
        input_ids, labels = batch
        # Get corresponding additional features from the current batch
        token_type_ids, attention_mask = get_features(input_ids=input_ids,
                                                      tokenizer=model.get_tokenizer(),
                                                      device=device)
        # Reset the gradients from previous processes
        optimizer.zero_grad()
        # Pass features through the model w/ or w/o BERT masks for attention & token type
        if include_bert_masks:
            predictions = model(input_ids=input_ids,
                                token_type_ids=token_type_ids,
                                attention_mask=attention_mask)
        else:
            predictions = model(input_ids=input_ids,header_ids=header_ids)

        # Calculate loss and accuracy
        loss = criterion(predictions, labels)
        acc = binary_accuracy(predictions, labels)

        # Backward and optimize
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        epoch_acc += acc.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)


def test(model, iterator, criterion, device, include_bert_masks=True):
    """
    Function to carry out the testing (or validation) process
    @param (torch.nn.Module) model: model object to be trained
    @param (torch.utils.data.DataLoader) iterator: data loader to iterate over batches
    @param (torch.nn.[...]) criterion: loss function to backpropagate on
    @param (torch.device) device: 'cpu' or 'gpu', decides where to store the outputted tensors
    @param (bool) include_bert_masks: whether to include token type IDs & attention masks alongside
           input IDs when passing to model or not (default: True)
    """
    epoch_loss, epoch_acc = 0.0, 0.0
    confusion = [[0 for _ in range(0, 4)] for _ in range(0, 4)]

    with torch.no_grad():
        for batch in iterator:
            # Get testing input IDs & labels from the current batch
            input_ids, labels = batch
            # Get corresponding additional features from the current batch
            token_type_ids, attention_mask = get_features(input_ids=input_ids,
                                                          tokenizer=model.get_tokenizer(),
                                                          device=device)
            # Pass features through the model w/ or w/o BERT masks for attention & token type
            if include_bert_masks:
                predictions = model(input_ids=input_ids,
                                    token_type_ids=token_type_ids,
                                    attention_mask=attention_mask)
            else:
                predictions = model(input_ids=input_ids)

            # Calculate loss and accuracy
            loss = criterion(predictions, labels)
            acc = binary_accuracy(predictions, labels)
            y_pred_max = torch.argmax(predictions, dim=-1)
            confusion += confusion_matrix(labels.cpu().detach().numpy(), 
                                          y_pred_max.cpu().detach().numpy(), 
                                          labels=[0,1,2,3])
            
            epoch_loss += loss.item()
            epoch_acc += acc.item()

    
    return epoch_loss / len(iterator), epoch_acc / len(iterator), confusion


def get_attention_nth_layer_mth_head_kth_token(attention_outputs, n, m, k, average_heads=False):
    """
    Function to compute attention weights by:
    i)   Take the attention weights from the nth multi-head attention layer assigned to kth token
    ii)  Take the mth attention head
    """
    if average_heads is True and m is not None:
        logging.warning("Argument passed for param @m will be ignored because of head averaging.")

    # Get the attention weights outputted by the nth layer
    attention_outputs_concatenated = torch.cat(attention_outputs, dim=0)       # (K, N, P, P)
    attention_outputs = attention_outputs_concatenated.data[n, :, :, :]        # (N, P, P)

    # Get the attention weights assigned to kth token
    attention_outputs = attention_outputs[:, k, :]                             # (N, P)

    # Compute the average attention weights across all attention heads
    if average_heads:
        attention_outputs = torch.sum(attention_outputs, dim=0)                # (P)
        num_attention_heads = attention_outputs_concatenated.shape[1]
        attention_outputs /= num_attention_heads
    # Get the attention weights of mth head
    else:
        attention_outputs = attention_outputs[m, :]                            # (P)

    return attention_outputs


def get_attention_average_first_layer(attention_outputs):
    """
    Function to compute attention weights by:
    i)   Take the attention weights from the first multi-head attention layer assigned to CLS
    ii)  Average each token across attention heads
    """
    return get_attention_nth_layer_mth_head_kth_token(attention_outputs=attention_outputs,
                                                      n=0, m=None, k=0,
                                                      average_heads=True)


def get_attention_average_last_layer(attention_outputs):
    """
    Function to compute attention weights by
    i)   Take the attention weights from the last multi-head attention layer assigned to CLS
    ii)  Average each token across attention heads
    """
    return get_attention_nth_layer_mth_head_kth_token(attention_outputs=attention_outputs,
                                                      n=-1, m=None, k=0,
                                                      average_heads=True)


def get_normalized_attention(model, raw_sentence, method='last_layer_heads_average',
                             n=None, m=None, k=None, exclude_special_tokens=True,
                             normalization_method='normal', device='cpu'):
    """
    Function to get the normalized version of the attention output of a FineTunedBert() model
    @param (torch.nn.Module) model: FineTunedBert() model to visualize attention weights on
    @param (str) raw_sentence: sentence in string format, preferably from the test distribution
    @param (str) method: method name specifying the attention output configuration, possible values
           are 'first_layer_heads_average', 'last_layer_heads_average', 'nth_layer_heads_average',
           'nth_layer_mth_head', and 'custom' (default: 'last_layer_heads_average')
    @param (int) n: layer no. (default: None)
    @param (int) m: head no. (default: None)
    @param (int) k: token no. (default: None)
    @param (bool) exclude_special_tokens: whether to exclude special tokens such as [CLS] and [SEP]
           from attention weights computation or not (default: True)
    @param (str) normalization_method: the normalization method to be applied on attention weights,
           possible values include 'min-max' and 'normal' (default: 'normal')
    @param (torch.device) device: 'cpu' or 'gpu', decides where to store the outputted tensors
    """
    if None in [n, m, k] and method == 'custom':
        raise ValueError("Must pass integer argument for params @n, @m, and @k " +
                         "if method is 'nth_layer_mth_head_kth_token'")
    elif None not in [n, m, k] and method != 'custom':
        logging.warning("Arguments passed for params @n, @m, or @k will be ignored. " +
                        "Specify @method as 'nth_layer_mth_head_kth_token' to make them effective.")

    # Plug in CLS & SEP special tokens for identification of start & end points of sequences
    if '[CLS]' not in raw_sentence and '[SEP]' not in raw_sentence:
        tokenized_text = ['[CLS]'] + model.get_tokenizer().tokenize(raw_sentence) + ['[SEP]']
    else:
        tokenized_text = model.get_tokenizer().tokenize(raw_sentence)

    # Call model evaluation as we don't want no gradient update
    model.eval()
    with torch.no_grad():
        attention_outputs = model.get_bert_attention(raw_sentence=raw_sentence, device=device)

    attention_weights = None
    if method == 'first_layer_heads_average':
        attention_weights = get_attention_nth_layer_mth_head_kth_token(
            attention_outputs=attention_outputs,
            n=0, m=None, k=0,
            average_heads=True
        )
    elif method == 'last_layer_heads_average':
        attention_weights = get_attention_nth_layer_mth_head_kth_token(
            attention_outputs=attention_outputs,
            n=-1, m=None, k=0,
            average_heads=True
        )
    elif method == 'nth_layer_heads_average':
        attention_weights = get_attention_nth_layer_mth_head_kth_token(
            attention_outputs=attention_outputs,
            n=n, m=None, k=0,
            average_heads=True
        )
    elif method == 'nth_layer_mth_head':
        attention_weights = get_attention_nth_layer_mth_head_kth_token( 
            attention_outputs=attention_outputs,
            n=n, m=m, k=0,
            average_heads=False
        )
    elif method == 'custom':
        attention_weights = get_attention_nth_layer_mth_head_kth_token(
            attention_outputs=attention_outputs,
            n=n, m=m, k=k,
            average_heads=False
        )

    # Remove the beginning [CLS] & ending [SEP] tokens for better intuition
    if exclude_special_tokens:
        tokenized_text, attention_weights = tokenized_text[1:-1], attention_weights[1:-1]

    # Apply normalization methods to attention weights
    # i)  Min-Max Normalization
    if normalization_method == 'min-max':
        max_weight, min_weight = attention_weights.max(), attention_weights.min()
        attention_weights = (attention_weights - min_weight) / (max_weight - min_weight)

    # ii) Z-Score Normalization
    elif normalization_method == 'normal':
        mu, std = attention_weights.mean(), attention_weights.std()
        attention_weights = (attention_weights - mu) / std

    # Convert tensor to NumPy array
    attention_weights = attention_weights.data

    tokens_and_weights = []
    for index, token in enumerate(tokenized_text):
        tokens_and_weights.append((token, attention_weights[index].item()))

    return tokens_and_weights


def get_delta_attention(tokens_and_weights_pre, tokens_and_weights_post):
    """Function to compute the delta (change) in scaled attention weights before & after"""
    tokens_and_weights_delta = []
    for i, token_and_weight in enumerate(tokens_and_weights_pre):
        token,  = token_and_weight[0],
        assert token == tokens_and_weights_post[i][0]

        pre_weight = token_and_weight[1]
        post_weight = tokens_and_weights_post[i][1]

        tokens_and_weights_delta.append((token, post_weight - pre_weight))

    return tokens_and_weights_delta

# Let's Demo!

In [None]:
# Load the weights
model.load_state_dict(torch.load('drive/MyDrive/CS 6320 Project/saved_models/BERT-b1-r1-h256-d6-blr1e-5.pt'))

<All keys matched successfully>

In [None]:
def predict(tweet, header_id, print_results = False):
    labels = ["agree", "disagree", "no_stance", "not_relevant"]

    prediction = model.predict(tweet, header_id)
    if print_results:
        print("Target: " + targets[target_idx.index(header_id)])
        print("Tweet: " + tweet)
        print("Stance: " + labels[torch.argmax(prediction, dim=-1)])
    return labels[torch.argmax(prediction, dim=-1)]

In [None]:
tweet = "@JCope222 Vaccine is untested for safety. People have died in he trials. Natural immunity scores 99.975% recovery from Covid for your age group" 
header_id = 1

tweet_demo = "My moms doesn't want me to continue my education because we have to get a Covid vaccine and she says it's a \"fact\" that it causes infertility"
header_id_demo = 2

predict(tweet, header_id, print_results=True)
print("="*100)
predict(tweet_demo, header_id_demo, print_results=True)

Target: RNA alters a person's DNA when taking the COVID-19 vaccine.
Tweet: @JCope222 Vaccine is untested for safety. People have died in he trials. Natural immunity scores 99.975% recovery from Covid for your age group
Stance: not_relevant
Target: The COVID-19 vaccine causes infertility or miscarriages in women.
Tweet: My moms doesn't want me to continue my education because we have to get a Covid vaccine and she says it's a "fact" that it causes infertility
Stance: disagree


'disagree'