# Vaccine Sentiment Classification
*by Nefeli Tavoulari*

#### In this notebook I .

## Install Dependencies

In [1]:
%%capture
!pip install transformers
!pip install datasets

## Import Packages

In [23]:
%matplotlib inline
import io
import re
import csv
import string

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from google.colab import files

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from sklearn.metrics import precision_recall_fscore_support, confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import classification_report, roc_curve, roc_auc_score

import transformers
from transformers import AutoTokenizer, BertModel, BertForSequenceClassification, BertForQuestionAnswering
from datasets import load_dataset
import logging

SEED = 1234
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
logging.basicConfig(level=logging.INFO)
transformers.logging.set_verbosity_error()

## Use GPU for faster processing

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Available device:", device)

Available device: cpu


## Upload dataset - Create and Clean dataframes

In [4]:
train_df, dev_df = load_dataset('squad_v2', split=['train', 'validation'])



  0%|          | 0/2 [00:00<?, ?it/s]

In [5]:
train_df = pd.DataFrame(train_df)
dev_df = pd.DataFrame(dev_df)

In [6]:
train_df = train_df.iloc[0:10,:]
dev_df = dev_df.iloc[0:10,:]

In [None]:
# remove empty instances / duplicates / extra columns
train_df.dropna(subset = ["question"], inplace=True)
dev_df.dropna(subset = ["question"], inplace=True)

train_df.drop(['id'], axis = 1, inplace = True) 
train_df.drop(['title'], axis = 1, inplace = True) 
dev_df.drop(['id'], axis = 1, inplace = True) 
dev_df.drop(['title'], axis = 1, inplace = True) 

In [8]:
def get_data(df):

  context = []
  question = []
  answer = []
  answer_start = []
  answer_end = []
  all_answers = []

  for index, row in df.iterrows():                      # for each row - pair of question-answers
    question.append(row['question'])
    context.append(row['context'])
    if row['answers']['text'] != []:
      main_answer = row['answers']['text'][0]             # first answer
      answer.append(main_answer)
      all_answers.append(row['answers']['text'])          # save all plausible answers
      text_length = len(main_answer)                      # length of main answer
      start_idx = row['answers']['answer_start'][0]       # char where answer begins inside context
      end_idx = start_idx + text_length                   # char where answer ends inside context
      if end_idx >= len(row['context']):                  # out of range -> dont add in dataset
        continue
      if (row['context'][start_idx:end_idx] == main_answer):
        answer_start.append(start_idx)
        answer_end.append(end_idx)
      else:
        start = 0
        end = 0
        for i in [1, 2]:                                  # maybe the answer is off by one or two chars
          if row['context'][start_idx-i:end_idx-i] == row['answers']['text'][0]:
              start = start_idx - i
              end = end_idx - i
        answer_start.append(start)
        answer_end.append(end)
    else:
      all_answers.append(None)
      answer.append(None)
      answer_start.append(None)
      answer_end.append(None)

  answer_dict = {'text': answer, 'answer_start': answer_start, 'answer_end': answer_end}
  dict = {'context': context, 'question': question, 'answer': answer_dict, 'all_answers': all_answers}  
  return dict

train_df = get_data(train_df)
dev_df = get_data(dev_df)

## Load Bert tokenizer and model

In [11]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model = BertForQuestionAnswering.from_pretrained('bert-base-uncased').to(device)

#tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
#model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

In [12]:
# training data
encoding = tokenizer(train_df["context"], train_df["question"],
                    truncation = True, 
                    padding = "max_length", 
                    max_length = 500,
                    return_attention_mask = True)

# validation data
encoding_dev = tokenizer(dev_df["context"], dev_df["question"],
                    truncation = True, 
                    padding = "max_length", 
                    max_length = 500,
                    return_attention_mask = True)

In [13]:
tokens = []
tokens_dev = []
for inputs in encoding['input_ids']:
  tokens.append(tokenizer.convert_ids_to_tokens(inputs))
for inputs in encoding_dev['input_ids']:
  tokens_dev.append(tokenizer.convert_ids_to_tokens(inputs))

In [15]:
def add_token_positions(encoding, answer, answer_start, answer_end):
    start = []
    end = []
    for i in range(len(answer["text"])):
      if (answer['answer_start'][i] == None):
        start.append(None)
        end.append(None)
        continue
      start.append(encoding.char_to_token(i, answer['answer_start'][i]))
      end.append(encoding.char_to_token(i, answer['answer_end'][i]))
      if start[-1] is None:
        start[-1] = tokenizer.model_max_length
      if end[-1] is None:
        end[-1] = encoding.char_to_token(i, answer['answer_end'][i] - 1)
      if end[-1] is None:
        end[-1] = tokenizer.model_max_length
    return start, end

encoding['start_positions'], encoding['end_positions'] = add_token_positions(encoding, train_df["answer"], train_df["answer"]["answer_start"], train_df["answer"]["answer_end"])
encoding_dev['start_positions'], encoding_dev['end_positions']  = add_token_positions(encoding_dev,  dev_df["answer"], dev_df["answer"]["answer_start"], dev_df["answer"]["answer_end"])

In [18]:
class SquadDataset(torch.utils.data.Dataset):
  def __init__(self, encodings):
    self.encodings = encodings

  def __getitem__(self, idx):
    dict = {}
    for key, val in self.encodings.items():
        if val[idx] == None:
          dict[key] = torch.tensor(-1)
        else:
          dict[key] = torch.tensor(val[idx])
    return dict

  def __len__(self):
    return len(self.encodings.input_ids)

train_dataset = SquadDataset(encoding)
validation_dataset = SquadDataset(encoding_dev)

BATCH_SIZE = 10
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
validation_dataloader = torch.utils.data.DataLoader(validation_dataset, batch_size=BATCH_SIZE)

## Configurations

In [19]:
#Define Hyperparameters
learning_rate = 1e-5

#Initialize optimizer
optimizer = optim.Adam(model.parameters(), lr=learning_rate)#, weight_decay=0.001)

clip = 2

#model

In [20]:
def normalize_text(s):

    def remove_articles(text):
        regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
        return re.sub(regex, " ", text)

    def white_space_fix(text):
        return " ".join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return "".join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))
    
def compute_exact_match(prediction, truth):
    return int(normalize_text(prediction) == normalize_text(truth))

def compute_f1(prediction, truth):
    pred_tokens = normalize_text(prediction).split()
    truth_tokens = normalize_text(truth).split()
    
    # if either the prediction or the truth is no-answer then f1 = 1 if they agree, 0 otherwise
    if len(pred_tokens) == 0 or len(truth_tokens) == 0:
        return int(pred_tokens == truth_tokens)
    
    common_tokens = set(pred_tokens) & set(truth_tokens)
    
    # if there are no common tokens then f1 = 0
    if len(common_tokens) == 0:
        return 0
    
    prec = len(common_tokens) / len(pred_tokens)
    rec = len(common_tokens) / len(truth_tokens)
    
    return 2 * (prec * rec) / (prec + rec)

In [None]:
torch.cuda.empty_cache()

EPOCHS = 1

epoch_loss = []
epoch_loss_dev = []
epoch_acc = []
epoch_acc_dev = []

for epoch in range(1):

  batch_losses = []
  batch_acc = 0
  total = 0
  total_dev = 0
  loss = 0
  batch_losses_dev = []
  batch_acc_dev = 0
  exact_match = 0
  f1_score = 0

  # sets the mode to train
  model.train()
  for batch in train_dataloader:  # for every batch

    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    token_type_ids = batch['token_type_ids'].to(device)
    start_positions = batch['start_positions'].to(device)
    end_positions = batch['end_positions'].to(device)

    output = model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, start_positions=start_positions, end_positions=end_positions)
    
    loss = output[0]
    loss.backward()
    batch_losses.append(loss)
    optimizer.zero_grad()
    #nn.utils.clip_grad_norm_(model.parameters(), clip)
    optimizer.step()

    # Total number of labels
    total += input_ids.size(0)
    break

  # validation    
  with torch.no_grad():

    # sets the mode to testing
    model.eval()
    for batch in validation_dataloader:

      input_ids = batch['input_ids'].to(device)
      attention_mask = batch['attention_mask'].to(device)
      token_type_ids = batch['token_type_ids'].to(device)
      start_positions = batch['start_positions'].to(device)
      end_positions = batch['end_positions'].to(device)

      start_logits, end_logits =  model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, return_dict=False) 

      start_logits = start_logits.detach().cpu()
      end_logits = end_logits.detach().cpu()

      # number of labels
      total_dev += input_ids.size(0)

      # correct predictions
      for idx, (start, end) in enumerate(zip(start_logits, end_logits)):
        ans_start = torch.argmax(start).tolist()
        ans_end = torch.argmax(end).tolist()
        if (ans_start > sum(attention_mask[idx].tolist()) or ans_start > ans_end):
          continue
        if (ans_end < sum(attention_mask[idx].tolist())):
          pred = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[idx][ans_start:ans_end]))
        else:
          pred = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[idx][ans_start:]))

      # check if no answer
      exact_match += max((compute_exact_match(pred, answer)) for answer in dev_df['all_answers'][idx])
      f1_score += max((compute_f1(pred, answer)) for answer in dev_df['all_answers'][idx])

  # em_scores = exact_match/total_dev
  # f1_scores = f1_score/total_dev
  # accuracy_dev = batch_acc_dev/total_dev
  # train_loss = sum(batch_losses)/len(train_dataloader)
  # epoch_loss.append(train_loss)
  # epoch_acc_dev.append(accuracy_dev)

  # print(f"Epoch {epoch:3}: | Train Loss = {train_loss:.5f} | Validation Accuracy = {accuracy_dev:.5f} ")

### Evaluation

In [22]:
with torch.no_grad():
  model.eval()
  for batch in validation_dataloader:

    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    token_type_ids = batch['token_type_ids'].to(device)
    start_positions = batch['start_positions'].to(device)
    end_positions = batch['end_positions'].to(device)

    start_logits, end_logits =  model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, return_dict=False) 

    start_logits = start_logits.detach().cpu()
    end_logits = end_logits.detach().cpu()

    # correct predictions
    for idx, (start, end) in enumerate(zip(start_logits, end_logits)):
      ans_start = torch.argmax(start).tolist()
      ans_end = torch.argmax(end).tolist()
      if (ans_start > sum(attention_mask[idx].tolist()) or ans_start > ans_end):
        continue
      if (ans_end < sum(attention_mask[idx].tolist())):
        pred = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[idx][ans_start:ans_end]))
      else:
        pred = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[idx][ans_start:]))

    exact_match += max((compute_exact_match(pred, answer)) for answer in dev_df['all_answers'][idx])
    f1_score += max((compute_f1(pred, answer)) for answer in dev_df['all_answers'][idx])

print("f1_score : ", f1_score, "exact match : ", exact_match)

ValueError: ignored

In [None]:
def plot_graph_loss(epochs):
    fig = plt.figure(figsize=(12,12))
    plt.title("Train Loss")
    plt.plot(list(np.arange(epochs) + 1) , epoch_loss, label='train')
    plt.xlabel('num_epochs', fontsize=12)
    plt.ylabel('loss', fontsize=12)
    plt.legend(['train']);

plot_graph_loss(EPOCHS)

In [None]:
def plot_graph_acc(epochs):
    fig = plt.figure(figsize=(12,12))
    plt.title("Validation Accuracy")
    plt.plot(list(np.arange(epochs) + 1), epoch_acc_dev, label='validation')
    plt.xlabel('num_epochs', fontsize=12)
    plt.ylabel('accuracy', fontsize=12)
    plt.legend(['validation']);

plot_graph_acc(EPOCHS)    

In [None]:
dataset = load_dataset("trivia_qa")

In [None]:
dataset = load_dataset("natural_questions")

In [None]:
dataset = load_dataset("quac")

In [None]:
dataset = load_dataset("newsqa", "combined-csv")