# Vaccine Sentiment Classification
*by Nefeli Tavoulari*

#### In this notebook I .

## Install Dependencies

In [1]:
!pip install transformers
!pip install datasets



## Import Packages

In [2]:
%matplotlib inline
import io
import re
import csv

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from google.colab import files
import re
import csv

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from sklearn.metrics import precision_recall_fscore_support, confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import classification_report, roc_curve, roc_auc_score
from sklearn.metrics import roc_curve, accuracy_score

import transformers
from transformers import AutoTokenizer, BertModel, BertForSequenceClassification, BertForQuestionAnswering
from datasets import load_dataset
import logging

SEED = 1234
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
logging.basicConfig(level=logging.INFO)
transformers.logging.set_verbosity_error()

## Use GPU for faster processing

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Available device:", device)

Available device: cuda


## Upload dataset - Create and Clean dataframes

In [4]:
train_df, dev_df = load_dataset('squad_v2', split=['train', 'validation'])



  0%|          | 0/2 [00:00<?, ?it/s]

In [5]:
train_df = pd.DataFrame(train_df)
dev_df = pd.DataFrame(dev_df)

In [6]:
# remove empty instances / duplicates / extra columns
train_df.dropna(subset = ["question"], inplace=True)
dev_df.dropna(subset = ["question"], inplace=True)

train_df.drop(['id'], axis = 1, inplace = True) 
train_df.drop(['title'], axis = 1, inplace = True) 
dev_df.drop(['id'], axis = 1, inplace = True) 
dev_df.drop(['title'], axis = 1, inplace = True) 

In [7]:
def get_dataframe(df):
  context = []
  question = []
  answer = []
  answer_start = []
  answer_end = []
  start = 0
  end = 0

  for index, row in df.iterrows():
    for a in row['answers']['text']:
      context.append(row['context'])
      question.append(row['question'])
      answer.append(a)
      text_length = len(a)
      start_idx = row['answers']['answer_start'][0]
      end_idx = row['answers']['answer_start'][0] + text_length
      if (row['context'][start_idx:end_idx] == a):
        answer_start.append(row['answers']['answer_start'][0])
        answer_end.append(row['answers']['answer_start'][0] + text_length)
      else:
        for i in [1, 2]:
          if row['context'][start_idx-i:end_idx-i] == a:
              start = start_idx - i
              end = end_idx - i
        answer_start.append(start)
        answer_end.append(end)

  answer_dict = {'text': answer, 'answer_start': answer_start, 'answer_end': answer_end}
  dict = {'context': context, 'question': question, 'answer': answer_dict}  
  #df = pd.DataFrame(dict)
  return dict

train_df = get_dataframe(train_df)
dev_df = get_dataframe(dev_df)

In [8]:
# remove special characters, urls, emojis and lowercase tweets
# train_df["tweet"] = train_df["tweet"].apply(lambda line: re.sub('[^A-Za-z0-9]+', ' ', re.sub(r'http\S+', ' ',line.lower().strip())))
# dev_df["tweet"] = dev_df["tweet"].apply(lambda line: re.sub('[^A-Za-z0-9]+', ' ', re.sub(r'http\S+', ' ',line.lower().strip())))
# remove empty instances again
# train_df.dropna(subset = ["tweet"], inplace=True)
# dev_df.dropna(subset = ["tweet"], inplace=True)

In [9]:
#print(train_df) # training data

In [10]:
#print(dev_df) # validation data

## Load Bert tokenizer and model

In [11]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model = BertForQuestionAnswering.from_pretrained('bert-base-uncased').to(device)

#tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
#model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

In [12]:
# training data
context = train_df["context"]
question = train_df["question"]
answer = train_df["answer"]
answer_start = train_df["answer"]["answer_start"]
answer_end = train_df["answer"]["answer_end"]

encoding = tokenizer(question, context, 
                    truncation = True, 
                    padding = "max_length", 
                    max_length = 100,
                    return_attention_mask = True)
input_ids = encoding["input_ids"] # token ids

# validation data
context_dev = dev_df["context"]
question_dev = dev_df["question"]
answer_dev = dev_df["answer"]
answer_start_dev = dev_df["answer"]["answer_start"]
answer_end_dev = dev_df["answer"]["answer_end"]

encoding_dev = tokenizer(question_dev, context_dev,
                    truncation = True, 
                    padding = "max_length", 
                    max_length = 100,
                    return_attention_mask = True)
input_ids_dev = encoding_dev["input_ids"] # token ids

In [13]:
def get_segment_ids(input_ids):
  segments_ids = []
  for i in input_ids: # for each list / instance
    sep_index = i.index(tokenizer.sep_token_id)
    num_seg_a = sep_index + 1
    num_seg_b = len(i) - num_seg_a
    ids = [0]*num_seg_a + [1]*num_seg_b
    segments_ids.append(ids)
    assert len(ids) == len(i)
  return segments_ids

segments_ids = get_segment_ids(input_ids)
segments_ids_dev = get_segment_ids(input_ids_dev)

In [14]:
def add_token_positions(encoding, answer, answer_start, answer_end):
    start_pos = []
    end_pos = []
    for i in range(len(answer["text"])):
        if (answer["text"][i] == ""):
           start_pos.append(0)
           end_pos.append(0)
           continue
        start_pos.append(encoding.char_to_token(i, answer['answer_start'][i]))
        end_pos.append(encoding.char_to_token(i, answer['answer_end'][i]))
        if start_pos[-1] is None:
          start_pos[-1] = tokenizer.model_max_length
        if end_pos[-1] is None:
          end_pos[-1] = encoding.char_to_token(i, answer['answer_end'][i] - 1)
        if end_pos[-1] is None:
          end_pos[-1] = tokenizer.model_max_length
    return start_pos, end_pos

start_pos, end_pos = add_token_positions(encoding, answer, answer_start, answer_end)
encoding["start_positions"] = start_pos
encoding["end_positions"] = end_pos
start_pos_dev, end_pos_dev = add_token_positions(encoding_dev, answer_dev, answer_start_dev, answer_end_dev)
encoding_dev["start_positions"] = start_pos_dev
encoding_dev["end_positions"] = end_pos_dev

In [15]:
# # convert lists to tensors

# train_inputs = torch.tensor(input_ids)
# dev_inputs = torch.tensor(input_ids_dev)

# train_masks = torch.tensor(segments_ids)
# dev_masks = torch.tensor(segments_ids_dev)

# train_answer_start = torch.tensor(start_pos)
# dev_answer_start = torch.tensor(start_pos_dev)

# train_answer_end = torch.tensor(end_pos)
# dev_answer_end = torch.tensor(end_pos_dev)

# # train_answers = torch.tensor(answer)
# # dev_answers = torch.tensor(answer_dev)

# # create datasets, dataloaders
# BATCH_SIZE = 3
# train_dataset = torch.utils.data.TensorDataset(train_inputs, train_masks, train_answer_start, train_answer_end)
# train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

# validation_dataset = torch.utils.data.TensorDataset(dev_inputs, dev_masks, dev_answer_start, dev_answer_end)
# validation_dataloader = torch.utils.data.DataLoader(validation_dataset, batch_size=BATCH_SIZE, shuffle=True)

In [38]:
class SquadDataset(torch.utils.data.Dataset):
  def __init__(self, encodings):
    self.encodings = encodings

  def __getitem__(self, idx):
    return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

  def __len__(self):
    return len(self.encodings.input_ids)

train_dataset = SquadDataset(encoding)
val_dataset = SquadDataset(encoding_dev)
BATCH_SIZE = 2
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
validation_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=True)

## Configurations

In [39]:
#Define Hyperparameters
learning_rate = 1e-5

#Initialize optimizer
optimizer = optim.Adam(model.parameters(), lr=learning_rate)#, weight_decay=0.001)

clip = 2

#model

In [None]:
torch.cuda.empty_cache()

epoch_loss = []
epoch_loss_dev = []
epoch_acc = []
epoch_acc_dev = []

for epoch in range(1):

  batch_losses = []
  batch_acc = 0
  total = 0
  total_dev = 0
  loss = 0

  # sets the mode to train
  model.train()
  for batch in train_dataloader:  # for every batch
    inputs = batch['input_ids'].to(device)
    masks = batch['attention_mask'].to(device)
    start_pos = batch['start_positions'].to(device)
    end_pos = batch['end_positions'].to(device)
    if inputs.shape[0] == BATCH_SIZE:
      y_pred = model(inputs, attention_mask=masks, start_positions=start_pos, end_positions=end_pos)
      loss = y_pred[0]
      #print(loss)

      # answer_start = torch.argmax(start_scores)
      # answer_end = torch.argmax(end_scores)
      # for j in range(i, BATCH_SIZE):
      #   answer = ' '.join(tokens[j][answer_start:answer_end+1])
      #   print('Answer: "' + answer + '"')

      #print(y_pred)
      batch_losses.append(loss)
      #Delete previously stored gradients
      optimizer.zero_grad()
      #Perform backpropagation starting from the loss calculated in this epoch
      loss.backward()
      #Perform gradient clipping to address exploding gradients
      #nn.utils.clip_grad_norm_(model.parameters(), clip)
      #Update model's weights based on the gradients calculated during backprop
      optimizer.step()

      # Total number of labels
      #total += label.size(0)
      # Total correct predictions
      #_,pred_label = torch.max(y_pred[0], dim = 1)
      #batch_acc += (pred_label == label).sum()

  # validation    
  with torch.no_grad():
    batch_losses_dev = []
    batch_acc_dev = 0
    # sets the mode to testing
    model.eval()
    for batch in validation_dataloader:
      inputs = batch['input_ids'].to(device)
      masks = batch['attention_mask'].to(device)
      start_pos = batch['start_positions'].to(device)
      end_pos = batch['end_positions'].to(device)

      if inputs.shape[0] == BATCH_SIZE:
        y_dev_pred = model(inputs, attention_mask=masks, start_positions=start_pos, end_positions=end_pos) 
        loss_dev = y_dev_pred[0]
        batch_losses_dev.append(loss_dev)
        # number of labels
        #total_dev += label.size(0)
        # correct predictions
        #_,pred_label = torch.max(y_dev_pred[0], dim = 1)  # get max probability
        #batch_acc_dev += (pred_label == label).sum()


  # accuracy = batch_acc/total
  # accuracy_dev = batch_acc_dev/total_dev

  train_loss = sum(batch_losses)/len(train_dataloader)
  valid_loss = sum(batch_losses_dev)/len(validation_dataloader)

  epoch_loss.append(train_loss)
  epoch_loss_dev.append(valid_loss)
  # epoch_acc.append(accuracy)
  # epoch_acc_dev.append(accuracy_dev)

  print(f"Epoch {epoch:3}: | Train Loss = {train_loss:.5f} | Validation Loss = {valid_loss:.5f} ")

### Evaluation

In [None]:
pred = []
validation_dataloader = torch.utils.data.DataLoader(validation_dataset, batch_size=1, shuffle=True)
for (inputs, masks, label) in validation_dataloader:
  inputs = inputs.to(device)
  masks = masks.to(device)
  label = label.to(device)
  y_dev_pred = model(inputs, masks)
  pred.append(y_dev_pred)

# Compare predictions to actual labels
print(classification_report(dev_labels, pred))

In [None]:
target_names = ['neutral', 'anti-vax', 'pro-vax']

cm = confusion_matrix(label, y_dev_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=target_names)
disp.plot()
plt.show()

In [None]:
print("Precision-Recall-F1 - Training Data :")
print(precision_recall_fscore_support(label, y_dev_pred, average='weighted'))

In [None]:
def plot_graph_loss(epochs):
    fig = plt.figure(figsize=(12,12))
    plt.title("Train/Validation Loss")
    plt.plot(list(np.arange(epochs) + 1) , epoch_loss, label='train')
    plt.plot(list(np.arange(epochs) + 1), epoch_loss_dev, label='validation')
    plt.xlabel('num_epochs', fontsize=12)
    plt.ylabel('loss', fontsize=12)
    plt.legend(['train', 'validation']);

plot_graph_loss(5)

In [None]:
def plot_graph_acc(epochs):
    fig = plt.figure(figsize=(12,12))
    plt.title("Train/Validation Accuracy")
    plt.plot(list(np.arange(epochs) + 1) , epoch_acc, label='train')
    plt.plot(list(np.arange(epochs) + 1), epoch_acc_dev, label='validation')
    plt.xlabel('num_epochs', fontsize=12)
    plt.ylabel('accuracy', fontsize=12)
    plt.legend(['train', 'validation']);

plot_graph_acc(5)    