# Vaccine Sentiment Classification
*by Nefeli Tavoulari*

#### In this notebook I .

## Install Dependencies

In [5]:
!pip install transformers
!pip install datasets

Collecting datasets
  Downloading datasets-1.18.3-py3-none-any.whl (311 kB)
[K     |████████████████████████████████| 311 kB 4.3 MB/s 
Collecting fsspec[http]>=2021.05.0
  Downloading fsspec-2022.1.0-py3-none-any.whl (133 kB)
[K     |████████████████████████████████| 133 kB 34.2 MB/s 
Collecting aiohttp
  Downloading aiohttp-3.8.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 37.8 MB/s 
[?25hCollecting xxhash
  Downloading xxhash-2.0.2-cp37-cp37m-manylinux2010_x86_64.whl (243 kB)
[K     |████████████████████████████████| 243 kB 33.2 MB/s 
Collecting frozenlist>=1.1.1
  Downloading frozenlist-1.3.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (144 kB)
[K     |████████████████████████████████| 144 kB 12.5 MB/s 
Collecting async-timeout<5.0,>=4.0.0a3
  Downloading async_timeout-4.0.2-py3-none-any.whl (5.8 kB)
Collecting mult

## Import Packages

In [6]:
%matplotlib inline
import io
import re
import csv

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from google.colab import files
import re
import csv

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from sklearn.metrics import precision_recall_fscore_support, confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import classification_report, roc_curve, roc_auc_score
from sklearn.metrics import roc_curve, accuracy_score

import transformers
from transformers import BertTokenizer, BertModel, BertForSequenceClassification, BertForQuestionAnswering
from datasets import load_dataset
import logging
logging.basicConfig(level=logging.INFO)

SEED = 1234
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

## Use GPU for faster processing

In [7]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Available device:", device)

Available device: cuda


## Upload dataset - Create and Clean dataframes

In [8]:
train_df, dev_df = load_dataset('squad_v2', split=['train', 'validation'])

Downloading:   0%|          | 0.00/1.87k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.02k [00:00<?, ?B/s]

Downloading and preparing dataset squad_v2/squad_v2 (download: 44.34 MiB, generated: 122.41 MiB, post-processed: Unknown size, total: 166.75 MiB) to /root/.cache/huggingface/datasets/squad_v2/squad_v2/2.0.0/09187c73c1b837c95d9a249cd97c2c3f1cebada06efe667b4427714b27639b1d...


  0%|          | 0/2 [00:00<?, ?it/s]

Downloading:   0%|          | 0.00/9.55M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/801k [00:00<?, ?B/s]

  0%|          | 0/2 [00:00<?, ?it/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset squad_v2 downloaded and prepared to /root/.cache/huggingface/datasets/squad_v2/squad_v2/2.0.0/09187c73c1b837c95d9a249cd97c2c3f1cebada06efe667b4427714b27639b1d. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [9]:
train_df = pd.DataFrame(train_df)
dev_df = pd.DataFrame(dev_df)

In [10]:
# remove empty instances / duplicates / extra columns
train_df.dropna(subset = ["question"], inplace=True)
dev_df.dropna(subset = ["question"], inplace=True)

train_df.drop(['id'], axis = 1, inplace = True) 
train_df.drop(['title'], axis = 1, inplace = True) 
dev_df.drop(['id'], axis = 1, inplace = True) 
dev_df.drop(['title'], axis = 1, inplace = True) 

In [11]:
context = []
question = []
answer = []

for index, row in train_df.iterrows():
  if (row['answers']['text'] == []):
    context.append(row['context'])
    question.append(row['question'])
    answer.append("")
  else:
    for a in row['answers']['text']:
      context.append(row['context'])
      question.append(row['question'])
      answer.append(a)

dict = {'context': context, 'question': question, 'answer': answer}  
train_df = pd.DataFrame(dict)

context = []
question = []
answer = []

for index, row in dev_df.iterrows():
  if (row['answers']['text'] == []):
    context.append(row['context'])
    question.append(row['question'])
    answer.append("")
  else:
    for a in row['answers']['text']:
      context.append(row['context'])
      question.append(row['question'])
      answer.append(a)

dict = {'context': context, 'question': question, 'answer': answer}  
dev_df = pd.DataFrame(dict)

In [12]:
# remove special characters, urls, emojis and lowercase tweets
# train_df["tweet"] = train_df["tweet"].apply(lambda line: re.sub('[^A-Za-z0-9]+', ' ', re.sub(r'http\S+', ' ',line.lower().strip())))
# dev_df["tweet"] = dev_df["tweet"].apply(lambda line: re.sub('[^A-Za-z0-9]+', ' ', re.sub(r'http\S+', ' ',line.lower().strip())))

In [13]:
# remove empty instances again
# train_df.dropna(subset = ["tweet"], inplace=True)
# dev_df.dropna(subset = ["tweet"], inplace=True)

In [14]:
print(train_df) # training data

                                                  context  ...               answer
0       Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...  ...    in the late 1990s
1       Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...  ...  singing and dancing
2       Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...  ...                 2003
3       Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...  ...       Houston, Texas
4       Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...  ...           late 1990s
...                                                   ...  ...                  ...
130314  The term "matter" is used throughout physics i...  ...                     
130315  The term "matter" is used throughout physics i...  ...                     
130316  The term "matter" is used throughout physics i...  ...                     
130317  The term "matter" is used throughout physics i...  ...                     
130318  The term "matter" is used throughout physics i...  ...              

In [15]:
print(dev_df) # validation data

                                                 context  ...                   answer
0      The Normans (Norman: Nourmands; French: Norman...  ...                   France
1      The Normans (Norman: Nourmands; French: Norman...  ...                   France
2      The Normans (Norman: Nourmands; French: Norman...  ...                   France
3      The Normans (Norman: Nourmands; French: Norman...  ...                   France
4      The Normans (Norman: Nourmands; French: Norman...  ...  10th and 11th centuries
...                                                  ...  ...                      ...
26242  The pound-force has a metric counterpart, less...  ...                   sthène
26243  The pound-force has a metric counterpart, less...  ...                         
26244  The pound-force has a metric counterpart, less...  ...                         
26245  The pound-force has a metric counterpart, less...  ...                         
26246  The pound-force has a metric counter

## Load Bert tokenizer and model

In [16]:
# text formatting
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForQuestionAnswering.from_pretrained('bert-base-uncased')

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForQuestionAnswering: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased a

In [18]:
transformers.logging.set_verbosity_error()

# training data
contexts = train_df["context"].tolist()
questions = train_df["question"].tolist()
label = train_df["answer"].tolist()

encoding = tokenizer(contexts, questions, padding=True, return_tensors = "pt")
input_ids = encoding["input_ids"] # token ids
attention_mask = encoding["attention_mask"] # word or padding

# validation data
contexts = dev_df["context"].tolist()
questions = dev_df["question"].tolist()
label_dev = dev_df["answer"].tolist()

encoding = tokenizer(contexts, questions, padding=True, return_tensors = "pt")
input_ids_dev = encoding["input_ids"] # token ids
attention_mask_dev = encoding["attention_mask"] # word or padding

In [31]:
# convert lists to tensors

train_inputs = input_ids.clone().detach()
dev_inputs = input_ids_dev.clone().detach()

train_masks = attention_mask.clone().detach()
dev_masks = attention_mask_dev.clone().detach()

# train_labels = torch.tensor(label)
# dev_labels = torch.tensor(label_dev)

# create datasets, dataloaders
BATCH_SIZE = 32
train_dataset = torch.utils.data.TensorDataset(train_inputs, train_masks, train_labels)
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

validation_dataset = torch.utils.data.TensorDataset(dev_inputs, dev_masks, dev_labels)
validation_dataloader = torch.utils.data.DataLoader(validation_dataset, batch_size=BATCH_SIZE, shuffle=True)

ValueError: ignored

## Configurations

In [None]:
#Define Hyperparameters
learning_rate = 1e-5

#Initialize model, optimizer
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels = 3, problem_type="multi_label_classification").to(device)
criterion = nn.CrossEntropyLoss().to(device)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)#, weight_decay=0.001)

clip = 2

#model

In [None]:
epoch_loss = []
epoch_loss_dev = []
epoch_acc = []
epoch_acc_dev = []

for epoch in range(1):

  batch_losses = []
  batch_acc = 0
  total = 0
  total_dev = 0
  loss = 0

  # sets the mode to train
  model.train()
  for (inputs, masks, label) in train_dataloader:  # for every batch
    if inputs.shape[0] == BATCH_SIZE:
      inputs = inputs.to(device)
      masks = masks.to(device)
      label = label.to(device)
      y_pred = model(inputs, masks)
      loss = criterion(y_pred[0], label)
      batch_losses.append(loss)
      #Delete previously stored gradients
      optimizer.zero_grad()
      #Perform backpropagation starting from the loss calculated in this epoch
      loss.backward()
      #Perform gradient clipping to address exploding gradients
      nn.utils.clip_grad_norm_(model.parameters(), clip)
      #Update model's weights based on the gradients calculated during backprop
      optimizer.step()

      # Total number of labels
      total += label.size(0)
      # Total correct predictions
      _,pred_label = torch.max(y_pred[0], dim = 1)
      batch_acc += (pred_label == label).sum()

  # validation    
  with torch.no_grad():
    batch_losses_dev = []
    batch_acc_dev = 0
    # sets the mode to testing
    model.eval()
    for (inputs, masks, label) in validation_dataloader:
      if inputs.shape[0] == BATCH_SIZE:
        inputs = inputs.to(device)
        masks = masks.to(device)
        label = label.to(device)
        y_dev_pred = model(inputs, masks)
        loss_dev = criterion(y_dev_pred[0], label)
        batch_losses_dev.append(loss_dev)
        # number of labels
        total_dev += label.size(0)
        # correct predictions
        _,pred_label = torch.max(y_dev_pred[0], dim = 1)  # get max probability
        #print(y_dev_pred, pred_label)
        batch_acc_dev += (pred_label == label).sum()


  accuracy = batch_acc/total
  accuracy_dev = batch_acc_dev/total_dev

  train_loss = sum(batch_losses)/len(train_dataloader)
  valid_loss = sum(batch_losses_dev)/len(validation_dataloader)

  epoch_loss.append(train_loss)
  epoch_loss_dev.append(valid_loss)
  epoch_acc.append(accuracy)
  epoch_acc_dev.append(accuracy_dev)

  print(f"Epoch {epoch:3}: | Train Loss = {train_loss:.5f} | Train Accuracy = {accuracy:.5f} | Validation Loss = {valid_loss:.5f} | Validation Accuracy = {accuracy_dev:.5f} ")

### Evaluation

In [None]:
pred = []
validation_dataloader = torch.utils.data.DataLoader(validation_dataset, batch_size=1, shuffle=True)
for (inputs, masks, label) in validation_dataloader:
  inputs = inputs.to(device)
  masks = masks.to(device)
  label = label.to(device)
  y_dev_pred = model(inputs, masks)
  pred.append(y_dev_pred)

# Compare predictions to actual labels
print(classification_report(dev_labels, pred))

In [None]:
target_names = ['neutral', 'anti-vax', 'pro-vax']

cm = confusion_matrix(label, y_dev_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=target_names)
disp.plot()
plt.show()

In [None]:
print("Precision-Recall-F1 - Training Data :")
print(precision_recall_fscore_support(label, y_dev_pred, average='weighted'))

In [None]:
def plot_graph_loss(epochs):
    fig = plt.figure(figsize=(12,12))
    plt.title("Train/Validation Loss")
    plt.plot(list(np.arange(epochs) + 1) , epoch_loss, label='train')
    plt.plot(list(np.arange(epochs) + 1), epoch_loss_dev, label='validation')
    plt.xlabel('num_epochs', fontsize=12)
    plt.ylabel('loss', fontsize=12)
    plt.legend(['train', 'validation']);

plot_graph_loss(5)

In [None]:
def plot_graph_acc(epochs):
    fig = plt.figure(figsize=(12,12))
    plt.title("Train/Validation Accuracy")
    plt.plot(list(np.arange(epochs) + 1) , epoch_acc, label='train')
    plt.plot(list(np.arange(epochs) + 1), epoch_acc_dev, label='validation')
    plt.xlabel('num_epochs', fontsize=12)
    plt.ylabel('accuracy', fontsize=12)
    plt.legend(['train', 'validation']);

plot_graph_acc(5)    