# Vaccine Sentiment Classification
*by Nefeli Tavoulari*

#### In this notebook I classify tweets as Neutral, Pro-vax or Anti-vax.

## Install Dependencies

In [2]:
!pip install -U torch==1.8.0 torchtext==0.9.0

Collecting torch==1.8.0
  Downloading torch-1.8.0-cp37-cp37m-manylinux1_x86_64.whl (735.5 MB)
[K     |████████████████████████████████| 735.5 MB 12 kB/s 
[?25hCollecting torchtext==0.9.0
  Downloading torchtext-0.9.0-cp37-cp37m-manylinux1_x86_64.whl (7.1 MB)
[K     |████████████████████████████████| 7.1 MB 23.9 MB/s 
Installing collected packages: torch, torchtext
  Attempting uninstall: torch
    Found existing installation: torch 1.10.0+cu111
    Uninstalling torch-1.10.0+cu111:
      Successfully uninstalled torch-1.10.0+cu111
  Attempting uninstall: torchtext
    Found existing installation: torchtext 0.11.0
    Uninstalling torchtext-0.11.0:
      Successfully uninstalled torchtext-0.11.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torchvision 0.11.1+cu111 requires torch==1.10.0, but you have torch 1.8.0 which is incompatible.
torchaudio 0.10.0

## Import Packages

In [3]:
%matplotlib inline
import io
import re
import csv

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from google.colab import files
from wordcloud import WordCloud
import nltk
import re
import csv
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords 

import torch
import torch.nn as nn
from torchtext.legacy import data   
from torchtext.vocab import GloVe
from torchtext.legacy.data import BucketIterator
from torch.utils.data import Dataset, DataLoader, TensorDataset
import torch.nn.functional as F
import torch.optim as optim
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

import torchvision.transforms as transforms
import torchvision.datasets as dsets

from sklearn.metrics import precision_recall_fscore_support, confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import classification_report, roc_curve, roc_auc_score
from sklearn.metrics import roc_curve, accuracy_score, mean_absolute_error

SEED = 1234
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
nltk.download('stopwords')
nltk.download('wordnet')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


## Use GPU

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device available for running: ")
print(device)

Device available for running: 
cpu


## Upload dataset - Create and Clean dataframe

In [5]:
upload_train = files.upload()

Saving vs_train.csv to vs_train.csv


In [6]:
upload_dev = files.upload()

Saving vs_dev.csv to vs_dev.csv


In [7]:
train_df = pd.read_csv(io.BytesIO(upload_train['vs_train.csv']))
dev_df = pd.read_csv(io.BytesIO(upload_dev['vs_dev.csv']))

In [8]:
train_df.dropna(subset = ["tweet"], inplace=True)
train_df.drop_duplicates(subset = ["tweet"], inplace=True)

dev_df.dropna(subset = ["tweet"], inplace=True)

train_df.drop(['Unnamed: 0'], axis = 1, inplace = True) 
dev_df.drop(['Unnamed: 0'], axis = 1, inplace = True) 

In [9]:
train_df["tweet"] = train_df["tweet"].apply(lambda line: re.sub('[^A-Za-z0-9]+', ' ', re.sub(r'http\S+', ' ',line.lower().strip())))
dev_df["tweet"] = dev_df["tweet"].apply(lambda line: re.sub('[^A-Za-z0-9]+', ' ', re.sub(r'http\S+', ' ',line.lower().strip())))

In [10]:
train_df.dropna(subset = ["tweet"], inplace=True)
dev_df.dropna(subset = ["tweet"], inplace=True)

In [11]:
print(train_df) # training data

                                                   tweet  label
0      sip n shop come thru right now marjais popular...      0
1      i don t know about you but my family and i wil...      1
2       msignorile immunizations should be mandatory ...      2
3      president obama spoke in favor of vaccination ...      0
4       myfoxla arizona monitoring hundreds for measl...      0
...                                                  ...    ...
15971   salon if u believe the anti vax nutcases caus...      1
15972  how do you feel about parents who don t vaccin...      0
15973  70 preschoolers tested for measles in simi val...      0
15974  finance minister budget offers room to procure...      0
15975  are you up to date on vaccines take cdc s vacc...      2

[15881 rows x 2 columns]


In [12]:
print(dev_df) # validation data

                                                  tweet  label
0      user they had a massive surge in with covid d...      1
1     required vaccines for school parents and guard...      0
2      kcstar two more johnson county children have ...      0
3     nv can do better which states are the best and...      2
4     nothing like killing ourselves w our own fear ...      2
...                                                 ...    ...
2277  rt abc7 number of measles cases reported in ca...      0
2278  evidence points to the idea that measles affec...      0
2279  where s savedyouaclick voxdotcom why you shoul...      2
2280  some of my favorite people have autism if that...      2
2281  coronavirus the married couple behind the succ...      0

[2282 rows x 2 columns]


## Use Glove Embeddings

In [13]:
TEXT = data.Field(sequential=True, tokenize="spacy", batch_first=True)
LABEL = data.Field(sequential=False, use_vocab=False, is_target=True)

In [14]:
train_df.to_csv("train.csv", index=False)
dev_df.to_csv("valid.csv", index=False)

train_data, valid_data = data.TabularDataset.splits(
    path="", train="train.csv", 
    validation="valid.csv",format="csv", skip_header=True, 
    fields=[('tweet', TEXT), ('label', LABEL)]
)

print(f'Number of training examples: {len(train_data)}')
print(f'Number of validation examples: {len(valid_data)}')

Number of training examples: 15881
Number of validation examples: 2282


In [15]:
TEXT.build_vocab(train_data, valid_data, vectors='glove.twitter.27B.200d', max_size=20000, min_freq=5)

# get the vocab instance
vocab = TEXT.vocab
vocab.vectors

.vector_cache/glove.twitter.27B.zip: 1.52GB [04:47, 5.30MB/s]                            
100%|█████████▉| 1193513/1193514 [01:36<00:00, 12427.54it/s]


tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.4935,  0.3570,  0.6607,  ...,  0.1771, -0.5369, -0.2970],
        ...,
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.1947,  0.3329,  0.2097,  ..., -0.1532, -0.4263,  0.2719],
        [-0.0272, -1.2332, -0.5078,  ...,  0.3071, -0.4746,  0.1992]])

In [16]:
TEXT.vocab.freqs.most_common(10) # seems right

[('the', 9936),
 ('to', 7894),
 (' ', 6059),
 ('vaccine', 5815),
 ('a', 5787),
 ('measles', 5075),
 ('of', 4966),
 ('and', 4559),
 ('i', 4488),
 ('in', 4301)]

In [17]:
LABEL.build_vocab(train_data)
vocab_label = LABEL.vocab
print("Size of LABEL vocabulary:",len(vocab_label))
vocab_label.freqs

Size of LABEL vocabulary: 4


Counter({'0': 7385, '1': 2070, '2': 6426})

In [18]:
print(f"Unique tokens in TEXT vocabulary: {len(TEXT.vocab)}")
print(f"Unique tokens in LABEL vocabulary: {len(LABEL.vocab)}")

Unique tokens in TEXT vocabulary: 4893
Unique tokens in LABEL vocabulary: 4


## Create Batch Iterator

In [19]:
BATCH_SIZE = 64
train_iterator, valid_iterator = data.BucketIterator.splits(
    (train_data, valid_data), sort_key=lambda x: len(x.tweet),
    batch_size=BATCH_SIZE, shuffle=True, device=device)

LABEL.vocab.freqs

Counter({'0': 7385, '1': 2070, '2': 6426})

In [20]:
pretrained_embeddings = TEXT.vocab.vectors.to(device)
pretrained_embeddings.shape

torch.Size([4893, 200])

In [21]:
pretrained_embeddings

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.4935,  0.3570,  0.6607,  ...,  0.1771, -0.5369, -0.2970],
        ...,
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.1947,  0.3329,  0.2097,  ..., -0.1532, -0.4263,  0.2719],
        [-0.0272, -1.2332, -0.5078,  ...,  0.3071, -0.4746,  0.1992]])

## Attention Layer

In [102]:
class Attention(nn.Module):
    def __init__(self, hidden_size):
        super(Attention, self).__init__()
        self.hidden_size = hidden_size
        self.concat_linear = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.attn = nn.Linear(self.hidden_size, hidden_size)

    def forward(self, rnn_outputs, final_hidden_state):
        batch_size, seq_len, _ = rnn_outputs.shape
        attn_weights = self.attn(rnn_outputs) # (batch_size, seq_len, hidden_dim)
        attn_weights = torch.bmm(attn_weights, final_hidden_state.unsqueeze(2))
        attn_weights = F.softmax(attn_weights.squeeze(2), dim=1)
        context = torch.bmm(rnn_outputs.transpose(1, 2), attn_weights.unsqueeze(2)).squeeze(2)
        attn_hidden = torch.tanh(self.concat_linear(torch.cat((context, final_hidden_state), dim=1)))
        return attn_hidden, attn_weights

## LSTM

In [103]:
class LSTM(nn.Module):
    def __init__(self, vocab_size, hidden_size, num_layers, embedding_size, output_size, weights, bidirectional, skip_connections, attention):
        super(LSTM, self).__init__()
        self.vocab_size = vocab_size
        self.embedding_size = embedding_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.output_size = output_size
        self.bidirectional = bidirectional
        self.skip_connections = skip_connections
        if (self.skip_connections):
          self.skip = nn.Identity()
        self.embedding = nn.Embedding.from_pretrained(weights)
        self.lstm = nn.LSTM(input_size=self.embedding_size,
                           hidden_size=self.hidden_size,
                           dropout=0.2,
                           num_layers=self.num_layers, 
                           bidirectional=self.bidirectional,
                           batch_first=True
                           )
        
        #self.tanh = nn.Tanh()
        #self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.2)

        if self.bidirectional: 
          self.fc = nn.Linear(self.hidden_size*2, self.output_size)
          if (attention):
            self.attention = Attention(self.hidden_size * 2)
        else:
          self.fc = nn.Linear(hidden_size, self.output_size)
          if (attention):
            self.attention = Attention(self.hidden_size)

        self.attn = attention

    def forward(self, x):
        x = self.embedding(x)

        if self.bidirectional:
            h = torch.zeros((self.num_layers*2, x.size(0), self.hidden_size)).to(device)
            c = torch.zeros((self.num_layers*2, x.size(0), self.hidden_size)).to(device)
        else:
            h = torch.zeros((self.num_layers, x.size(0), self.hidden_size)).to(device)
            c = torch.zeros((self.num_layers, x.size(0), self.hidden_size)).to(device)

        out, hidden = self.lstm(x, (h, c))

        if (self.skip_connections):
          out = out + self.skip(out)
         

        final_hidden_state = None

        if self.bidirectional: 
          if (self.attn):
            final_state = hidden[0].view(self.num_layers, 2, x.size(0), self.hidden_size)[-1]
            h_1, h_2 = final_state[0], final_state[1]
            final_hidden_state = torch.cat((h_1, h_2), 1) 
          else:
            out = out.contiguous().view(-1, self.hidden_size*2)
        else:
          if (self.attn):
            final_state = hidden[0].view(self.num_layers, 1, x.size(0), self.hidden_size)[-1]
            final_hidden_state = final_state.squeeze(0)
          else:
            out = out.contiguous().view(-1, self.hidden_size)

    
        #out = self.tanh(out)
        #out = self.relu(out)
        out = self.dropout(out)
        if (self.attn):
          out, attn_weights = self.attention(out, final_hidden_state)
        
        out = self.fc(out)  
        
        out = out.view(x.size(0), -1, self.output_size)

        out = out[:, -1, :]

        return out.to(device)

### Configurations

In [104]:
#Define layer sizes
vocab_size = pretrained_embeddings.shape[0]
hidden_size = 256
num_layers = 2
embedding_size = pretrained_embeddings.shape[1]
output_size = 3

#Define Hyperparameters
learning_rate = 1e-4

#Initialize model, optimizer
model = LSTM(vocab_size, hidden_size, num_layers, embedding_size, output_size, pretrained_embeddings, False, False, True).to(device)
model.embedding.weight.data = pretrained_embeddings
criterion = nn.CrossEntropyLoss().to(device)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)#, weight_decay=0.001)

clip = 4

In [105]:
model

LSTM(
  (embedding): Embedding(4893, 200)
  (lstm): LSTM(200, 256, num_layers=2, batch_first=True, dropout=0.2, bidirectional=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc): Linear(in_features=512, out_features=3, bias=True)
  (attention): Attention(
    (concat_linear): Linear(in_features=1024, out_features=512, bias=True)
    (attn): Linear(in_features=512, out_features=512, bias=True)
  )
)

In [106]:
epoch_loss = []
epoch_loss_dev = []
epoch_acc = []
epoch_acc_dev = []


for epoch in range(40):

  batch_losses = []
  batch_acc = 0
  total = 0
  total_dev = 0
  loss = 0
  pred_proba = []

  # training
  model.train()
  for (tweet, label) in train_iterator:  # for every batch
    if tweet.shape[0] == BATCH_SIZE:
      tweet = tweet.to(device)
      label = label.to(device)
      y_pred = model(tweet)
      loss = criterion(y_pred, label)
      batch_losses.append(loss)
      #Delete previously stored gradients
      optimizer.zero_grad()
      #Perform backpropagation starting from the loss calculated in this epoch
      loss.backward()
      nn.utils.clip_grad_norm_(model.parameters(), clip)
      #Update model's weights based on the gradients calculated during backprop
      optimizer.step()

      # Total number of labels
      total += label.size(0)
      # Total correct predictions
      _,pred_label = torch.max(y_pred, dim = 1)
      batch_acc += (pred_label == label).sum()

  # validation    
  with torch.no_grad():
    batch_losses_dev = []
    batch_acc_dev = 0
    model.eval()
    for (tweet, label) in valid_iterator:
      if tweet.shape[0] == BATCH_SIZE:
        tweet = tweet.to(device)
        label = label.to(device)
        y_dev_pred = model(tweet)
        prob = F.softmax(y_dev_pred, dim=1)   # probability that an instance belogs to each class
        for i in prob:
          pred_proba.append(i.tolist())
        loss_dev = criterion(y_dev_pred, label)
        batch_losses_dev.append(loss_dev)
        # number of labels
        total_dev += label.size(0)
        # correct predictions
        _,pred_label = torch.max(y_dev_pred, dim = 1)  # get max probability
        #print(y_dev_pred, pred_label)
        batch_acc_dev += (pred_label == label).sum()


  accuracy = batch_acc/total
  accuracy_dev = batch_acc_dev/total_dev

  train_loss = sum(batch_losses)/len(train_iterator)
  valid_loss = sum(batch_losses_dev)/len(valid_iterator)

  epoch_loss.append(train_loss)
  epoch_loss_dev.append(valid_loss)
  epoch_acc.append(accuracy)
  epoch_acc_dev.append(accuracy_dev)

  print(f"Epoch {epoch:3}: | Train Loss = {train_loss:.5f} | Train Accuracy = {accuracy:.5f} | Validation Loss = {valid_loss:.5f} | Validation Accuracy = {accuracy_dev:.5f} ")

Epoch   0: | Train Loss = 0.82949 | Train Accuracy = 0.61448 | Validation Loss = 0.88441 | Validation Accuracy = 0.59911 


KeyboardInterrupt: ignored

### Evaluation

In [None]:
valid = pd.read_csv("valid.csv")

In [None]:
def predict(sentence):
    tokenized = [tok for tok in sentence.split()]
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]

    tensor = torch.LongTensor(indexed)
    tensor = tensor.unsqueeze(0)
    prediction = model(tensor.to(device))

    preds, ind = torch.max(F.softmax(prediction, dim=-1), 1)
    return preds, ind, F.softmax(prediction, dim=-1)


In [None]:
pred = [predict(k)[1].cpu().numpy() for k in valid.tweet]
pred_proba = [predict(k)[2] for k in valid.tweet]

print(classification_report(valid.label, pred))

In [None]:
target_names = ['neutral', 'anti-vax', 'pro-vax']

cm = confusion_matrix(valid.label, pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=target_names)
disp.plot()
plt.show()

In [None]:
print("Precision-Recall-F1 - Training Data :")
print(precision_recall_fscore_support(valid.label, pred, average='micro'))

In [None]:
proba_list = []
for i in pred_proba:
  proba_list.append(i.squeeze(0).tolist())
#proba_list

In [None]:
macro_roc_auc_ovo = roc_auc_score(valid.label.tolist(), proba_list, multi_class="ovo", average="macro")
weighted_roc_auc_ovo = roc_auc_score(valid.label.tolist(), proba_list, multi_class="ovo", average="weighted")
macro_roc_auc_ovr = roc_auc_score(valid.label.tolist(), proba_list, multi_class="ovr", average="macro")
weighted_roc_auc_ovr = roc_auc_score(valid.label.tolist(), proba_list, multi_class="ovr", average="weighted")
print(
    "One-vs-One ROC AUC scores:\n{:.6f} (macro),\n{:.6f} "
    "(weighted by prevalence)".format(macro_roc_auc_ovo, weighted_roc_auc_ovo)
)
print()
print(
    "One-vs-Rest ROC AUC scores:\n{:.6f} (macro),\n{:.6f} "
    "(weighted by prevalence)".format(macro_roc_auc_ovr, weighted_roc_auc_ovr)
)

### Plots

In [None]:
# probabilities
df_prob = pd.DataFrame(proba_list)
df_prob

In [None]:
# roc curve for classes
fpr = {}
tpr = {}
thresh ={}

n_class = 3

for i in range(n_class):    
    fpr[i], tpr[i], thresh[i] = roc_curve(valid.label, df_prob[i], pos_label=i)
    
# plotting    
plt.plot(fpr[0], tpr[0], linestyle='--',color='orange', label='Class 0 vs Rest')
plt.plot(fpr[1], tpr[1], linestyle='--',color='green', label='Class 1 vs Rest')
plt.plot(fpr[2], tpr[2], linestyle='--',color='blue', label='Class 2 vs Rest')
plt.title('Multiclass ROC curve')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive rate')
plt.legend(loc='best')
plt.savefig('Multiclass ROC',dpi=300);  

In [None]:
def plot_graph_loss(epochs):
    fig = plt.figure(figsize=(12,12))
    plt.title("Train/Validation Loss")
    plt.plot(list(np.arange(epochs) + 1) , epoch_loss, label='train')
    plt.plot(list(np.arange(epochs) + 1), epoch_loss_dev, label='validation')
    plt.xlabel('num_epochs', fontsize=12)
    plt.ylabel('loss', fontsize=12)
    plt.legend(['train', 'validation']);

plot_graph_loss(40)

In [None]:
def plot_graph_acc(epochs):
    fig = plt.figure(figsize=(12,12))
    plt.title("Train/Validation Accuracy")
    plt.plot(list(np.arange(epochs) + 1) , epoch_acc, label='train')
    plt.plot(list(np.arange(epochs) + 1), epoch_acc_dev, label='validation')
    plt.xlabel('num_epochs', fontsize=12)
    plt.ylabel('accuracy', fontsize=12)
    plt.legend(['train', 'validation']);

plot_graph_acc(40)    

## GRU

In [118]:
class GRU(nn.Module):
    def __init__(self, vocab_size, hidden_size, num_layers, embedding_size, output_size, weights, bidirectional, skip_connections, attention):
        super(GRU, self).__init__()

        self.vocab_size = vocab_size
        self.embedding_size = embedding_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.output_size = output_size
        self.bidirectional = bidirectional

        self.skip_connections = skip_connections
        if (self.skip_connections):
          self.skip = nn.Identity()
        self.embedding = nn.Embedding.from_pretrained(weights)

        self.GRU = nn.GRU(input_size=self.embedding_size,
                           hidden_size=self.hidden_size,
                           #dropout=dropout,
                           num_layers=self.num_layers, 
                           bidirectional=self.bidirectional,
                           batch_first=True
                           )
        
        if self.bidirectional: 
          self.fc = nn.Linear(self.hidden_size*2, self.output_size)
          if (attention):
            self.attention = Attention(self.hidden_size * 2)
        else:
          self.fc = nn.Linear(self.hidden_size, self.output_size)
          if (attention):
            self.attention = Attention(self.hidden_size)

        self.attn = attention
        self.dropout = nn.Dropout()

    def forward(self, x):
        x = self.embedding(x)
        
        if self.bidirectional:
            h = torch.zeros((self.num_layers*2, x.size(0), self.hidden_size))
        else:
            h = torch.zeros((self.num_layers, x.size(0), self.hidden_size))

        out, hidden = self.GRU(x, h)

        if (self.skip_connections):
          out = out + self.skip(out)

        final_hidden_state = None

        if self.bidirectional: 
          if (self.attn):
            final_state = hidden.view(self.num_layers, 2, x.size(0), self.hidden_size)[-1]
            h_1, h_2 = final_state[0], final_state[1]
            final_hidden_state = torch.cat((h_1, h_2), 1) 
          else:
            out = out.contiguous().view(-1, self.hidden_size*2)
        else:
          if (self.attn):
            final_state = hidden.view(self.num_layers, 1, x.size(0), self.hidden_size)[-1]
            final_hidden_state = final_state.squeeze(0)
          else:
            out = out.contiguous().view(-1, self.hidden_size)

        #out = self.tanh(out)
        #out = self.relu(out)
        out = self.dropout(out)
        if (self.attn):
          out, attn_weights = self.attention(out, final_hidden_state)
        
        out = self.fc(out)   
         
        out = out.view(x.size(0), -1, self.output_size)
        out = out[:, -1, :]
        
        return out.to(device)

### Configurations

In [122]:
#Define layer sizes
vocab_size = pretrained_embeddings.shape[0]
hidden_size = 128
num_layers = 2
embedding_size = pretrained_embeddings.shape[1]
output_size = 3

#Define Hyperparameters
learning_rate = 1e-4

#Initialize model, optimizer
model = GRU(vocab_size, hidden_size, num_layers, embedding_size, output_size, pretrained_embeddings, False, False, False).to(device)
model.embedding.weight.data = pretrained_embeddings
criterion = nn.CrossEntropyLoss().to(device)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

clip = 5

In [123]:
model

GRU(
  (skip): Identity()
  (embedding): Embedding(4893, 200)
  (GRU): GRU(200, 128, num_layers=2, batch_first=True)
  (fc): Linear(in_features=128, out_features=3, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)

In [124]:
epoch_loss = []
epoch_loss_dev = []
epoch_acc = []
epoch_acc_dev = []


for epoch in range(40):

  batch_losses = []
  batch_acc = 0
  total = 0
  total_dev = 0
  loss = 0
  pred_proba = []

  # training
  model.train()
  for (tweet, label) in train_iterator:  # for every batch
    if tweet.shape[0] == BATCH_SIZE:
      tweet = tweet.to(device)
      label = label.to(device)
      y_pred = model(tweet)
      loss = criterion(y_pred, label)
      batch_losses.append(loss)
      #Delete previously stored gradients
      optimizer.zero_grad()
      #Perform backpropagation starting from the loss calculated in this epoch
      loss.backward()
      nn.utils.clip_grad_norm_(model.parameters(), clip)
      #Update model's weights based on the gradients calculated during backprop
      optimizer.step()

      # Total number of labels
      total += label.size(0)
      # Total correct predictions
      _,pred_label = torch.max(y_pred, dim = 1)
      batch_acc += (pred_label == label).sum()

  # validation    
  with torch.no_grad():
    batch_losses_dev = []
    batch_acc_dev = 0
    model.eval()
    for (tweet, label) in valid_iterator:
      if tweet.shape[0] == BATCH_SIZE:
        tweet = tweet.to(device)
        label = label.to(device)
        y_dev_pred = model(tweet)
        prob = F.softmax(y_dev_pred, dim=1)   # probability that an instance belogs to each class
        #print(prob.shape)
        for i in prob:
          pred_proba.append(i.tolist())
        loss_dev = criterion(y_dev_pred, label)
        batch_losses_dev.append(loss_dev)
        # number of labels
        total_dev += label.size(0)
        # correct predictions
        _,pred_label = torch.max(y_dev_pred, dim = 1)  # get max probability
        batch_acc_dev += (pred_label == label).sum()


  accuracy = batch_acc/total
  accuracy_dev = batch_acc_dev/total_dev

  train_loss = sum(batch_losses)/len(train_iterator)
  valid_loss = sum(batch_losses_dev)/len(valid_iterator)

  epoch_loss.append(train_loss)
  epoch_loss_dev.append(valid_loss)
  epoch_acc.append(accuracy)
  epoch_acc_dev.append(accuracy_dev)

  print(f"Epoch {epoch:3}: | Train Loss = {train_loss:.5f} | Train Accuracy = {accuracy:.5f} | Validation Loss = {valid_loss:.5f} | Validation Accuracy = {accuracy_dev:.5f} ")

KeyboardInterrupt: ignored

In [None]:
valid = pd.read_csv("valid.csv")

In [None]:
def predict(sentence):
    tokenized = [tok for tok in sentence.split()]
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]

    tensor = torch.LongTensor(indexed)
    tensor = tensor.unsqueeze(0)
    prediction = model(tensor.to(device))

    preds, ind = torch.max(F.softmax(prediction, dim=-1), 1)
    return preds, ind, F.softmax(prediction, dim=-1)


In [None]:
pred = [predict(k)[1].cpu().numpy() for k in valid.tweet]
pred_proba = [predict(k)[2] for k in valid.tweet]

print(classification_report(valid.label, pred))

In [None]:
target_names = ['neutral', 'anti-vax', 'pro-vax']

cm = confusion_matrix(valid.label, pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=target_names)
disp.plot()
plt.show()

In [None]:
print("Precision-Recall-F1 - Training Data :")
print(precision_recall_fscore_support(valid.label, pred, average='micro'))

In [None]:
proba_list = []
for i in pred_proba:
  proba_list.append(i.squeeze(0).tolist())
#proba_list

In [None]:
macro_roc_auc_ovo = roc_auc_score(valid.label.tolist(), proba_list, multi_class="ovo", average="macro")
weighted_roc_auc_ovo = roc_auc_score(valid.label.tolist(), proba_list, multi_class="ovo", average="weighted")
macro_roc_auc_ovr = roc_auc_score(valid.label.tolist(), proba_list, multi_class="ovr", average="macro")
weighted_roc_auc_ovr = roc_auc_score(valid.label.tolist(), proba_list, multi_class="ovr", average="weighted")
print(
    "One-vs-One ROC AUC scores:\n{:.6f} (macro),\n{:.6f} "
    "(weighted by prevalence)".format(macro_roc_auc_ovo, weighted_roc_auc_ovo)
)
print()
print(
    "One-vs-Rest ROC AUC scores:\n{:.6f} (macro),\n{:.6f} "
    "(weighted by prevalence)".format(macro_roc_auc_ovr, weighted_roc_auc_ovr)
)

### Plots

In [None]:
# probabilities
df_prob = pd.DataFrame(proba_list)
df_prob

In [None]:
# roc curve for classes
fpr = {}
tpr = {}
thresh ={}

n_class = 3

for i in range(n_class):    
    fpr[i], tpr[i], thresh[i] = roc_curve(valid.label, df_prob[i], pos_label=i)
    
# plotting    
plt.plot(fpr[0], tpr[0], linestyle='--',color='orange', label='Class 0 vs Rest')
plt.plot(fpr[1], tpr[1], linestyle='--',color='green', label='Class 1 vs Rest')
plt.plot(fpr[2], tpr[2], linestyle='--',color='blue', label='Class 2 vs Rest')
plt.title('Multiclass ROC curve')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive rate')
plt.legend(loc='best')
plt.savefig('Multiclass ROC',dpi=300);  

In [None]:
def plot_graph_loss(epochs):
    fig = plt.figure(figsize=(12,12))
    plt.title("Train/Validation Loss")
    plt.plot(list(np.arange(epochs) + 1) , epoch_loss, label='train')
    plt.plot(list(np.arange(epochs) + 1), epoch_loss_dev, label='validation')
    plt.xlabel('num_epochs', fontsize=12)
    plt.ylabel('loss', fontsize=12)
    plt.legend(['train', 'validation']);

plot_graph_loss(40)

In [None]:
def plot_graph_acc(epochs):
    fig = plt.figure(figsize=(12,12))
    plt.title("Train/Validation Accuracy")
    plt.plot(list(np.arange(epochs) + 1) , epoch_acc, label='train')
    plt.plot(list(np.arange(epochs) + 1), epoch_acc_dev, label='validation')
    plt.xlabel('num_epochs', fontsize=12)
    plt.ylabel('accuracy', fontsize=12)
    plt.legend(['train', 'validation']);

plot_graph_acc(40)    