# Vaccine Sentiment Classification
*by Nefeli Tavoulari*

#### In this notebook I classify tweets as Neutral, Pro-vax or Anti-vax.

## Install Dependencies

In [None]:
!pip install -U torch==1.8.0 torchtext==0.9.0
!pip install pyprind



## Import Packages

In [None]:
%matplotlib inline
import io
import re
import csv

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from google.colab import files
from wordcloud import WordCloud
import pyprind

import torch
import torch.nn as nn
from torchtext.legacy import data   
from torchtext.vocab import GloVe
from torchtext.legacy.data import BucketIterator
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F

import torchvision.transforms as transforms
import torchvision.datasets as dsets
SEED = 1234
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

## Upload dataset - Create dataframe

In [None]:
upload_train = files.upload()

Saving vs_train.csv to vs_train (2).csv


In [None]:
upload_dev = files.upload()

Saving vs_dev.csv to vs_dev (2).csv


In [None]:
train_df = pd.read_csv(io.BytesIO(upload_train['vs_train.csv']))
dev_df = pd.read_csv(io.BytesIO(upload_dev['vs_dev.csv']))

In [None]:
print(train_df) # training data

       Unnamed: 0                                              tweet  label
0               0  Sip N Shop Come thru right now #Marjais #Popul...      0
1               1  I don't know about you but My family and I wil...      1
2               2  @MSignorile Immunizations should be mandatory....      2
3               3  President Obama spoke in favor of vaccination ...      0
4               4  "@myfoxla: Arizona monitoring hundreds for mea...      0
...           ...                                                ...    ...
15971       15971  @Salon if u believe the anti-vax nutcases caus...      1
15972       15972  How do you feel about parents who don't #vacci...      0
15973       15973  70 Preschoolers Tested for Measles in Simi Val...      0
15974       15974  Finance Minister: Budget offers room to procur...      0
15975       15975  Are you up to date on vaccines? Take CDC’s vac...      2

[15976 rows x 3 columns]


In [None]:
print(dev_df) # validation data

      Unnamed: 0                                              tweet  label
0              0  @user They had a massive surge in with covid d...      1
1              1  Required vaccines for school: Parents and guar...      0
2              2  “@KCStar: Two more Johnson County children hav...      0
3              3  NV can do better. Which states are the best (a...      2
4              4  Nothing like killing ourselves w/ our own fear...      2
...          ...                                                ...    ...
2277        2277  RT @abc7: Number of measles cases reported in ...      0
2278        2278  Evidence points to the idea that "measles affe...      0
2279        2279  Where's @SavedYouAClick "@voxdotcom: Why you s...      2
2280        2280  Some of my favorite people have autism. If tha...      2
2281        2281  Coronavirus: The married couple behind the suc...      0

[2282 rows x 3 columns]


## Remove empty / duplicate tweets

In [None]:
train_df.dropna(subset = ["tweet"], inplace=True)
train_df.drop_duplicates(subset = ["tweet"], inplace=True)

dev_df.dropna(subset = ["tweet"], inplace=True)

train_df.drop(['Unnamed: 0'], axis=1, inplace = True) 
dev_df.drop(['Unnamed: 0'], axis=1, inplace = True) 

print(train_df.shape)
print(dev_df.shape)

(15881, 2)
(2282, 2)


In [None]:
total = train_df.append(dev_df)
total.shape

(18163, 2)

## Use Word Embeddings

In [None]:
TEXT = data.Field(tokenize='basic_english', lower=True)
LABEL = data.LabelField(dtype = torch.long, sequential=False)

In [None]:
train_df.to_csv("train.csv", index=False)
dev_df.to_csv("valid.csv", index=False)

train_data, valid_data = data.TabularDataset.splits(
    path="", train="train.csv", 
    validation="valid.csv",format="csv", skip_header=True, 
    fields=[('Text', TEXT), ('Label', LABEL)]
)

print(f'Number of training examples: {len(train_data)}')
print(f'Number of validation examples: {len(valid_data)}')

Number of training examples: 15881
Number of validation examples: 2282


In [29]:
TEXT.build_vocab(train_data, vectors='glove.twitter.27B.200d')

# get the vocab instance
vocab = TEXT.vocab
vocab.vectors

.vector_cache/glove.twitter.27B.zip: 1.52GB [05:29, 4.62MB/s]                            
100%|█████████▉| 1193513/1193514 [01:33<00:00, 12709.00it/s]


tensor([[ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00],
        [ 3.5132e-01,  5.6084e-04, -2.1488e-01,  ...,  3.6684e-02,
         -3.7206e-02,  8.5384e-01],
        ...,
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00]])

In [30]:
TEXT.vocab.freqs.most_common(10) # seems right

[('.', 26488),
 ('the', 8603),
 ('//t', 7914),
 (',', 6809),
 ('to', 6789),
 ("'", 5535),
 ('http', 5346),
 ('a', 4985),
 ('vaccine', 4619),
 ('of', 4299)]

In [31]:
print(TEXT.vocab.stoi)



In [32]:
LABEL.build_vocab(train_data)
vocab_label = LABEL.vocab
print("Size of LABEL vocabulary:",len(vocab_label))
vocab_label.freqs

Size of LABEL vocabulary: 3


Counter({'0': 7385, '1': 2070, '2': 6426})

In [33]:
print(f"Unique tokens in TEXT vocabulary: {len(TEXT.vocab)}")
print(f"Unique tokens in LABEL vocabulary: {len(LABEL.vocab)}")

Unique tokens in TEXT vocabulary: 37422
Unique tokens in LABEL vocabulary: 3


## Create batch iterator

In [34]:
BATCH_SIZE = 20

device = torch.device('cpu')

# keep in mind the sort_key option 
train_iterator, valid_iterator = data.BucketIterator.splits(
    (train_data, valid_data), sort_key=lambda x: len(x.Text),
    batch_size=BATCH_SIZE,
    device=device)

## Create model

In [75]:
class RNN(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_dim)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(output_dim, output_dim)
        self.softmax = nn.Softmax(dim=0)
        
    def forward(self, x):

        embedded = self.embedding(x)
        output, hidden = self.rnn(embedded)
        assert torch.equal(output[-1,:,:], hidden.squeeze(0))
        out = self.fc(hidden)
        out = self.relu(out)
        out = self.fc2(out)
        out = self.softmax(out)
        return out

In [76]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 200
HIDDEN_DIM = 400
OUTPUT_DIM = 3

model = RNN(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM).to(device)
criterion = nn.CrossEntropyLoss().to(device)
optimizer = torch.optim.SGD(model.parameters(), lr = 0.01)
# pretrained_embeddings = TEXT.vocab.vectors
# model.embedding.weight.data = pretrained_embeddings
# print(pretrained_embeddings.shape)

In [37]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    preds, ind= torch.max(F.softmax(preds, dim=-1), 1)
    correct = (ind == y).float()
    acc = correct.sum()/float(len(correct))
    return acc

In [49]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    bar = pyprind.ProgBar(len(iterator), bar_char='█')
    for batch in iterator:
        
        optimizer.zero_grad()
                
        predictions = model(batch.Text).squeeze(0)
        #predictions = predictions.reshape(-1, 1)
        #print(predictions.shape, batch.Label.shape, model(batch.Text).shape)

        loss = criterion(predictions, batch.Label)
        #print(loss.shape)
        acc = binary_accuracy(predictions, batch.Label)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        bar.update()
    return epoch_loss / len(iterator), epoch_acc / len(iterator)


In [39]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
        bar = pyprind.ProgBar(len(iterator), bar_char='█')
        for batch in iterator:

            predictions = model(batch.Text).squeeze(0)
            
            loss = criterion(predictions, batch.Label)
            
            acc = binary_accuracy(predictions, batch.Label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
            bar.update()
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [77]:
N_EPOCHS = 2

for epoch in range(N_EPOCHS):

    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    print(f'| Epoch: {epoch+1:02} | Train Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}% | Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc*100:.2f}% |')

0% [██████████████████████████████] 100% | ETA: 00:00:00
Total time elapsed: 00:01:05
0% [██████████████████████████████] 100% | ETA: 00:00:00

| Epoch: 01 | Train Loss: 1.099 | Train Acc: 46.45% | Val. Loss: 1.099 | Val. Acc: 46.70% |



Total time elapsed: 00:00:00
0% [██████████████████████████████] 100% | ETA: 00:00:00
Total time elapsed: 00:01:05
0% [██████████████████████████████] 100% | ETA: 00:00:00

| Epoch: 02 | Train Loss: 1.099 | Train Acc: 46.57% | Val. Loss: 1.099 | Val. Acc: 46.70% |



Total time elapsed: 00:00:00


In [None]:
model.eval()
y_pred = model(x_dev)
after_train = criterion(y_pred.squeeze(), y_dev) 
print('Test loss after Training' , after_train.item())

In [None]:
from ignite.metrics import Precision
from ignite.metrics import Recall


# Define the metric
precision = Precision()
recall = Recall()


# Start accumulation:
for x, y in data:
    y_pred = model(x)
    precision.update((y_pred, y))
    recall.update((y_pred, y))


# Compute the result
print("Precision: ", precision.compute())
print("Recall: ", recall.compute())
F1 = (precision * recall * 2 / (precision + recall)).mean()