# Vaccine Sentiment Classification
*by Nefeli Tavoulari*

#### In this notebook I classify tweets as Neutral, Pro-vax or Anti-vax.

## Install Dependencies

In [2]:
!pip install -U torch==1.8.0 torchtext==0.9.0
!pip install 

[31mERROR: You must give at least one requirement to install (see "pip help install")[0m


## Import Packages

In [3]:
%matplotlib inline
import io
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from google.colab import files
from wordcloud import WordCloud
import nltk
import re
import csv
import random
import os
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords 

import torch
import torch.nn as nn
from torchtext.legacy import data   
from torchtext.vocab import GloVe
from torchtext.legacy.data import BucketIterator
from torch.utils.data import Dataset, DataLoader, TensorDataset
import torch.nn.functional as F
import torch.optim as optim

import torchvision.transforms as transforms
import torchvision.datasets as dsets

from sklearn.metrics import precision_recall_fscore_support, confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import classification_report, roc_curve, roc_auc_score
from sklearn.metrics import roc_curve, accuracy_score, mean_absolute_error

nltk.download('stopwords')
nltk.download('wordnet')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Set Reproducibility Seed

In [4]:
def set_seed(seed = 1234):
    '''Sets the seed of the entire notebook so results are the same every time we run.
    This is for REPRODUCIBILITY.'''
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # When running on the CuDNN backend, two further options must be set
    torch.backends.cudnn.deterministic = True
    # Set a fixed value for the hash seed
    os.environ['PYTHONHASHSEED'] = str(seed)
    
set_seed()

device = 'cpu'
print('Working on:', device)

Working on: cpu


## Upload dataset - Create dataframe

In [5]:
upload_train = files.upload()

Saving vs_train.csv to vs_train (1).csv


In [6]:
upload_dev = files.upload()

Saving vs_dev.csv to vs_dev (1).csv


## Upload Glove Embeddings

In [7]:
def load_glove_model(File):
    print("Loading Glove Model")
    glove_model = {}
    with open(File,'r') as f:
        for line in f:
            split_line = line.split()
            word = split_line[0]
            embedding = np.array(split_line[1:], dtype=np.float64)
            glove_model[word] = embedding
    print(f"{len(glove_model)} words loaded!")
    return glove_model

In [8]:
!wget https://nlp.stanford.edu/data/glove.twitter.27B.zip
!unzip glove.twitter.27B.zip
# free some space
!rm glove.twitter.27B.zip

--2021-12-22 19:03:28--  https://nlp.stanford.edu/data/glove.twitter.27B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://downloads.cs.stanford.edu/nlp/data/glove.twitter.27B.zip [following]
--2021-12-22 19:03:28--  http://downloads.cs.stanford.edu/nlp/data/glove.twitter.27B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1520408563 (1.4G) [application/zip]
Saving to: ‘glove.twitter.27B.zip’


2021-12-22 19:08:14 (5.08 MB/s) - ‘glove.twitter.27B.zip’ saved [1520408563/1520408563]

Archive:  glove.twitter.27B.zip
replace glove.twitter.27B.25d.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: glove.twitter.27B.25d.tx

In [9]:
gloveModel = load_glove_model("glove.twitter.27B.25d.txt")
dim = 25
#
#gloveModel = load_glove_model("glove.twitter.27B.50d.txt")
#dim = 50
#
#gloveModel = load_glove_model("glove.twitter.27B.100d.txt")
#dim = 100
#
#gloveModel = load_glove_model("glove.twitter.27B.200d.txt")
#dim = 200


Loading Glove Model
1193514 words loaded!


In [10]:
train_df = pd.read_csv(io.BytesIO(upload_train['vs_train.csv']))
dev_df = pd.read_csv(io.BytesIO(upload_dev['vs_dev.csv']))

In [11]:
print(train_df) # training data

       Unnamed: 0                                              tweet  label
0               0  Sip N Shop Come thru right now #Marjais #Popul...      0
1               1  I don't know about you but My family and I wil...      1
2               2  @MSignorile Immunizations should be mandatory....      2
3               3  President Obama spoke in favor of vaccination ...      0
4               4  "@myfoxla: Arizona monitoring hundreds for mea...      0
...           ...                                                ...    ...
15971       15971  @Salon if u believe the anti-vax nutcases caus...      1
15972       15972  How do you feel about parents who don't #vacci...      0
15973       15973  70 Preschoolers Tested for Measles in Simi Val...      0
15974       15974  Finance Minister: Budget offers room to procur...      0
15975       15975  Are you up to date on vaccines? Take CDC’s vac...      2

[15976 rows x 3 columns]


In [12]:
print(dev_df) # validation data

      Unnamed: 0                                              tweet  label
0              0  @user They had a massive surge in with covid d...      1
1              1  Required vaccines for school: Parents and guar...      0
2              2  “@KCStar: Two more Johnson County children hav...      0
3              3  NV can do better. Which states are the best (a...      2
4              4  Nothing like killing ourselves w/ our own fear...      2
...          ...                                                ...    ...
2277        2277  RT @abc7: Number of measles cases reported in ...      0
2278        2278  Evidence points to the idea that "measles affe...      0
2279        2279  Where's @SavedYouAClick "@voxdotcom: Why you s...      2
2280        2280  Some of my favorite people have autism. If tha...      2
2281        2281  Coronavirus: The married couple behind the suc...      0

[2282 rows x 3 columns]


## Remove empty / duplicate tweets

In [13]:
train_df.dropna(subset = ["tweet"], inplace=True)
train_df.drop_duplicates(subset = ["tweet"], inplace=True)

dev_df.dropna(subset = ["tweet"], inplace=True)

train_df.drop(['Unnamed: 0'], axis=1, inplace = True) 
dev_df.drop(['Unnamed: 0'], axis=1, inplace = True) 

print(train_df.shape)
print(dev_df.shape)

(15881, 2)
(2282, 2)


## Clean text

In [14]:
def clean_text(text):
  text = text.lower()                                           # lowercase
  text = text.strip()                                           # remove white spaces
  #text = ' '.join(re.sub("(@[A-Za-z0-9]+)"," ",text).split())   # remove twitter user accounts
  text = re.sub(r'http\S+', ' ', text)                          # remove urls
  text = re.sub('[^A-Za-z0-9]+', ' ', text)                     # remove special characters
  # perform lemmatization
  cleaned_text = ""
  for word in text.split() :
    if word in stop_words:                                      # remove stopwords
      continue
    temp = lemmatizer.lemmatize(word)                           # lemmatize
    #temp = snowball.stem(word)
    #temp = lancaster.stem(word)
    #temp = porter.stem(word)
    cleaned_text += (temp + " ")
  return cleaned_text

In [15]:
cleaned_text = []                                  # clean training data
for line in train_df["tweet"]:
  cleaned_text.append(clean_text(line))
cleaned_text_val = []                              # clean validation data
for line in dev_df["tweet"]:
  cleaned_text_val.append(clean_text(line))

train_df = train_df.assign(clean_tweet = lambda x: cleaned_text)
dev_df = dev_df.assign(clean_tweet = lambda x: cleaned_text_val)

train_df.drop(['tweet'], axis=1, inplace = True) 
dev_df.drop(['tweet'], axis=1, inplace = True) 

In [16]:
total_df = train_df.append(dev_df)
total_df.shape

(18163, 2)

### Use Word Embeddings

In [17]:
vocab = {}
for row in range(0, total_df.shape[0]):
  vocab[row] = []
i = 0
for tweet in total_df["clean_tweet"]: # each tweet
  count = np.zeros(dim)  
  for word in tweet.split():
    if (word in gloveModel):
      count += gloveModel[word]
  for num in count/dim:
    vocab[i].append(num)
  i += 1

In [18]:
df = pd.DataFrame.from_dict(vocab, orient='index')
#df = df.transpose()
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24
0,-0.074209,0.174633,-0.018545,-0.030889,-0.051354,-0.050267,0.174427,0.031691,-0.011138,0.126046,-0.059712,-0.051399,-0.903162,-0.034106,-0.020152,-0.055038,0.026370,-0.129461,-0.080249,-0.004640,-0.054612,-0.052254,0.154567,0.132431,0.081430
1,-0.096885,0.198095,-0.036882,-0.027684,-0.064159,-0.100399,0.301494,-0.063744,-0.024514,0.023850,-0.076308,0.067945,-0.922091,0.012661,0.092259,0.003399,0.080930,-0.173531,0.035397,-0.089007,-0.018312,0.031987,-0.018892,0.075888,-0.009154
2,-0.040021,0.258818,-0.079311,-0.117769,-0.042948,-0.029355,0.461787,-0.403149,-0.017695,0.054543,-0.156850,0.092332,-1.501643,0.117023,0.250349,0.064133,0.097205,-0.161577,0.073152,-0.167225,-0.092468,0.156980,0.077036,-0.134057,-0.034076
3,0.035770,0.332920,-0.080705,-0.037391,0.115634,-0.350312,0.212475,-0.222832,0.052082,-0.137239,0.060121,0.017058,-1.428052,0.010468,0.194881,-0.033724,-0.066033,-0.011467,0.148273,-0.270828,0.099840,0.127823,-0.090725,-0.291199,-0.178715
4,0.010700,0.106362,-0.045609,-0.027944,-0.061994,-0.040095,0.175118,-0.341968,-0.027564,0.030356,0.145610,0.041177,-0.824256,0.154942,0.065042,-0.105719,0.084132,0.015968,0.198163,-0.051885,-0.170522,-0.036614,-0.022403,-0.270810,-0.051687
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18158,0.068196,0.232128,0.084904,-0.170493,0.003719,-0.088877,0.527260,0.026907,-0.020015,0.118675,-0.070675,0.160058,-1.776028,0.137248,0.220457,-0.129746,0.188813,-0.176381,0.013349,-0.074251,-0.018184,0.080001,0.130804,-0.048743,-0.079874
18159,0.135183,0.103666,-0.183617,0.069301,0.194723,-0.035908,0.397737,-0.391050,0.025930,0.104589,0.088990,0.076809,-1.422444,0.272032,0.283920,0.007851,0.152214,0.103808,0.302962,-0.237584,-0.146742,0.087779,0.016166,-0.096209,-0.205875
18160,0.012991,0.094557,0.045770,0.002997,-0.000554,-0.057806,0.143313,-0.167676,-0.063106,-0.008966,-0.018790,0.015629,-0.523774,0.007312,0.009046,-0.010261,0.048903,-0.101382,0.002362,-0.044212,-0.071073,0.078687,0.016030,-0.094016,-0.049905
18161,-0.094936,0.232104,-0.101476,0.067977,0.059048,-0.099540,0.410378,-0.217557,-0.181291,0.041748,-0.102457,0.161872,-1.260708,-0.074044,0.073398,0.053315,0.095231,-0.091243,0.197620,-0.116799,0.012127,0.090565,-0.012347,-0.159820,-0.066846


In [19]:
train = df.iloc[:train_df.shape[0], :]
dev = df.iloc[train_df.shape[0]:, :]

x = torch.tensor(df.values, dtype=torch.float)
y = torch.tensor(total_df['label'].values)

x_train = torch.tensor(train.values, dtype=torch.float)
y_train = torch.tensor(train_df['label'].values)

x_dev = torch.tensor(dev.values, dtype=torch.float)
y_dev = torch.tensor(dev_df['label'].values)

# Initialize dataloaders
train_dataset = TensorDataset(x_train, y_train)
train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True)

dev_dataset = TensorDataset(x_dev, y_dev)
dev_dataloader = DataLoader(dev_dataset, batch_size=64, shuffle=True)

## Models

### LSTM

In [20]:
class LSTMCell(nn.Module):
    def __init__(self, num_features, num_hidden, num_classes):
        super().__init__()
        
        self.num_features = num_features
        self.num_hidden = num_hidden
        self.num_classes = num_classes
        
        # Network Parameters
        # New cell content
        self.Wxh = nn.Parameter(torch.randn((num_features, num_hidden)))
        self.Whh = nn.Parameter(torch.randn((num_hidden, num_hidden)))
        self.bh = nn.Parameter(torch.zeros((num_hidden)))
        
        # Input gate parameters
        self.Wxh_i = nn.Parameter(torch.randn_like(self.Wxh))
        self.Whh_i = nn.Parameter(torch.randn_like(self.Whh))
        self.bh_i = nn.Parameter(torch.randn_like(self.bh))
        
        # Forget gate parameters
        self.Wxh_f = nn.Parameter(torch.randn_like(self.Wxh))
        self.Whh_f = nn.Parameter(torch.randn_like(self.Whh))
        self.bh_f = nn.Parameter(torch.randn_like(self.bh))
        
        # Output gate parameters
        self.Wxh_o = nn.Parameter(torch.randn_like(self.Wxh))
        self.Whh_o = nn.Parameter(torch.randn_like(self.Whh))
        self.bh_o = nn.Parameter(torch.randn_like(self.bh))
        
        # Hidden -> Output
        self.Why = nn.Parameter(torch.randn((num_hidden, self.num_classes)))
        self.by = nn.Parameter(torch.zeros((self.num_classes))) 
        
        # Activations
        self.tanh = nn.Tanh()
        self.sigmoid = nn.Sigmoid()
    
    def init(self):
        self.h = torch.zeros((self.num_hidden))  # Hidden state
        self.c = torch.zeros((self.num_hidden))  # Cell state
        
    def forward(self, x):
        potential_input = self.tanh((x @ self.Wxh) + (self.h @ self.Whh + self.bh))
        
        # Gate updates
        input_gate = self.sigmoid((x @ self.Wxh_i) + (self.h @ self.Whh_i + self.bh_i))
        forget_gate = self.sigmoid((x @ self.Wxh_f) + (self.h @ self.Whh_f + self.bh_f))
        output_gate = self.sigmoid((x @ self.Wxh_o) + (self.h @ self.Whh_o + self.bh_o))
        
        # Update c and h
        self.c = self.c * forget_gate + potential_input * input_gate
        self.h = output_gate * self.tanh(self.c)
        
        y_output = self.h @ self.Why + self.by
        
        return y_output


### GRU

In [21]:
class GRUCell(nn.Module):
    def __init__(self, num_features, num_hidden, num_classes):
        super().__init__()
        
        self.num_features = num_features
        self.num_hidden = num_hidden
        self.num_classes = num_classes
        
        # Network Parameters
        # Potential Input
        self.Wxh = nn.Parameter(torch.randn((num_features, num_hidden)))
        self.Whh = nn.Parameter(torch.randn((num_hidden, num_hidden)))
        self.bh = nn.Parameter(torch.zeros((num_hidden)))
        
        # Update gate parameters
        self.Wxh_u = nn.Parameter(torch.randn_like(self.Wxh))
        self.Whh_u = nn.Parameter(torch.randn_like(self.Whh))
        self.bh_u = nn.Parameter(torch.randn_like(self.bh))
        
        # Reset gate parameters
        self.Wxh_r = nn.Parameter(torch.randn_like(self.Wxh))
        self.Whh_r = nn.Parameter(torch.randn_like(self.Whh))
        self.bh_r = nn.Parameter(torch.randn_like(self.bh))
        
        # Hidden -> Output
        self.Why = nn.Parameter(torch.randn((num_hidden, self.num_classes)))
        self.by = nn.Parameter(torch.zeros((self.num_classes))) 
        
        # Activations
        self.tanh = nn.Tanh()
        self.sigmoid = nn.Sigmoid()
    
    def init(self):
        self.h = torch.zeros((self.num_hidden))  # Hidden state
        
    def forward(self, x):
        
        # Gate updates
        update_gate = self.sigmoid((x @ self.Wxh_u) + (self.h @ self.Whh_u + self.bh_u))
        reset_gate = self.sigmoid((x @ self.Wxh_r) + (self.h @ self.Whh_r + self.bh_r))
        
        potential_input = self.tanh((x @ self.Wxh) + (reset_gate @ self.Whh + self.bh))
        
        self.h = self.h * (1-update_gate) + (potential_input * update_gate)
        y_output = self.h @ self.Why + self.by
        
        return y_output

In [22]:
class RNN(nn.Module):
    """
    Wrapper class that will hold the interface for LSTMs and GRUs
    """
    cells = {
        "LSTM"    : LSTMCell,
        "GRU"     : GRUCell,
    }
    
    def __init__(self, num_features, num_hidden=10, num_classes=3, cell_type='LSTM'):
        super().__init__()
        self.cell_type = cell_type
        self.cell = RNN.cells[cell_type](num_features, num_hidden, num_classes)

        self._init_weights()
    
    def _init_weights(self):
        for param in self.cell.parameters():
            # Keep track of gradient for backprop
            param.requires_grad_(True)
            # If we deal with weights xavier initialization
            if param.data.ndimension() >= 2:
                nn.init.xavier_uniform_(param.data) #keep variance stability
            # Else is a bias term so all zeros
            else: 
                nn.init.zeros_(param.data)
                
    def forward(self, X):
        # Setup outputs container (the output at each step)
        outputs = torch.zeros(X.size(0),3)
        # Iterate through sequence
        self.cell.init()
        for i, x in enumerate(X):
          outputs[i] = self.cell(x)
        return outputs

### Setup

In [None]:
N_EPOCHS = 100
LR = 0.01
num_hidden = 10

epoch_loss = []
epoch_loss_dev = []
epoch_acc = []
epoch_acc_dev = []

for CELL_TYPE in ['LSTM', 'GRU']:
    print(f'Fitting {CELL_TYPE}...')

    model = RNN(num_features=dim, num_hidden=num_hidden, num_classes=3, cell_type=CELL_TYPE)
    optimizer = optim.Adam(model.parameters(), lr=LR)
    criterion = nn.CrossEntropyLoss()
    losses = []

    # training
    model.train()
    for epoch in range(N_EPOCHS):

      batch_losses = []
      batch_acc = 0
      total = 0
      total_dev = 0
      loss = 0
      pred_proba = []

      for x_batch, y_batch in train_dataloader:
        optimizer.zero_grad()
        y_pred = model(x_batch)#.squeeze(1)
        loss = criterion(y_pred, y_batch)
        loss.backward()

        # Clip params
        for param in model.parameters():
            if param.grad is None:
                continue
            grad_val = torch.clamp(param.grad, -5, 5)
        optimizer.step()

        # Total number of labels
        total += y_batch.size(0)
        # Total correct predictions
        _,pred_label = torch.max(y_pred, dim = 1)
        batch_acc += (pred_label == y_batch).sum()
        # Track loss
        batch_losses.append(loss)

      # validation    
      with torch.no_grad():
        
        batch_losses_dev = []
        batch_acc_dev = 0
        model.eval()

        for x_batch, y_batch in dev_dataloader:
            y_dev_pred = model(x_batch).squeeze(1)
            prob = F.softmax(y_dev_pred, dim=1)   # probability that an instance belogs to each class
            for i in prob:
              pred_proba.append(i.tolist())
            loss_dev = criterion(y_dev_pred, y_batch)
            batch_losses_dev.append(loss_dev)
            # number of labels
            total_dev += y_batch.size(0)
            # correct predictions
            _,pred_label = torch.max(y_dev_pred, dim = 1)  # get max probability
            batch_acc_dev += (pred_label == y_batch).sum()  

      accuracy = batch_acc/total
      accuracy_dev = batch_acc_dev/total_dev
      train_loss = sum(batch_losses)/len(train_dataloader)
      valid_loss = sum(batch_losses_dev)/len(dev_dataloader)
      epoch_loss.append(train_loss)
      epoch_loss_dev.append(valid_loss)
      epoch_acc.append(accuracy)
      epoch_acc_dev.append(accuracy_dev)

      print(f"Epoch {epoch:3}: | Train Loss = {train_loss:.5f} | Train Accuracy = {accuracy:.5f} | Validation Loss = {valid_loss:.5f} | Validation Accuracy = {accuracy_dev:.5f}")

Fitting LSTM...
Epoch   0: | Train Loss = 0.91320 | Train Accuracy = 0.55733 | Validation Loss = 0.85167 | Validation Accuracy = 0.59290
Epoch   1: | Train Loss = 0.85131 | Train Accuracy = 0.60160 | Validation Loss = 0.82557 | Validation Accuracy = 0.61613
Epoch   2: | Train Loss = 0.83470 | Train Accuracy = 0.61375 | Validation Loss = 0.81622 | Validation Accuracy = 0.61394
Epoch   3: | Train Loss = 0.82650 | Train Accuracy = 0.61564 | Validation Loss = 0.82249 | Validation Accuracy = 0.60868
Epoch   4: | Train Loss = 0.82152 | Train Accuracy = 0.61621 | Validation Loss = 0.80621 | Validation Accuracy = 0.61394
Epoch   5: | Train Loss = 0.81439 | Train Accuracy = 0.62005 | Validation Loss = 0.81941 | Validation Accuracy = 0.61087
Epoch   6: | Train Loss = 0.81190 | Train Accuracy = 0.62005 | Validation Loss = 0.80879 | Validation Accuracy = 0.61744
Epoch   7: | Train Loss = 0.80873 | Train Accuracy = 0.62282 | Validation Loss = 0.79565 | Validation Accuracy = 0.61963
Epoch   8: | Tra

### Complexity

In [None]:
num_features, num_hidden, num_classes = 300, 500, 1
print('Number of Params:..')
for cell_type in ['LSTM', 'GRU']:
    model = RNN(num_features=dim, num_hidden=num_hidden, cell_type=cell_type)
    model_param_count = 0
    for name, param in model.named_parameters():
        if param.requires_grad:
            c = 1
            for val in param.shape:
                c *= val
        model_param_count += c
    print(f'{cell_type} : {model_param_count}')

### Evaluation

In [None]:
model.eval()
true = y_dev
pred = model(x_dev).squeeze(1)
after_train = criterion(pred, true) 
print('Test loss after Training' , after_train.item())

In [None]:
_,pred_label = torch.max(pred, dim = 1)

In [None]:
target_names = ['neutral', 'anti-vax', 'pro-vax']
df = pd.DataFrame({'true':true.numpy()})
df2 = pd.DataFrame({'pred':pred_label.numpy()})
cm = confusion_matrix(true.numpy(), pred_label.numpy())
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=target_names)
disp.plot()
plt.show()

In [None]:
print("Precision-Recall-F1 - Test Data :")
print(precision_recall_fscore_support(true.numpy(), pred_label.numpy(), average='micro'))
print()

In [None]:
print(classification_report(true.numpy(), pred_label.numpy()))

In [None]:
prob = F.softmax(pred, dim=1)
pred_proba = []
for i in prob:
  pred_proba.append(i.tolist())

In [None]:
macro_roc_auc_ovo = roc_auc_score(true.tolist(), pred_proba, multi_class="ovo", average="macro")
weighted_roc_auc_ovo = roc_auc_score(true.tolist(), pred_proba, multi_class="ovo", average="weighted")
macro_roc_auc_ovr = roc_auc_score(true.tolist(), pred_proba, multi_class="ovr", average="macro")
weighted_roc_auc_ovr = roc_auc_score(true.tolist(), pred_proba, multi_class="ovr", average="weighted")
print(
    "One-vs-One ROC AUC scores:\n{:.6f} (macro),\n{:.6f} "
    "(weighted by prevalence)".format(macro_roc_auc_ovo, weighted_roc_auc_ovo)
)
print()
print(
    "One-vs-Rest ROC AUC scores:\n{:.6f} (macro),\n{:.6f} "
    "(weighted by prevalence)".format(macro_roc_auc_ovr, weighted_roc_auc_ovr)
)

### Plots

In [None]:
# probabilities
df_prob = pd.DataFrame(pred_proba)
df_prob

In [None]:
# roc curve for classes
fpr = {}
tpr = {}
thresh ={}

n_class = 3

for i in range(n_class):    
    fpr[i], tpr[i], thresh[i] = roc_curve(true, df_prob[i], pos_label=i)
    
# plotting    
plt.plot(fpr[0], tpr[0], linestyle='--',color='orange', label='Class 0 vs Rest')
plt.plot(fpr[1], tpr[1], linestyle='--',color='green', label='Class 1 vs Rest')
plt.plot(fpr[2], tpr[2], linestyle='--',color='blue', label='Class 2 vs Rest')
plt.title('Multiclass ROC curve')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive rate')
plt.legend(loc='best')
plt.savefig('Multiclass ROC',dpi=300);  

In [None]:
def plot_graph_loss(epochs):
    fig = plt.figure(figsize=(12,12))
    plt.title("Train/Validation Loss")
    plt.plot(list(np.arange(epochs) + 1) , epoch_loss, label='train')
    plt.plot(list(np.arange(epochs) + 1), epoch_loss_dev, label='validation')
    plt.xlabel('num_epochs', fontsize=12)
    plt.ylabel('loss', fontsize=12)
    plt.legend(['train', 'validation']);

plot_graph_loss(100)

In [None]:
def plot_graph_acc(epochs):
    fig = plt.figure(figsize=(12,12))
    plt.title("Train/Validation Accuracy")
    plt.plot(list(np.arange(epochs) + 1) , epoch_acc, label='train')
    plt.plot(list(np.arange(epochs) + 1), epoch_acc_dev, label='validation')
    plt.xlabel('num_epochs', fontsize=12)
    plt.ylabel('accuracy', fontsize=12)
    plt.legend(['train', 'validation']);

plot_graph_acc(100)    

## Second Model

In [None]:
class Net2(nn.Module):
    def __init__(self, D_in, H1, H2, H3, D_out):
        super(Net2, self).__init__()
        self.linear1 = nn.Linear(D_in, H1)
        self.batchnorm1 = nn.BatchNorm1d(H1)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=0.2)
        self.linear2 = nn.Linear(H1, H2)
        self.sigmoid = nn.Sigmoid()
        self.linear3 = nn.Linear(H2, H3)
        self.linear4 = nn.Linear(H3, D_out)
        self.logsoftmax = nn.LogSoftmax(dim=1)
        
    def forward(self, x):
        out = self.linear1(x)
        out = self.batchnorm1(out)
        out = self.relu(out)
        out = self.dropout(out)
        out = self.linear2(out)
        out = self.sigmoid(out)
        out = self.linear3(out)
        out = self.logsoftmax(out)
        return out

In [None]:
#Define layer sizes
D_in = x.shape[1]
H1 = 128
H2 = 64
H3 = 32
D_out = 3

#Define Hyperparameters
learning_rate = 1e-4

#Initialize model, optimizer, loss function
model = Net2(D_in, H1, H2, H3, D_out)

loss_func = nn.NLLLoss()                    # with softmax in output of model

optimizer = optim.Adam(model.parameters(), lr=learning_rate)
#optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum=0.2, weight_decay=0.01)
#optimizer = optim.Adagrad(model.parameters(), lr=learning_rate)

In [None]:
model

In [None]:
epoch_loss = []
epoch_loss_dev = []
epoch_acc = []
epoch_acc_dev = []

for epoch in range(100):

  batch_losses = []
  batch_acc = 0
  total = 0
  total_dev = 0
  loss = 0

  # training
  model.train()
  for x_batch, y_batch in train_dataloader:  # for every batch
    y_pred = model(x_batch).squeeze(1)
    loss = loss_func(y_pred, y_batch)
    batch_losses.append(loss)
    #Delete previously stored gradients
    optimizer.zero_grad()
    #Perform backpropagation starting from the loss calculated in this epoch
    loss.backward()
    #Update model's weights based on the gradients calculated during backprop
    optimizer.step()

    # Total number of labels
    total += y_batch.size(0)
    # Total correct predictions
    _,pred_label = torch.max(y_pred, dim = 1)
    batch_acc += (pred_label == y_batch).sum()

  # validation    
  with torch.no_grad():
    batch_losses_dev = []
    batch_acc_dev = 0
    model.eval()

    for x_batch, y_batch in dev_dataloader:
        y_dev_pred = model(x_batch).squeeze(1)
        loss_dev = loss_func(y_dev_pred, y_batch)
        batch_losses_dev.append(loss_dev)
        # number of labels
        total_dev += y_batch.size(0)
        # correct predictions
        _,pred_label = torch.max(y_dev_pred, dim = 1)  # get max probability
        batch_acc_dev += (pred_label == y_batch).sum()

  accuracy = batch_acc/total
  accuracy_dev = batch_acc_dev/total_dev

  train_loss = sum(batch_losses)/len(train_dataloader)
  valid_loss = sum(batch_losses_dev)/len(dev_dataloader)

  epoch_loss.append(train_loss)
  epoch_loss_dev.append(valid_loss)
  epoch_acc.append(accuracy)
  epoch_acc_dev.append(accuracy_dev)

  print(f"Epoch {epoch:3}: | Train Loss = {train_loss:.5f} | Train Accuracy = {accuracy:.5f} | Validation Loss = {valid_loss:.5f} | Validation Accuracy = {accuracy_dev:.5f} ")

### Evaluation

In [None]:
model.eval()
true = y_dev
pred = model(x_dev).squeeze(1)
after_train = criterion(pred, true) 
print('Test loss after Training' , after_train.item())

In [None]:
_,pred_label = torch.max(pred, dim = 1)

In [None]:
target_names = ['neutral', 'anti-vax', 'pro-vax']
df = pd.DataFrame({'true':true})
df2 = pd.DataFrame({'pred':pred_label})
cm = confusion_matrix(true, pred_label)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=target_names)
disp.plot()
plt.show()

In [None]:
print("Precision-Recall-F1 - Training Data :")
print(precision_recall_fscore_support(true, pred_label, average='micro'))

In [None]:
print(classification_report(true, pred_label))

### Plots

In [None]:
def plot_graph_loss(epochs):
    fig = plt.figure(figsize=(12,12))
    plt.title("Train/Validation Loss")
    plt.plot(list(np.arange(epochs) + 1) , epoch_loss, label='train')
    plt.plot(list(np.arange(epochs) + 1), epoch_loss_dev, label='validation')
    plt.xlabel('num_epochs', fontsize=12)
    plt.ylabel('loss', fontsize=12)
    plt.legend(['train', 'validation']);
plot_graph_loss(100)

In [None]:
def plot_graph_acc(epochs):
    fig = plt.figure(figsize=(12,12))
    plt.title("Train/Validation Accuracy")
    plt.plot(list(np.arange(epochs) + 1) , epoch_acc, label='train')
    plt.plot(list(np.arange(epochs) + 1), epoch_acc_dev, label='validation')
    plt.xlabel('num_epochs', fontsize=12)
    plt.ylabel('accuracy', fontsize=12)
    plt.legend(['train', 'validation']);
plot_graph_acc(100)    