# Mounting Content from Google Drive

In [None]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


# Import necessary libraries and set the seeds

In [None]:
!pip install transformers

In [None]:
import torch
from tqdm.notebook import tqdm

from transformers import BertTokenizer
from torch.utils.data import TensorDataset

from transformers import BertForSequenceClassification

# OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
import logging
#logging.basicConfig(level=logging.INFO)

import matplotlib.pyplot as plt
% matplotlib inline
import pandas as pd
import random
import numpy as np
import os
import time # time module 




def set_seed(seed = 1234):
    '''Sets the seed of the entire notebook so results are the same every time we run.
    This is for REPRODUCIBILITY.'''
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # When running on the CuDNN backend, two further options must be set
    torch.backends.cudnn.deterministic = True
    # Set a fixed value for the hash seed
    os.environ['PYTHONHASHSEED'] = str(seed)
    
set_seed(23456)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # use 'cuda' if available else 'cpu'
print('Working on:', device)

Working on: cuda


# Import datasets


Labels in dataset:
*   label 0: neutral
*   label 1: anti-vax
*   label 2: pro-vax







In [None]:
# train set
Train_set_Location = r'/content/drive/MyDrive/Colab Notebooks/AI 2/vaccine_train_set.csv' 

# validation set
Validation_set_Location = r'/content/drive/MyDrive/Colab Notebooks/AI 2/vaccine_validation_set.csv'  

df_train = pd.read_csv(Train_set_Location, index_col=0)
df_test = pd.read_csv(Validation_set_Location, index_col=0)

# Drop the rows where at least one element is missing
df_train.dropna()
df_test.dropna()

df_train.head(5)

Unnamed: 0,tweet,label
0,Sip N Shop Come thru right now #Marjais #Popul...,0
1,I don't know about you but My family and I wil...,1
2,@MSignorile Immunizations should be mandatory....,2
3,President Obama spoke in favor of vaccination ...,0
4,"""@myfoxla: Arizona monitoring hundreds for mea...",0


In [None]:
print(df_train.shape)
print(df_test.shape)

(15976, 2)
(2282, 2)


# Tokenization with BertTokenizer and Encoding the data

In [None]:
# Constructs a BERT tokenizer.
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', 
                                           do_lower_case=True )

In [None]:
# encoding of training data
encoded_data_train = tokenizer.batch_encode_plus(
    df_train.tweet.values, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    padding=True,      # padding = 'max_length' , this takes too much time for fine-tuning
    truncation=True,  # for using max_length
    max_length=100,
    return_tensors='pt'   # return PyTorch
)


# encoding of testing data
encoded_data_test = tokenizer.batch_encode_plus(
    df_test.tweet.values, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    padding=True, 
    truncation=True,    
    max_length=100,
    return_tensors='pt'
)

In [None]:
# Split the data into input_ids, attention_masks, labels for train data
input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(df_train.label.values)

# do the same for test data
input_ids_test = encoded_data_test['input_ids']
attention_masks_test = encoded_data_test['attention_mask']
labels_test = torch.tensor(df_test.label.values)

In [None]:
# Create training data and testing data
dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
dataset_test = TensorDataset(input_ids_test, attention_masks_test, labels_test)

# BERT Pre-trained Model

Using the BertForSequenceClassification

In [None]:
label_dict = {'neutral':0, 'anti-vax':1, 'pro-vax':2}

In [None]:
# Create the Bert model
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=len(label_dict), # Using num_labels to indicate the number of output labels
                                                      output_hidden_states=False).to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

# Construct the DataLoaders

In [None]:
# RandomSampler for training and SequentialSampler for test
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler 

BATCH_SIZE = 16

# dataloader for training data
dataloader_train = DataLoader(dataset_train, 
                              sampler=RandomSampler(dataset_train), 
                              batch_size=BATCH_SIZE)
# dataloader for testing data
dataloader_test = DataLoader(dataset_test, 
                              sampler=SequentialSampler(dataset_test), 
                              batch_size=BATCH_SIZE)

# Optimizer & Scheduler

In [None]:
from transformers import  get_linear_schedule_with_warmup

optimizer = torch.optim.AdamW(model.parameters(),
                      lr=1e-5, 
                      eps=1e-8)
                  
epochs = 2

scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps=0,
                                            num_training_steps=len(dataloader_train)*epochs)

# Train the model

In [None]:
from sklearn.metrics import f1_score

def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average='weighted')



train_losses = []
test_losses =  []

for epoch in range(epochs):
    train_loss = [] # loss per epoch
    train_acc = [] # accuracy per epoch

    test_loss = []  # total loss for the whole incoming data
    test_acc = []   # total accuracy for the whole data

    model.train()
    with torch.set_grad_enabled(True):
      progress_bar = tqdm(dataloader_train,f"Epoch: {epoch+1}") #tqdm(dataloader_train, desc='Epoch {:1d}'.format(epoch),
                                                            #leave=False, disable=False)
      for batch in progress_bar: 

        model.zero_grad()

        batch = tuple(b.to(device) for b in batch)

        inputs = {'input_ids': batch[0],
                   'attention_mask': batch[1],
                   'labels': batch[2]
                  }

        outputs = model(**inputs)   # get the output from bert model
 
        loss = outputs[0]   # get the loss from the training batch

        train_loss.append(loss.item())

        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(np.mean(train_loss))})

      train_loss_temp = np.mean(train_loss)
      train_losses.append(train_loss_temp)


      model.eval()  # Evaluate the model
      with torch.no_grad():
        
        predictions, true_vals = [], []

        for batch in dataloader_test:

          batch = tuple(b.to(device) for b in batch)
          inputs = {'input_ids': batch[0],
                    'attention_mask': batch[1],
                    'labels': batch[2]
                    }
          outputs = model(**inputs)
          loss = outputs[0]
          logits = outputs[1]
          test_loss.append(loss.item())

          logits = logits.detach().cpu().numpy()
          label_ids = inputs['labels'].cpu().numpy()
          predictions.append(logits)
          true_vals.append(label_ids)

      test_loss_temp = np.mean(test_loss)
      predictions = np.concatenate(predictions, axis=0)
      true_vals = np.concatenate(true_vals, axis=0)

      val_f1 = f1_score_func(predictions, true_vals)

    tqdm.write(f'''Epoch: {epoch+1}  |  Train Loss: {train_loss_temp:.3f}  |  Val Loss: {test_loss_temp:.3f}  |  Val Acc: {val_f1*100:.2f}% ''')



Epoch: 1:   0%|          | 0/999 [00:00<?, ?it/s]

Epoch: 1  |  Train Loss: 0.712  |  Val Loss: 0.669  |  Val Acc: 71.00% 


Epoch: 2:   0%|          | 0/999 [00:00<?, ?it/s]

Epoch: 2  |  Train Loss: 0.537  |  Val Loss: 0.589  |  Val Acc: 75.83% 


# Get the scores

In [None]:
# convert into to list
l = [x.tolist() for x in predictions]
# convert into to numpy array and then to tensor
np_array = np.array(l)
torch_pred = torch.tensor(np_array)

# get the predicted labels
h, pred_classes = torch.max(torch_pred, 1) # get the index of the maximum value for each label

In [None]:
from sklearn import metrics

print(metrics.classification_report(true_vals, pred_classes, digits=3))

              precision    recall  f1-score   support

           0      0.866     0.786     0.824      1065
           1      0.599     0.500     0.545       296
           2      0.699     0.811     0.751       921

    accuracy                          0.759      2282
   macro avg      0.721     0.699     0.707      2282
weighted avg      0.764     0.759     0.758      2282

