In [None]:
!pip install transformers
!pip install emoji
!pip install jsonlines
!pip install sentence_transformers

In [None]:
import torch
from sentence_transformers import SentenceTransformer,util
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import recall_score,precision_score,f1_score,precision_recall_fscore_support
import jsonlines
import json
import re
import numpy as np
from helper_function import *
from Tweet_Info_Obj import *

from google.colab import drive
drive.mount('/content/gdrive')

folder='/content/gdrive/My Drive/B_NLP_Project/project-data'

In [None]:
bertweet = AutoModel.from_pretrained("vinai/bertweet-base")
tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base", normalization=True)
line = "SC has first two presumptive cases of coronavirus, DHEC confirms https://postandcourier.com/health/covid19/sc-has-first-two-presumptive-cases-of-coronavirus-dhec-confirms/article_bddfe4ae-5fd3-11ea-9ce4-5f495366cee6.html?utm_medium=social&utm_source=twitter&utm_campaign=user-share… via @postandcourier"

input_ids = torch.tensor([tokenizer.encode(line)])
input_ids

with torch.no_grad():
    features = bertweet(input_ids)
    print(features.last_hidden_state.shape)



In [None]:
with torch.no_grad():
    features = bertweet(input_ids)
    print(features.last_hidden_state)

In [None]:
def print_scores(y_true,y_pred):
    print(f1_score(y_true,y_pred),precision_score(y_true,y_pred),recall_score(y_true,y_pred))

In [None]:
train_tweets_corpus,train_tweet_id,train_tweet_info=extract_data(folder+'/train.data.jsonl')
dev_tweets_corpus,dev_tweet_id,dev_tweet_info=extract_data(folder+'/dev.data.jsonl')
test_tweets_corpus,test_tweet_id,test_tweet_info=extract_data(folder+'/test.data.jsonl')

train_data_label=get_labels(folder+'/train.label.json',train_tweet_id)
dev_data_label=get_labels(folder+'/dev.label.json',dev_tweet_id)

preprocess_train_tweet_corpous=preprocees_tweets(train_tweets_corpus)
preprocess_test_tweet_corpous=preprocees_tweets(test_tweets_corpus)
preprocess_dev_tweet_corpous=preprocees_tweets(dev_tweets_corpus)

In [None]:
TrainY=[0 if x=='non-rumour' else 1 for x in train_data_label]
DevY=[0 if x=='non-rumour' else 1 for x in dev_data_label]

In [None]:
def get_min_max_similar(corpus):
  if len(corpus)==1:
    return None,None
  if len(corpus)==2:
    return 0,None
  embeddings1 = model1.encode(corpus[0], convert_to_tensor=True)
  embeddings2 = model1.encode(corpus[1:], convert_to_tensor=True)
  cosine_scores = util.pytorch_cos_sim(embeddings1, embeddings2)

  return np.argmax(cosine_scores.tolist()[0]),np.argmin(cosine_scores.tolist()[0])

def get_new_input_text(sets):
  data=[]
  for obj in sets:
    max_arg,min_arg=get_min_max_similar(obj)
    if min_arg==None:
      if max_arg==None:
        data.append(obj[0])
      else:
        data.append(obj[0]+"."+obj[1])
    else:
      data.append(obj[0]+"."+obj[max_arg+1]+"."+obj[min_arg+1])
  return data

In [None]:
def combine_tweets(corpus):
  return [".".join(group) for group in corpus]

In [None]:
model1 = SentenceTransformer('stsb-mpnet-base-v2')
train_tweets=get_new_input_text(preprocess_train_tweet_corpous)
dev_tweets=get_new_input_text(preprocess_dev_tweet_corpous)
test_tweets=get_new_input_text(preprocess_test_tweet_corpous)

In [None]:
from torch.utils.data import Dataset

class SSTDataset(Dataset):

    def __init__(self, data,labels, maxlen):

        #Store the contents of the file in a pandas dataframe
        self.data = data
        self.labels=labels
        #Initialize the BERT tokenizer
        self.tokenizer = tokenizer

        self.maxlen = maxlen

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):

        #Selecting the sentence and label at the specified index in the data frame
        sentence = self.data[index]
        if self.labels!=None:
          label = self.labels[index]
        else:
          label=None
        
          #Preprocessing the text to be suitable for BERT
        tokens = self.tokenizer.tokenize(sentence) #Tokenize the sentence
        tokens = ['[CLS]'] + tokens + ['[SEP]'] #Insering the CLS and SEP token in the beginning and end of the sentence
        if len(tokens) < self.maxlen:
            tokens = tokens + ['[PAD]' for _ in range(self.maxlen - len(tokens))] #Padding sentences
        else:
            tokens = tokens[:self.maxlen-1] + ['[SEP]'] #Prunning the list to be of specified max length

        tokens_ids = self.tokenizer.convert_tokens_to_ids(tokens) #Obtaining the indices of the tokens in the BERT Vocabulary
        tokens_ids_tensor = torch.tensor(tokens_ids) #Converting the list to a pytorch tensor

        #Obtaining the attention mask i.e a tensor containing 1s for no padded tokens and 0s for padded ones
        attn_mask = (tokens_ids_tensor != 0).long()

        if label!=None:
          return tokens_ids_tensor, attn_mask, label
        return tokens_ids_tensor, attn_mask

In [None]:
from torch.utils.data import DataLoader
train_set = SSTDataset(train_tweets, TrainY,maxlen = 100)
dev_set = SSTDataset(dev_tweets, DevY,maxlen = 100)

#Creating intsances of training and development dataloaders
train_loader = DataLoader(train_set, batch_size = 64, num_workers = 2)
dev_loader = DataLoader(dev_set, batch_size = 64, num_workers = 2)

In [None]:
train_set.__getitem__(0)

In [None]:
import torch.nn as nn

class RumourClassifier(nn.Module):

    def __init__(self):
        super(RumourClassifier, self).__init__()
        #Instantiating BERT model object 
        self.bert_layer = bertweet
        
        #Classification layer
        #input dimension is 768 because [CLS] embedding has a dimension of 768
        #output dimension is 1 because we're working with a binary classification problem
        self.cls_layer = nn.Linear(768, 1)

    def forward(self, seq, attn_masks):
        '''
        Inputs:
            -seq : Tensor of shape [B, T] containing token ids of sequences
            -attn_masks : Tensor of shape [B, T] containing attention masks to be used to avoid contibution of PAD tokens
        '''

        #Feeding the input to BERT model to obtain contextualized representations
        outputs = self.bert_layer(seq, attention_mask = attn_masks)
        cont_reps = outputs.last_hidden_state

        #Obtaining the representation of [CLS] head (the first token)
        cls_rep = cont_reps[:, 0]

        #Feeding cls_rep to the classifier layer
        logits = self.cls_layer(cls_rep)

        return logits

In [None]:
gpu = 0 #gpu ID

print("Creating the rumour classifier, initialised with pretrained BERT-BASE parameters...")
net = RumourClassifier()
net.cuda(gpu)
print("Done creating the rumour classifier.")

In [None]:
import torch.nn as nn
import torch.optim as optim

criterion = nn.BCEWithLogitsLoss()
opti = optim.Adam(net.parameters(), lr = 2e-5)

In [None]:
import time

def train(net, criterion, opti, train_loader, dev_loader, max_eps, gpu):

    best_acc = 0
    st = time.time()
    for ep in range(max_eps):
        
        for it, (seq, attn_masks, labels) in enumerate(train_loader):
            #Clear gradients
            opti.zero_grad()  
            #Converting these to cuda tensors
            seq, attn_masks, labels = seq.cuda(gpu), attn_masks.cuda(gpu), labels.cuda(gpu)

            #Obtaining the logits from the model
            logits = net(seq, attn_masks)

            #Computing loss
            loss = criterion(logits.squeeze(-1), labels.float())

            #Backpropagating the gradients
            loss.backward()

            #Optimization step
            opti.step()
              
            if it % 100 == 0:
                
                acc = get_accuracy_from_logits(logits, labels)
                print("Iteration {} of epoch {} complete. Loss: {}; Accuracy: {}; Time taken (s): {}".format(it, ep, loss.item(), acc, (time.time()-st)))
                st = time.time()

        
        dev_acc, dev_loss = evaluate(net, criterion, dev_loader, gpu)
        print("Epoch {} complete! Development Accuracy: {}; Development Loss: {}".format(ep, dev_acc, dev_loss))
        if dev_acc > best_acc:
            print("Best development accuracy improved from {} to {}, saving model...".format(best_acc, dev_acc))
            best_acc = dev_acc
            torch.save(net.state_dict(), 'sstcls_{}.dat'.format(ep))

In [None]:
def get_accuracy_from_logits(logits, labels):
    probs = torch.sigmoid(logits.unsqueeze(-1))
    soft_probs = (probs > 0.5).long()
    acc = (soft_probs.squeeze() == labels).float().mean()
    return acc

def evaluate(net, criterion, dataloader, gpu):
    net.eval()

    mean_acc, mean_loss = 0, 0
    count = 0

    with torch.no_grad():
        for seq, attn_masks, labels in dataloader:
            seq, attn_masks, labels = seq.cuda(gpu), attn_masks.cuda(gpu), labels.cuda(gpu)
            logits = net(seq, attn_masks)
            mean_loss += criterion(logits.squeeze(-1), labels.float()).item()
            mean_acc += get_accuracy_from_logits(logits, labels)
            count += 1

    return mean_acc / count, mean_loss / count

In [None]:
num_epoch = 2
new_trainX=train_tweets+dev_tweets
new_trainY=TrainY+DevY
new_set = SSTDataset(new_trainX, new_trainY,maxlen = 100)
new_loader = DataLoader(new_set, batch_size = 64, num_workers = 2)
#fine-tune the model
train(net, criterion, opti, train_loader, dev_loader, num_epoch, gpu)

In [None]:
test_set = SSTDataset(test_tweets, None,maxlen = 100)
test_loader = DataLoader(test_set, batch_size = 64, num_workers = 2)

In [None]:
covid_tweets_corpus,covid_tweet_id,covid_tweet_info=extract_data(folder+'/covid.data.jsonl')
covid_tweets=combine_tweets(covid_tweets_corpus)
covid_set = SSTDataset(covid_tweets, None,maxlen = 100)
covid_loader = DataLoader(covid_set, batch_size = 64, num_workers = 2)

In [None]:
pred=[]
with torch.no_grad():
  for seq, attn_masks,labels in dev_loader:
    seq, attn_masks,labels = seq.cuda(gpu), attn_masks.cuda(gpu), labels.cuda(gpu)
    logits = net(seq, attn_masks)
    probs = torch.sigmoid(logits.unsqueeze(-1))
    soft_probs = (probs > 0.5).long()
    pred+=soft_probs.squeeze().tolist()

In [None]:
print_scores(DevY,pred)

In [None]:
print(len(pred))
print(len(covid_tweet_id))

In [None]:
output_dict={}

for i in range(len(covid_tweet_id)):
    if pred[i]==0:
        output_dict[covid_tweet_id[i]]='non-rumour'
    else: 
        output_dict[covid_tweet_id[i]]='rumour'

In [None]:
with open(folder+'/covid-output.json', 'w') as f:
    json.dump(output_dict, f)

In [None]:
from collections import Counter

Counter(pred)