In [1]:
import json
import pandas as pd

In [2]:
train_data = json.load(open('total_train_data.json'))
train_labels = json.load(open('total_train_data_labels.json'))

dev_data = json.load(open('total_dev_data.json'))
dev_labels = json.load(open('total_dev_data_labels.json'))

In [3]:
import re
import emoji

def process_data(data, data_labels):
    data_dict = {}
    
    tweet_ids = []
    tweet_texts = []
    tweet_retweets = []
    
    for i in range(len(data)):
        tweet_text = clean_tweet(data.get(str(i)).get('source tweet text'))
        
        retweets = data.get(str(i)).get('retweets')
        if retweets:
          retweets = list(map(lambda x:clean_tweet(x), retweets))
        
        tweet_ids.append(data.get(str(i)).get('source tweet id'))
        tweet_texts.append(tweet_text)
        tweet_retweets.append(retweets)
        
    data_dict['tweet_ids'] = tweet_ids
    data_dict['tweet_texts'] = tweet_texts
    data_dict['tweet_retweets'] = tweet_retweets
    
    if not data_labels:
      data_dict['tweet_labels'] = 2
    else:
      data_dict['tweet_labels'] = data_labels
    
    df = pd.DataFrame(data = data_dict)
    return df

def clean_tweet(text):
  # Remove hashtag while keeping hashtag text
  text = re.sub(r'#', '', text)
  # Remove HTML special entities (e.g. &amp;)
  text = re.sub(r'\&\w*;', '', text)
  # Remove tickers
  text = re.sub(r'\$\w*', '', text)
  # Remove hyperlinks
  text = re.sub(r'https?:\/\/.*\/\w*', '', text)
  # Remove URL, RT, mention(@)
  text = re.sub(r'http(\S)+', '', text)
  text = re.sub(r'http ...', '', text)
  text = re.sub(r'(RT|rt)[ ]*@[ ]*[\S]+', '', text)
  text = re.sub(r'RT[ ]?@', '', text)
  text = re.sub(r'@[\S]+', '', text)
  
  text = re.sub(r'&amp;?', 'and', text)
  text = re.sub(r'&lt;', '<', text)
  text = re.sub(r'&gt;', '>', text)
  
  # Remove emoji
  text = emoji.demojize(text)
  
  # Remove redundent whitespace (including new line characters)
  text = text.replace('\n', ' ')
  text = re.sub(r'\s\s+', '', text)
  text = re.sub(r'[ ]{2, }', ' ', text)
    
  return text
  

In [4]:
df_train = process_data(train_data,train_labels)
df_dev = process_data(dev_data,dev_labels)

In [None]:
#load pretrained bert base model
#this is already trained on a large courpus
from transformers import BertModel

bert_model = BertModel.from_pretrained('bert-base-uncased')

print("Done loading BERT model.")

In [6]:
import torch
from torch.utils.data import Dataset
from transformers import AutoTokenizer
import pandas as pd
import random

class TweetDataset(Dataset):

    def __init__(self, dataframe, maxlen): 

        #Store the contents of the file in a pandas dataframe
        self.df = dataframe

        #Initialize the BERT tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

        self.maxlen = maxlen

    def __len__(self):
        return len(self.df)     

    def __getitem__(self, index):
    
        #Selecting the sentence and label at the specified index in the data frame
        tweet_id = self.df.loc[index, 'tweet_ids']
        tweet_text = self.df.loc[index, 'tweet_texts']
        tweet_retweet = self.df.loc[index, 'tweet_retweets']
        tweet_labels = self.df.loc[index, 'tweet_labels']
        
        retweet_text = ""
        if(tweet_retweet):
            random.shuffle(tweet_retweet)
            retweet_text = '[SEP]'.join(tweet_retweet)
            
        #Preprocessing the text to be suitable for BERT
        tokens = self.tokenizer(tweet_text,
                                retweet_text,
                                truncation=True,
                                padding='max_length',
                                max_length=self.maxlen) #Tokenize the sentence
        
        tokens_ids_tensor = torch.tensor(tokens['input_ids']) #Converting the list to a pytorch tensor

        #Obtaining the attention mask i.e a tensor containing 1s for no padded tokens and 0s for padded ones
        attn_mask = torch.tensor(tokens['attention_mask'])

        seg_ids = torch.tensor(tokens['token_type_ids'])

        return tokens_ids_tensor, attn_mask, seg_ids, tweet_labels

In [None]:
from torch.utils.data import DataLoader

#Creating instances of training and development set
#maxlen sets the maximum length a sentence can have
#any sentence longer than this length is truncated to the maxlen size
train_set = TweetDataset(dataframe = df_train, maxlen = 512) 
dev_set = TweetDataset(dataframe = df_dev, maxlen = 512)

#Creating intsances of training and development dataloaders
train_loader = DataLoader(train_set, batch_size = 4)
dev_loader = DataLoader(dev_set, batch_size = 4)

print("Done preprocessing training and development data.")

In [8]:
import torch
import torch.nn as nn
from transformers import BertModel

class RumourClassifier(nn.Module):

    def __init__(self):
        super(RumourClassifier, self).__init__()
        #Instantiating BERT model object 
        self.bert_layer = BertModel.from_pretrained('bert-base-uncased')
        
        #Classification layer
        #input dimension is 768 because [CLS] embedding has a dimension of 768
        #output dimension is 1 because we're working with a binary classification problem
        self.cls_layer = nn.Linear(768, 1) #initialize the layer

    def forward(self, seq, attn_masks, seg_ids):
        '''
        Inputs:
            -seq : Tensor of shape [B, T] containing token ids of sequences
            -attn_masks : Tensor of shape [B, T] containing attention masks to be used to avoid contibution of PAD tokens
        '''

        #Feeding the input to BERT model to obtain contextualized representations
        outputs = self.bert_layer(seq, attention_mask = attn_masks, token_type_ids = seg_ids)
        cont_reps = outputs.last_hidden_state

        #Obtaining the representation of [CLS] head (the first token)
        cls_rep = cont_reps[:, 0] #for all the context, just take the first cls token

        #Feeding cls_rep to the classifier layer
        logits = self.cls_layer(cls_rep)

        return logits

In [None]:
gpu = 0 #gpu ID

print("Creating the sentiment classifier, initialised with pretrained BERT-BASE parameters...")
net = RumourClassifier() #initailize the net
net.cuda(gpu) #Enable gpu support for the model
print("Done creating the sentiment classifier.")

In [10]:
import torch.nn as nn
import torch.optim as optim

criterion = nn.BCEWithLogitsLoss()
opti = optim.Adam(net.parameters(), lr = 2e-5)

In [11]:
import time
from sklearn.metrics import (accuracy_score, f1_score, precision_score,
                             recall_score)
import numpy as np

def eval_metrics(y_true, y_pred):
    acc = accuracy_score(y_true, y_pred)
    p = precision_score(y_true, y_pred, average="binary")
    r = recall_score(y_true, y_pred, average="binary")
    f1 = f1_score(y_true, y_pred, average="binary")

    return (acc, p, r, f1)

def train(net, criterion, opti, train_loader, dev_loader, max_eps, gpu):
    
    net.train()
    
    best_acc=0
    best_f1 = 0
    
    st = time.time()
    
    for ep in range(max_eps):
        loss_list = []
        pred_list = []
        true_list = []
        for it, (seq, attn_masks, seg_ids, labels) in enumerate(train_loader):
            #Clear gradients
            opti.zero_grad() #make all the gradient zero  
            #Converting these to cuda tensors
            seq, attn_masks, seg_ids, labels = seq.cuda(gpu), attn_masks.cuda(gpu), seg_ids.cuda(gpu), labels.cuda(gpu)

            #Obtaining the logits from the model
            logits = net(seq, attn_masks, seg_ids)

            #Computing loss
            loss = criterion(logits.squeeze(-1), labels.float())

            #Backpropagating the gradients
            loss.backward()

            #Optimization step
            opti.step() #update the weight with the gradient
            
            loss_list.append(loss.item())
            pred_list.append( (torch.sigmoid(logits) > 0.5).long().squeeze().cpu().numpy())
            true_list.append(labels.cpu().numpy())
              
            if it % 100 == 0:
                y_pred = np.concatenate(pred_list)
                y_true = np.concatenate(true_list)
                loss = np.mean(loss_list)
                acc, p, r, f1 = eval_metrics(y_true, y_pred)
                print("Iteration {} of epoch {} complete. Loss: {}; acc: {}; f1: {}; Time taken (s): {}".format(it, ep, loss.item(), acc,f1, (time.time()-st)))
                st = time.time()
                loss_list = []
                pred_list = []
                true_list = []

        dev_acc, dev_f1, dev_loss = evaluate(net, criterion, dev_loader, gpu)
        
        print("Epoch {} complete! Development acc: {}; Development f1: {}; Development Loss: {}".format(ep, dev_acc, dev_f1 ,dev_loss))
        
        if dev_acc > best_acc:
            print("Best development acc improved from {} to {}, saving model...".format(best_acc, dev_acc))
            best_acc = dev_acc
            torch.save(net.state_dict(), 'bert_epoch{}_acc_{}.dat'.format(ep, best_acc))
        if dev_f1 > best_f1:
            print("Best development f1 improved from {} to {}, saving model...".format(best_f1, dev_f1))
            best_f1 = dev_f1
            torch.save(net.state_dict(), 'bert_epoch{}_f1_{}.dat'.format(ep, best_acc))

In [12]:
import time

def predict(net, data_set, gpu):
    data_loader = DataLoader(data_set, batch_size = 1)
    net.eval()
    predicted_dict = {}
    
    for it, (seq, attn_masks, seg_ids, labels) in enumerate(data_loader):
        #Clear gradients
        with torch.no_grad():
        
            seq, attn_masks, seg_ids, labels = seq.cuda(gpu), attn_masks.cuda(gpu), seg_ids.cuda(gpu), labels.cuda(gpu)
            
            logits = net(seq, attn_masks, seg_ids)
            
            probs = torch.sigmoid(logits.unsqueeze(-1))
            soft_probs = (probs > 0.5).long()
            predictedLabel = int(soft_probs.item())
            predicted_dict[it] = predictedLabel

    return predicted_dict

In [13]:
def evaluate(net, criterion, dataloader, gpu):
    net.eval()
    with torch.no_grad():
        loss_list = []
        pred_list = []
        true_list = []
        for seq, attn_masks, seg_ids, labels in dataloader:
            seq, attn_masks, seg_ids, labels = seq.cuda(gpu), attn_masks.cuda(gpu), seg_ids.cuda(gpu), labels.cuda(gpu)
            logits = net(seq, attn_masks, seg_ids)
            loss = criterion(logits.squeeze(-1), labels.float())
            loss_list.append(loss.item())
            pred_list.append( (torch.sigmoid(logits) > 0.5).long().squeeze().cpu().numpy())
            true_list.append(labels.cpu().numpy())
        y_pred = np.concatenate(pred_list)
        y_true = np.concatenate(true_list)
        loss = np.mean(loss_list)
        acc, p, r, f1 = eval_metrics(y_true, y_pred)
    return acc, f1,loss

In [None]:
num_epoch =40

#fine-tune the model
train(net, criterion, opti, train_loader, dev_loader, num_epoch, gpu)
print("------------------- Finish -------------------")

## Test prediction and output

In [None]:
# load existing model
net = RumourClassifier()
net.load_state_dict(torch.load('./bert_epoch3_f1_0.9700934579439252.dat'))
net.cuda(gpu) #Enable gpu support for the model #tell the model to move to GPU
net.eval()

In [16]:
# load dataset
test_data = json.load(open('total_test_data.json'))
df_test = process_data(test_data,None)
test_set = TweetDataset(df_test, maxlen = 512)

In [17]:
test_predicted_dict = predict(net, test_set ,gpu)

In [18]:
import csv
with open('test.csv', 'w') as f:
    f.write("Id,Predicted\n")
    for key in test_predicted_dict.keys():
        f.write("%s,%s\n"%(key,test_predicted_dict[key]))

## Covid tweet prediction and output

In [None]:
net = RumourClassifier()
net.load_state_dict(torch.load('bert_epoch3_f1_0.9700934579439252.dat'))
net.cuda(gpu) #Enable gpu support for the model #tell the model to move to GPU
net.eval()

In [None]:
covid_data = json.load(open('total_covid_data.json'))
df_covid = process_data(covid_data,None)
covid_set = TweetDataset(df_covid, maxlen = 512)

In [None]:
from torch.utils.data import DataLoader
covid_predicted_dict = predict(net, covid_set ,gpu)

In [None]:
covid_prediction = {}
for key in covid_predicted_dict.keys():
  prediction = {}
  prediction['id'] = covid_data[str(key)].get('source tweet id')
  prediction['prediction'] = covid_predicted_dict[key]
  covid_prediction[key] = prediction

with open('covid_prediction.json', 'w') as f:
    json.dump(covid_prediction, f)