Project 1 Task 1 COMP90042

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install torch torchvision transformers



In [3]:
#load pretrained bert base model
from transformers import BertModel

bert_model = BertModel.from_pretrained('bert-base-uncased')

print("Done loading BERT model.")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Done loading BERT model.


In [4]:
import torch
from torch.utils.data import Dataset
from transformers import AutoTokenizer
import pandas as pd
import numpy as np
import json

import nltk
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('brown')
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.corpus import brown

import re
import gensim
from gensim.utils import simple_preprocess

import math
from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer

class RumourDataset(Dataset):
    def __init__(self, data_file, label_file, maxlen):
        # Open dataset
        self.ID = 'id_str'
        self.TEXT = 'text'
        self.PARENT_ID = 'in_reply_to_status_id_str'
        self.PARENT_TWEET = 'text_x'
        self.REPLY_TWEETS = 'text_y'
        self.LABEL = 'label'

        # Libraries
        self.lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()
        self.stopwords = stopwords.words('english')
        
        #words from brown corpus
        raw_brown_words = brown.words()
        brown_words = []
        for word in raw_brown_words:
          brown_words.append(word.lower())
        self.brown_words = brown_words

        # Create count_dict, total_word, total_unique_word from brown_corpus
        count_dict = {}
        total_word = 0
        total_unique_word = 0
        for word in self.brown_words:
            word = word.lower()
            if word not in count_dict:
                total_unique_word += 1
                count_dict[word] = 1
            else:
                count_dict[word] += 1
        for key in count_dict:
            total_word += count_dict[key]
        self.count_dict = count_dict
        self.total_word = total_word
        self.total_unique_word = total_unique_word

        # Open dataset
        self.dataset_df = pd.read_json(path_or_buf=data_file, lines=True) # returns a df where row = a collection of tweets, with each column = a single tweet

        # Open label set
        if label_file is not None:
          with open(label_file) as f:
              loaded_json = json.load(f)
              label_df = pd.DataFrame.from_dict(loaded_json, orient='index', columns=[self.LABEL])
          # Combine dataset and label set
          # convert labels into 1s and 0s
          label_encoder = preprocessing.LabelEncoder()
          label = label_df[self.LABEL]
          self.label_dataset = label_encoder.fit_transform(label)
        else:
          self.label_dataset = None

        #Initialize the BERT tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

        self.maxlen = maxlen
    
    def preprocess_text(self, text):
      text = text.lower()
      temp = text.split()

      for index, word in enumerate(temp):
        if word.startswith('@'):
          text = text.replace(word, '')
        if word.startswith('#'):
          new_word = word[1:]
          text = text.replace(word, new_word)
          # tokens = self.max_match_or_rev(new_word, self.count_dict, self.total_word, self.total_unique_word)
          # tokens = self.max_match(new_word)
          # tokens_str = ''
          # for token in tokens:
          #   tokens_str += token + ' '
          # text = text.replace(word, tokens_str)

      text_url = re.sub(r"http\S+", "", text)
      
      return text_url

    def lemmatize(self, word):
      lemma = self.lemmatizer.lemmatize(word,'v')
      if lemma == word:
          lemma = self.lemmatizer.lemmatize(word,'n')
      return lemma

    def max_match(self, hashtag):
        tokens = []
        longest_index = 0
        while longest_index < len(hashtag):
            temp = ''
            lemma = ''
            longest_word = ''
            for i in range(longest_index,len(hashtag)):
                temp = hashtag[longest_index:i+1]
                lemma = self.lemmatize(temp)
                if lemma != temp or lemma in self.brown_words:
                    if lemma in self.brown_words:
                        longest_word = temp
            if len(longest_word) == 0:
                longest_word = hashtag[longest_index]
            longest_index += len(longest_word)
            tokens.append(longest_word)
        return tokens

    def rev_max_match(self, hashtag):
        tokens = []
        longest_index = len(hashtag) - 1
        while longest_index > -1:
            temp = ''
            lemma = ''
            longest_word = ''
            i = longest_index
            while i > -1:
                temp = hashtag[i:longest_index + 1]
                lemma = self.lemmatize(temp)
                if lemma != temp or lemma in self.brown_words:
                    if lemma in self.brown_words:
                        longest_word = temp
                i -= 1
            if len(longest_word) == 0:
                longest_word = hashtag[longest_index]
            longest_index -= len(longest_word)
            tokens.append(longest_word)
        tokens.reverse()
        return tokens
    
    def find_prob(self, tokens, count_dict, total_word, total_unique_word):
      # called in max_match_or_rev
      prob = 0
      for token in tokens:
          count = 0
          if token in count_dict:
              count = count_dict[token] + 1
          else:
              count += 1
          prob += math.log((count / (total_word + total_unique_word)))
      return prob

    def max_match_or_rev(self, hashtag, count_dict, total_word, total_unique_word):
      normal = self.max_match(hashtag)
      rev = self.rev_max_match(hashtag)

      if normal != rev: # if MaxMatch and reversed MaxMatch differs
          normal_prob = self.find_prob(normal, count_dict, total_word, total_unique_word)
          rev_prob = self.find_prob(rev, count_dict, total_word, total_unique_word)
          if rev_prob > normal_prob:
            return rev
      return normal

    def __len__(self):
        return len(self.dataset_df)

    def __getitem__(self, index):
        row_twt = list(self.dataset_df.iloc[index])
        parent_tweet = ''
        reply_tweets = ''

        for twt in row_twt:
          if twt == None:
            break
          
          if twt[self.PARENT_ID] == None:
            parent_tweet = twt[self.TEXT]
            parent_tweet = twt[self.TEXT]
          else:
            reply_tweets += twt[self.TEXT] + ' '
            reply_tweets += twt[self.TEXT] + ' '

        # #Preprocessing the text to be suitable for BERT
        tokens = self.tokenizer(parent_tweet, 
                                reply_tweets, 
                                padding='max_length', 
                                truncation=True,
                                max_length=self.maxlen) #Tokenize the sentence

        
        # tokens_ids = self.tokenizer.convert_tokens_to_ids(tokens) #Obtaining the indices of the tokens in the BERT Vocabulary
        tokens_ids_tensor = torch.tensor(tokens['input_ids']) #Converting the list to a pytorch tensor

        #Obtaining the attention mask i.e a tensor containing 1s for no padded tokens and 0s for padded ones
        attn_mask = torch.tensor(tokens['attention_mask'])

        seg_ids = torch.tensor(tokens['token_type_ids'])
        
        # Check label, if available return
        if self.label_dataset is not None:
          label_twt = self.label_dataset[index]
          return tokens_ids_tensor, attn_mask, seg_ids, label_twt
        else:
          return tokens_ids_tensor, attn_mask, seg_ids

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!


In [5]:
from torch.utils.data import DataLoader
MAXLEN = 512
BATCH_SIZE = 16
NUM_WORKERS = 2

#Creating instances of training and development set
#maxlen sets the maximum length a sentence can have
#any sentence longer than this length is truncated to the maxlen size
ROOT = '/content/drive/My Drive/nlp-data/'

TRAIN_SET = ROOT + 'train.data.jsonl'
DEV_SET = ROOT + 'dev.data.jsonl'

TRAIN_SET_LABEL = ROOT + 'train.label.json'
DEV_SET_LABEL = ROOT + 'dev.label.json'
train_set = RumourDataset(data_file = TRAIN_SET, label_file = TRAIN_SET_LABEL, maxlen = MAXLEN)
dev_set = RumourDataset(data_file = DEV_SET, label_file = DEV_SET_LABEL, maxlen = MAXLEN)

#Creating intsances of training and development dataloaders
train_loader = DataLoader(train_set, batch_size = BATCH_SIZE, num_workers = NUM_WORKERS)
dev_loader = DataLoader(dev_set, batch_size = BATCH_SIZE, num_workers = NUM_WORKERS)

print("Done preprocessing training and development data.")

Done preprocessing training and development data.


In [6]:
import torch
import torch.nn as nn
from transformers import BertModel

class RumourClassifier(nn.Module):
    def __init__(self):
        super(RumourClassifier, self).__init__()
        #Instantiating BERT model object 
        self.bert_layer = BertModel.from_pretrained('bert-base-uncased')
        
        #Classification layer
        #input dimension is 768 because [CLS] embedding has a dimension of 768
        #output dimension is 1 because we're working with a binary classification problem
        self.cls_layer = nn.Linear(768, 1)

    def forward(self, seq, attn_masks, seg_ids):
        '''
        Inputs:
            -seq : Tensor of shape [B, T] containing token ids of sequences
            -attn_masks : Tensor of shape [B, T] containing attention masks to be used to avoid contibution of PAD tokens
        '''

        #Feeding the input to BERT model to obtain contextualized representations
        outputs = self.bert_layer(seq, attention_mask = attn_masks, token_type_ids = seg_ids)
        cont_reps = outputs.last_hidden_state

        #Obtaining the representation of [CLS] head (the first token)
        cls_rep = cont_reps[:, 0]

        #Feeding cls_rep to the classifier layer
        logits = self.cls_layer(cls_rep)

        return logits

In [7]:
gpu = 0 #gpu ID

print("Creating the rumour classifier, initialised with pretrained BERT-BASE parameters...")
net = RumourClassifier()
net.cuda(gpu) #Enable gpu support for the model
print("Done creating the rumour classifier.")

Creating the rumour classifier, initialised with pretrained BERT-BASE parameters...


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Done creating the rumour classifier.


In [8]:
import torch.nn as nn
import torch.optim as optim

criterion = nn.BCEWithLogitsLoss()
opti = optim.Adam(net.parameters(), lr = 2e-5)

In [9]:
import time

def train(net, criterion, opti, train_loader, dev_loader, max_eps, gpu):
    best_acc = 0
    st = time.time()
    for ep in range(max_eps):
        
        for it, (seq, attn_masks, seg_ids, labels) in enumerate(train_loader):
            #Clear gradients
            opti.zero_grad()  
            #Converting these to cuda tensors
            seq, attn_masks, seg_ids, labels = seq.cuda(gpu), attn_masks.cuda(gpu), seg_ids.cuda(gpu), labels.cuda(gpu)

            #Obtaining the logits from the model
            logits = net(seq, attn_masks, seg_ids)

            #Computing loss
            loss = criterion(logits.squeeze(-1), labels.float())

            #Backpropagating the gradients
            loss.backward()

            #Optimization step
            opti.step()
              
            if it % 100 == 0:
                
                acc = get_accuracy_from_logits(logits, labels)
                print("Iteration {} of epoch {} complete. Loss: {}; Accuracy: {}; Time taken (s): {}".format(it, ep, loss.item(), acc, (time.time()-st)))
                st = time.time()

        
        dev_acc, dev_loss = evaluate(net, criterion, dev_loader, gpu)
        print("Epoch {} complete! Development Accuracy: {}; Development Loss: {}".format(ep, dev_acc, dev_loss))
        torch.save(net.state_dict(), ROOT + 'sstcls_{}.dat'.format(ep))
        if dev_acc > best_acc:
            print("Best development accuracy improved from {} to {}, saving model...".format(best_acc, dev_acc))
            best_acc = dev_acc

In [10]:
def get_accuracy_from_logits(logits, labels):
    probs = torch.sigmoid(logits.unsqueeze(-1))
    soft_probs = (probs > 0.5).long()
    acc = (soft_probs.squeeze() == labels).float().mean()
    return acc

def evaluate(net, criterion, dataloader, gpu):
    net.eval()

    mean_acc, mean_loss = 0, 0
    count = 0

    with torch.no_grad():
        for seq, attn_masks, seg_ids, labels in dataloader:
            seq, attn_masks, seg_ids, labels = seq.cuda(gpu), attn_masks.cuda(gpu), seg_ids.cuda(gpu), labels.cuda(gpu)
            logits = net(seq, attn_masks, seg_ids)
            mean_loss += criterion(logits.squeeze(-1), labels.float()).item()
            mean_acc += get_accuracy_from_logits(logits, labels)
            count += 1

    return mean_acc / count, mean_loss / count

In [11]:
num_epoch = 2

#fine-tune the model
train(net, criterion, opti, train_loader, dev_loader, num_epoch, gpu)

Iteration 0 of epoch 0 complete. Loss: 0.7296366095542908; Accuracy: 0.1875; Time taken (s): 1.1983132362365723
Iteration 100 of epoch 0 complete. Loss: 0.14117911458015442; Accuracy: 0.9375; Time taken (s): 83.45783996582031
Iteration 200 of epoch 0 complete. Loss: 0.4235019087791443; Accuracy: 0.8125; Time taken (s): 83.43763780593872
Epoch 0 complete! Development Accuracy: 0.8665540814399719; Development Loss: 0.2907700264776075
Best development accuracy improved from 0 to 0.8665540814399719, saving model...
Iteration 0 of epoch 1 complete. Loss: 0.24530521035194397; Accuracy: 0.9375; Time taken (s): 88.22898197174072
Iteration 100 of epoch 1 complete. Loss: 0.03604999929666519; Accuracy: 1.0; Time taken (s): 83.6326355934143
Iteration 200 of epoch 1 complete. Loss: 0.17435328662395477; Accuracy: 0.9375; Time taken (s): 83.40871739387512
Epoch 1 complete! Development Accuracy: 0.8614864945411682; Development Loss: 0.3874914037013376


In [12]:
from torch.utils.data import DataLoader
ROOT = '/content/drive/My Drive/nlp-data/'

def predict_rumour(net, test_set, output_path):
  RUMOUR = 'rumour'
  NON_RUMOUR = 'non-rumour'

  net = net.eval()
  test_loader = DataLoader(test_set)
  
  id_lst = []
  for index, row in test_set.dataset_df.iterrows():
    for twt in row:
      if twt[test_set.PARENT_ID] is None:
        id_lst.append(twt[test_set.ID])
        break
  
  predictions_lst = []
  with torch.no_grad():
      for seq, attn_masks, seg_ids in test_loader:
          seq, attn_masks, seg_ids = seq.cuda(gpu), attn_masks.cuda(gpu), seg_ids.cuda(gpu)
          logits = net(seq, attn_masks, seg_ids)
          probs = torch.sigmoid(logits)
          soft_prob = (probs > 0.5).long()
          if soft_prob.squeeze().item() == 0:
            predictions_lst.append(NON_RUMOUR)
          else:
            predictions_lst.append(RUMOUR)
  predictions_dct = {}
  for id, pred in zip(id_lst, predictions_lst):
    predictions_dct[id] = pred
  
  with open(output_path, 'w') as out:
    json.dump(predictions_dct, out)
  print('Finish Prediction.')

In [13]:
ROOT = '/content/drive/My Drive/nlp-data/'
OUTPUT = ROOT + 'test-output.json'

TEST_SET = ROOT + 'test.data.jsonl'
test_set = RumourDataset(data_file = TEST_SET, label_file = None, maxlen = MAXLEN)
predict_rumour(net, test_set, OUTPUT)

Finish Prediction.


Project 1 Task 2 COMP90042

In [14]:
# Predicting the COVID dataset - beware, huge file
# COVID_SET = ROOT + 'covid.data.jsonl'
# COVID_OUTPUT = ROOT + 'covid-output.json'

# covid_set = RumourDataset(data_file = COVID_SET, label_file = None, maxlen = MAXLEN)
# predict_rumour(net, covid_set, COVID_OUTPUT)