# Downloading and importing libraries

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')
%cd /content/gdrive/MyDrive/685

Mounted at /content/gdrive
/content/gdrive/MyDrive/685


In [2]:
!pip install -q transformers
!pip install -q datasets

[K     |████████████████████████████████| 3.4 MB 4.4 MB/s 
[K     |████████████████████████████████| 895 kB 35.5 MB/s 
[K     |████████████████████████████████| 3.3 MB 39.3 MB/s 
[K     |████████████████████████████████| 61 kB 476 kB/s 
[K     |████████████████████████████████| 596 kB 42.5 MB/s 
[K     |████████████████████████████████| 298 kB 4.0 MB/s 
[K     |████████████████████████████████| 243 kB 39.1 MB/s 
[K     |████████████████████████████████| 1.1 MB 36.7 MB/s 
[K     |████████████████████████████████| 132 kB 49.9 MB/s 
[K     |████████████████████████████████| 160 kB 38.4 MB/s 
[K     |████████████████████████████████| 271 kB 52.3 MB/s 
[K     |████████████████████████████████| 192 kB 54.1 MB/s 
[?25h

In [3]:
import pandas as pd
import numpy as np
import torch
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

# Loading and saving SMS text from NUS corpus

In [4]:
import json

file = open("/content/gdrive/MyDrive/685/dataset/smsCorpus_en_2015.03.09_all.json")
data = json.load(file)
data = data['smsCorpus']['message']

all_text = [item['text']['$'] for item in data]

In [5]:
df = pd.DataFrame(all_text)
df.to_csv ('dataset/smstext.csv', index=False, header=False)
df.head()

Unnamed: 0,0
0,Bugis oso near wat...
1,"Go until jurong point, crazy.. Available only ..."
2,I dunno until when... Lets go learn pilates...
3,Den only weekdays got special price... Haiz......
4,Meet after lunch la...


# Loading and evaluating different models 

In this section, we try some models to qualitatively evaluate the stuff that will be used for the final pipeline


## Roberta for Masked Language Modelling

In [18]:
from transformers import RobertaTokenizerFast, RobertaForMaskedLM
tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base')
model = RobertaForMaskedLM.from_pretrained('roberta-base')

In [34]:
text = f"Replace me by any {tokenizer.mask_token} you'd like."
encoded_input = tokenizer(text, return_tensors='pt')
masked_token = torch.where(encoded_input["input_ids"]==tokenizer.mask_token_id)[1].item()
output = model(**encoded_input)["logits"]
print(tokenizer.decode(torch.argmax(output[0][masked_token])))

 name


## Roberta for CoLA

In [35]:
from transformers import RobertaTokenizerFast, RobertaForSequenceClassification
tokenizer = RobertaTokenizerFast.from_pretrained('textattack/roberta-base-CoLA')
model = RobertaForSequenceClassification.from_pretrained('textattack/roberta-base-CoLA')

Downloading:   0%|          | 0.00/780k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/564 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/478M [00:00<?, ?B/s]

Some weights of the model checkpoint at textattack/roberta-base-CoLA were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [36]:
text = "I wanted to ask you to wait for me to finish the lecture. Because my lecture finishes in an hour anyway."
encoded_input = tokenizer(text, return_tensors='pt')
output = model(**encoded_input)[0].detach().squeeze()

probs = torch.softmax(output, dim=0)
print(probs)

tensor([0.0241, 0.9759])


## Electra for Out-of-Context masking

In [57]:
from transformers import ElectraForPreTraining, ElectraTokenizerFast
discriminator = ElectraForPreTraining.from_pretrained("google/electra-base-discriminator")
tokenizer = ElectraTokenizerFast.from_pretrained("google/electra-base-discriminator")

In [47]:
# sentence = "The quick brown fox jumps over the lazy dog"
fake_sentence = "I want to play games wit my india in Arizona"

fake_tokens = tokenizer.tokenize(fake_sentence)
fake_inputs = tokenizer.encode(fake_sentence, return_tensors="pt")
discriminator_outputs = discriminator(fake_inputs)
predictions = torch.round((torch.sign(discriminator_outputs[0]) + 1) / 2)[0]

[print("%7s" % token, end="") for token in fake_tokens]
print()
[print("%7s" % int(prediction), end="") for prediction in predictions.data.tolist()]

      i   want     to   play  games    wit     my  india     inarizona
      0      0      0      0      0      0      0      0      1      0      0      0

[None, None, None, None, None, None, None, None, None, None, None, None]

## Roberta for Token Classification

In [50]:
from transformers import RobertaTokenizer, RobertaForTokenClassification
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaForTokenClassification.from_pretrained('roberta-base')

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForTokenClassification: ['lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able

In [51]:
sent = "Hello, my dog is cute"
inputs = tokenizer(sent, return_tensors="pt")
outputs = model(**inputs).logits
probs = torch.softmax(outputs, dim=2)
print(probs)

tensor([[[0.4334, 0.5666],
         [0.3368, 0.6632],
         [0.3031, 0.6969],
         [0.3890, 0.6110],
         [0.4316, 0.5684],
         [0.3734, 0.6266],
         [0.3789, 0.6211],
         [0.4399, 0.5601]]], grad_fn=<SoftmaxBackward0>)


# Loading the CoLA and Xsum dataset for training

We are working with only data that has ground truth label as 1 and unnormalized it

In [65]:
# CoLA Train and Test split

train = pd.read_csv("/content/gdrive/MyDrive/685/dataset/CoLA/raw/in_domain_train.tsv", sep="\t", header=None)
train_sentences = list(train[train[1]==1][3])

test = pd.read_csv("/content/gdrive/MyDrive/685/dataset/CoLA/raw/in_domain_dev.tsv", sep="\t", header=None)
test_sentences = list(test[test[1]==1][3])

In [71]:
# Xsum - Train-Val-Test split

train = pd.read_csv("/content/xsum_train_val.tsv", sep="\t")
test = pd.read_csv("/content/gdrive/MyDrive/685/dataset/xsum_test.csv", sep="\t")
all_sentences = list(train["gt_text"])
train_sentences = all_sentences[:50000]
test_sentences = all_sentences[55000:]
val_sentences = all_sentences[50000:55000]

In [53]:
from torch.utils.data import Dataset
import random
import nltk
nltk.download('punkt')

from transformers import BertTokenizerFast, BertForTokenClassification, AdamW
from transformers import pipeline

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


# Creating a Training dataset for Masking strategy model 

In [61]:
nertokenizer = BertTokenizerFast.from_pretrained("dslim/bert-base-NER")
nermodel = BertForTokenClassification.from_pretrained("dslim/bert-base-NER").to(device)

Downloading:   0%|          | 0.00/208k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/59.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/829 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/413M [00:00<?, ?B/s]

In [62]:
class MyDataset(Dataset):
    def __init__(self, sentences):
        self.sentences = [x.replace("'", "") for x in sentences]
        self.phonetics = {
            "ph": "f",
            "th": "d",
            "ck": "k",
            "q": "k",
            "kn": "n",
            "gn": "n",
            "pn": "n",
            "ae": "e",
            "wr": "r",
            "at": "8",
            "ate": "8",
            "to": "2",
            "too": "2",
            "for": "4",
            "z": "s",
            "x": "ks",
            "tch": "ch",
            "ci": "si",
            "si": "ci",
            "ght": "te",
            "tch": "ch",
            "you": "u",
        }
        self.tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
        # self.ner = pipeline("ner", model=nermodel, tokenizer=nertokenizer).to(device)

    def __len__(self):
        return len(self.sentences)

    def prob(self, p):
        return np.random.uniform() < p 

    def add_chars(self, word):
        # Add characters at the end
        word = list(word)
        if self.prob(0.7): word[-1] = word[-1]*np.random.geometric(0.6)
        # Add characters in between
        for i in range(len(word) - 1):
            if self.prob(0.1): word[i] = word[i]*np.random.geometric(0.8)
        return "".join(word)

    def drop_chars(self, word):
        word = list(word)
        vowels = set(['a', 'e', 'i', 'o', 'u'])
        # Handle first and last character differently since they are less likely to be dropped if they are not vowels
        if self.prob(0.3) and word[0] in vowels: word[0] = ''
        if self.prob(0.3) and word[-1] in vowels: word[-1] = ''
        for i in range(1, len(word)-1):
            # Handle vowels differently since they are more likely to be dropped
            if word[i] in vowels:
                if self.prob(0.33): word[i] = ''
            elif self.prob(0.05): word[i] = ''
        return "".join(word)

    def swap_chars(self, word):
        word = list(word)
        for i in range(2, len(word)-1):
            if self.prob(0.1): word[i], word[i-1] = word[i-1], word[i]
        return "".join(word)

    def switch_cases(self, word):
        return "".join([i.swapcase() if self.prob(0.1) else i for i in list(word)])

    def phonetic_replacements(self, word):
        for k, v in self.phonetics.items():
            if k in word.lower() and self.prob(0.33): word = word.lower().replace(k, v)
        return word

    def modify(self, word):
        if self.prob(0.15): word = self.phonetic_replacements(word)
        if self.prob(0.2) and len(word) > 1: word = self.drop_chars(word)
        if self.prob(0.2) and word.isalnum(): word = self.add_chars(word)
        if self.prob(0.15): word = self.switch_cases(word)
        if self.prob(0.05) and len(word) > 4: word = self.swap_chars(word)
        return word

    def process(self, text):
        # entities = self.ner(text)
        # entities_set = set(nertokenizer.convert_tokens_to_string([x["word"] for x in entities]).split())
        words = nltk.word_tokenize(text)
        changed_words = []
        changed_masked_words = []
        changed_idx, modified, idx = [], [], 0
        for word in words:
            # if word in entities_set: changed = word
            # else: changed = self.modify(word)
            changed = self.modify(word)
            # Change according to case/uncased model
            if word.lower().strip() != changed.lower().strip():
                changed_idx.append(idx)
                changed_words.append(word.lower().strip())
                changed_masked_words.append(changed)
            modified.append(changed)
            idx += 1
        return " ".join(modified), set(changed_idx), changed_words, changed_masked_words

    def __getitem__(self, idx):
        unnormalized, changed_ids, changed_words, changed_masked_words = self.process(self.sentences[idx])
        normalized = self.sentences[idx]
        tokens = self.tokenizer(unnormalized, return_tensors='pt', max_length=128, padding='max_length', truncation=True)
        word_ids = tokens.word_ids()
        labels = [1 if i in changed_ids else 0 for i in word_ids]
        return tokens, torch.tensor(labels)

train_dataset = MyDataset(train_sentences)

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

# Creating a Validation/Testing data for evaluations on the model 

This is the same as above but also returns multiple items during get_item function which cannot be loaded in a batch because the size of not all items are consistent. 

In [72]:
class MyVDataset(Dataset):
    def __init__(self, sentences):
        self.sentences = [x.replace("'", "") for x in sentences]
        self.phonetics = {
            "ph": "f",
            "th": "d",
            "ck": "k",
            "q": "k",
            "kn": "n",
            "gn": "n",
            "pn": "n",
            "ae": "e",
            "wr": "r",
            "at": "8",
            "ate": "8",
            "to": "2",
            "too": "2",
            "for": "4",
            "z": "s",
            "x": "ks",
            "tch": "ch",
            "ci": "si",
            "si": "ci",
            "ght": "te",
            "tch": "ch",
            "you": "u",
        }
        self.tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
        # self.ner = pipeline("ner", model=nermodel, tokenizer=nertokenizer).to(device)

    def __len__(self):
        return len(self.sentences)

    def prob(self, p):
        return np.random.uniform() < p 

    def add_chars(self, word):
        # Add characters at the end
        word = list(word)
        if self.prob(0.7): word[-1] = word[-1]*np.random.geometric(0.6)
        # Add characters in between
        for i in range(len(word) - 1):
            if self.prob(0.1): word[i] = word[i]*np.random.geometric(0.8)
        return "".join(word)

    def drop_chars(self, word):
        word = list(word)
        vowels = set(['a', 'e', 'i', 'o', 'u'])
        # Handle first and last character differently since they are less likely to be dropped if they are not vowels
        if self.prob(0.3) and word[0] in vowels: word[0] = ''
        if self.prob(0.3) and word[-1] in vowels: word[-1] = ''
        for i in range(1, len(word)-1):
            # Handle vowels differently since they are more likely to be dropped
            if word[i] in vowels:
                if self.prob(0.33): word[i] = ''
            elif self.prob(0.05): word[i] = ''
        return "".join(word)

    def swap_chars(self, word):
        word = list(word)
        for i in range(2, len(word)-1):
            if self.prob(0.1): word[i], word[i-1] = word[i-1], word[i]
        return "".join(word)

    def switch_cases(self, word):
        return "".join([i.swapcase() if self.prob(0.1) else i for i in list(word)])

    def phonetic_replacements(self, word):
        for k, v in self.phonetics.items():
            if k in word.lower() and self.prob(0.33): word = word.lower().replace(k, v)
        return word

    def modify(self, word):
        if self.prob(0.15): word = self.phonetic_replacements(word)
        if self.prob(0.2) and len(word) > 1: word = self.drop_chars(word)
        if self.prob(0.2) and word.isalnum(): word = self.add_chars(word)
        if self.prob(0.15): word = self.switch_cases(word)
        if self.prob(0.05) and len(word) > 4: word = self.swap_chars(word)
        return word

    def process(self, text):
        # entities = self.ner(text)
        # entities_set = set(nertokenizer.convert_tokens_to_string([x["word"] for x in entities]).split())
        words = nltk.word_tokenize(text)
        changed_words = []
        changed_masked_words = []
        changed_idx, modified, idx = [], [], 0
        for word in words:
            # if word in entities_set: changed = word
            # else: changed = self.modify(word)
            changed = self.modify(word)
            # Change according to case/uncased model
            if word.lower().strip() != changed.lower().strip():
                changed_idx.append(idx)
                changed_words.append(word.lower().strip())
                changed_masked_words.append(changed)
            modified.append(changed)
            idx += 1
        return " ".join(modified), set(changed_idx), changed_words, changed_masked_words

    def __getitem__(self, idx):
        unnormalized, changed_ids, changed_words, changed_masked_words = self.process(self.sentences[idx])
        normalized = self.sentences[idx]
        tokens = self.tokenizer(unnormalized, return_tensors='pt', max_length=128, padding='max_length', truncation=True)
        word_ids = tokens.word_ids()
        labels = [1 if i in changed_ids else 0 for i in word_ids]
        return tokens, torch.tensor(labels), unnormalized, normalized, changed_words, changed_masked_words

val_dataset = MyVDataset(val_sentences)
test_dataset = MyVDataset(test_sentences)

In [73]:
len(train_dataset), len(val_dataset), len(test_dataset)

(50000, 5000, 5000)

# Evaluations on downstream SNLI data after unnormalizing it

In [76]:
from transformers import RobertaForSequenceClassification, RobertaTokenizerFast
from datasets import load_dataset

data = load_dataset('snli')

Reusing dataset snli (/root/.cache/huggingface/datasets/snli/plain_text/1.0.0/1f60b67533b65ae0275561ff7828aad5ee4282d0e6f844fd148d05d3c6ea251b)


  0%|          | 0/3 [00:00<?, ?it/s]

In [81]:
mnli_model = RobertaForSequenceClassification.from_pretrained('roberta-large-mnli')
mnli_tokenizer = RobertaTokenizerFast.from_pretrained('roberta-large-mnli')

Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [77]:
sub_data = data['train']
df = pd.DataFrame(sub_data)
df = df.sample(frac=1, random_state=42)
df_small = df.iloc[:1000]

premise_dataset = MyDataset(df_small['premise'].tolist())
hypo_dataset = MyDataset(df_small['hypothesis'].tolist())
labels = df_small['label'].tolist()

In [79]:
df_small.head()

Unnamed: 0,premise,hypothesis,label
451411,An older gentleman in an orange jumpsuit and g...,A man is cleaning up around his house.,1
70668,Boy and girl running along the beach.,Two people run on the beach.,0
208057,Four men do repair work on a roof.,Four men work with tools.,0
126424,A long-haired young man skateboarding on the r...,A girl is riding a bike.,2
191670,A band performs on stage.,People are listening to a rock concert.,1


# Creating a unnormalized-normalized sentence pairs and storing the file externally for evaluations and input to proposed models

In [84]:
norm_unnorm_df = pd.DataFrame(columns = ['labels', 'unnormalized', 'normalized'])
norm, unnorm = [], []
changed = []
# for i in range(len(premise_dataset)):
#     _, _, unnormalized1, normalized1, changed_words1 = premise_dataset[i]
#     _, _, unnormalized2, normalized2, changed_words2 = hypo_dataset[i]
#     unnorm.append([unnormalized1, unnormalized2])
#     norm.append([normalized1, normalized2])
#     changed.append([changed_words1, changed_words2])

# labels = []
# norm_unnorm_df = pd.read_csv('/content/gdrive/MyDrive/685/dataset/norm_unnorm_snli_new.csv')
# norm_unnorm_df['normalized'] = norm
# norm_unnorm_df['unnormalized'] = unnorm
# norm_unnorm_df['labels'] = labels
# for i, row in norm_unnorm_df.iterrows():
#   d = df[df['hypothesis']==eval(row['normalized'])[1]]
#   # labels.append(row['label'][eval(row['normalized'])[0]==df['premise']])
#   try:
#     labels.append(d['label'].iloc[0])
#   except:
#     labels.append(-1)
# norm_unnorm_df['labels'] = labels
# norm_unnorm_df.to_csv('/content/gdrive/MyDrive/685/dataset/norm_unnorm_snli_new.csv')

# df2.to_csv('changed_words.csv', index=False)

norm_unnorm_df = pd.read_csv('/content/gdrive/MyDrive/685/dataset/xsum_test_new.csv')
dataset = MyVDataset(norm_unnorm_df['normalized'].tolist())
for i in range(len(dataset)):
    _, _, unnormalized, normalized, changed_words, _ = dataset[i]
    unnorm.append(unnormalized)
    norm.append(normalized)
    changed.append(changed_words)

norm_unnorm_df['unnormalized'] = unnorm
norm_unnorm_df['normalized'] = norm
norm_unnorm_df.head()

Unnamed: 0.2,Unnamed: 0,index,Unnamed: 0.1,normalized,unnormalized
0,0,0,0,Bangor City manager Kevin Nicholson says it wo...,Bangor City manager Kevin Nicholson says it wo...
1,1,1,1,The actor who played Darth Vader in the origin...,The ctorrr wh played Drth Vader in the origina...
2,2,2,2,Catalans Dragons earned a narrow victory at Hu...,Catalans Dragons earned a narrow victory at Hu...
3,3,3,3,"A driver who crashed into a shop, killing a te...","A driver who crashed into a shop , killing a t..."
4,4,4,4,Kenyan police have been involved in a fierce g...,Kenyan police have been nvoLvd in a fierce gun...


In [85]:
changed[:5]

[['achievement', 'overturn', 'first-leg'],
 ['actor', 'who', 'darth', 'wars', 'says', 'hopefuls', 'auditioning', 'the'],
 [],
 ['been', 'eight'],
 ['involved', 'gun', 'battle', 'one', 'wounded']]

In [86]:
# changed_df = pd.DataFrame(changed, columns = ['changed_words_premise', 'changed_words_hypo'])
# changed_df = changed_df.drop(737)
# norm_unnorm_df = norm_unnorm_df.drop(737)
# changed_df.head()
# norm_unnorm_df.reset_index(inplace=True)
# changed_df.reset_index(inplace=True)
# changed_df.to_csv('/content/gdrive/MyDrive/685/dataset/changed_words_snli.csv')
# norm_unnorm_df.to_csv('/content/gdrive/MyDrive/685/dataset/norm_unnorm_snli_new.csv')

changed_df = pd.DataFrame(columns = ['changed_words'])
changed_df['changed_words'] = changed
norm_unnorm_df.reset_index(inplace=True)
changed_df.reset_index(inplace=True)
changed_df.to_csv('/content/gdrive/MyDrive/685/dataset/changed_words_xsum.csv')
norm_unnorm_df.to_csv('/content/gdrive/MyDrive/685/dataset/xsum_test_new.csv')

# Training the Masking model to predict which tokens to mask

In [87]:
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, num_workers=2)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=32, num_workers=2)

In [88]:
model = BertForTokenClassification.from_pretrained("bert-base-uncased", num_labels=2)
model.to(device)
model.train()

optim = AdamW(model.parameters(), lr=5e-5)

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-u

## Getting the validation accuracy 

In [97]:
def get_val_accuracy(model_new, btf, val_dataset):
    acc_num, acc_den = 0, 0

    for i in range(len(val_dataset)):
        _, ground_truth, unnorm_sent, norm_sent, _, to_mask = val_dataset[i]

        x = btf(unnorm_sent, return_tensors='pt')

        input_ids = x['input_ids'].to(device)
        attention_mask = x['attention_mask'].to(device)
        token_ids = x['token_type_ids'].to(device)
        logits = model_new(input_ids, attention_mask=attention_mask, token_type_ids=token_ids).logits.squeeze()

        probs = torch.softmax(logits, dim=1).squeeze().cpu().detach().numpy()

        preds = np.argmax(probs, axis=1)
        labels = ground_truth.squeeze().cpu().detach().numpy()
        n = probs.shape[0]

        acc_num += np.sum(preds==labels[:n])
        acc_den += n

    return acc_num/acc_den

## Training the model 

In [90]:
from transformers import BertForTokenClassification, BertTokenizerFast

btf = BertTokenizerFast.from_pretrained('bert-base-uncased')

In [None]:
def train(model, optimizer, train_loader, EPOCHS = 10):
    for epoch in range(EPOCHS):
        total_loss = 0
        for batch, labels in train_loader:
            optimizer.zero_grad()
            input_ids = batch['input_ids'].squeeze(1).to(device)
            attention_mask = batch['attention_mask'].squeeze(1).to(device)
            token_ids = batch['token_type_ids'].squeeze(1).to(device)
            labels = labels.to(device)
            outputs = model(input_ids, attention_mask=attention_mask, token_type_ids=token_ids, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()
            loss.backward()
            optimizer.step()
        print(f"Epoch {epoch+1} => Total Loss: {total_loss}")
        model.save_pretrained(f"/content/gdrive/MyDrive/685/models/predict_masks_xsum/{epoch}")
        val_acc = get_val_accuracy(model, btf, val_dataset)
        print(f"Epoch {epoch+1} => Validation Accuracy: {val_acc}")

train(model, optimizer, train_loader, 10)

Epoch 1 => Total Loss: 117.36510821990669
0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
Epoch 1 => Validation Accuracy: 0.9864371663080723
Epoch 2 => Total Loss: 54.325558891519904
0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
Epoch 2 => Validation Accuracy: 0.9885319274325852
Epoch 3 => Total Loss: 42.68788924487308
0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
Epoch 3 => Validation Accuracy: 0.9884172959882739
Epoch 4 => Total Loss: 36.77711867680773
0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
Epoch 4 => Validation Accuracy: 0.9907507641365257


## Loading the pretrained model from drive

In [95]:
model_cola = BertForTokenClassification.from_pretrained("/content/gdrive/MyDrive/685/models/predict_masks", num_labels=2).to(device)
model_xsum = BertForTokenClassification.from_pretrained("/content/gdrive/MyDrive/685/models/predict_masks_xsum/3", num_labels=2).to(device)

## Get accuracy on val and test data of model trained on CoLA and XSum

In [98]:
print(f"Cola Model accuracy on validation data => {get_val_accuracy(model_cola, btf, val_dataset)}")
print(f"Cola Model accuracy on test data => {get_val_accuracy(model_cola, btf, test_dataset)}")

Cola Model accuracy on validation data => 0.9300112088454889
Cola Model accuracy on test data => 0.9309912488443662


In [99]:
print(f"XSum Model accuracy on validation data => {get_val_accuracy(model_xsum, btf, val_dataset)}")
print(f"Xsum Model accuracy on test data => {get_val_accuracy(model_xsum, btf, test_dataset)}")

XSum Model accuracy on validation data => 0.9918715682420522
Xsum Model accuracy on test data => 0.9916871004327523


# Making Predictions

In [100]:
model_new = model_xsum
btf = BertTokenizerFast.from_pretrained('bert-base-uncased')

x = btf("Let's nt killlll each other.", return_tensors='pt')

input_ids = x['input_ids'].to(device)
attention_mask = x['attention_mask'].to(device)
token_ids = x['token_type_ids'].to(device)
logits = model_new(input_ids, attention_mask=attention_mask, token_type_ids=token_ids).logits.squeeze()

torch.softmax(logits, dim=1)

tensor([[9.9988e-01, 1.1852e-04],
        [9.9822e-01, 1.7768e-03],
        [9.9857e-01, 1.4344e-03],
        [9.9726e-01, 2.7406e-03],
        [1.1051e-04, 9.9989e-01],
        [1.5099e-04, 9.9985e-01],
        [7.2227e-05, 9.9993e-01],
        [8.7432e-05, 9.9991e-01],
        [9.9957e-01, 4.3445e-04],
        [9.9941e-01, 5.8530e-04],
        [9.9999e-01, 8.8733e-06],
        [9.9999e-01, 9.4282e-06]], device='cuda:0', grad_fn=<SoftmaxBackward0>)

In [101]:
sent = "Ireland levelled theee fIve-match oned-ay srisss against Afghanistan attttt 1-1 with a six-iwcket victttoryyyy at Stormont ."

x = btf(sent, return_tensors='pt')

input_ids = x['input_ids'].to(device)
attention_mask = x['attention_mask'].to(device)
token_ids = x['token_type_ids'].to(device)
logits = model_new(input_ids, attention_mask=attention_mask, token_type_ids=token_ids).logits.squeeze()

probs = torch.softmax(logits, dim=1).squeeze().cpu().detach().numpy()
n = probs.shape[0]

words = nltk.word_tokenize(sent)
words_ids = x.word_ids()
all_words = []

for i in range(n):
    if probs[i][1] > 0.5:
        words[words_ids[i]] = '[MASK]'
print(" ".join(words))

Ireland levelled [MASK] fIve-match [MASK] [MASK] against Afghanistan [MASK] 1-1 with a six-iwcket [MASK] at Stormont .
