In [1]:
# Reference: https://towardsdatascience.com/named-entity-recognition-with-bert-in-pytorch-a454405e0b6a

# Setup

In [2]:
! pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizerFast, BertForTokenClassification
import torch
from torch.utils.data import DataLoader
from torch.optim import SGD
from tqdm import tqdm

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
# Please update the file path here
file_path = '/content/drive/MyDrive/Colab Notebooks/data/capstone/anno_14_tc.csv'

# Data Preparation

### Load in data

In [6]:
df = pd.read_csv(file_path)
df.head()

Unnamed: 0,Word,IOB-tag
0,S1,*
1,Enter,O
2,email,O
3,address,O
4,to,O


### Reformat dataset

In [7]:
# Sents -- list of tuples (sentence, tags)
sents = []
sent, tags = [], []
for idx, row in df.iterrows():
  word, tag = row['Word'], row['IOB-tag']
  if tag == '*':
    if len(sent) > 0:
        # sent = ' '.join(sent)
        sents.append((sent, tags))
        sent, tags = [], []
    else:
        continue
  else:
    sent.append(word)
    tags.append(tag)

In [8]:
# Hyperparameter - Max length
MAX_LENGTH = max(len(sent) for sent, _ in sents)
MAX_LENGTH

16

In [9]:
# Tags
tags = df['IOB-tag'].values.tolist()
unique_tags = set(tags) - set('*')

tags2ids = {k: v for v, k in enumerate(sorted(unique_tags))}
ids2tags = {v: k for v, k in enumerate(sorted(unique_tags))}

print(tags2ids)
print(ids2tags)

{'B-location': 0, 'B-time': 1, 'B-value': 2, 'I-location': 3, 'I-time': 4, 'I-value': 5, 'O': 6}
{0: 'B-location', 1: 'B-time', 2: 'B-value', 3: 'I-location', 4: 'I-time', 5: 'I-value', 6: 'O'}


### Tokenization



In [10]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased')

##### Try tokenization with one example

In [11]:
# ' '.join(sents[1][0])

In [12]:
# text_tokenized = tokenizer(' '.join(sents[1][0]), padding='max_length', max_length=MAX_LENGTH, return_tensors="pt")
# text_tokenized['input_ids']

In [13]:
# tokenizer.convert_ids_to_tokens(text_tokenized["input_ids"][0])

In [14]:
# word_ids = text_tokenized.word_ids()
# print(tokenizer.convert_ids_to_tokens(text_tokenized["input_ids"][0]))
# print(word_ids)

### Align tags to tokenized texts

In [15]:
def align_label(texts, labels):
    tokenized_inputs = tokenizer(texts, padding='max_length', max_length=MAX_LENGTH)

    word_ids = tokenized_inputs.word_ids()

    previous_word_idx = None
    label_ids = []

    for word_idx in word_ids:

        if word_idx is None:
            label_ids.append(-100)

        elif word_idx != previous_word_idx:
            try:
                label_ids.append(tags2ids[labels[word_idx]])
            except:
                label_ids.append(-100)
        else:
            try:
                label_ids.append(tags2ids[labels[word_idx]])
            except:
                label_ids.append(-100)
        previous_word_idx = word_idx

    return label_ids

##### Try align tags for one

In [16]:
# print(tokenizer.convert_ids_to_tokens(text_tokenized["input_ids"][0]))
# print(align_label(' '.join(sents[1][0]), sents[1][1]))

### Data Class

In [17]:
class DataSequence(torch.utils.data.Dataset):

    def __init__(self, tagged_sents):
        self.sents = [' '.join(sent) for sent, _ in tagged_sents]
        self.tags = [tags for _, tags in tagged_sents]
        self.texts = [tokenizer(' '.join(sent), padding='max_length', max_length = MAX_LENGTH, return_tensors="pt") for sent, _ in tagged_sents]
        self.labels = [align_label(' '.join(sent), tag) for sent, tag in tagged_sents]

    def __len__(self):

        return len(self.labels)

    def get_batch_data(self, idx):

        return self.texts[idx]

    def get_batch_labels(self, idx):

        return torch.LongTensor(self.labels[idx])

    def __getitem__(self, idx):

        batch_data = self.get_batch_data(idx)
        batch_labels = self.get_batch_labels(idx)

        return batch_data, batch_labels

### Split train and test 

In [18]:
train_set, dev_set = train_test_split(sents, test_size=0.2, shuffle=True)

In [19]:
print(len(train_set), len(dev_set))

69 18


# NER Model

### Load pre-trained BERT model

In [20]:
class BertModel(torch.nn.Module):

    def __init__(self):

        super(BertModel, self).__init__()

        self.bert = BertForTokenClassification.from_pretrained('bert-base-cased', num_labels=len(unique_tags))

    def forward(self, input_id, mask, label):

        output = self.bert(input_ids=input_id, attention_mask=mask, labels=label, return_dict=False)

        return output

### Evaluation

In [21]:
def evaluate(sys_spacy_data, gold_spacy_data):
    precision, recall, fscore = 0, 0, 0

    tp = 0
    fp = 0
    fn = 0

    for sys_ex, gold_ex in zip(sys_spacy_data, gold_spacy_data):
        gold_annotations = set([tuple(e) for e in gold_ex])
        sys_annotations = set([tuple(e) for e in sys_ex])

        tp += len(sys_annotations.intersection(gold_annotations))
        fp += len(sys_annotations.difference(gold_annotations))
        fn += len(gold_annotations.difference(sys_annotations))

    if tp != 0:
        recall = (tp/(tp+fn)) * 100
        precision = (tp/(tp+fp)) * 100
        fscore = 2*recall*precision/(recall+precision)

    return precision, recall, fscore

### Tagger

In [22]:
# predict tags for sentences
def align_word_ids(texts):
  
    tokenized_inputs = tokenizer(texts, padding='max_length', max_length=MAX_LENGTH)

    word_ids = tokenized_inputs.word_ids()

    previous_word_idx = None
    label_ids = []

    for word_idx in word_ids:

        if word_idx is None:
            label_ids.append(-100)

        elif word_idx != previous_word_idx:
            try:
                label_ids.append(1)
            except:
                label_ids.append(-100)
        else:
          label_ids.append(-100)
        previous_word_idx = word_idx

    return label_ids


def predict(model, sentence):
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    if use_cuda:
        model = model.cuda()

    text = tokenizer(sentence, padding='max_length', max_length = MAX_LENGTH, return_tensors="pt")

    mask = text['attention_mask'][0].unsqueeze(0).to(device)

    input_id = text['input_ids'][0].unsqueeze(0).to(device)
    label_ids = torch.Tensor(align_word_ids(sentence)).unsqueeze(0).to(device)

    logits = model(input_id, mask, None)
    logits_clean = logits[0][label_ids != -100]

    predictions = logits_clean.argmax(dim=1).tolist()
    prediction_label = [ids2tags[i] for i in predictions]
    return prediction_label

### Train data

In [23]:
def train_loop(model, train_set, dev_set):

    train_dataset = DataSequence(train_set)
    dev_dataset = DataSequence(dev_set)

    train_dataloader = DataLoader(train_dataset, num_workers=4, batch_size=1, shuffle=True)
    dev_dataloader = DataLoader(dev_dataset, num_workers=4, batch_size=1)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    optimizer = SGD(model.parameters(), lr=LEARNING_RATE)

    if use_cuda:
        model = model.cuda()

    best_acc = 0
    best_loss = 1000

    for epoch_num in range(EPOCHS):

        total_acc_train = 0
        total_loss_train = 0

        model.train()

        for train_data, train_label in tqdm(train_dataloader):

            train_label = train_label[0].to(device)
            mask = train_data['attention_mask'][0].to(device)
            input_id = train_data['input_ids'][0].to(device)

            optimizer.zero_grad()
            loss, logits = model(input_id, mask, train_label)

            logits_clean = logits[0][train_label != -100]
            label_clean = train_label[train_label != -100]

            predictions = logits_clean.argmax(dim=1)

            acc = (predictions == label_clean).float().mean()
            total_acc_train += acc
            total_loss_train += loss.item()

            loss.backward()
            optimizer.step()

        predictions = [predict(model, sent) for sent in dev_dataset.sents]
        p,r,f = evaluate(predictions, dev_dataset.tags)
        print("  PRECISION: %.2f%%, RECALL: %.2f%%, F-SCORE: %.2f%%" % (p,r,f))

In [24]:
# Hyperparameters
LEARNING_RATE = 1e-2
EPOCHS = 5

model = BertModel()
train_loop(model, train_set, dev_set)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cas

  PRECISION: 94.34%, RECALL: 86.21%, F-SCORE: 90.09%


100%|██████████| 69/69 [00:03<00:00, 17.57it/s]


  PRECISION: 98.15%, RECALL: 91.38%, F-SCORE: 94.64%


100%|██████████| 69/69 [00:03<00:00, 22.81it/s]


  PRECISION: 100.00%, RECALL: 96.55%, F-SCORE: 98.25%


100%|██████████| 69/69 [00:03<00:00, 22.88it/s]


  PRECISION: 95.08%, RECALL: 100.00%, F-SCORE: 97.48%


100%|██████████| 69/69 [00:03<00:00, 22.52it/s]


  PRECISION: 100.00%, RECALL: 100.00%, F-SCORE: 100.00%


# Save model

In [25]:
model_path = '/content/drive/MyDrive/Colab Notebooks/ckpt/katalon-bert-tagger.pt'
torch.save(model.state_dict(), model_path)

# Load model

In [26]:
model = BertModel()
model.load_state_dict(torch.load(model_path))

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cas

<All keys matched successfully>

# Results on Dev set

In [27]:
for sent, gold_tags in dev_set: 
    sent = ' '.join(sent)
    sys = predict(model, sent)
    print(sent)
    print('Predict: ')
    print(sys)
    print('Gold: ')
    print(gold_tags)
    print()

Click submit button
Predict: 
['O', 'B-value', 'I-value']
Gold: 
['O', 'B-value', 'I-value']

Click submit button
Predict: 
['O', 'B-value', 'I-value']
Gold: 
['O', 'B-value', 'I-value']

Click button Create Library
Predict: 
['O', 'O', 'B-value', 'I-value']
Gold: 
['O', 'O', 'B-value', 'I-value']

Click button Submit
Predict: 
['O', 'O', 'B-value']
Gold: 
['O', 'O', 'B-value']

Verify create library button appears
Predict: 
['O', 'B-value', 'I-value', 'I-value', 'O']
Gold: 
['O', 'B-value', 'I-value', 'I-value', 'O']

Enter password to Password textbox Admin@123
Predict: 
['O', 'O', 'O', 'B-location', 'I-location', 'B-value', 'B-value', 'B-value']
Gold: 
['O', 'O', 'O', 'B-location', 'I-location', 'B-value']

Click submit button
Predict: 
['O', 'B-value', 'I-value']
Gold: 
['O', 'B-value', 'I-value']

Verify the add question button appears
Predict: 
['O', 'O', 'B-value', 'I-value', 'I-value', 'O']
Gold: 
['O', 'O', 'B-value', 'I-value', 'I-value', 'O']

Get password error message
Pred