<a href="https://colab.research.google.com/github/matthewleechen/woodcroft_patents/blob/main/ner/notebooks/ner_patents.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This notebook is a modified version of Niels Rogge's (extremely helpful!) notebook, "Fine-tuning BERT for named entity recognition", linked [here](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/BERT/Custom_Named_Entity_Recognition_with_BERT.ipynb). 

It is **not** recommended to run this notebook on the Colab free plan. This notebook's training loop was originally run using Colab Pro on 1 Nvidia Tesla V100 (16GB) GPU. You can also run this locally on a virtual machine or server, but carefully check for dependencies.

This notebook uses the BERT model 'bert-base-uncased' (linked [here](https://huggingface.co/bert-base-uncased)), but any pre-trained model on [HuggingFace](https://huggingface.co) can be used. I have experimented with BERT (base and both cased/uncased), RoBERTa (base and distilled), XLNet (base) and SBERT models (fine-tuned MPNet) and find no meaningful differences in performance on the patents data given the hyperparameters set below (validation accuracy 97-98%).

**Setup**

In [None]:
%%capture
!pip install transformers seqeval[gpu]
!pip install conllu

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import set_seed, AutoConfig, AutoTokenizer, AutoModelForTokenClassification, pipeline
import conllu
import csv
import json
import os
from tqdm import tqdm

In [None]:
# Check if GPU is available
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

cuda


In [None]:
# Set seed
set_seed(42)

**Data Preprocessing**

Upload .conll dataset exported from Label Studio

In [None]:
# Visualize annotations as .conll dataset
data = open("/path/to/ner_patents.conll", mode = "r", encoding = "utf-8")
annotations = data.read()
print(annotations[:1000])

In [None]:
# define input and output file names
input_file = "ner_patents.conll" ## assume name of .conll file is ner_patents
output_file = "ner_patents.csv" ## same here

# initialize the csv writer and write the header row
csv_writer = csv.writer(open(output_file, "w", newline="", encoding="utf-8"))
csv_writer.writerow(["sentence_no", "word", "tag"])

# read the input file line by line
with open(input_file, "r", encoding="utf-8") as f:
    sentence_num = 1
    for line in f:
        line = line.strip()
        if not line:
            # empty line denotes end of sentence
            sentence_num += 1
        else:
            # split line into columns
            columns = line.split()
            word = columns[0]
            tag = columns[-1]
            sentence = "{}".format(sentence_num)
            # write row to csv file
            csv_writer.writerow([sentence, word or "NaN", tag or "NaN"])

In [None]:
# Visualize data
pd.read_csv('ner_patents.csv').drop(0).to_csv('ner_patents.csv', index=False)
data = pd.read_csv("ner_patents.csv", encoding='unicode_escape')
data.head()

In [None]:
# Group by sentence
# let's create a new column called "patent" which groups the words by sentence 
data['sentence'] = data[['sentence_no','word','tag']].groupby(['sentence_no'])['word'].transform(lambda x: ' '.join(x))
# let's also create a new column called "word_labels" which groups the tags by sentence 
data['word_labels'] = data[['sentence_no','word','tag']].groupby(['sentence_no'])['tag'].transform(lambda x: ','.join(x))
# Show data
data.head()

In [None]:
# Make dictionary mapping tags to indices
label2id = {k: v for v, k in enumerate(data.tag.unique())}
id2label = {v: k for v, k in enumerate(data.tag.unique())}
label2id

In [None]:
data = data[["sentence", "word_labels"]].drop_duplicates().reset_index(drop=True)
data.head()

**Dataset and Dataloader**

In [None]:
# Define variables
MAX_LEN = 128
TRAIN_BATCH_SIZE = 120
VALID_BATCH_SIZE = 60
EPOCHS = 150
LEARNING_RATE = 1e-05
MAX_GRAD_NORM = 10
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased') # can change tokenizer here

In [None]:
# Word-level tokenization
def tokenize_and_preserve_labels(sentence, text_labels, tokenizer):
    """
    Word piece tokenization makes it difficult to match word labels
    back up with individual word pieces. This function tokenizes each
    word one at a time so that it is easier to preserve the correct
    label for each subword. It is, of course, a bit slower in processing
    time, but it will help our model achieve higher accuracy.
    """

    tokenized_sentence = []
    labels = []

    sentence = sentence.strip()

    for word, label in zip(sentence.split(), text_labels.split(",")):

        # Tokenize the word and count # of subwords the word is broken into
        tokenized_word = tokenizer.tokenize(word)
        n_subwords = len(tokenized_word)

        # Add the tokenized word to the final tokenized word list
        tokenized_sentence.extend(tokenized_word)

        # Add the same label to the new list of labels `n_subwords` times
        labels.extend([label] * n_subwords)

    return tokenized_sentence, labels

In [None]:
class dataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __getitem__(self, index):
        # step 1: tokenize (and adapt corresponding labels)
        sentence = self.data.sentence[index]  
        word_labels = self.data.word_labels[index]  
        tokenized_sentence, labels = tokenize_and_preserve_labels(sentence, word_labels, self.tokenizer)
        
        # step 2: add special tokens (and corresponding labels)
        tokenized_sentence = ["[CLS]"] + tokenized_sentence + ["[SEP]"] # add special tokens
        labels.insert(0, "O") # add outside label for [CLS] token
        labels.insert(-1, "O") # add outside label for [SEP] token

        # step 3: truncating/padding
        maxlen = self.max_len

        if (len(tokenized_sentence) > maxlen):
          # truncate
          tokenized_sentence = tokenized_sentence[:maxlen]
          labels = labels[:maxlen]
        else:
          # pad
          tokenized_sentence = tokenized_sentence + ['[PAD]'for _ in range(maxlen - len(tokenized_sentence))]
          labels = labels + ["O" for _ in range(maxlen - len(labels))]

        # step 4: obtain the attention mask
        attn_mask = [1 if tok != '[PAD]' else 0 for tok in tokenized_sentence]
        
        # step 5: convert tokens to input ids
        ids = self.tokenizer.convert_tokens_to_ids(tokenized_sentence)

        label_ids = [label2id[label] for label in labels]
        # the following line is deprecated
        #label_ids = [label if label != 0 else -100 for label in label_ids]
        
        return {
              'ids': torch.tensor(ids, dtype=torch.long),
              'mask': torch.tensor(attn_mask, dtype=torch.long),
              #'token_type_ids': torch.tensor(token_ids, dtype=torch.long),
              'targets': torch.tensor(label_ids, dtype=torch.long)
        } 
    
    def __len__(self):
        return self.len

In [None]:
train_size = 0.8
train_dataset = data.sample(frac=train_size,random_state=200)
test_dataset = data.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)

print("FULL Dataset: {}".format(data.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

training_set = dataset(train_dataset, tokenizer, MAX_LEN)
testing_set = dataset(test_dataset, tokenizer, MAX_LEN)

In [None]:
for token, label in zip(tokenizer.convert_ids_to_tokens(training_set[0]["ids"][:30]), training_set[0]["targets"][:30]):
  print('{0:10}  {1}'.format(token, id2label[label.item()]))

In [None]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)


**Define the model**

In [None]:
model = AutoModelForTokenClassification.from_pretrained('bert-base-uncased', # can change model here
                                                   num_labels=len(id2label),
                                                   id2label=id2label,
                                                   label2id=label2id)
model.to(device)

**Training**

In [None]:
ids = training_set[0]["ids"].unsqueeze(0)
mask = training_set[0]["mask"].unsqueeze(0)
targets = training_set[0]["targets"].unsqueeze(0)
ids = ids.to(device)
mask = mask.to(device)
targets = targets.to(device)
outputs = model(input_ids=ids, attention_mask=mask, labels=targets)
initial_loss = outputs[0]
initial_loss

tensor(2.7869, device='cuda:0', grad_fn=<NllLossBackward0>)

In [None]:
tr_logits = outputs[1]
tr_logits.shape

torch.Size([1, 128, 17])

In [None]:
optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)

In [None]:
def train(epoch):
    tr_loss, tr_accuracy = 0, 0
    nb_tr_examples, nb_tr_steps = 0, 0
    tr_preds, tr_labels = [], []
    # put model in training mode
    model.train()

    for idx, batch in enumerate(training_loader):
        
        ids = batch['ids'].to(device, dtype = torch.long)
        mask = batch['mask'].to(device, dtype = torch.long)
        targets = batch['targets'].to(device, dtype = torch.long)

        outputs = model(input_ids=ids, attention_mask=mask, labels=targets)
        loss, tr_logits = outputs.loss, outputs.logits
        tr_loss += loss.item()

        nb_tr_steps += 1
        nb_tr_examples += targets.size(0)
        
        if idx % 100==0:
            loss_step = tr_loss/nb_tr_steps
            print(f"Training loss per 100 training steps: {loss_step}")

        # compute training accuracy
        flattened_targets = targets.view(-1) # shape (batch_size * seq_len,)
        active_logits = tr_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
        flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)
        # now, use mask to determine where we should compare predictions with targets (includes [CLS] and [SEP] token predictions)
        active_accuracy = mask.view(-1) == 1 # active accuracy is also of shape (batch_size * seq_len,)
        targets = torch.masked_select(flattened_targets, active_accuracy)
        predictions = torch.masked_select(flattened_predictions, active_accuracy)
        
        tr_preds.extend(predictions)
        tr_labels.extend(targets)
        
        tmp_tr_accuracy = accuracy_score(targets.cpu().numpy(), predictions.cpu().numpy())
        tr_accuracy += tmp_tr_accuracy
    
        # gradient clipping
        torch.nn.utils.clip_grad_norm_(
            parameters=model.parameters(), max_norm=MAX_GRAD_NORM
        )
        
        # backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    epoch_loss = tr_loss / nb_tr_steps
    tr_accuracy = tr_accuracy / nb_tr_steps
    print(f"Training loss epoch: {epoch_loss}")
    print(f"Training accuracy epoch: {tr_accuracy}")

In [None]:
# Training loop for model
for epoch in range(EPOCHS):
    print(f"Training epoch: {epoch + 1}")
    train(epoch)

**Evaluate model**

In [None]:
def valid(model, testing_loader):
    # put model in evaluation mode
    model.eval()
    
    eval_loss, eval_accuracy = 0, 0
    nb_eval_examples, nb_eval_steps = 0, 0
    eval_preds, eval_labels = [], []
    
    with torch.no_grad():
        for idx, batch in enumerate(testing_loader):
            
            ids = batch['ids'].to(device, dtype = torch.long)
            mask = batch['mask'].to(device, dtype = torch.long)
            targets = batch['targets'].to(device, dtype = torch.long)
            
            outputs = model(input_ids=ids, attention_mask=mask, labels=targets)
            loss, eval_logits = outputs.loss, outputs.logits
            
            eval_loss += loss.item()

            nb_eval_steps += 1
            nb_eval_examples += targets.size(0)
        
            if idx % 100==0:
                loss_step = eval_loss/nb_eval_steps
                print(f"Validation loss per 100 evaluation steps: {loss_step}")
              
            # compute evaluation accuracy
            flattened_targets = targets.view(-1) # shape (batch_size * seq_len,)
            active_logits = eval_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
            flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)
            # now, use mask to determine where we should compare predictions with targets (includes [CLS] and [SEP] token predictions)
            active_accuracy = mask.view(-1) == 1 # active accuracy is also of shape (batch_size * seq_len,)
            targets = torch.masked_select(flattened_targets, active_accuracy)
            predictions = torch.masked_select(flattened_predictions, active_accuracy)
            
            eval_labels.extend(targets)
            eval_preds.extend(predictions)
            
            tmp_eval_accuracy = accuracy_score(targets.cpu().numpy(), predictions.cpu().numpy())
            eval_accuracy += tmp_eval_accuracy
    
    #print(eval_labels)
    #print(eval_preds)

    labels = [id2label[id.item()] for id in eval_labels]
    predictions = [id2label[id.item()] for id in eval_preds]

    #print(labels)
    #print(predictions)
    
    eval_loss = eval_loss / nb_eval_steps
    eval_accuracy = eval_accuracy / nb_eval_steps
    print(f"Validation Loss: {eval_loss}")
    print(f"Validation Accuracy: {eval_accuracy}")

    return labels, predictions

In [None]:
labels, predictions = valid(model, testing_loader)

In [None]:
from seqeval.metrics import classification_report

print(classification_report([labels], [predictions]))

In [None]:
# Save model weights and tokenizer
model_path = "/path/to/directory"

model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

**Inference**

This code below runs inference on the merged text boxes (each is referred to as a "sentence") using the HuggingFace Pipelines API - documentation is linked [here](https://huggingface.co/docs/transformers/main_classes/pipelines).

This assumes that you have an input directory consisting of .txt files, and have an output directory that you want .csv files to exported to. The output is a csv file containing the labelled classes as columns (with each entity outputted to a separate column for the classes "PER", "LOC" and "OCC"), and each patent being recorded as a row (observation). You may need to modify this code depending on your desired output format.

Running this code on a GPU is strongly recommended. A Nvidia Tesla T4 GPU (provided on the Colab free plan) is orders of magnitude faster than using the CPU. On the T4, inference on 1000 patents takes approximately 15-20 seconds, but several hours on the Colab CPU. 

In [None]:
# Load the saved model weights and configuration
model = AutoModelForTokenClassification.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

In [None]:
# Set input and output directories
input_dir = "/path/to/input/dir"
output_dir = "/path/to/output/dir"

In [None]:
# Deploy Pipeline API: device = 0 for GPU, device = -1 is default (for CPU)
pipe = pipeline(task="token-classification", model=model, device = 0, tokenizer=tokenizer, aggregation_strategy="simple")

In [None]:
# Loop over all .txt files in input directory
for filename in tqdm(os.listdir(input_dir)):
    if filename.endswith(".txt"):
        # Specify file paths
        input_path = os.path.join(input_dir, filename)
        output_path = os.path.join(output_dir, filename[:-4] + ".csv")  # Remove .txt extension and add .csv extension

        # Read sentences from text file
        with open(input_path, "r") as f:
            sentences = f.read().split("\n\n")

        # Create list of dictionaries to store entities for each sentence
        all_entities = []
        fieldnames = ["NUM", "PER", "DATE", "LOC", "COMM", "OCC", "MISC", "INFO"]  # Specify the fieldnames

        for sentence in sentences:
            # Extract entities
            combined_entities = {}
            for entity in pipe(sentence):
                entity_group = entity['entity_group']
                word = entity['word']
                if entity_group not in combined_entities:
                    combined_entities[entity_group] = []
                combined_entities[entity_group].append(word)

            # Create a new dictionary to store the updated entities
            updated_entities = {}
            for entity_group, words in combined_entities.items():
                if entity_group in ["PER"]:
                    for i, word in enumerate(words):
                        column_name = f"{entity_group}_{i + 1}"
                        if column_name not in fieldnames:
                            updated_entities[column_name] = word
                else:
                    updated_entities[entity_group] = '& '.join(words)

            # Add updated entities for this sentence to list
            all_entities.append(updated_entities)

        # Update fieldnames to include separate columns for PER, LOC, and OCC
        max_columns = {key: 0 for key in ["PER"]}
        for entity_dict in all_entities:
            for key in max_columns.keys():
                max_columns[key] = max(max_columns[key], len([k for k in entity_dict.keys() if k.startswith(key)]))

        for key, count in max_columns.items():
            for i in range(count):
                column_name = f"{key}_{i + 1}"
                if column_name not in fieldnames:
                    fieldnames.append(column_name)

        # Write entities to CSV file
        with open(output_path, 'w', newline='') as csvfile:
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            writer.writeheader()
            writer.writerows(all_entities)