In [2]:
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import torch
from torch.utils.data import DataLoader
from torch.optim import AdamW
from datasets import Dataset, DatasetDict
from transformers import AutoModelForTokenClassification, AutoTokenizer, DataCollatorForTokenClassification, get_scheduler
import evaluate
from accelerate import Accelerator
import shutil

In [3]:
df = pd.read_parquet('/kaggle/input/ner_data_raw.prqt')
df = df.sample(frac=1, random_state=111).reset_index(drop=True)

label2id = {
    'O': 0,
    'B-TARGET': 1, 
    'I-TARGET': 2,
    'B-SUBSTRATE': 3,
    'I-SUBSTRATE': 4
}
id2label = {v: k for k, v in label2id.items()}
label_names = list(label2id.keys())

df['tokens'] = df.ner_data.apply(lambda x: [_[0] for _ in x])
df['ner_tags'] = df.ner_data.apply(lambda x: [label2id[_[1]] for _ in x])

raw_datasets = DatasetDict(
    {
    'train': Dataset.from_pandas(df[['tokens', 'ner_tags']].iloc[:7000, :]),
    'validation': Dataset.from_pandas(df[['tokens', 'ner_tags']].iloc[7000:7500, :]),
    'test': Dataset.from_pandas(df[['tokens', 'ner_tags']].iloc[7500:, :])
    }
)

raw_datasets

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 7000
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 500
    })
    test: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 577
    })
})

In [4]:
model_checkpoint = "michiyasunaga/BioLinkBERT-base"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label=id2label,
    label2id=label2id,
)
model.to('cuda')

tokenizer_config.json:   0%|          | 0.00/379 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/225k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/447k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/559 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/433M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at michiyasunaga/BioLinkBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28895, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12

In [5]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            new_labels.append(-100)
        else:
            label = labels[word_id]
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )
    all_labels = examples["ner_tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

def postprocess(predictions, labels):
    predictions = predictions.detach().cpu().clone().numpy()
    labels = labels.detach().cpu().clone().numpy()

    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    return true_labels, true_predictions

In [6]:
tokenized_datasets = raw_datasets.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=raw_datasets["train"].column_names,
)
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
metric = evaluate.load("seqeval")

Map:   0%|          | 0/7000 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/577 [00:00<?, ? examples/s]

Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

In [7]:
train_dataloader = DataLoader(
    tokenized_datasets["train"],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=64,
)
eval_dataloader = DataLoader(
    tokenized_datasets["validation"], collate_fn=data_collator, batch_size=64
)
test_dataloader = DataLoader(
    tokenized_datasets["test"], collate_fn=data_collator, batch_size=64
)

optimizer = AdamW(model.parameters(), lr=2e-5)
accelerator = Accelerator()
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, eval_dataloader
)

num_train_epochs = 5
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

! mkdir linkbert_bioassay_ner
output_dir = "linkbert_bioassay_ner"

In [8]:
progress_bar = tqdm(range(num_training_steps))

for epoch in range(num_train_epochs):

    model.train()
    for batch in train_dataloader:
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

    model.eval()
    for batch in eval_dataloader:
        with torch.no_grad():
            outputs = model(**batch)

        predictions = outputs.logits.argmax(dim=-1)
        labels = batch["labels"]

        predictions = accelerator.pad_across_processes(predictions, dim=1, pad_index=-100)
        labels = accelerator.pad_across_processes(labels, dim=1, pad_index=-100)

        predictions_gathered = accelerator.gather(predictions)
        labels_gathered = accelerator.gather(labels)

        true_predictions, true_labels = postprocess(predictions_gathered, labels_gathered)
        metric.add_batch(predictions=true_predictions, references=true_labels)

    results = metric.compute()
    print(
        f"epoch {epoch}:",
        {
            key: results[f"overall_{key}"]
            for key in ["precision", "recall", "f1", "accuracy"]
        },
    )

    accelerator.wait_for_everyone()
    unwrapped_model = accelerator.unwrap_model(model)
    unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
    if accelerator.is_main_process:
        tokenizer.save_pretrained(output_dir)

accelerator.wait_for_everyone()
unwrapped_model = accelerator.unwrap_model(model)
unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)

  0%|          | 0/550 [00:00<?, ?it/s]

epoch 0: {'precision': 0.9138166894664843, 'recall': 0.835, 'f1': 0.8726322664924886, 'accuracy': 0.9782276268841477}
epoch 1: {'precision': 0.9233926128590971, 'recall': 0.8522727272727273, 'f1': 0.8864084044648719, 'accuracy': 0.9778635403771936}
epoch 2: {'precision': 0.9247606019151847, 'recall': 0.8813559322033898, 'f1': 0.9025367156208278, 'accuracy': 0.9820141265564698}
epoch 3: {'precision': 0.9220246238030095, 'recall': 0.8730569948186528, 'f1': 0.8968729208250165, 'accuracy': 0.9812859535425618}
epoch 4: {'precision': 0.9247606019151847, 'recall': 0.8745148771021992, 'f1': 0.898936170212766, 'accuracy': 0.9816500400495157}


In [9]:
model.eval()
for batch in test_dataloader:
    batch.to('cuda')
    with torch.no_grad():
        outputs = model(**batch)

    predictions = outputs.logits.argmax(dim=-1)
    labels = batch["labels"]

    predictions = accelerator.pad_across_processes(predictions, dim=1, pad_index=-100)
    labels = accelerator.pad_across_processes(labels, dim=1, pad_index=-100)

    predictions_gathered = accelerator.gather(predictions)
    labels_gathered = accelerator.gather(labels)

    true_predictions, true_labels = postprocess(predictions_gathered, labels_gathered)
    metric.add_batch(predictions=true_predictions, references=true_labels)

results = metric.compute()
print(
    f"Final metric on test set:",
    {
        key: results[f"overall_{key}"]
        for key in ["precision", "recall", "f1", "accuracy"]
    },
)

Final metric on test set: {'precision': 0.9422632794457275, 'recall': 0.8898582333696837, 'f1': 0.9153112731351654, 'accuracy': 0.9840529648819805}


In [10]:
text = 'Activation of human beta-adrenergic receptor by isoproterenol at 0.5 uM measured as increase in cAMP levels'

example = tokenizer(text, return_tensors="pt").to('cuda')

with torch.no_grad():
    logits = model(**example).logits

predictions = torch.argmax(logits, dim=2)
predicted_token_class = [model.config.id2label[t.item()] for t in predictions[0]]

tokens = tokenizer.convert_ids_to_tokens(example['input_ids'][0])
max_length = max(len(token) for token in tokens)

for token, label in zip(tokens, predicted_token_class):
    print(f"{token: <{max_length}} {label}")

[CLS]         O
activation    O
of            O
human         O
beta          B-TARGET
-             I-TARGET
adrenergic    I-TARGET
receptor      I-TARGET
by            O
isoproterenol B-SUBSTRATE
at            O
0             O
.             O
5             O
um            O
measured      O
as            O
increase      O
in            O
camp          O
levels        O
[SEP]         O


In [11]:
source_directory = 'linkbert_bioassay_ner'
archive_path = 'linkbert_bioassay_ner.zip'
shutil.make_archive(archive_path.replace('.zip', ''), 'zip', source_directory)

'/kaggle/working/linkbert_bioassay_ner.zip'