In [None]:
!pip install -U transformers datasets accelerate evaluate seqeval

In [86]:
import os
import random
import numpy as np
from dataclasses import dataclass
from typing import List, Dict, Any

import torch
from datasets import Dataset, DatasetDict
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForTokenClassification,
    DataCollatorForTokenClassification,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback,
)
from evaluate import load as load_metric


### Paths 

In [61]:
DATA_DIR = "../data/"  
MODEL_NAME = "bert-base-cased"  


### Load label list & mappings

In [63]:

label_list: List[str] = [l.strip() for l in open(os.path.join(DATA_DIR, "labels.txt"), encoding="utf-8")]
id2label = {i: l for i, l in enumerate(label_list)}
label2id = {l: i for i, l in enumerate(label_list)}

In [64]:
id2label

{0: 'O', 1: 'B-CHEMICAL', 2: 'I-CHEMICAL', 3: 'B-DISEASE', 4: 'I-DISEASE'}

In [65]:
label2id

{'O': 0, 'B-CHEMICAL': 1, 'I-CHEMICAL': 2, 'B-DISEASE': 3, 'I-DISEASE': 4}

In [66]:

def read_conll(path: str):
    
    sentences, tags = [], []
    cur_tokens, cur_tags = [], []
    
    with open(path, encoding="utf-8") as f:
        for line in f:
            line = line.rstrip("\n")
            if not line:
                if cur_tokens:
                    sentences.append(cur_tokens); tags.append(cur_tags)
                    cur_tokens, cur_tags = [], []
                continue
            
            # each line: "<token> <tag>"
            parts = line.split()
            token = parts[0]
            tag = parts[-1]
            cur_tokens.append(token)
            cur_tags.append(tag)
            
    if cur_tokens:
        sentences.append(cur_tokens); tags.append(cur_tags)
        
    return sentences, tags


train_sents, train_tags = read_conll(os.path.join(DATA_DIR, "bc5cdr_train.conll"))
val_sents,   val_tags   = read_conll(os.path.join(DATA_DIR, "bc5cdr_validation.conll"))
test_sents,  test_tags  = read_conll(os.path.join(DATA_DIR, "bc5cdr_test.conll"))


In [67]:
# Wrap in HF datasets
def to_hf_dataset(sents, tags):
    return Dataset.from_dict({"tokens": sents, "ner_tags": tags})

data = DatasetDict({
    "train": to_hf_dataset(train_sents, train_tags),
    "validation": to_hf_dataset(val_sents, val_tags),
    "test": to_hf_dataset(test_sents, test_tags),
})


In [68]:
ner_tags = data['train'][0]['ner_tags']
print(ner_tags)
print(len(ner_tags))

['O', 'O', 'O', 'B-DISEASE', 'O', 'O']
6


### Model Building (Tokenizer and model)

#### Tokenization 

In [69]:

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [70]:
tokenizer.is_fast

True

In [71]:
tokens = data['train'][0]['tokens']#original tokens 
print(tokens)
print(len(tokens))

['Antihypertensive', 'drugs', 'and', 'depression:', 'a', 'reappraisal.']
6


In [72]:
inputs = data['train'][0]['tokens']
inputs = tokenizer(inputs, is_split_into_words=True)

In [73]:
print(inputs)
print(inputs.tokens())
print(len(inputs.tokens()) - 2 )#removing [CLS] and [SEP ]

{'input_ids': [101, 8329, 7889, 17786, 5026, 2109, 5557, 1105, 7560, 131, 170, 1231, 11478, 20488, 15630, 1233, 119, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
['[CLS]', 'Anti', '##hy', '##pert', '##ens', '##ive', 'drugs', 'and', 'depression', ':', 'a', 're', '##ap', '##pra', '##isa', '##l', '.', '[SEP]']
16


- Here , we can analyse that after tokenizer our single tokens are divided into multiple (like depletion is 
divided into de, ##ple, ##tion).
- So, now we need to solve this misalignment.

In [74]:

print(inputs.word_ids())

[None, 0, 0, 0, 0, 0, 1, 2, 3, 3, 4, 5, 5, 5, 5, 5, 5, None]


- We can see words ids (like 3,3,3) i.e this means for the single token 

#### Preprocessing for Misaligned Labels

In [75]:
def align_labels_with_tokens(labels, word_ids):
    
    new_labels = []
    current_word = None
    
    for word_id in word_ids:
        
        if word_id is None:
            new_labels.append(-100)
            
        elif word_id != current_word:
            current_word = word_id
            new_labels.append(label2id[labels[word_id]])
            
        else:
            new_labels.append(-100)#-100 means Don’t compute loss for this position.
            #i.e ignore_index=-100 in PyTorch’s CrossEntropyLoss.
            
    return new_labels

In [76]:
labels = data['train'][0]['ner_tags']
word_ids = inputs.word_ids()
print(labels)
print(word_ids)


['O', 'O', 'O', 'B-DISEASE', 'O', 'O']
[None, 0, 0, 0, 0, 0, 1, 2, 3, 3, 4, 5, 5, 5, 5, 5, 5, None]


In [77]:
new_labels_ids = align_labels_with_tokens(labels,word_ids)
print(new_labels_ids)

[-100, 0, -100, -100, -100, -100, 0, 0, 3, -100, 0, 0, -100, -100, -100, -100, -100, -100]


In [78]:
def tokenize_and_align_labels(batch):
    
    # print(batch)
    
    tokenized_inputs = tokenizer(batch["tokens"], is_split_into_words=True)
    label_ids = []
    
    for i, labels in enumerate(batch["ner_tags"]):
        
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        aligned_labels = align_labels_with_tokens(labels, word_ids)#new label ids
        label_ids.append(aligned_labels)
        
    # print(label_ids)
        
    tokenized_inputs["labels"] = label_ids
    # print(f'Tokenized input : {tokenized_inputs}')
    return tokenized_inputs

In [79]:
tokenized_datasets = data.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/4648 [00:00<?, ? examples/s]

Map:   0%|          | 0/4657 [00:00<?, ? examples/s]

Map:   0%|          | 0/4861 [00:00<?, ? examples/s]

In [80]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 4648
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 4657
    })
    test: Dataset({
        features: ['tokens', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 4861
    })
})

In [81]:
tokenized_datasets = tokenized_datasets.remove_columns(["tokens", "ner_tags","token_type_ids"])

For classification we only need input_ids, attention_mask and labels.So, columns can be removed 

In [82]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 4648
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 4657
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 4861
    })
})

In [83]:
tokenized_datasets = data.map(tokenize_and_align_labels, batched=True)


Map:   0%|          | 0/4648 [00:00<?, ? examples/s]

Map:   0%|          | 0/4657 [00:00<?, ? examples/s]

Map:   0%|          | 0/4861 [00:00<?, ? examples/s]

In [84]:

tokenized_datasets.save_to_disk("../data/processed/bio_ner")

Saving the dataset (0/1 shards):   0%|          | 0/4648 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/4657 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/4861 [00:00<?, ? examples/s]