In [None]:
!pip install -U transformers datasets accelerate evaluate seqeval

In [27]:
import os
import random
import numpy as np
from dataclasses import dataclass
from typing import List, Dict, Any

import torch
from datasets import Dataset, DatasetDict
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    DataCollatorForTokenClassification,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback,
)

### Paths 

In [3]:
DATA_DIR = "../data"  
MODEL_NAME = "bert-base-cased"  
OUTPUT_DIR = "outputs/bert_ner"

os.makedirs(OUTPUT_DIR, exist_ok=True)

### Reproducibility

In [4]:
seed = 42

random.seed(seed) 
np.random.seed(seed)

torch.manual_seed(seed) 
torch.cuda.manual_seed_all(seed)

In [5]:

data = load_dataset(
    "text",
    data_files={
    "train": "../data/train.conll",
    "validation": "../data/validation.conll",
    "test": "../data/test.conll"
})


### Load label list & mappings

In [6]:

label_list: List[str] = [l.strip() for l in open(os.path.join(DATA_DIR, "labels.txt"), encoding="utf-8")]
id2label = {i: l for i, l in enumerate(label_list)}
label2id = {l: i for i, l in enumerate(label_list)}

In [7]:
id2label

{0: 'O',
 1: 'B-FACTORY_LOCATION',
 2: 'I-FACTORY_LOCATION',
 3: 'B-MACHINE_TYPE',
 4: 'I-MACHINE_TYPE',
 5: 'B-DOWNTIME_CAUSE',
 6: 'I-DOWNTIME_CAUSE',
 7: 'B-PRODUCTION_VOLUME',
 8: 'I-PRODUCTION_VOLUME',
 9: 'B-WORKFORCE_AVAILABILITY',
 10: 'I-WORKFORCE_AVAILABILITY'}

In [8]:
label2id

{'O': 0,
 'B-FACTORY_LOCATION': 1,
 'I-FACTORY_LOCATION': 2,
 'B-MACHINE_TYPE': 3,
 'I-MACHINE_TYPE': 4,
 'B-DOWNTIME_CAUSE': 5,
 'I-DOWNTIME_CAUSE': 6,
 'B-PRODUCTION_VOLUME': 7,
 'I-PRODUCTION_VOLUME': 8,
 'B-WORKFORCE_AVAILABILITY': 9,
 'I-WORKFORCE_AVAILABILITY': 10}

In [9]:

def read_conll(path: str):
    sentences, tags = [], []
    cur_tokens, cur_tags = [], []
    with open(path, encoding="utf-8") as f:
        for line in f:
            line = line.rstrip("\n")
            if not line:
                if cur_tokens:
                    sentences.append(cur_tokens); tags.append(cur_tags)
                    cur_tokens, cur_tags = [], []
                continue
            
            # each line: "<token> <tag>"
            parts = line.split()
            token = parts[0]
            tag = parts[-1]
            cur_tokens.append(token)
            cur_tags.append(tag)
    if cur_tokens:
        sentences.append(cur_tokens); tags.append(cur_tags)
    return sentences, tags

train_sents, train_tags = read_conll(os.path.join(DATA_DIR, "train.conll"))
val_sents,   val_tags   = read_conll(os.path.join(DATA_DIR, "validation.conll"))
test_sents,  test_tags  = read_conll(os.path.join(DATA_DIR, "test.conll"))


In [10]:
# Wrap in HF datasets
def to_hf_dataset(sents, tags):
    return Dataset.from_dict({"tokens": sents, "ner_tags": tags})

data = DatasetDict({
    "train": to_hf_dataset(train_sents, train_tags),
    "validation": to_hf_dataset(val_sents, val_tags),
    "test": to_hf_dataset(test_sents, test_tags),
})


In [11]:
ner_tags = data['train'][0]['ner_tags']
print(ner_tags)
print(len(ner_tags))

['B-FACTORY_LOCATION', 'O', 'B-DOWNTIME_CAUSE', 'I-DOWNTIME_CAUSE', 'O', 'O', 'O', 'O', 'B-MACHINE_TYPE', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PRODUCTION_VOLUME', 'I-PRODUCTION_VOLUME', 'O', 'O', 'O', 'O']
21


### Model Building (Tokenizer and model)

#### Tokenization 

In [12]:

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [13]:
tokenizer.is_fast

True

In [14]:
tokens = data['train'][0]['tokens']#original tokens 
print(tokens)
print(len(tokens))

['Manitoba', 'experienced', 'reservoir', 'depletion', 'in', 'May', '2024', ';', 'intertie', 'capacity', 'tightened', 'and', 'authorities', 'recorded', 'a', '6.4%', 'decrease', 'relative', 'to', 'normal', 'levels.']
21


In [15]:
inputs = data['train'][0]['tokens']
inputs = tokenizer(inputs, is_split_into_words=True)

In [16]:
print(inputs)
print(inputs.tokens())
print(len(inputs.tokens()) - 2 )#removing [CLS] and [SEP ]

{'input_ids': [101, 10391, 4531, 10462, 1260, 7136, 2116, 1107, 1318, 17881, 1527, 132, 9455, 9570, 3211, 7974, 1105, 3912, 1802, 170, 127, 119, 125, 110, 9711, 5236, 1106, 2999, 3001, 119, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
['[CLS]', 'Manitoba', 'experienced', 'reservoir', 'de', '##ple', '##tion', 'in', 'May', '202', '##4', ';', 'inter', '##tie', 'capacity', 'tightened', 'and', 'authorities', 'recorded', 'a', '6', '.', '4', '%', 'decrease', 'relative', 'to', 'normal', 'levels', '.', '[SEP]']
29


- Here , we can analyse that after tokenizer our single tokens are divided into multiple (like depletion is 
divided into de, ##ple, ##tion).
- So, now we need to solve this misalignment.

In [17]:

print(inputs.word_ids())

[None, 0, 1, 2, 3, 3, 3, 4, 5, 6, 6, 7, 8, 8, 9, 10, 11, 12, 13, 14, 15, 15, 15, 15, 16, 17, 18, 19, 20, 20, None]


- We can see words ids (like 3,3,3) i.e this means for the single token 

#### Preprocessing for Misaligned Labels

In [18]:
def align_labels_with_tokens(labels, word_ids):
    
    new_labels = []
    current_word = None
    
    for word_id in word_ids:
        
        if word_id is None:
            new_labels.append(-100)
            
        elif word_id != current_word:
            current_word = word_id
            new_labels.append(label2id[labels[word_id]])
            
        else:
            new_labels.append(-100)#-100 means Don’t compute loss for this position.
            #i.e ignore_index=-100 in PyTorch’s CrossEntropyLoss.
            
    return new_labels

In [19]:
labels = data['train'][0]['ner_tags']
word_ids = inputs.word_ids()
print(labels)
print(word_ids)


['B-FACTORY_LOCATION', 'O', 'B-DOWNTIME_CAUSE', 'I-DOWNTIME_CAUSE', 'O', 'O', 'O', 'O', 'B-MACHINE_TYPE', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PRODUCTION_VOLUME', 'I-PRODUCTION_VOLUME', 'O', 'O', 'O', 'O']
[None, 0, 1, 2, 3, 3, 3, 4, 5, 6, 6, 7, 8, 8, 9, 10, 11, 12, 13, 14, 15, 15, 15, 15, 16, 17, 18, 19, 20, 20, None]


In [20]:
new_labels_ids = align_labels_with_tokens(labels,word_ids)
print(new_labels_ids)

[-100, 1, 0, 5, 6, -100, -100, 0, 0, 0, -100, 0, 3, -100, 0, 0, 0, 0, 0, 0, 7, -100, -100, -100, 8, 0, 0, 0, 0, -100, -100]


In [None]:
def tokenize_and_align_labels(batch):
    
    # print(batch)
    
    tokenized_inputs = tokenizer(batch["tokens"], is_split_into_words=True)
    label_ids = []
    
    for i, labels in enumerate(batch["ner_tags"]):
        
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        aligned_labels = align_labels_with_tokens(labels, word_ids)#new label ids
        label_ids.append(aligned_labels)
        
    # print(label_ids)
        
    tokenized_inputs["labels"] = label_ids
    # print(f'Tokenized input : {tokenized_inputs}')
    return tokenized_inputs

In [28]:
tokenized_datasets = data.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/110 [00:00<?, ? examples/s]

Map:   0%|          | 0/14 [00:00<?, ? examples/s]

Map:   0%|          | 0/14 [00:00<?, ? examples/s]

In [29]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 110
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 14
    })
    test: Dataset({
        features: ['tokens', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 14
    })
})

In [30]:
tokenized_datasets = tokenized_datasets.remove_columns(["tokens", "ner_tags","token_type_ids"])

For classification we only need input_ids, attention_mask and labels.So, columns can be removed 

In [31]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 110
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 14
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 14
    })
})

In [32]:
tokenized_datasets = data.map(tokenize_and_align_labels, batched=True)


Map:   0%|          | 0/110 [00:00<?, ? examples/s]

Map:   0%|          | 0/14 [00:00<?, ? examples/s]

Map:   0%|          | 0/14 [00:00<?, ? examples/s]