In [1]:
!pip install datasets transformers accelerate==0.28.0



In [2]:
import os
from datasets import Dataset, load_dataset

# Function to load NER data from a file
def load_ner_data(file_path):
    sentences, labels = [], []
    with open(file_path, 'r') as f:
        sentence, label = [], []
        for line in f:
            line = line.strip()
            if line:
                word, tag = line.split()
                sentence.append(word)
                label.append(tag)
            else:
                if sentence:  # End of a sentence
                    sentences.append(sentence)
                    labels.append(label)
                    sentence, label = [], []
        # Catch the last sentence if there's no newline at the end
        if sentence:
            sentences.append(sentence)
            labels.append(label)
    return sentences, labels

# Load your NER data
train_sentences, train_labels = load_ner_data('train.txt')
valid_sentences, valid_labels = load_ner_data('valid.txt')
test_sentences, test_labels = load_ner_data('test.txt')

# Collect all unique labels from all datasets
all_labels = set()
for label_list in train_labels + valid_labels + test_labels:
    all_labels.update(label_list)

# Create label mapping
label_map = {label: idx for idx, label in enumerate(sorted(all_labels))}
print("Label Map:", label_map)

# Inverse mapping for decoding (if needed later)
inverse_label_map = {v: k for k, v in label_map.items()}

# Encode labels as integers
def encode_labels(labels):
    return [[label_map[label] for label in sentence_labels] for sentence_labels in labels]

train_labels_encoded = encode_labels(train_labels)
valid_labels_encoded = encode_labels(valid_labels)
test_labels_encoded = encode_labels(test_labels)

# Convert to Dataset objects
train_dataset = Dataset.from_dict({"tokens": train_sentences, "ner_tags": train_labels_encoded})
valid_dataset = Dataset.from_dict({"tokens": valid_sentences, "ner_tags": valid_labels_encoded})
test_dataset = Dataset.from_dict({"tokens": test_sentences, "ner_tags": test_labels_encoded})

Label Map: {'B-eve': 0, 'B-geo': 1, 'B-gpe': 2, 'B-org': 3, 'B-per': 4, 'I-eve': 5, 'I-geo': 6, 'I-gpe': 7, 'I-org': 8, 'I-per': 9, 'None': 10, 'O': 11}


In [3]:
from transformers import BertTokenizerFast

tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased')

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples['tokens'],
        truncation=True,
        padding='max_length',
        max_length=128,
        is_split_into_words=True,
        return_tensors="pt"  # Return tensors directly
    )

    labels = []
    for i, label in enumerate(examples['ner_tags']):
        label_ids = [-100] * len(tokenized_inputs['input_ids'][i])  # Start with -100 for all tokens

        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        for j, word_idx in enumerate(word_ids):
            if word_idx is None:
                label_ids[j] = -100
            elif word_idx != previous_word_idx:
                label_ids[j] = label[word_idx]
            else:
                label_ids[j] = label_map['I-' + inverse_label_map[label[word_idx]][2:]] if inverse_label_map[label[word_idx]].startswith('B-') else -100
            previous_word_idx = word_idx

        # Ensure that label_ids are the same length as tokenized inputs
        labels.append(label_ids)

    # Ensure that the labels are padded to the same length as input IDs
    max_length = max(len(l) for l in labels)
    for i in range(len(labels)):
        labels[i] += [-100] * (max_length - len(labels[i]))  # Pad with -100

    tokenized_inputs['labels'] = labels
    return tokenized_inputs

# Tokenize datasets
train_dataset = train_dataset.map(tokenize_and_align_labels, batched=True)
valid_dataset = valid_dataset.map(tokenize_and_align_labels, batched=True)
test_dataset = test_dataset.map(tokenize_and_align_labels, batched=True)



Map:   0%|          | 0/33560 [00:00<?, ? examples/s]

Map:   0%|          | 0/7193 [00:00<?, ? examples/s]

Map:   0%|          | 0/7193 [00:00<?, ? examples/s]

In [4]:
from transformers import BertForTokenClassification

# Instantiate the model
model = BertForTokenClassification.from_pretrained('bert-base-cased', num_labels=len(label_map))

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',           # Output directory
    evaluation_strategy='epoch',      # Evaluation strategy
    learning_rate=2e-5,               # Learning rate
    per_device_train_batch_size=80,    # Reduced batch size
    per_device_eval_batch_size=80,     # Reduced batch size
    num_train_epochs=10,               # Total number of training epochs
    weight_decay=0.01,                # Strength of weight decay
    save_total_limit=2,               # Limit the total amount of checkpoints
    report_to=[]
)

In [10]:
import torch

# Clear CUDA cache
torch.cuda.empty_cache()

# Disable CUDA caching allocator
torch.backends.cuda.matmul.allow_tf32 = False
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.allow_tf32 = False

# Initialize the Trainer
trainer = Trainer(
    model=model,                       # The instantiated 🤗 Transformers model to be trained
    args=training_args,                # Training arguments, defined above
    train_dataset=train_dataset,       # Training dataset
    eval_dataset=valid_dataset         # Evaluation dataset
)

# Start training
trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


Epoch,Training Loss,Validation Loss
1,No log,0.090539
2,0.066800,0.090772
3,0.053400,0.092408


KeyboardInterrupt: 

In [14]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

sentence = "Hello, my name is Amir. I work at OpenAI as a senior ML engineer. I live in Almaty"

# Tokenize the sentence with offsets
inputs = tokenizer(sentence, return_tensors="pt", return_offsets_mapping=True, padding=True, truncation=True)

# Move inputs to the same device as the model
input_ids = inputs["input_ids"].to(device)
attention_mask = inputs["attention_mask"].to(device)
offsets = inputs["offset_mapping"].cpu().numpy()[0]

# Run inference
model.eval()
with torch.no_grad():
    outputs = model(input_ids=input_ids, attention_mask=attention_mask)
    logits = outputs.logits

# Get predicted labels
predicted_token_class_ids = torch.argmax(logits, dim=2).cpu().numpy()[0]
tokens = tokenizer.convert_ids_to_tokens(input_ids[0].cpu().numpy())

# Convert the predicted class IDs back to NER labels
predicted_labels = [inverse_label_map[label_id] for label_id in predicted_token_class_ids]

# Combine tokens with their corresponding labels and offsets for readability
results = [(token, label, (start, end)) for token, label, (start, end) in zip(tokens, predicted_labels, offsets)]

# Align subword tokens with original words
aligned_results = []
current_word = ""
current_labels = []

for token, label, (start, end) in results:
    if start == 0 and end == 0:  # [CLS] and [SEP] tokens
        continue
    if token.startswith("##"):
        current_word += token[2:]
        current_labels.append(label)
    else:
        if current_word:
            # Aggregate labels for the current word
            # Here we simple take the first label, but you can define a more sophisticated strategy
            aligned_label = current_labels[0]
            aligned_results.append((current_word, aligned_label))
            current_word = ""
            current_labels = []
        current_word = token
        current_labels.append(label)

# Add the last word if there's any
if current_word:
    aligned_label = current_labels[0]
    aligned_results.append((current_word, aligned_label))

# Post-process to ensure correct entity boundaries
final_results = []
i = 0
while i < len(aligned_results):
    word, label = aligned_results[i]
    if label.startswith("B-"):
        # Check if the next words belong to the same entity
        entity_type = label[2:]
        j = i + 1
        while j < len(aligned_results) and aligned_results[j][1] == f"I-{entity_type}":
            word += aligned_results[j][0]
            j += 1
        final_results.append((word, label))
        i = j
    else:
        final_results.append((word, label))
        i += 1

# Print the final aligned results
for word, label in final_results:
    print(f"{word:10} {label}")

Hello      O
,          O
my         O
name       O
is         O
Amir       B-per
.          O
I          O
work       O
at         O
OpenAI     B-org
as         O
a          O
senior     O
ML         O
engineer   O
.          O
I          O
live       O
in         O
Almaty     B-geo


In [13]:
torch.save(model.state_dict(), 'ner/model.pth')