In [1]:
import os
from datasets import Dataset, load_dataset

# Function to load NER data from a file
def load_ner_data(file_path):
    sentences, labels = [], []
    with open(file_path, 'r') as f:
        sentence, label = [], []
        for line in f:
            line = line.strip()
            if line:
                word, tag = line.split()
                sentence.append(word)
                label.append(tag)
            else:
                if sentence:  # End of a sentence
                    sentences.append(sentence)
                    labels.append(label)
                    sentence, label = [], []
        # Catch the last sentence if there's no newline at the end
        if sentence:
            sentences.append(sentence)
            labels.append(label)
    return sentences, labels

# Load your NER data
train_sentences, train_labels = load_ner_data('../data/train_dataset/train.txt')
valid_sentences, valid_labels = load_ner_data('../data/train_dataset/valid.txt')
test_sentences, test_labels = load_ner_data('../data/train_dataset/test.txt')

# Collect all unique labels from all datasets
all_labels = set()
for label_list in train_labels + valid_labels + test_labels:
    all_labels.update(label_list)

# Create label mapping
label_map = {label: idx for idx, label in enumerate(sorted(all_labels))}
print("Label Map:", label_map)

# Inverse mapping for decoding (if needed later)
inverse_label_map = {v: k for k, v in label_map.items()}

# Encode labels as integers
def encode_labels(labels):
    return [[label_map[label] for label in sentence_labels] for sentence_labels in labels]

train_labels_encoded = encode_labels(train_labels)
valid_labels_encoded = encode_labels(valid_labels)
test_labels_encoded = encode_labels(test_labels)

# Convert to Dataset objects
train_dataset = Dataset.from_dict({"tokens": train_sentences, "ner_tags": train_labels_encoded})
valid_dataset = Dataset.from_dict({"tokens": valid_sentences, "ner_tags": valid_labels_encoded})
test_dataset = Dataset.from_dict({"tokens": test_sentences, "ner_tags": test_labels_encoded})


Label Map: {'B-eve': 0, 'B-geo': 1, 'B-gpe': 2, 'B-org': 3, 'B-per': 4, 'I-eve': 5, 'I-geo': 6, 'I-gpe': 7, 'I-org': 8, 'I-per': 9, 'None': 10, 'O': 11}


In [2]:
from transformers import BertTokenizerFast

tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased')

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples['tokens'],
        truncation=True,
        padding='max_length',
        max_length=128,
        is_split_into_words=True,
        return_tensors="pt"  # Return tensors directly
    )

    labels = []
    for i, label in enumerate(examples['ner_tags']):
        label_ids = [-100] * len(tokenized_inputs['input_ids'][i])  # Start with -100 for all tokens

        word_ids = tokenized_inputs.word_ids(batch_index=i)
        for word_idx in range(len(label)):
            if word_ids[word_idx] is not None:  # Only map if the word index is valid
                label_ids[word_idx] = label[word_idx]  # Assign the correct label to the token index

        # Ensure that label_ids are the same length as tokenized inputs
        labels.append(label_ids)

    # Ensure that the labels are padded to the same length as input IDs
    max_length = max(len(l) for l in labels)
    for i in range(len(labels)):
        labels[i] += [-100] * (max_length - len(labels[i]))  # Pad with -100

    tokenized_inputs['labels'] = labels
    return tokenized_inputs


# Tokenize datasets
train_dataset = train_dataset.map(tokenize_and_align_labels, batched=True)
valid_dataset = valid_dataset.map(tokenize_and_align_labels, batched=True)
test_dataset = test_dataset.map(tokenize_and_align_labels, batched=True)




Map:   0%|          | 0/33560 [00:00<?, ? examples/s]

Map:   0%|          | 0/7193 [00:00<?, ? examples/s]

Map:   0%|          | 0/7193 [00:00<?, ? examples/s]

In [3]:
from transformers import BertForTokenClassification
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


num_labels = len(set(tag for tags in train_labels for tag in tags))  # Number of unique labels
model = BertForTokenClassification.from_pretrained('bert-base-cased', num_labels=num_labels)
model = model.to(device)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['cl

In [4]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',           # Output directory
    evaluation_strategy='epoch',      # Evaluation strategy
    learning_rate=2e-5,               # Learning rate
    per_device_train_batch_size=16,   # Training batch size
    per_device_eval_batch_size=16,    # Evaluation batch size
    num_train_epochs=3,               # Total number of training epochs
    weight_decay=0.01,                # Strength of weight decay
    save_total_limit=2,               # Limit the total amount of checkpoints
    report_to=[]
)

trainer = Trainer(
    model=model,                       # The instantiated 🤗 Transformers model to be trained
    args=training_args,                # Training arguments, defined above
    train_dataset=train_dataset,       # Training dataset
    eval_dataset=valid_dataset         # Evaluation dataset
)

# Start training
trainer.train()




Epoch,Training Loss,Validation Loss
1,0.1602,0.13346
2,0.1156,0.107394
3,0.0964,0.098014


TrainOutput(global_step=6294, training_loss=0.1481890773864106, metrics={'train_runtime': 4315.3962, 'train_samples_per_second': 23.33, 'train_steps_per_second': 1.458, 'total_flos': 6577433975070720.0, 'train_loss': 0.1481890773864106, 'epoch': 3.0})

In [5]:
torch.cuda.is_available()

True

In [6]:
# Evaluate the model
trainer.evaluate(test_dataset)


{'eval_loss': 0.09713272005319595,
 'eval_runtime': 94.8473,
 'eval_samples_per_second': 75.838,
 'eval_steps_per_second': 4.744,
 'epoch': 3.0}

In [7]:
model.save_pretrained('../models/ner')
tokenizer.save_pretrained('../models/ner')


('../models/ner\\tokenizer_config.json',
 '../models/ner\\special_tokens_map.json',
 '../models/ner\\vocab.txt',
 '../models/ner\\added_tokens.json',
 '../models/ner\\tokenizer.json')

In [21]:
sentence = "Hello, my name is Amir. I work at OpenAI as a senior ML engineer."

# Tokenize the sentence
inputs = tokenizer(sentence, return_tensors="pt", is_split_into_words=False, padding=True, truncation=True)

# Move inputs to the same device as the model
input_ids = inputs["input_ids"].to(device)
attention_mask = inputs["attention_mask"].to(device)

# Run inference
model.eval()
with torch.no_grad():
    outputs = model(input_ids=input_ids, attention_mask=attention_mask)
    logits = outputs.logits

# Get predicted labels
predicted_token_class_ids = torch.argmax(logits, dim=2).cpu().numpy()[0]
tokens = tokenizer.convert_ids_to_tokens(input_ids[0].cpu().numpy())

# Convert the predicted class IDs back to NER labels
predicted_labels = [inverse_label_map[label_id] for label_id in predicted_token_class_ids]

# Combine tokens with their corresponding labels for readability
results = [(token, label) for token, label in zip(tokens, predicted_labels)]

# Print the results
for token, label in results:
    print(f"{token:10} {label}")

[CLS]      O
Hello      O
,          O
my         O
name       O
is         B-per
Amir       O
.          O
I          O
work       O
at         B-org
Open       O
##A        O
##I        O
as         O
a          O
senior     O
M          O
##L        O
engineer   O
.          O
[SEP]      O


In [10]:
inverse_label_map

{0: 'B-eve',
 1: 'B-geo',
 2: 'B-gpe',
 3: 'B-org',
 4: 'B-per',
 5: 'I-eve',
 6: 'I-geo',
 7: 'I-gpe',
 8: 'I-org',
 9: 'I-per',
 10: 'None',
 11: 'O'}

In [20]:
import pickle
with open('../models/label_map.pickle', 'wb') as f:
    pickle.dump(inverse_label_map,f)