# Necessary imports

In [26]:
from datasets import load_from_disk
from transformers import AutoModelForTokenClassification, AutoTokenizer, DataCollatorForTokenClassification, TrainingArguments
from training import compute_metrics1, CustomTrainer
from inference import infer, highlight_words
import os

# Load the test dataset and the model

In [50]:
test_loaded = load_from_disk('./data/test_data')

In [42]:
model = AutoModelForTokenClassification.from_pretrained('huggingsaurusRex/bert-base-uncased-for-mountain-ner')
tokenizer = AutoTokenizer.from_pretrained('huggingsaurusRex/bert-base-uncased-for-mountain-ner')

config.json:   0%|          | 0.00/688 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/431M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/669k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

Perform the dataset preprocessing steps

In [43]:
def align_labels_with_tokens(labels, word_ids):
    """
    convert labeled words to labeled tokens
    :param labels: word labels
    :param word_ids: list of word ids for each token
    :return: token labels
    """
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # a new word
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # special token, typically set to -100
            new_labels.append(-100)
        else:
            # same word as previous token
            label = labels[word_id]
            new_labels.append(label)

    return new_labels


def tokenize_and_align_labels(examples):
    """
    tokenize the samples and align the labels
    :param examples: dataset samples
    :return: tokenized samples
    """
    # apply tokenizer to the tokens
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )
    all_labels = examples["fine_ner_tags"]
    new_labels = []
    # for each sentence in the dataset align the labels for tokens
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [44]:
tokenized_test = test_loaded.map(
    tokenize_and_align_labels,
    batched=True
)

Map:   0%|          | 0/886 [00:00<?, ? examples/s]

In [45]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

Prepare for testing the model

In [46]:
os.makedirs('./models/logs', exist_ok=True)

In [47]:
args = TrainingArguments(
    "./models/logs",
    evaluation_strategy="epoch",
    report_to="none"
)

trainer = CustomTrainer(
    model=model,
    args=args,
    data_collator=data_collator,
    compute_metrics=compute_metrics1,
    tokenizer=tokenizer,
)

# Test the model on test split

In [48]:
# Compute model's scores on test split
test_scores = trainer.evaluate(eval_dataset=tokenized_test)

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [49]:
print(f'Precision = {test_scores["eval_precision_class1"]}')
print(f'Recall = {test_scores["eval_recall_class1"]}')
print(f'F1-score = {test_scores["eval_f1_class1"]}')

Precision = 0.835233541743288
Recall = 0.9051414906337186
F1-score = 0.8687834736036726


# Test the model on sample data (inference)

In [32]:
# create sample text and put it into 'sample1.txt'
sample_text = "I spent days climbing the Mount Evelest.\nI founded a company called 'Everest'."
with open('sample1.txt', 'w', encoding='utf-8') as f:
    f.write(sample_text)

In [33]:
# perform inference
content, word_labels, highlights = infer(model, tokenizer, 'sample1.txt')
# put the results into 'sample1_pred.txt' neatly
highlight_words(content, 'sample1_pred.txt', highlights)

In [34]:
with open('sample1_pred.txt', 'r', encoding='utf-8') as f:
    content = f.read()
print(content)

I spent days climbing the <mountain>Mount</mountain> <mountain>Evelest</mountain>.
I founded a company called 'Everest'.


As we can see, the model differentiates between a company Everest, and the mount Everest.
Also, the typo in word Everest did not confuse it.

Let's look at another example.

In [52]:
# create sample text and put it into 'sample2.txt'
sample_text = "The tallest mountains in the Alps include: Mont Blanc, Piz Bernina, the Dom, the Grand Combin, and others."
with open('sample2.txt', 'w', encoding='utf-8') as f:
    f.write(sample_text)

In [53]:
# perform inference
content, word_labels, highlights = infer(model, tokenizer, 'sample2.txt')
# put the results into 'sample1_pred.txt' neatly
highlight_words(content, 'sample2_pred.txt', highlights)

In [54]:
with open('sample2_pred.txt', 'r', encoding='utf-8') as f:
    content = f.read()
print(content)

The tallest mountains in <mountain>the</mountain> <mountain>Alps</mountain> include: <mountain>Mont</mountain> <mountain>Blanc</mountain>, <mountain>Piz</mountain> <mountain>Bernina</mountain>, the <mountain>Dom</mountain>, the <mountain>Grand</mountain> <mountain>Combin</mountain>, and others.


This example demonstrates that the model sometimes considers article 'the' as a mountain, like in the Alps. But it was not confused by a lot of compound mountain names.