In this notebook we build the anonomization NER model.

In [None]:
import os
import numpy as np
import pandas as pd 
from ast import literal_eval

#I created separate csv for both ng and bruk datasets
for name in ["ng","bruk"]:

    #simple csv
    csv_data = pd.DataFrame(columns=['id', 'tokens', 'ner_tags'])

    for file in os.listdir(f'/kaggle/input/ner-uk/data/{name}'):
        #check annotation files only - txt has the same name but different extension
        if file.endswith('.ann'):

            #just reading files
            ann_lines = []
            text = ''
            text_lines = []
            with open(os.path.join(f'/kaggle/input/ner-uk/data/{name}', file), 'r', encoding='utf-8') as ann_file:
                ann_lines = ann_file.readlines()
            with open(os.path.join(f'/kaggle/input/ner-uk/data/{name}', file[:-4] + '.txt'), 'r', encoding='utf-8') as txt_file:
                text_lines = txt_file.readlines()
            
            #quick fix to handle /n in bruk dataset
            for line in text_lines:
                if line == '\n':
                    text += ' SEPARATOR '
                text += line
            
            #parse and filter out only relevant entities
            entities = []
            for line in ann_lines:
                parts = line.strip().split('\t')
                if len(parts) < 4:
                    continue
                if parts[1] not in ["PERS", "LOC", "ORG"]:
                    continue
                entities.append(parts[1:])
            
            #get words from text
            words = text.split()
            
            #if no relevant entities, tag all as O
            if not entities:
                ner_tags = [6] * len(words)
                csv_row = {
                    'id': len(csv_data),
                    'tokens': words,
                    'ner_tags': ner_tags
                }
                csv_data.loc[len(csv_data)] = csv_row
                continue

            #align entities with words
            ner_tags = []
            file_offset = 0
            current_entity_index = 0
            

            for word in words:
                #quickfix for separator from above
                if word == 'SEPARATOR':
                    file_offset += 1
                    continue

                word_start = file_offset
                word_end = file_offset + len(word)
                # Check if the current word overlaps with the current entity (Also give +1 as there are signs like < or etc.)
                # We can also do check for end but for example in НКВД and НКВД-исти example its better to check only start to map correctly

                if int(entities[current_entity_index][1]) <= word_start + 1:
                    # if word is at the beginning of an entity or previous tag was O, it's a B- tag
                    if (ner_tags[-1] if ner_tags else None) == 'O' or word_start == int(entities[current_entity_index][1]) or word_start + 1 == int(entities[current_entity_index][1]):
                        ner_tags.append('B-' + entities[current_entity_index][0])
                    #else it's an I- tag
                    else:
                        ner_tags.append('I-' + entities[current_entity_index][0])

                    # If the word end surpasses the entity end, move to the next entity
                    if word_end >= int(entities[current_entity_index][2]):
                        current_entity_index += 1
                else:
                    ner_tags.append('O')
                
                #no need to continue if all entities are processed
                if current_entity_index == len(entities):
                    break

                file_offset += len(word) + 1  # +1 for the space
            
            #remove seperator tokens
            words = [word for word in words if word != 'SEPARATOR']

            #extend ner tags because we break after all entities are processed
            ner_tags.extend(['O']*(len(words)-len(ner_tags)))

            #validation checks
            ner_words = [word for words in entities for word in words[3].split()]

            if not len(ner_words) == len([words[i] for i in range(len(ner_tags)) if ner_tags[i] != 'O']):
                print(f"Mismatch between entity words and NER tags {file} : {len(ner_words)} vs {len([words[i] for i in range(len(ner_tags)) if ner_tags[i] != 'O'])}")
                continue

            if not len(entities) == sum(1 for tag in ner_tags if tag.startswith('B-')):
                print(f"Mismatch between entities and NER tags {file}   : {len(entities)} vs {sum(1 for tag in ner_tags if tag.startswith('B-'))}")
                continue
            
            #map ner tags to integers
            ner_tags_map = {
                'B-LOC': 0, 'B-ORG': 1, 'B-PERS': 2, 'I-LOC': 3, 'I-ORG': 4, 'I-PERS': 5, 'O': 6
            }

            #create csv row
            csv_row = {
                'id': len(csv_data),
                'tokens': words,
                'ner_tags': [ner_tags_map[tag] for tag in ner_tags]
            }
            # append to dataframe
            csv_data.loc[len(csv_data)] = csv_row

    #save to csv
    csv_data.to_csv(f'{name}_ner.csv', index=False)

Mismatch between entities and NER tags f5e3415a3653.ann   : 31 vs 30
Mismatch between entity words and NER tags c5930ba45ebe.ann : 59 vs 55
Mismatch between entity words and NER tags 3262eb989bd6.ann : 51 vs 52
Mismatch between entity words and NER tags 5d3d7e0d5bae.ann : 51 vs 50
Mismatch between entity words and NER tags 6d47a8c4d755.ann : 93 vs 94
Mismatch between entity words and NER tags 6f6281133248.ann : 67 vs 63
Mismatch between entity words and NER tags a4eee55da896.ann : 72 vs 71
Mismatch between entities and NER tags a6b3166e4d3b.ann   : 13 vs 12
Mismatch between entity words and NER tags 5a4b4498fe83.ann : 72 vs 74
Mismatch between entity words and NER tags 6dd15465059a.ann : 32 vs 34
Mismatch between entity words and NER tags 76b9b07172fb.ann : 44 vs 46
Mismatch between entity words and NER tags ba4204adf371.ann : 83 vs 84
Mismatch between entities and NER tags b9ea4aa52bae.ann   : 37 vs 36
Mismatch between entities and NER tags 816a89664813.ann   : 38 vs 37
Mismatch betwe

In [4]:
import ast
from datasets import load_dataset, Dataset, Features, ClassLabel, Value, Sequence
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification
import evaluate
from transformers import AutoTokenizer
from transformers import DataCollatorForTokenClassification
import evaluate
from seqeval.metrics import f1_score, precision_score, recall_score, accuracy_score

seqeval = evaluate.load("seqeval")

2025-12-12 00:47:24.963576: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1765500444.987859     221 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1765500444.995453     221 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

In [5]:
import datasets

data_bruk = datasets.load_dataset('csv', data_files='/kaggle/working/bruk_ner.csv')
data_ng = datasets.load_dataset('csv', data_files='/kaggle/working/ng_ner.csv')
combined = datasets.concatenate_datasets([data_bruk['train'], data_ng['train']])
combined

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset({
    features: ['id', 'tokens', 'ner_tags'],
    num_rows: 517
})

In [6]:
combined[0]

{'id': 0,
 'tokens': "['Уже', 'точилися', 'розмови', ',', 'що', 'коли', 'донецькі', 'шахтарі', 'до', 'певного', 'часу', 'не', 'встигнуть', 'провести', 'необхідні', 'захисні', 'роботи', 'біля', 'підірваного', 'реактора', ',', 'станеться', 'ще', 'один', 'вибух', '.', 'Мені', 'як', 'активісту', 'профспілки', 'доводилося', 'добувати', 'квитки', 'на', 'виїзд', 'сім’ям', 'наших', 'співробітників', '.', 'Прийшов', 'одного', 'дня', 'на', 'залізничний', 'вокзал', 'до', 'кас', 'в', 'депутатському', 'залі', ',', 'а', 'там', '—', 'ніколи', 'більше', 'такого', 'не', 'бачив', '—', 'довжелезна', 'черга', 'з', 'одних', 'тільки', 'Героїв', 'Радянського', 'Союзу', '.', 'Всі', 'при', 'своїх', 'зірках', ',', 'у', 'декого', 'бачив', 'і', 'дві', 'зірки', 'Героя', '.', 'Вразило', ',', 'як', 'вони', 'матюкалися', 'між', 'собою', ',', 'доводячи', 'один', 'одному', ',', 'які', 'роди', 'військ', 'мають', 'більше', 'заслуг', 'у', 'перемозі', 'над', 'Гітлером', '.', 'Було', 'чимало', 'п’яних', '.', 'Гнітюче', 'вра

We keep only the person tags, because this is our main goal - to anonimize personal information.

In [7]:
allowed_tags = {2, 5, 6}

def preprocess(example):
    # Parse the strings into Python lists
    tokens = ast.literal_eval(example["tokens"])
    ner_tags = ast.literal_eval(example["ner_tags"])

    # Keep only allowed tags, set all others to 6 (O)
    ner_tags = [tag if tag in allowed_tags else 6 for tag in ner_tags]

    example["tokens"] = tokens
    example["ner_tags"] = ner_tags
    return example

cleaned = combined.map(preprocess)

# Inspect result
print(cleaned[0])

Map:   0%|          | 0/517 [00:00<?, ? examples/s]

{'id': 0, 'tokens': ['Уже', 'точилися', 'розмови', ',', 'що', 'коли', 'донецькі', 'шахтарі', 'до', 'певного', 'часу', 'не', 'встигнуть', 'провести', 'необхідні', 'захисні', 'роботи', 'біля', 'підірваного', 'реактора', ',', 'станеться', 'ще', 'один', 'вибух', '.', 'Мені', 'як', 'активісту', 'профспілки', 'доводилося', 'добувати', 'квитки', 'на', 'виїзд', 'сім’ям', 'наших', 'співробітників', '.', 'Прийшов', 'одного', 'дня', 'на', 'залізничний', 'вокзал', 'до', 'кас', 'в', 'депутатському', 'залі', ',', 'а', 'там', '—', 'ніколи', 'більше', 'такого', 'не', 'бачив', '—', 'довжелезна', 'черга', 'з', 'одних', 'тільки', 'Героїв', 'Радянського', 'Союзу', '.', 'Всі', 'при', 'своїх', 'зірках', ',', 'у', 'декого', 'бачив', 'і', 'дві', 'зірки', 'Героя', '.', 'Вразило', ',', 'як', 'вони', 'матюкалися', 'між', 'собою', ',', 'доводячи', 'один', 'одному', ',', 'які', 'роди', 'військ', 'мають', 'більше', 'заслуг', 'у', 'перемозі', 'над', 'Гітлером', '.', 'Було', 'чимало', 'п’яних', '.', 'Гнітюче', 'враже

In [8]:
data = cleaned.train_test_split(test_size=0.2, seed=42)
data

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 413
    })
    test: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 104
    })
})

In [9]:
label_list = ['B-LOC', 'B-ORG', 'B-PERS', 'I-LOC', 'I-ORG', 'I-PERS', 'O']
print("Label list:", label_list)

id2label = {i: label for i, label in enumerate(label_list)}
print("\nid2label:")
for i, label in id2label.items():
    print(f"    {i}: \"{label}\",")

label2id = {label: i for i, label in enumerate(label_list)}
print("\nlabel2id:")
for label, i in label2id.items():
    print(f"    \"{label}\": {i},")

Label list: ['B-LOC', 'B-ORG', 'B-PERS', 'I-LOC', 'I-ORG', 'I-PERS', 'O']

id2label:
    0: "B-LOC",
    1: "B-ORG",
    2: "B-PERS",
    3: "I-LOC",
    4: "I-ORG",
    5: "I-PERS",
    6: "O",

label2id:
    "B-LOC": 0,
    "B-ORG": 1,
    "B-PERS": 2,
    "I-LOC": 3,
    "I-ORG": 4,
    "I-PERS": 5,
    "O": 6,


In [11]:
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    average_method = "macro"
    results = {
        "f1": f1_score(true_labels, true_predictions, average=average_method),
        "precision": precision_score(true_labels, true_predictions, average=average_method),
        "recall": recall_score(true_labels, true_predictions, average=average_method),
        "accuracy": accuracy_score(true_labels, true_predictions),
    }
    return {
        "precision": results["precision"],
        "recall": results["recall"],
        "f1": results["f1"],
        "accuracy": results["accuracy"],
    }

In [12]:

def chunk_tokenize_and_align(examples,tokenizer, max_length=512):
    all_input_ids = []
    all_attention_masks = []
    all_labels = []

    for tokens, labels in zip(examples["tokens"], examples["ner_tags"]):
        tokenized = tokenizer(
            tokens,
            is_split_into_words=True,
            truncation=False,
            return_offsets_mapping=False,
        )

        word_ids = tokenized.word_ids()
        label_ids = []
        previous_word_idx = None

        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(labels[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx

        input_ids = tokenized["input_ids"]
        attention_mask = tokenized["attention_mask"]

        for i in range(0, len(input_ids), max_length):
            chunk_input_ids = input_ids[i:i + max_length]
            chunk_attention_mask = attention_mask[i:i + max_length]
            chunk_labels = label_ids[i:i + max_length]

            all_input_ids.append(chunk_input_ids)
            all_attention_masks.append(chunk_attention_mask)
            all_labels.append(chunk_labels)

    return {
        "input_ids": all_input_ids,
        "attention_mask": all_attention_masks,
        "labels": all_labels,
    }


We use xlm-roberta-base as itt performed the best in the homework task.

In [13]:
from datasets import Dataset, DatasetDict


dataset = data

tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base')

chunked_data = {}

for split_name in ["train", "test"]:
    all_input_ids = []
    all_attention_masks = []
    all_labels = []

    for example in dataset[split_name]:
        chunks = chunk_tokenize_and_align(
            {"tokens": [example["tokens"]], "ner_tags": [example["ner_tags"]]},
            tokenizer=tokenizer,
            max_length=512
        )

        all_input_ids.extend(chunks["input_ids"])
        all_attention_masks.extend(chunks["attention_mask"])
        all_labels.extend(chunks["labels"])

    chunked_data[split_name] = Dataset.from_dict({
        "input_ids": all_input_ids,
        "attention_mask": all_attention_masks,
        "labels": all_labels
    })

chunked_dataset = DatasetDict(chunked_data)
print(chunked_dataset)


Token indices sequence length is longer than the specified maximum sequence length for this model (1038 > 512). Running this sequence through the model will result in indexing errors


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 1006
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 276
    })
})


In [15]:
def train_and_evaluate_model(model_name, data, tokenizer):
    local_data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

    model = AutoModelForTokenClassification.from_pretrained(
        model_name,
        num_labels=len(label_list),
        id2label=id2label,
        label2id=label2id,
        trust_remote_code=True
    )
    
    args = TrainingArguments(
        output_dir=f"./results/{model_name.replace('/', '_')}",
        
        num_train_epochs=6,
        eval_strategy="epoch",
        save_strategy="epoch",
        
        learning_rate=2e-5,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=4,
        gradient_accumulation_steps=4,
        weight_decay=0.01,
        
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        greater_is_better=True,
        save_total_limit=1,
    
        logging_strategy="epoch",
        report_to="none",
        push_to_hub=False
    )

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=data["train"],
        eval_dataset=data["test"],
        tokenizer=tokenizer,
        data_collator=local_data_collator,
        compute_metrics=compute_metrics
    )

    trainer.train()
    eval_results = trainer.evaluate()
    return eval_results['eval_f1']

In [16]:
print(train_and_evaluate_model('xlm-roberta-base', chunked_dataset, tokenizer))

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.3434,0.064285,0.0,0.0,0.0,0.973298
2,0.0412,0.018104,0.876281,0.908497,0.892098,0.99575
3,0.0142,0.010245,0.938161,0.941993,0.940073,0.997707
4,0.0079,0.008796,0.944992,0.940359,0.94267,0.997803
5,0.0066,0.008366,0.948361,0.945261,0.946809,0.997995
6,0.0058,0.008484,0.948887,0.940359,0.944604,0.997947


  _warn_prf(average, modifier, msg_start, len(result))


0.9468085106382979


Performs fairly good on general data. Lets test on artificial test cases.

Here we load the data from deanon notebook, where we prepared the test cases.

In [None]:
import json
import re
import csv

def process_json_to_csv(json_file_path, csv_file_path):
    try:
        with open(json_file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
    except FileNotFoundError:
        print(f"Error: The file '{json_file_path}' was not found.")
        return
    except json.JSONDecodeError:
        print(f"Error: Could not decode JSON from '{json_file_path}'.")
        return
    except Exception as e:
        print(f"An error occurred while reading the JSON file: {e}")
        return

    token_regex = re.compile(r"[\w'-]+|[^\w\s]", re.UNICODE)

    try:
        with open(csv_file_path, 'w', encoding='utf-8', newline='') as f:
            writer = csv.writer(f)
            writer.writerow(['tokens', 'ner_tags'])

            for item in data:
                original_text = item.get('text', '')
                entities = item.get('generated_entities', [])

                entity_map = {
                    e['start']: e['text']
                    for e in entities
                    if 'start' in e and 'text' in e
                }

                final_tokens = []
                final_ner_tags = []

                for match in token_regex.finditer(original_text):
                    token_text = match.group(0)
                    token_start = match.start()

                    if token_start in entity_map:
                        full_name = entity_map[token_start]
                        name_parts = full_name.split()

                        if len(name_parts) == 3:
                            final_tokens.extend(name_parts)
                            final_ner_tags.extend([2, 5, 5])
                        else:
                            print(f"Warning: Entity '{full_name}' at char {token_start} did not have 3 words. Skipping replacement, tagging as 'O'.")
                            final_tokens.append(token_text)
                            final_ner_tags.append(6)
                    else:

                        final_tokens.append(token_text)
                        final_ner_tags.append(6)

                writer.writerow([str(final_tokens), str(final_ner_tags)])

        print(f"Successfully processed data and saved to '{csv_file_path}'")

    except IOError:
        print(f"Error: Could not write to CSV file '{csv_file_path}'.")
    except Exception as e:
        print(f"An error occurred during processing: {e}")

In [18]:
process_json_to_csv("/kaggle/input/test-deacon/first_50_rows_anonymized.json", "test.csv")

Successfully processed data and saved to 'test.csv'


In [19]:
test_ds = datasets.load_dataset('csv', data_files='/kaggle/working/test.csv')
test_ds = test_ds['train']
test_ds = test_ds.map(preprocess)
test_ds[0]

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

{'tokens': ['УХВАЛА',
  '30',
  'листопада',
  '2023',
  'року',
  'м',
  '.',
  'Київ',
  'справа',
  '№',
  '296',
  '/',
  '5017',
  '/',
  '18',
  'провадження',
  '№',
  '61-14884ск23',
  'Верховний',
  'Суд',
  'у',
  'складі',
  'колегії',
  'суддів',
  'Третьої',
  'судової',
  'палати',
  'Касаційного',
  'цивільного',
  'суду',
  ':',
  'Ігнатенка',
  'В',
  '.',
  'М',
  '.',
  '(',
  'суддя-доповідач',
  ')',
  ',',
  'Карпенко',
  'С',
  '.',
  'О',
  '.',
  ',',
  'Фаловської',
  'І',
  '.',
  'М',
  '.',
  ',',
  'розглянув',
  'клопотання',
  'приватного',
  'акціонерного',
  'товариства',
  '«',
  'Вібросепаратор',
  '»',
  'про',
  'зупинення',
  'виконання',
  'рішення',
  'Корольовського',
  'районного',
  'суду',
  'м',
  '.',
  'Житомира',
  'від',
  '18',
  'лютого2022',
  'року',
  'та',
  'постанови',
  'Житомирського',
  'апеляційного',
  'суду',
  'від',
  '04',
  'жовтня',
  '2023',
  'року',
  'в',
  'справі',
  'за',
  'позовом',
  'Савченка',
  'Євгенія',

In [20]:
def classify_text(tokens):
    all_word_predictions = []
    start_word = 0
    tokenizer.model_max_length = 512

    while start_word < len(tokens):
        tokenized = tokenizer(
            tokens[start_word:],
            is_split_into_words=True,
            truncation=True,
            max_length=512,
            return_tensors="pt",
        ).to(model.device)

        logits = model(**tokenized).logits
        predictions = torch.argmax(logits, dim=2)[0].tolist()
        word_ids = tokenized.word_ids(batch_index=0)

        chunk_word_preds = []
        previous_word_idx = None
        for i, word_idx in enumerate(word_ids):
            if word_idx is not None and word_idx != previous_word_idx:
                chunk_word_preds.append(predictions[i])
            previous_word_idx = word_idx

        all_word_predictions.extend(chunk_word_preds)

        last_word_idx = max([wid for wid in word_ids if wid is not None])
        start_word += last_word_idx + 1

    return all_word_predictions


In [22]:
import ast
import torch

MODEL_PATH="/kaggle/working/results/xlm-roberta-base/checkpoint-160/"

tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
model = AutoModelForTokenClassification.from_pretrained(MODEL_PATH)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()




classify_text(["Американський", "президент", "Іван", "Трамп", "на", "тлі", "нового", "загострення", "торговельних", "відносин", "з", "Китаєм", "запевнив", "–", "все", "буде", "добре."])

[6, 6, 2, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6]

In [None]:
def evaluate_model(test_dataset, classify_function):

    ner_metric = evaluate.load("seqeval")

    all_predictions = []
    all_true_labels = []

    print(f"Running inference on {len(test_dataset)} items...")

    for item in test_dataset:
        input_ids = item['tokens']
        true_label_ids = item['ner_tags']
        predicted_labels = classify_function(input_ids)
        
        predicted_labels_str = [id2label[pid] for pid in predicted_labels]
        all_predictions.append(predicted_labels_str)
        
        true_labels_cleaned = []
        for label_id in true_label_ids:
            if label_id != -100:
                true_labels_cleaned.append(id2label[label_id])
        
        all_true_labels.append(true_labels_cleaned)

    print("Inference complete. Calculating metrics...")

    results = ner_metric.compute(predictions=all_predictions, references=all_true_labels)
    
    return results
    

In [24]:
print(evaluate_model(test_ds, classify_text))

Running inference on 50 items...
Inference complete. Calculating metrics...
{'PERS': {'precision': 0.5228628230616302, 'recall': 0.8129829984544049, 'f1': 0.6364186327888687, 'number': 647}, 'overall_precision': 0.5228628230616302, 'overall_recall': 0.8129829984544049, 'overall_f1': 0.6364186327888687, 'overall_accuracy': 0.9744571204511082}


We get much worse results here but it is obvious why. We have great accuracy 0.97. Lets see at the false positives that our model decided to anonimize.

In [None]:
def evaluate_model(test_dataset, classify_function):

    ner_metric = evaluate.load("seqeval")

    all_predictions = []
    all_true_labels = []

    print(f"Running inference on {len(test_dataset)} items...")
    
    false_positive_log = []

    for idx, item in enumerate(test_dataset):
        input_ids = item['tokens']
        true_label_ids = item['ner_tags']

        predicted_ids = classify_function(input_ids)
        predicted_labels = [id2label[pid] for pid in predicted_ids]

        true_labels_cleaned = [id2label[lid] for lid in true_label_ids if lid != -100]

        tokens = item.get("tokens_str", None)
        if tokens is None:
            tokens = [str(t) for t in input_ids]

        for t, pred, true in zip(tokens, predicted_labels, true_labels_cleaned):
            if pred != "O" and true == "O":  
                false_positive_log.append((idx, t, true, pred))

        all_predictions.append(predicted_labels)
        all_true_labels.append(true_labels_cleaned)

    print("Inference complete.")

    if false_positive_log:
        print("\n===== FALSE POSITIVES FOUND =====")
        for sample_idx, token, true_label, pred_label in false_positive_log:
            print(f"[Sample {sample_idx}] Token: '{token}' | True: {true_label} | Pred: {pred_label}")
    else:
        print("\nNo false positives detected.")

    # --- Compute Metrics ---
    print("Calculating metrics...")
    results = ner_metric.compute(predictions=all_predictions, references=all_true_labels)
    return results


In [55]:
print(evaluate_model(test_ds, classify_text))

Downloading builder script: 0.00B [00:00, ?B/s]

Running inference on 50 items...
Inference complete.

===== FALSE POSITIVES FOUND =====
[Sample 0] Token: 'Ігнатенка' | True: O | Pred: B-PERS
[Sample 0] Token: 'В' | True: O | Pred: I-PERS
[Sample 0] Token: '.' | True: O | Pred: I-PERS
[Sample 0] Token: 'М' | True: O | Pred: I-PERS
[Sample 0] Token: '.' | True: O | Pred: I-PERS
[Sample 0] Token: 'Карпенко' | True: O | Pred: B-PERS
[Sample 0] Token: 'С' | True: O | Pred: I-PERS
[Sample 0] Token: '.' | True: O | Pred: I-PERS
[Sample 0] Token: 'О' | True: O | Pred: I-PERS
[Sample 0] Token: '.' | True: O | Pred: I-PERS
[Sample 0] Token: 'Фаловської' | True: O | Pred: I-PERS
[Sample 0] Token: 'І' | True: O | Pred: I-PERS
[Sample 0] Token: '.' | True: O | Pred: I-PERS
[Sample 0] Token: 'М' | True: O | Pred: I-PERS
[Sample 0] Token: '.' | True: O | Pred: I-PERS
[Sample 0] Token: 'В' | True: O | Pred: B-PERS
[Sample 0] Token: '.' | True: O | Pred: I-PERS
[Sample 0] Token: 'М' | True: O | Pred: I-PERS
[Sample 0] Token: '.' | True: O | Pred: I-

We can see that almost all false postives are actual personal information, that wasnt labeled as such in our test cases. It wasnt marked, becuase it wasnt initially masked in government data.

Overall, we have a great model, maskes identities well and thats all we need at this stage.