In [None]:

import os

os.environ["OPENAI_API_KEY"]=""
os.environ["GROQ_API_KEY"] = ""
os.environ["GOOGLE_API_KEY"] = ""

https://huggingface.co/datasets/boltuix/conll2025-ner

In [3]:
from datasets import load_dataset

ds = load_dataset("boltuix/conll2025-ner")

print(f"Dataset:\n{ds}\nTrain example:\n{ds['train'][1]}")

Dataset:
DatasetDict({
    train: Dataset({
        features: ['split', 'tokens', 'ner_tags'],
        num_rows: 143709
    })
})
Train example:
{'split': 'train', 'tokens': ['In', 'recent', 'years', ',', 'advanced', 'education', 'for', 'professionals', 'has', 'become', 'a', 'hot', 'topic', 'in', 'the', 'business', 'community', '.'], 'ner_tags': ['O', 'B-DATE', 'I-DATE', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']}


In [4]:
from sugardata.utility.ner import NERExampleFormatter

WANTED_LABELS = ["PERSON", "DATE", "ORG", "LOC", "EVENT"]
FULL_TO_SHORT = {
    "PERSON": "PER",
    "DATE": "DATE",
    "ORG": "ORG",
    "LOC": "LOC",
    "EVENT": "EVENT"
}

tokens_list = [example["tokens"] for example in ds["train"]]
bio_tags_list = [example["ner_tags"] for example in ds["train"]]

print(f"First tokens: {tokens_list[1]}")
print(f"First BIO tags: {bio_tags_list[1]}")

formatter = NERExampleFormatter(wanted_labels=WANTED_LABELS, full_to_short=FULL_TO_SHORT)
examples = formatter.build_from_bio(tokens_list, bio_tags_list)

print(f"Formatted NER example:\n{examples[1]}")

First tokens: ['In', 'recent', 'years', ',', 'advanced', 'education', 'for', 'professionals', 'has', 'become', 'a', 'hot', 'topic', 'in', 'the', 'business', 'community', '.']
First BIO tags: ['O', 'B-DATE', 'I-DATE', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Formatted NER example:
{'text': 'In recent years, advanced education for professionals has become a hot topic in the business community.', 'ner_tags': [{'recent years': 'DATE'}]}


In [5]:
import random

sample_count = 5_000

sample_indices = random.sample(range(len(examples)), sample_count)

sampled_examples = [examples[i] for i in sample_indices]

print(f"Sampled Results (first 5 examples):")

for res in sampled_examples[:5]:
    print(res)
    print("----")


Sampled Results (first 5 examples):
{'text': "so anyway I'm just so stressed out right now that I feel like crying", 'ner_tags': []}
----
{'text': 'The Israeli spokesman said there were two simultaneous attacks at an intersection.', 'ner_tags': []}
----
{'text': 'Talks may resume after the Israeli elections.', 'ner_tags': []}
----
{'text': 'Does he make any sounds /?', 'ner_tags': []}
----
{'text': 'They offer these suggestions:', 'ner_tags': []}
----


In [6]:
from sugardata import localize_ner_data


examples = sampled_examples
language = "Turkish"
model = "gpt-4o-mini"
vendor = "openai"
tokenizer = "dbmdz/bert-base-turkish-cased"
entity_labels = {"PER": (1, 2), "ORG": (3, 4), "LOC": (5, 6), "DATE": (7, 8), "EVENT": (9, 10)}
batch_size = 32
verbose = True


results_example = localize_ner_data(
    examples=examples[:10],
    language=language,
    model=model,
    vendor=vendor,
    tokenizer=tokenizer,
    batch_size=batch_size,
    entity_labels=entity_labels,
    export_type="default",
)

In [7]:
print(f"Localization Results (first 2 examples) compared to examples:")
for i in range(2):
    print(f"Example {i+1}:")
    print("Original:", examples[i])
    print("Localized:", results_example[i])
    print("----")

Localization Results (first 2 examples) compared to examples:
Example 1:
Original: {'text': "so anyway I'm just so stressed out right now that I feel like crying", 'ner_tags': []}
Localized: {'index': 0, 'localized_text': 'Her neyse, şu anda o kadar stresli hissediyorum ki ağlamak istiyorum.', 'localized_word_mappings': {}, 'tokens': ['Her', 'neyse', ',', 'şu', 'anda', 'o', 'kadar', 'stres', '##li', 'hissediyorum', 'ki', 'ağ', '##lamak', 'istiyorum', '.'], 'ner_tags': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'ner_tag_labels': {'PER': (1, 2), 'ORG': (3, 4), 'LOC': (5, 6), 'DATE': (7, 8), 'EVENT': (9, 10)}}
----
Example 2:
Original: {'text': 'The Israeli spokesman said there were two simultaneous attacks at an intersection.', 'ner_tags': []}
Localized: {'index': 1, 'localized_text': 'İsrailli sözcü, bir kavşakta iki eş zamanlı saldırı olduğunu söyledi.', 'localized_word_mappings': {}, 'tokens': ['İsrailli', 'sözcü', ',', 'bir', 'kavşak', '##ta', 'iki', 'eş', 'zamanlı', 'saldırı', '

In [8]:
from sugardata import localize_ner_data_async

local_example_async = await localize_ner_data_async(
    examples=examples[:10],
    language=language,
    model=model,
    vendor=vendor,
    tokenizer=tokenizer,
    batch_size=batch_size,
    entity_labels=entity_labels,
    export_type="default",
)

In [9]:
print(f"Localization Results (first 2 examples) compared to examples:")
for i in range(2):
    print(f"Example {i+1}:")
    print("Original:", examples[i])
    print("Localized:", local_example_async[i])
    print("----")

Localization Results (first 2 examples) compared to examples:
Example 1:
Original: {'text': "so anyway I'm just so stressed out right now that I feel like crying", 'ner_tags': []}
Localized: {'index': 0, 'localized_text': 'Yani şu an o kadar stresli hissediyorum ki ağlamak istiyorum.', 'localized_word_mappings': {}, 'tokens': ['Yani', 'şu', 'an', 'o', 'kadar', 'stres', '##li', 'hissediyorum', 'ki', 'ağ', '##lamak', 'istiyorum', '.'], 'ner_tags': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'ner_tag_labels': {'PER': (1, 2), 'ORG': (3, 4), 'LOC': (5, 6), 'DATE': (7, 8), 'EVENT': (9, 10)}}
----
Example 2:
Original: {'text': 'The Israeli spokesman said there were two simultaneous attacks at an intersection.', 'ner_tags': []}
Localized: {'index': 4, 'localized_text': 'Bu önerileri sunuyorlar:', 'localized_word_mappings': {}, 'tokens': ['Bu', 'önerileri', 'sunuyor', '##lar', ':'], 'ner_tags': [0, 0, 0, 0, 0], 'ner_tag_labels': {'PER': (1, 2), 'ORG': (3, 4), 'LOC': (5, 6), 'DATE': (7, 8), 'EVENT'

In [10]:
from sugardata import localize_ner_data_multi_vendor_async

examples = sampled_examples
batch_size = 32
verbose = True

vendors = {
    "openai": "gpt-4o-mini",
    "gemini": "gemini-2.0-flash-lite",
    "groq": "meta-llama/llama-4-scout-17b-16e-instruct",
    "ollama": "gemma3:12b",
}

results = await localize_ner_data_multi_vendor_async(
    examples=sampled_examples,
    language=language,
    tokenizer=tokenizer,
    entity_labels=entity_labels,
    batch_size=batch_size,
    vendors=vendors,
    verbose=verbose
)

[gpt-4o-mini] Starting text generation: 40 batches
[meta-llama/llama-4-scout-17b-16e-instruct] Starting text generation: 40 batches
[models/gemini-2.0-flash-lite] Starting text generation: 40 batches
[gemma3:12b] Starting text generation: 40 batches
[models/gemini-2.0-flash-lite] Generated batch 1/40
[meta-llama/llama-4-scout-17b-16e-instruct] Generated batch 1/40
[models/gemini-2.0-flash-lite] Generated batch 2/40
[gpt-4o-mini] Generated batch 1/40
[meta-llama/llama-4-scout-17b-16e-instruct] Generated batch 2/40
[models/gemini-2.0-flash-lite] Generated batch 3/40
[models/gemini-2.0-flash-lite] Generated batch 4/40
[meta-llama/llama-4-scout-17b-16e-instruct] Generated batch 3/40
[models/gemini-2.0-flash-lite] Generated batch 5/40
[gpt-4o-mini] Generated batch 2/40
[meta-llama/llama-4-scout-17b-16e-instruct] Generated batch 4/40
[models/gemini-2.0-flash-lite] Generated batch 6/40
[meta-llama/llama-4-scout-17b-16e-instruct] Generated batch 5/40
[models/gemini-2.0-flash-lite] Generated ba

In [11]:
local_data = []
for vendor, vendor_results in results.items():
    local_data.extend(vendor_results)

print(f"Total localized examples: {len(local_data)}")

Total localized examples: 4743


In [12]:
from typing import Dict, Tuple

NER_LABELS: Dict[str, Tuple[int,int]] = {
    'PER': (1, 2),
    'ORG':    (3, 4),
    'LOC':    (5, 6),
    'DATE':   (7, 8),
    'EVENT':  (9,10),
}

id2label = {0: "O"}
for ent, (b_id, i_id) in NER_LABELS.items():
    id2label[b_id] = f"B-{ent}"
    id2label[i_id] = f"I-{ent}"
label2id = {v: k for k, v in id2label.items()}

num_labels = max(id2label.keys()) + 1


assert all(isinstance(x["tokens"], list) and isinstance(x["ner_tags"], list) for x in local_data)
assert all(len(x["tokens"]) == len(x["ner_tags"]) for x in local_data)

In [13]:
import random
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split

# Train / validation / test split (80/10/10)
random.seed(42)
idx = list(range(len(local_data)))
train_idx, tmp_idx = train_test_split(idx, test_size=0.2, random_state=42, shuffle=True)
valid_idx, test_idx = train_test_split(tmp_idx, test_size=0.5, random_state=42, shuffle=True)

train_data = [local_data[i] for i in train_idx]
valid_data = [local_data[i] for i in valid_idx]
test_data  = [local_data[i] for i in test_idx]

ds = DatasetDict({
    "train": Dataset.from_list(train_data),
    "validation": Dataset.from_list(valid_data),
    "test": Dataset.from_list(test_data),
})

print(ds)

DatasetDict({
    train: Dataset({
        features: ['index', 'localized_text', 'localized_word_mappings', 'tokens', 'ner_tags', 'ner_tag_labels'],
        num_rows: 3794
    })
    validation: Dataset({
        features: ['index', 'localized_text', 'localized_word_mappings', 'tokens', 'ner_tags', 'ner_tag_labels'],
        num_rows: 474
    })
    test: Dataset({
        features: ['index', 'localized_text', 'localized_word_mappings', 'tokens', 'ner_tags', 'ner_tag_labels'],
        num_rows: 475
    })
})


In [14]:
from transformers import AutoTokenizer

model_name = "dbmdz/bert-base-turkish-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

CLS_ID = tokenizer.cls_token_id
SEP_ID = tokenizer.sep_token_id
PAD_ID = tokenizer.pad_token_id

def encode_example(example, max_length=256):
    # Convert your piecewise tokens → ids (unknown pieces become [UNK])
    input_ids = tokenizer.convert_tokens_to_ids(example["tokens"])
    attention_mask = [1]*len(input_ids)

    # Add special tokens [CLS] ... [SEP]
    input_ids = [CLS_ID] + input_ids + [SEP_ID]
    attention_mask = [1] + attention_mask + [1]

    # Shift labels: add -100 for [CLS] and [SEP] so they are ignored by the loss
    labels = [-100] + example["ner_tags"] + [-100]

    # Truncate if needed (keep room already considered)
    if len(input_ids) > max_length:
        input_ids   = input_ids[:max_length]
        attention_mask = attention_mask[:max_length]
        labels = labels[:max_length]

    # No padding here; DataCollator will pad dynamically per batch
    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels,
        "localized_text": example.get("localized_text", ""),
        "index": example.get("index", -1),
    }

encoded_ds = ds.map(encode_example, remove_columns=ds["train"].column_names)

print(f"Encoded dataset:\n{encoded_ds}")

Map:   0%|          | 0/3794 [00:00<?, ? examples/s]

Map:   0%|          | 0/474 [00:00<?, ? examples/s]

Map:   0%|          | 0/475 [00:00<?, ? examples/s]

Encoded dataset:
DatasetDict({
    train: Dataset({
        features: ['index', 'localized_text', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 3794
    })
    validation: Dataset({
        features: ['index', 'localized_text', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 474
    })
    test: Dataset({
        features: ['index', 'localized_text', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 475
    })
})


In [15]:
import evaluate
import numpy as np
from transformers import AutoConfig, AutoModelForTokenClassification, DataCollatorForTokenClassification, TrainingArguments, Trainer

seqeval = evaluate.load("seqeval")

config = AutoConfig.from_pretrained(
    model_name,
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id,
)

model = AutoModelForTokenClassification.from_pretrained(model_name, config=config)

data_collator = DataCollatorForTokenClassification(tokenizer, padding=True, max_length=256)

# Build a function to convert label ids back to strings (excluding -100 and specials)
def align_predictions(predictions, label_ids):
    preds = np.argmax(predictions, axis=2)

    batch_preds = []
    batch_refs = []

    for pred, lab in zip(preds, label_ids):
        true_preds = []
        true_labels = []
        for p, l in zip(pred, lab):
            if l == -100:
                continue
            true_preds.append(id2label[p])
            true_labels.append(id2label[l])
        batch_preds.append(true_preds)
        batch_refs.append(true_labels)
    return batch_preds, batch_refs

def compute_metrics(p):
    predictions, labels = p
    preds_list, refs_list = align_predictions(predictions, labels)
    results = seqeval.compute(predictions=preds_list, references=refs_list)
    # Friendly flatten of main scores
    return {
        "precision": results["overall_precision"],
        "recall":    results["overall_recall"],
        "f1":        results["overall_f1"],
        "accuracy":  results["overall_accuracy"],
    }

training_args = TrainingArguments(
    output_dir="outputs/bert-tr-ner",
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_steps=50,
    learning_rate=3e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    gradient_accumulation_steps=1,
    fp16=True,
    report_to="none",   # set "wandb" etc. if you want
    seed=42,
    dataloader_num_workers=2,
)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at dbmdz/bert-base-turkish-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_ds["train"],
    eval_dataset=encoded_ds["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,     # keeps special tokens, etc.
    compute_metrics=compute_metrics,
)

trainer.train()


  trainer = Trainer(
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.1233,0.07988,0.639405,0.767857,0.697769,0.972706
2,0.0829,0.087053,0.625899,0.776786,0.693227,0.970422
3,0.0413,0.09178,0.666667,0.785714,0.721311,0.972597


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
  _warn_prf(average, modifier, msg_start, len(result))
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been 

TrainOutput(global_step=714, training_loss=0.11032439196477083, metrics={'train_runtime': 16.1785, 'train_samples_per_second': 703.524, 'train_steps_per_second': 44.133, 'total_flos': 309988498909440.0, 'train_loss': 0.11032439196477083, 'epoch': 3.0})

In [17]:
test_metrics = trainer.evaluate(encoded_ds["test"])
print(test_metrics)  # precision / recall / f1 / accuracy


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


{'eval_loss': 0.08436267077922821, 'eval_precision': 0.7209302325581395, 'eval_recall': 0.7833935018050542, 'eval_f1': 0.7508650519031141, 'eval_accuracy': 0.9747692307692307, 'eval_runtime': 0.2368, 'eval_samples_per_second': 2005.81, 'eval_steps_per_second': 126.683, 'epoch': 3.0}


In [18]:
# Get raw predictions for per-class report
predictions = trainer.predict(encoded_ds["test"])
preds_list, refs_list = align_predictions(predictions.predictions, predictions.label_ids)
report = evaluate.load("seqeval").compute(predictions=preds_list, references=refs_list)
print(report)  # includes per-entity scores like B-PERSON/I-PERSON aggregated as PER


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


{'DATE': {'precision': np.float64(0.5394736842105263), 'recall': np.float64(0.6721311475409836), 'f1': np.float64(0.5985401459854015), 'number': np.int64(61)}, 'EVENT': {'precision': np.float64(0.0), 'recall': np.float64(0.0), 'f1': np.float64(0.0), 'number': np.int64(2)}, 'LOC': {'precision': np.float64(0.5), 'recall': np.float64(0.4444444444444444), 'f1': np.float64(0.47058823529411764), 'number': np.int64(9)}, 'ORG': {'precision': np.float64(0.7652173913043478), 'recall': np.float64(0.8), 'f1': np.float64(0.7822222222222223), 'number': np.int64(110)}, 'PER': {'precision': np.float64(0.8316831683168316), 'recall': np.float64(0.8842105263157894), 'f1': np.float64(0.8571428571428571), 'number': np.int64(95)}, 'overall_precision': np.float64(0.7209302325581395), 'overall_recall': np.float64(0.7833935018050542), 'overall_f1': np.float64(0.7508650519031141), 'overall_accuracy': 0.9747692307692307}


In [19]:
trainer.save_model("outputs/bert-tr-ner/final")
tokenizer.save_pretrained("outputs/bert-tr-ner/final")


('outputs/bert-tr-ner/final/tokenizer_config.json',
 'outputs/bert-tr-ner/final/special_tokens_map.json',
 'outputs/bert-tr-ner/final/vocab.txt',
 'outputs/bert-tr-ner/final/added_tokens.json',
 'outputs/bert-tr-ner/final/tokenizer.json')

In [20]:
from transformers import pipeline

ner_pipe = pipeline(
    "token-classification",
    model="outputs/bert-tr-ner/final",
    tokenizer="outputs/bert-tr-ner/final",
    aggregation_strategy="simple"   # merges sub-tokens into whole entities
)

text = "Japonya Başbakanı Fumio Kishida bugün Ankara'da Cumhurbaşkanı ile görüştü."
preds = ner_pipe(text)
preds


Device set to use cuda:0


[{'entity_group': 'PER',
  'score': np.float32(0.9883959),
  'word': 'Fumio Kishida',
  'start': 18,
  'end': 31},
 {'entity_group': 'DATE',
  'score': np.float32(0.83149445),
  'word': 'bugün',
  'start': 32,
  'end': 37}]