In [None]:

import os

os.environ["OPENAI_API_KEY"]=""
os.environ["GROQ_API_KEY"] = ""
os.environ["GOOGLE_API_KEY"] = ""

In [1]:
import sys
sys.path.append("/home/yeniguno/projects/sugardata")

In [2]:
from dotenv import load_dotenv
load_dotenv()

True

https://huggingface.co/datasets/boltuix/conll2025-ner

In [3]:
from datasets import load_dataset

ds = load_dataset("boltuix/conll2025-ner")

print(f"Dataset:\n{ds}\nTrain example:\n{ds['train'][1]}")

Dataset:
DatasetDict({
    train: Dataset({
        features: ['split', 'tokens', 'ner_tags'],
        num_rows: 143709
    })
})
Train example:
{'split': 'train', 'tokens': ['In', 'recent', 'years', ',', 'advanced', 'education', 'for', 'professionals', 'has', 'become', 'a', 'hot', 'topic', 'in', 'the', 'business', 'community', '.'], 'ner_tags': ['O', 'B-DATE', 'I-DATE', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']}


In [4]:
from sugardata.utility.ner import NERExampleFormatter

WANTED_LABELS = ["PERSON", "DATE", "ORG", "LOC", "EVENT"]
FULL_TO_SHORT = {
    "PERSON": "PER",
    "DATE": "DATE",
    "ORG": "ORG",
    "LOC": "LOC",
    "EVENT": "EVENT"
}

tokens_list = [example["tokens"] for example in ds["train"]]
bio_tags_list = [example["ner_tags"] for example in ds["train"]]

print(f"First tokens: {tokens_list[1]}")
print(f"First BIO tags: {bio_tags_list[1]}")

formatter = NERExampleFormatter(wanted_labels=WANTED_LABELS, full_to_short=FULL_TO_SHORT)
examples = formatter.build_from_bio(tokens_list, bio_tags_list)

print(f"Formatted NER example:\n{examples[1]}")

First tokens: ['In', 'recent', 'years', ',', 'advanced', 'education', 'for', 'professionals', 'has', 'become', 'a', 'hot', 'topic', 'in', 'the', 'business', 'community', '.']
First BIO tags: ['O', 'B-DATE', 'I-DATE', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Formatted NER example:
{'text': 'In recent years, advanced education for professionals has become a hot topic in the business community.', 'ner_tags': [{'recent years': 'DATE'}]}


In [5]:
import random

sample_count = 5_000

sample_indices = random.sample(range(len(examples)), sample_count)

sampled_examples = [examples[i] for i in sample_indices]

print(f"Sampled Results (first 5 examples):")

for res in sampled_examples[:5]:
    print(res)
    print("----")


Sampled Results (first 5 examples):
{'text': 'She seemed startled to see someone that looked so different.', 'ner_tags': []}
----
{'text': 'and I love him /.', 'ner_tags': []}
----
{'text': 'You know you never can tell with people like that', 'ner_tags': []}
----
{'text': "If she announces that she's running for president will you do what is customary and support your fellow New York senator for president of the United States /?", 'ner_tags': []}
----
{'text': 'We know that everything God made has been waiting until now in pain like a woman ready to give birth to a child.', 'ner_tags': []}
----


In [6]:
from sugardata import localize_ner_data


examples = sampled_examples
language = "Turkish"
model = "gpt-4o-mini"
vendor = "openai"
tokenizer = "dbmdz/bert-base-turkish-cased"
entity_labels = {"PER": (1, 2), "ORG": (3, 4), "LOC": (5, 6), "DATE": (7, 8), "EVENT": (9, 10)}
batch_size = 32
verbose = True


results_example = localize_ner_data(
    examples=examples[:10],
    language=language,
    model=model,
    vendor=vendor,
    tokenizer=tokenizer,
    batch_size=batch_size,
    entity_labels=entity_labels,
    export_type="default",
)

In [7]:
print(f"Localization Results (first 2 examples) compared to examples:")
for i in range(2):
    print(f"Example {i+1}:")
    print("Original:", examples[i])
    print("Localized:", results_example[i])
    print("----")

Localization Results (first 2 examples) compared to examples:
Example 1:
Original: {'text': 'She seemed startled to see someone that looked so different.', 'ner_tags': []}
Localized: {'index': 0, 'localized_text': 'Birinin bu kadar farklı görünmesine şaşırmış gibiydi.', 'localized_word_mappings': {}, 'tokens': ['Birinin', 'bu', 'kadar', 'farklı', 'görün', '##mesine', 'şaşır', '##mış', 'gibiydi', '.'], 'ner_tags': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'ner_tag_labels': {'PER': (1, 2), 'ORG': (3, 4), 'LOC': (5, 6), 'DATE': (7, 8), 'EVENT': (9, 10)}}
----
Example 2:
Original: {'text': 'and I love him /.', 'ner_tags': []}
Localized: {'index': 1, 'localized_text': 've onu seviyorum.', 'localized_word_mappings': {}, 'tokens': ['ve', 'onu', 'seviyorum', '.'], 'ner_tags': [0, 0, 0, 0], 'ner_tag_labels': {'PER': (1, 2), 'ORG': (3, 4), 'LOC': (5, 6), 'DATE': (7, 8), 'EVENT': (9, 10)}}
----


In [8]:
from sugardata import localize_ner_data_async

local_example_async = await localize_ner_data_async(
    examples=examples[:10],
    language=language,
    model=model,
    vendor=vendor,
    tokenizer=tokenizer,
    batch_size=batch_size,
    entity_labels=entity_labels,
    export_type="default",
)

In [9]:
print(f"Localization Results (first 2 examples) compared to examples:")
for i in range(2):
    print(f"Example {i+1}:")
    print("Original:", examples[i])
    print("Localized:", local_example_async[i])
    print("----")

Localization Results (first 2 examples) compared to examples:
Example 1:
Original: {'text': 'She seemed startled to see someone that looked so different.', 'ner_tags': []}
Localized: {'index': 3, 'localized_text': 'Eğer başkanlık için koşacağını duyurursa, New Yorklu senatörünüzü Amerika Birleşik Devletleri başkanlığı için desteklemenizin şimdilik olanı yapar mısınız?', 'localized_word_mappings': {}, 'tokens': ['Eğer', 'başkanlık', 'için', 'koş', '##acağını', 'duyuru', '##r', '##sa', ',', 'New', 'York', '##lu', 'sen', '##atörü', '##nü', '##z', '##ü', 'Amerika', 'Birleşik', 'Devletleri', 'başkanlığı', 'için', 'destekleme', '##ni', '##zi', '##n', 'şimdilik', 'olanı', 'yapar', 'mısınız', '?'], 'ner_tags': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'ner_tag_labels': {'PER': (1, 2), 'ORG': (3, 4), 'LOC': (5, 6), 'DATE': (7, 8), 'EVENT': (9, 10)}}
----
Example 2:
Original: {'text': 'and I love him /.', 'ner_tags': []}
Localized: {'index': 2

In [None]:
from sugardata import localize_ner_data_multi_vendor_async

examples = sampled_examples
batch_size = 32
verbose = True

vendors = {
    "openai": "gpt-4o-mini",
    "gemini": "gemini-2.0-flash-lite",
    "groq": "meta-llama/llama-4-scout-17b-16e-instruct",
    "ollama": "gemma3:12b",
}

results = await localize_ner_data_multi_vendor_async(
    examples=sampled_examples,
    language=language,
    tokenizer=tokenizer,
    entity_labels=entity_labels,
    batch_size=batch_size,
    vendors=vendors,
    verbose=verbose
)

[gpt-4o-mini] Starting text generation: 40 batches
[models/gemini-2.0-flash-lite] Starting text generation: 40 batches
[meta-llama/llama-4-scout-17b-16e-instruct] Starting text generation: 40 batches
[gemma3:12b] Starting text generation: 40 batches
[models/gemini-2.0-flash-lite] Generated batch 1/40
[meta-llama/llama-4-scout-17b-16e-instruct] Generated batch 1/40
[gpt-4o-mini] Generated batch 1/40
[models/gemini-2.0-flash-lite] Generated batch 2/40
[meta-llama/llama-4-scout-17b-16e-instruct] Generated batch 2/40
Error in batch item 20: OutputParserException: Failed to parse NERLocalText from completion null. Got: 1 validation error for NERLocalText
  Input should be a valid dictionary or instance of NERLocalText [type=model_type, input_value=None, input_t...
[models/gemini-2.0-flash-lite] Generated batch 3/40
[gpt-4o-mini] Generated batch 2/40
[models/gemini-2.0-flash-lite] Generated batch 4/40
Error in batch item 26: OutputParserException: Failed to parse NERLocalText from completion

In [None]:
local_data = []
for vendor, vendor_results in results.items():
    local_data.extend(vendor_results)

print(f"Total localized examples: {len(local_data)}")

In [None]:
from typing import Dict, Tuple

NER_LABELS: Dict[str, Tuple[int,int]] = {
    'PER': (1, 2),
    'ORG':    (3, 4),
    'LOC':    (5, 6),
    'DATE':   (7, 8),
    'EVENT':  (9,10),
}

id2label = {0: "O"}
for ent, (b_id, i_id) in NER_LABELS.items():
    id2label[b_id] = f"B-{ent}"
    id2label[i_id] = f"I-{ent}"
label2id = {v: k for k, v in id2label.items()}

num_labels = max(id2label.keys()) + 1


assert all(isinstance(x["tokens"], list) and isinstance(x["ner_tags"], list) for x in local_data)
assert all(len(x["tokens"]) == len(x["ner_tags"]) for x in local_data)

In [None]:
import random
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split

# Train / validation / test split (80/10/10)
random.seed(42)
idx = list(range(len(local_data)))
train_idx, tmp_idx = train_test_split(idx, test_size=0.2, random_state=42, shuffle=True)
valid_idx, test_idx = train_test_split(tmp_idx, test_size=0.5, random_state=42, shuffle=True)

train_data = [local_data[i] for i in train_idx]
valid_data = [local_data[i] for i in valid_idx]
test_data  = [local_data[i] for i in test_idx]

ds = DatasetDict({
    "train": Dataset.from_list(train_data),
    "validation": Dataset.from_list(valid_data),
    "test": Dataset.from_list(test_data),
})

print(ds)

In [None]:
from transformers import AutoTokenizer

model_name = "dbmdz/bert-base-turkish-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

CLS_ID = tokenizer.cls_token_id
SEP_ID = tokenizer.sep_token_id
PAD_ID = tokenizer.pad_token_id

def encode_example(example, max_length=256):
    # Convert your piecewise tokens → ids (unknown pieces become [UNK])
    input_ids = tokenizer.convert_tokens_to_ids(example["tokens"])
    attention_mask = [1]*len(input_ids)

    # Add special tokens [CLS] ... [SEP]
    input_ids = [CLS_ID] + input_ids + [SEP_ID]
    attention_mask = [1] + attention_mask + [1]

    # Shift labels: add -100 for [CLS] and [SEP] so they are ignored by the loss
    labels = [-100] + example["ner_tags"] + [-100]

    # Truncate if needed (keep room already considered)
    if len(input_ids) > max_length:
        input_ids   = input_ids[:max_length]
        attention_mask = attention_mask[:max_length]
        labels = labels[:max_length]

    # No padding here; DataCollator will pad dynamically per batch
    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels,
        "localized_text": example.get("localized_text", ""),
        "index": example.get("index", -1),
    }

encoded_ds = ds.map(encode_example, remove_columns=ds["train"].column_names)

print(f"Encoded dataset:\n{encoded_ds}")

In [None]:
import evaluate
import numpy as np
from transformers import AutoConfig, AutoModelForTokenClassification, DataCollatorForTokenClassification, TrainingArguments, Trainer

seqeval = evaluate.load("seqeval")

config = AutoConfig.from_pretrained(
    model_name,
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id,
)

model = AutoModelForTokenClassification.from_pretrained(model_name, config=config)

data_collator = DataCollatorForTokenClassification(tokenizer, padding=True, max_length=256)

# Build a function to convert label ids back to strings (excluding -100 and specials)
def align_predictions(predictions, label_ids):
    preds = np.argmax(predictions, axis=2)

    batch_preds = []
    batch_refs = []

    for pred, lab in zip(preds, label_ids):
        true_preds = []
        true_labels = []
        for p, l in zip(pred, lab):
            if l == -100:
                continue
            true_preds.append(id2label[p])
            true_labels.append(id2label[l])
        batch_preds.append(true_preds)
        batch_refs.append(true_labels)
    return batch_preds, batch_refs

def compute_metrics(p):
    predictions, labels = p
    preds_list, refs_list = align_predictions(predictions, labels)
    results = seqeval.compute(predictions=preds_list, references=refs_list)
    # Friendly flatten of main scores
    return {
        "precision": results["overall_precision"],
        "recall":    results["overall_recall"],
        "f1":        results["overall_f1"],
        "accuracy":  results["overall_accuracy"],
    }

training_args = TrainingArguments(
    output_dir="outputs/bert-tr-ner",
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_steps=50,
    learning_rate=3e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    gradient_accumulation_steps=1,
    fp16=True,
    report_to="none",   # set "wandb" etc. if you want
    seed=42,
    dataloader_num_workers=2,
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_ds["train"],
    eval_dataset=encoded_ds["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,     # keeps special tokens, etc.
    compute_metrics=compute_metrics,
)

trainer.train()


In [None]:
test_metrics = trainer.evaluate(encoded_ds["test"])
print(test_metrics)  # precision / recall / f1 / accuracy


In [None]:
# Get raw predictions for per-class report
predictions = trainer.predict(encoded_ds["test"])
preds_list, refs_list = align_predictions(predictions.predictions, predictions.label_ids)
report = evaluate.load("seqeval").compute(predictions=preds_list, references=refs_list)
print(report)  # includes per-entity scores like B-PERSON/I-PERSON aggregated as PER


In [None]:
trainer.save_model("outputs/bert-tr-ner/final")
tokenizer.save_pretrained("outputs/bert-tr-ner/final")


In [None]:
from transformers import pipeline

ner_pipe = pipeline(
    "token-classification",
    model="outputs/bert-tr-ner/final",
    tokenizer="outputs/bert-tr-ner/final",
    aggregation_strategy="simple"   # merges sub-tokens into whole entities
)

text = "Japonya Başbakanı Fumio Kishida bugün Ankara'da Cumhurbaşkanı ile görüştü."
preds = ner_pipe(text)
preds
