# Introduction

This notebook explores a multilingual Named Entity Recognition (NER) pipeline using the XLM-RoBERTa model, with a focus on cross-lingual transfer learning.

The model is fine-tuned on English data (PAN-X from the XTREME benchmark) and evaluated on Turkish in both zero-shot and few-shot settings. By gradually increasing the number of training samples in the target language, we analyze how low-resource adaptation impacts NER performance.

Key goals of this notebook include:
* Evaluating cross-lingual NER transfer from English to Turkish,
* Investigating few-shot adaptation behavior under varying data sizes,
* Providing a reproducible baseline for multilingual NER fine-tuning using Hugging Face Transformers.

# 1 Load The Dataset

In [None]:
!pip install -q datasets transformers seqeval --no-deps

import warnings
warnings.filterwarnings('ignore')

In [None]:
from datasets import load_dataset, DatasetDict
from collections import defaultdict

def load_panx_datasets(langs):
    panx_ch = defaultdict(DatasetDict)
    for lang in langs:
        ds = load_dataset("xtreme", name=f"PAN-X.{lang}")
        for split in ds:
            panx_ch[lang][split] = ds[split].shuffle(seed=0)
    return panx_ch

In [None]:
langs=["en","tr"]
panx_ch = load_panx_datasets(langs)

In [None]:
# number of data
import pandas as pd

pd.DataFrame(
    {lang: [
        panx_ch[lang]["train"].num_rows,
        panx_ch[lang]["validation"].num_rows,
        panx_ch[lang]["test"].num_rows
    ] for lang in langs},
    index=["Train", "Validation", "Test"])

# 2 EDA & Data Preprocessing

## 2.1 Feature Items

In [None]:
#features
for key, value in panx_ch["en"]["train"].features.items():
    print(f"{key}: {value}")

## 2.2 NER Tags

In [None]:
# NER Tags
tags = panx_ch["en"]["train"].features["ner_tags"].feature
print(tags)

## 2.3 An Example

In [None]:
# first context in English Train
element = panx_ch["en"]["train"][2]

for key, value in element.items():
    print(f"{key}: {value}")

In [None]:
def ner_tags_int2str(batch):
    return {"ner_tags_str": [tags.int2str(idx) for idx in batch["ner_tags"]]}

panx_en = panx_ch["en"].map(ner_tags_int2str)

en_example = panx_en["train"][2]

pd.DataFrame(
    [en_example["tokens"], en_example["ner_tags_str"]],
    index=["Tokens", "Tags"])

In [None]:
# number of B- tags in en

from collections import Counter

split2freqs = defaultdict(Counter)

for split, dataset in panx_en.items():
    for row in dataset["ner_tags_str"]:
        for tag in row:
            if tag.startswith("B"):
                tag_type = tag.split("-")[1]
                split2freqs[split][tag_type] += 1

pd.DataFrame.from_dict(split2freqs, orient="index")

# 3 Tokenizer and Label Alignment

In [None]:
from transformers import AutoTokenizer

xlmr_tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")

## 3.1 XLM-R Tokenization Mechanism (SentencePiece-based)

In [None]:
text_example = "Mehmet Sahin lives in Halle!"

xlmr_tokens = xlmr_tokenizer(text_example).tokens()

pd.DataFrame([xlmr_tokens], index = ["XLM-R"])

## 3.2 Token-level Entity Prediction

In [None]:
from transformers import XLMRobertaForTokenClassification
import torch

xlmr_model_name = "xlm-roberta-base"
index2tag = {idx: tag for idx, tag in enumerate(tags.names)}
tag2index = {tag: idx for idx, tag in enumerate(tags.names)}
num_labels = tags.num_classes

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

xlmr_model = XLMRobertaForTokenClassification.from_pretrained(xlmr_model_name,
                                                              num_labels = num_labels,
                                                              id2label=index2tag,
                                                              label2id=tag2index
).to(device)

In [None]:
def ner_infer(text, tags, model, tokenizer):
    tokens = tokenizer(text).tokens()
    input_ids = xlmr_tokenizer.encode(text,
                                      return_tensors = "pt").to(device)
    outputs = model(input_ids)[0]
    predictions = torch.argmax(outputs, dim=2)
    preds = [tags.names[p] for p in predictions[0].cpu().numpy()]
    return pd.DataFrame([tokens, preds], index = ["Tokens", "Tags"])

In [None]:
# a zero-shot example
ner_infer(text_example, tags, xlmr_model, xlmr_tokenizer)

## 3.3 Label Alignment

In [None]:
def tokenize_and_align_labels(example_batch):
    tokenized_batch = xlmr_tokenizer(
        example_batch["tokens"],
        truncation=True,
        is_split_into_words=True
    )

    aligned_label_batch = []

    for example_idx, word_labels in enumerate(example_batch["ner_tags"]):
        word_ids = tokenized_batch.word_ids(batch_index=example_idx)
        previous_word_id = None
        label_ids = []

        for word_id in word_ids:
            if word_id is None:
                label_ids.append(-100)
            elif word_id != previous_word_id:
                label_ids.append(word_labels[word_id])
            else:
                label_ids.append(-100)
            previous_word_id = word_id

        aligned_label_batch.append(label_ids)

    tokenized_batch["labels"] = aligned_label_batch
    return tokenized_batch

In [None]:
def encode_panx_dataset(dataset_split):
    return dataset_split.map(tokenize_and_align_labels,
                     batched=True,
                     remove_columns=["langs", "ner_tags", "tokens"])

panx_en_encoded = encode_panx_dataset(panx_ch["en"])

In [None]:
example = panx_en_encoded["train"][2]

tokens = xlmr_tokenizer.convert_ids_to_tokens(example["input_ids"])
label_ids = example["labels"]
label_names = [index2tag[label] if label != -100 else "IGN" for label in label_ids]

pd.DataFrame(
    [tokens, label_ids, label_names],
    index=["Tokens", "Label IDs", "Labels"])

## 3.4 Preparing Batched Inputs with Data Collator

In [None]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(xlmr_tokenizer)

# 4. Model Training

## 4.1 Defining Training Arguments

In [None]:
from transformers import TrainingArguments
def get_training_arguments(output_dir="xlm-roberta-ner-multi",
                           num_epochs=3,
                           batch_size=32,
                           dataset_length=None,
                           push_to_hf=False
                          ):

    return TrainingArguments(
        output_dir=output_dir,
        eval_strategy="epoch",
        save_strategy="no",
        learning_rate=2e-5,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=num_epochs,
        weight_decay=0.01,
        logging_steps=dataset_length // batch_size,
        push_to_hub=push_to_hf,
        report_to="none",
    )

## 4.2 Evaluation Metrics

In [None]:
import numpy as np

def align_predictions(predictions, label_ids):
    pred_ids = np.argmax(predictions, axis=2)
    batch_size, seq_len = pred_ids.shape

    true_labels = []
    pred_labels = []

    for batch_idx in range(batch_size):
        example_true = []
        example_pred = []
        for token_idx in range(seq_len):
            true_label_id = label_ids[batch_idx][token_idx]
            pred_label_id = pred_ids[batch_idx][token_idx]

            if true_label_id != -100:
                example_true.append(index2tag[true_label_id])
                example_pred.append(index2tag[pred_label_id])

        true_labels.append(example_true)
        pred_labels.append(example_pred)

    return pred_labels, true_labels

In [None]:
from seqeval.metrics import precision_score, recall_score, f1_score, accuracy_score

def compute_metrics(eval_pred):
    predictions, label_ids = eval_pred
    y_pred, y_true = align_predictions(predictions, label_ids)
    return {
        "accuracy": accuracy_score(y_true,y_pred),
        "precision": precision_score(y_true, y_pred),
        "recall": recall_score(y_true, y_pred),
        "f1": f1_score(y_true, y_pred)}

## 4.3 Training with Hugging Face Trainer

In [None]:
from transformers import Trainer

training_args = get_training_arguments(
    output_dir="xlm-roberta-ner-en",
    dataset_length=len(panx_en_encoded["train"])
)

model_ft_en = Trainer(model = xlmr_model,
                                   args = training_args,
                                   tokenizer = xlmr_tokenizer,
                                   train_dataset = panx_en_encoded["train"],
                                   eval_dataset = panx_en_encoded["validation"],
                                   data_collator = data_collator,
                                   compute_metrics = compute_metrics
                                  )

In [None]:
model_ft_en.train()

# 5 Cross-Lingual Training & Adaptation

## 5.1 Phase 1: Fine-Tuned Model Evaluation on English (en)

In [None]:
text_en = "Alan Mathison Turing was an English mathematician, computer scientist from London." 

ner_infer(text_en, tags, model_ft_en.model, xlmr_tokenizer)

In [None]:
def get_f1_score(model, dataset):
    return model.predict(dataset).metrics["test_f1"]

f1_scores_en = defaultdict(dict)

In [None]:
f1_scores_en["en"] = get_f1_score(model_ft_en, panx_en_encoded["test"])
print(f"F1-score of [en] model on [en] dataset: {f1_scores_en['en']:.3f}")

## 5.2 Phase 2: Zero-Shot Evaluation on Turkish

In [None]:
text_tr = "Vardar Kapısı'ndan çıkarken nişanlarımı söktüm, biraz müteessirdim. Böyle yakındı Enver Paşa"

ner_infer(text_tr, tags, model_ft_en.model, xlmr_tokenizer)

In [None]:
panx_tr_encoded = encode_panx_dataset(panx_ch["tr"])
f1_scores_en["tr"] = get_f1_score(model_ft_en, panx_tr_encoded["test"])
print(f"F1-score of [en] model on [tr] dataset: {f1_scores_en['tr']:.3f}")

## 5.3 Phase 3: Progressive Adaptation to Turkish (Few-Shot Fine-Tuning)

In [None]:
# subset training function for tr
def train_on_subset(dataset,
                    num_samples,
                    output_dir="ner-subset"
                   ):

    train_subset = dataset["train"].shuffle(seed=42).select(range(num_samples))
    val_set = dataset["validation"]
    test_ds = dataset["test"]
    
    training_args = get_training_arguments(output_dir,
                                           dataset_length=len(train_subset),
                                           push_to_hf=False
                                          )

    sub_model_trainer = Trainer(model = model_ft_en.model,
                           args = training_args,
                           tokenizer = xlmr_tokenizer,
                           train_dataset = train_subset,
                           eval_dataset = val_set,
                           data_collator = data_collator,
                           compute_metrics = compute_metrics
                                      )

    sub_model_trainer.train()
    # sub_model_trainer.push_to_hub(commit_message="Training has been completed successfully!")

    f1_score = get_f1_score(sub_model_trainer, test_ds)
    return pd.DataFrame.from_dict(
                {"num_samples": [len(train_subset)], "f1_score": [f1_score]})

In [None]:
metrics_df = pd.DataFrame()

for size in [250, 500, 1000, 2000, 5000, 10000, 20000]:
    metrics_df = metrics_df._append(train_on_subset(panx_tr_encoded, num_samples=size, output_dir="xlm-roberta-base-cased-ner-turkish"),
                                   ignore_index = True)

In [None]:
# test scores
metrics_df

In [None]:
import matplotlib.pyplot as plt

fig, ax = plt.subplots()
ax.axhline(f1_scores_en['tr'], ls = "--", color="r")
metrics_df.set_index("num_samples").plot(ax=ax)
plt.legend(["Phase 2: Zero-Shot Evaluation on Turkish", "Phase 3: Few-Shot Evaluation on Turkish"], loc="lower right")
plt.ylim((0,1))
plt.xlabel("Number of Training Turkish Samples")
plt.ylabel("F1 Score")
plt.grid(True)
plt.show()