In [3]:
# -------------------------------
# 1. Install libraries
# -------------------------------
# !pip install transformers datasets seqeval

# -------------------------------
# 2. Imports
# -------------------------------
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification
import evaluate
import numpy as np

# -------------------------------
# 3. Parse CoNLL file into dataset
# -------------------------------
def parse_conll(file_path):
    sentences = []
    tokens = []
    labels = []

    with open(file_path, encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if not line:
                if tokens:
                    sentences.append({'tokens': tokens, 'ner_tags': labels})
                    tokens = []
                    labels = []
            else:
                splits = line.split()
                if len(splits) >= 2:
                    tokens.append(splits[0])
                    labels.append(splits[1])
        # add last sentence
        if tokens:
            sentences.append({'tokens': tokens, 'ner_tags': labels})
    return sentences

# Replace with your labeled dataset path
data = parse_conll("amharic_ner_conll_labeled_2.txt")

# Check if dataset is empty
if len(data) == 0:
    raise ValueError("Parsed dataset is empty. Check your CoNLL file formatting.")

# Build label mapping
all_labels = sorted(list({label for sent in data for label in sent['ner_tags']}))
label2id = {l: i for i, l in enumerate(all_labels)}
id2label = {i: l for i, l in enumerate(all_labels)}

# Map string labels to integers
for sent in data:
    sent['ner_tags'] = [label2id[label] for label in sent['ner_tags']]

# Create Hugging Face dataset
dataset = Dataset.from_list(data)
dataset = dataset.train_test_split(test_size=0.1)

# -------------------------------
# 4. Models to compare
# -------------------------------
model_names = {
    "XLM-Roberta": "xlm-roberta-base",
    "DistilBERT": "distilbert-base-multilingual-cased",
    "mBERT": "bert-base-multilingual-cased"
}

# -------------------------------
# 5. Tokenization & alignment
# -------------------------------
def tokenize_and_align_labels(batch, tokenizer):
    tokenized_inputs = tokenizer(batch["tokens"], truncation=True, is_split_into_words=True)
    aligned_labels = []
    for i, label in enumerate(batch["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                # Inside subword: keep I- labels only
                current_label = label[word_idx]
                label_name = id2label[current_label]
                label_ids.append(current_label if label_name.startswith('I-') else -100)
            previous_word_idx = word_idx
        aligned_labels.append(label_ids)
    tokenized_inputs["labels"] = aligned_labels
    return tokenized_inputs

# -------------------------------
# 6. Evaluation Metrics
# -------------------------------
seqeval = evaluate.load("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=-1)
    true_labels = [
        [id2label[l] for l in label if l != -100]
        for label in labels
    ]
    true_predictions = [
        [id2label[pred] for (pred, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    
    # Avoid undefined metrics by filling zero_division
    results = seqeval.compute(predictions=true_predictions, references=true_labels, zero_division=0)
    return {
        "precision": results.get("overall_precision", 0),
        "recall": results.get("overall_recall", 0),
        "f1": results.get("overall_f1", 0),
        "accuracy": results.get("overall_accuracy", 0)
    }

# -------------------------------
# 7. Loop through models
# -------------------------------
results = {}

for name, model_name in model_names.items():
    print(f"\n=== Fine-tuning {name} ({model_name}) ===")
    
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForTokenClassification.from_pretrained(
        model_name,
        num_labels=len(label2id),
        id2label=id2label,
        label2id=label2id
    )

    tokenized_dataset = dataset.map(lambda x: tokenize_and_align_labels(x, tokenizer), batched=True)

    data_collator = DataCollatorForTokenClassification(tokenizer)

    training_args = TrainingArguments(
        output_dir=f"./{name}_ner_model",
        evaluation_strategy="epoch",
        learning_rate=5e-5,
        per_device_train_batch_size=2,  # reduce for CPU or GPU memory
        per_device_eval_batch_size=2,
        num_train_epochs=3,
        weight_decay=0.01,
        save_total_limit=1,
        logging_dir=f'./logs_{name}',
        logging_steps=10
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset["train"],
        eval_dataset=tokenized_dataset["test"],
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics
    )

    trainer.train()
    metrics = trainer.evaluate()
    results[name] = metrics
    trainer.save_model(f"./{name}_ner_model_final")
    tokenizer.save_pretrained(f"./{name}_ner_model_final")

# -------------------------------
# 8. Compare results (fixed)
# -------------------------------
for model, metrics in results.items():
    print(f"\nModel: {model}")
    print(f"Precision: {metrics.get('eval_precision', 0):.3f}")
    print(f"Recall:    {metrics.get('eval_recall', 0):.3f}")
    print(f"F1-score:  {metrics.get('eval_f1', 0):.3f}")
    print(f"Accuracy:  {metrics.get('eval_accuracy', 0):.3f}")




=== Fine-tuning XLM-Roberta (xlm-roberta-base) ===


Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 45/45 [00:00<00:00, 2368.89 examples/s]
Map: 100%|██████████| 5/5 [00:00<00:00, 714.46 examples/s]
 14%|█▍        | 10/69 [00:26<02:06,  2.14s/it]

{'loss': 1.2556, 'grad_norm': 2.745151996612549, 'learning_rate': 4.27536231884058e-05, 'epoch': 0.43}


 29%|██▉       | 20/69 [00:45<01:36,  1.96s/it]

{'loss': 0.7006, 'grad_norm': 11.66637134552002, 'learning_rate': 3.5507246376811596e-05, 'epoch': 0.87}


 33%|███▎      | 23/69 [00:51<01:27,  1.89s/it]
 33%|███▎      | 23/69 [00:51<01:27,  1.89s/it]

{'eval_loss': 0.6350153684616089, 'eval_precision': 0.06666666666666667, 'eval_recall': 0.13333333333333333, 'eval_f1': 0.08888888888888888, 'eval_accuracy': 0.7363636363636363, 'eval_runtime': 0.4576, 'eval_samples_per_second': 10.926, 'eval_steps_per_second': 6.555, 'epoch': 1.0}


 43%|████▎     | 30/69 [01:05<01:22,  2.11s/it]

{'loss': 0.5875, 'grad_norm': 4.240194320678711, 'learning_rate': 2.826086956521739e-05, 'epoch': 1.3}


 58%|█████▊    | 40/69 [01:27<01:00,  2.08s/it]

{'loss': 0.4521, 'grad_norm': 2.441202163696289, 'learning_rate': 2.101449275362319e-05, 'epoch': 1.74}


 67%|██████▋   | 46/69 [01:38<00:43,  1.89s/it]
 67%|██████▋   | 46/69 [01:39<00:43,  1.89s/it]

{'eval_loss': 0.30138081312179565, 'eval_precision': 0.5454545454545454, 'eval_recall': 0.4, 'eval_f1': 0.4615384615384615, 'eval_accuracy': 0.9090909090909091, 'eval_runtime': 0.361, 'eval_samples_per_second': 13.85, 'eval_steps_per_second': 8.31, 'epoch': 2.0}


 72%|███████▏  | 50/69 [01:46<00:35,  1.86s/it]

{'loss': 0.367, 'grad_norm': 2.0312976837158203, 'learning_rate': 1.3768115942028985e-05, 'epoch': 2.17}


 87%|████████▋ | 60/69 [02:07<00:19,  2.17s/it]

{'loss': 0.3331, 'grad_norm': 5.527822494506836, 'learning_rate': 6.521739130434783e-06, 'epoch': 2.61}



100%|██████████| 69/69 [02:30<00:00,  2.18s/it]


{'eval_loss': 0.1644371896982193, 'eval_precision': 0.45, 'eval_recall': 0.6, 'eval_f1': 0.5142857142857143, 'eval_accuracy': 0.9681818181818181, 'eval_runtime': 0.414, 'eval_samples_per_second': 12.078, 'eval_steps_per_second': 7.247, 'epoch': 3.0}
{'train_runtime': 150.4654, 'train_samples_per_second': 0.897, 'train_steps_per_second': 0.459, 'train_loss': 0.5625021215798198, 'epoch': 3.0}


100%|██████████| 3/3 [00:00<00:00, 11.15it/s]



=== Fine-tuning DistilBERT (distilbert-base-multilingual-cased) ===


Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 45/45 [00:00<00:00, 2047.89 examples/s]
Map: 100%|██████████| 5/5 [00:00<00:00, 714.41 examples/s]
 14%|█▍        | 10/69 [00:11<00:54,  1.09it/s]

{'loss': 0.8712, 'grad_norm': 2.7913451194763184, 'learning_rate': 4.27536231884058e-05, 'epoch': 0.43}


 29%|██▉       | 20/69 [00:19<00:43,  1.12it/s]

{'loss': 0.6747, 'grad_norm': 1.3758013248443604, 'learning_rate': 3.5507246376811596e-05, 'epoch': 0.87}


 33%|███▎      | 23/69 [00:22<00:40,  1.15it/s]
 33%|███▎      | 23/69 [00:22<00:40,  1.15it/s]

{'eval_loss': 0.5708490014076233, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.8545454545454545, 'eval_runtime': 0.205, 'eval_samples_per_second': 24.39, 'eval_steps_per_second': 14.634, 'epoch': 1.0}


 43%|████▎     | 30/69 [00:28<00:34,  1.14it/s]

{'loss': 0.6208, 'grad_norm': 1.2737773656845093, 'learning_rate': 2.826086956521739e-05, 'epoch': 1.3}


 58%|█████▊    | 40/69 [00:38<00:26,  1.09it/s]

{'loss': 0.4, 'grad_norm': 1.6577433347702026, 'learning_rate': 2.101449275362319e-05, 'epoch': 1.74}


 67%|██████▋   | 46/69 [00:43<00:19,  1.15it/s]
 67%|██████▋   | 46/69 [00:43<00:19,  1.15it/s]

{'eval_loss': 0.2568841576576233, 'eval_precision': 0.5555555555555556, 'eval_recall': 0.3333333333333333, 'eval_f1': 0.4166666666666667, 'eval_accuracy': 0.9318181818181818, 'eval_runtime': 0.207, 'eval_samples_per_second': 24.157, 'eval_steps_per_second': 14.494, 'epoch': 2.0}


 72%|███████▏  | 50/69 [00:46<00:16,  1.16it/s]

{'loss': 0.2985, 'grad_norm': 3.6539924144744873, 'learning_rate': 1.3768115942028985e-05, 'epoch': 2.17}


 87%|████████▋ | 60/69 [00:56<00:08,  1.10it/s]

{'loss': 0.2854, 'grad_norm': 5.608894348144531, 'learning_rate': 6.521739130434783e-06, 'epoch': 2.61}



100%|██████████| 69/69 [01:06<00:00,  1.04it/s]


{'eval_loss': 0.21991176903247833, 'eval_precision': 0.75, 'eval_recall': 0.4, 'eval_f1': 0.5217391304347827, 'eval_accuracy': 0.9363636363636364, 'eval_runtime': 0.161, 'eval_samples_per_second': 31.056, 'eval_steps_per_second': 18.634, 'epoch': 3.0}
{'train_runtime': 66.389, 'train_samples_per_second': 2.033, 'train_steps_per_second': 1.039, 'train_loss': 0.48372066884801007, 'epoch': 3.0}


100%|██████████| 3/3 [00:00<00:00, 23.81it/s]



=== Fine-tuning mBERT (bert-base-multilingual-cased) ===


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 45/45 [00:00<00:00, 2999.79 examples/s]
Map: 100%|██████████| 5/5 [00:00<00:00, 833.13 examples/s]
 14%|█▍        | 10/69 [00:16<01:22,  1.40s/it]

{'loss': 0.8133, 'grad_norm': 3.134209394454956, 'learning_rate': 4.27536231884058e-05, 'epoch': 0.43}


 29%|██▉       | 20/69 [00:30<01:04,  1.32s/it]

{'loss': 0.5965, 'grad_norm': 3.8075315952301025, 'learning_rate': 3.5507246376811596e-05, 'epoch': 0.87}


 33%|███▎      | 23/69 [00:34<00:58,  1.27s/it]
 33%|███▎      | 23/69 [00:34<00:58,  1.27s/it]

{'eval_loss': 0.4556332528591156, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.8727272727272727, 'eval_runtime': 0.35, 'eval_samples_per_second': 14.286, 'eval_steps_per_second': 8.571, 'epoch': 1.0}


 43%|████▎     | 30/69 [00:43<00:49,  1.28s/it]

{'loss': 0.4955, 'grad_norm': 1.8427023887634277, 'learning_rate': 2.826086956521739e-05, 'epoch': 1.3}


 58%|█████▊    | 40/69 [00:57<00:39,  1.36s/it]

{'loss': 0.3639, 'grad_norm': 1.588943362236023, 'learning_rate': 2.101449275362319e-05, 'epoch': 1.74}


 67%|██████▋   | 46/69 [01:04<00:29,  1.29s/it]
 67%|██████▋   | 46/69 [01:05<00:29,  1.29s/it]

{'eval_loss': 0.17030641436576843, 'eval_precision': 0.5384615384615384, 'eval_recall': 0.4666666666666667, 'eval_f1': 0.5, 'eval_accuracy': 0.95, 'eval_runtime': 0.363, 'eval_samples_per_second': 13.774, 'eval_steps_per_second': 8.265, 'epoch': 2.0}


 72%|███████▏  | 50/69 [01:10<00:25,  1.36s/it]

{'loss': 0.2401, 'grad_norm': 2.0008692741394043, 'learning_rate': 1.3768115942028985e-05, 'epoch': 2.17}


 87%|████████▋ | 60/69 [01:24<00:12,  1.34s/it]

{'loss': 0.245, 'grad_norm': 6.0889387130737305, 'learning_rate': 6.521739130434783e-06, 'epoch': 2.61}



100%|██████████| 69/69 [01:39<00:00,  1.44s/it]


{'eval_loss': 0.11059578508138657, 'eval_precision': 0.9090909090909091, 'eval_recall': 0.6666666666666666, 'eval_f1': 0.7692307692307692, 'eval_accuracy': 0.9727272727272728, 'eval_runtime': 0.327, 'eval_samples_per_second': 15.291, 'eval_steps_per_second': 9.174, 'epoch': 3.0}
{'train_runtime': 99.2303, 'train_samples_per_second': 1.36, 'train_steps_per_second': 0.695, 'train_loss': 0.41962648992953094, 'epoch': 3.0}


100%|██████████| 3/3 [00:00<00:00, 13.57it/s]



Model: XLM-Roberta
Precision: 0.450
Recall:    0.600
F1-score:  0.514
Accuracy:  0.968

Model: DistilBERT
Precision: 0.750
Recall:    0.400
F1-score:  0.522
Accuracy:  0.936

Model: mBERT
Precision: 0.909
Recall:    0.667
F1-score:  0.769
Accuracy:  0.973
