In [None]:
!pip install --no-cache-dir transformers==4.41.2

In [None]:
!pip install transformers datasets seqeval

read coNLL file

In [24]:
def read_conll_file(filepath):
    sentences = []
    labels = []
    with open(filepath, 'r', encoding='utf-8') as f:
        sentence = []
        label = []
        for line in f:
            if line.strip() == "":
                if sentence:
                    sentences.append(sentence)
                    labels.append(label)
                    sentence = []
                    label = []
            else:
                splits = line.strip().split()
                if len(splits) == 2:
                    token, tag = splits
                    sentence.append(token)
                    label.append(tag)
        if sentence:  # Catch the last sentence if file doesn’t end in newline
            sentences.append(sentence)
            labels.append(label)
    return sentences, labels

sentences, ner_labels = read_conll_file("amharic_ner_dataset.conll")

# Preview
for i in range(2):
    print("Sentence:", sentences[i])
    print("Labels  :", ner_labels[i])
    print()


Sentence: ['3pcs', 'silicon', 'brush', 'spatulas', 'እስከ', '260°c', 'ሙቀት', 'መቆቆም', 'የሚችል', 'ዋጋ-550ብር', 'አድራሻ', 'ቁ.1', 'ስሪ', 'ኤም', 'ሲቲ', 'ሞል', 'ሁለተኛ', 'ፎቅ', 'ቢሮ', 'ቁ.', 'SL-05A(ከ', 'ሊፍቱ', 'ፊት', 'ለ', 'ፊት)', 'ቁ.2', 'ለቡ', 'መዳህኒዓለም', 'ቤተ/ክርስቲያን', 'ፊት', 'ለፊት', '#ዛም_ሞል', '2ኛ', 'ፎቅ', 'ቢሮ', 'ቁጥር.214', 'ለቡ', 'ቅርንጫፍ0973611819', '0909522840', '0923350054', 'በTelegram', 'ለማዘዝ', 'ይጠቀሙ', '@shager_onlinestore', 'ለተጨማሪ', 'ማብራሪያ', 'የቴሌግራም', 'ገፃችን', 'https://t.me/Shageronlinestore']
Labels  : ['B-PRODUCT', 'I-PRODUCT', 'I-PRODUCT', 'I-PRODUCT', 'O', 'O', 'O', 'O', 'O', 'I-PRICE', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'I-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'I-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']

Sentence: ['Mandoline', 'Slicer', 'ጊዜ', 'ቆጣቢ', 'ስላይስ', 'ማድረጊያ', 'ለእጅ', 'ሴፍቲ', 'ተመራጭ', 'ለድንች', 'ለካሮትና', 'ሌሎች', 'አታክልቶች', 'ተመራጭ', 'ጥራት', 'ያለው', 'ዕቃ', 'ዋጋ፦', '1,200', 'ብር', 'አድራሻ', 'ቁ.1', 'ስሪ', 'ኤም', 'ሲቲ', 'ሞል', 'ሁለተኛ', 'ፎቅ', 'ቢሮ', 

tokenizer from Hugging Face's transformers library

In [25]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")
label_list = ['O', 'B-PRODUCT', 'I-PRODUCT', 'B-LOC', 'I-LOC', 'B-PRICE', 'I-PRICE']
label2id = {label: i for i, label in enumerate(label_list)}
id2label = {i: label for label, i in label2id.items()}

following function tokenizes input sentences (splitting words into subwords if needed) and aligns the corresponding NER labels with each token, ensuring special tokens are ignored and subwords inherit or adapt labels from their original words.

In [10]:
def tokenize_and_align_labels(sentences, labels):
    tokenized_inputs = tokenizer(sentences, truncation=True, padding=True, is_split_into_words=True, return_tensors="pt")

    aligned_labels = []
    for i, label in enumerate(labels):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []

        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)  # Ignore special tokens
            elif word_idx != previous_word_idx:
                label_ids.append(label2id[label[word_idx]])
            else:
                # For subwords, assign the same label or 'O'
                label_ids.append(label2id[label[word_idx]])  # Or optionally just label2id["O"]
            previous_word_idx = word_idx

        aligned_labels.append(label_ids)

    return tokenized_inputs, aligned_labels


In [16]:
for label_seq in ner_labels:
    for i, tag in enumerate(label_seq):
        if tag == "I-PRODUC":
            label_seq[i] = "I-PRODUCT"

tokenized_inputs, aligned_labels = tokenize_and_align_labels(sentences, ner_labels)
print("Tokens:", tokenizer.convert_ids_to_tokens(tokenized_inputs["input_ids"][0]))
print("Aligned Labels:", aligned_labels[0])
print("Label Meaning:", [id2label[id] if id != -100 else "IGNORED" for id in aligned_labels[0]])


Tokens: ['<s>', '▁3', 'pc', 's', '▁si', 'li', 'con', '▁brush', '▁spa', 'tul', 'as', '▁እስከ', '▁2', '60°', 'c', '▁ሙ', 'ቀት', '▁መቆ', 'ቆም', '▁የሚችል', '▁ዋጋ', '-', '550', 'ብር', '▁አድራሻ', '▁ቁ', '.', '1', '▁ስ', 'ሪ', '▁', 'ኤም', '▁ሲ', 'ቲ', '▁ሞ', 'ል', '▁', 'ሁለተኛ', '▁', 'ፎ', 'ቅ', '▁ቢሮ', '▁ቁ', '.', '▁SL', '-05', 'A', '(', 'ከ', '▁ሊ', 'ፍ', 'ቱ', '▁ፊት', '▁ለ', '▁ፊት', ')', '▁ቁ', '.', '2', '▁ለ', 'ቡ', '▁መ', 'ዳ', 'ህ', 'ኒ', 'ዓለም', '▁ቤተ', '/', 'ክር', 'ስቲ', 'ያን', '▁ፊት', '▁ለ', 'ፊት', '▁#', 'ዛ', 'ም', '_', 'ሞ', 'ል', '▁2', 'ኛ', '▁', 'ፎ', 'ቅ', '▁ቢሮ', '▁ቁጥር', '.', '2', '14', '▁ለ', 'ቡ', '▁ቅር', 'ን', 'ጫ', 'ፍ', '09', '736', '118', '19', '▁', '090', '95', '228', '40', '▁09', '233', '500', '54', '▁በ', 'Tele', 'gram', '▁ለማ', 'ዘ', 'ዝ', '▁ይ', 'ጠቀሙ', '▁@', 'sha', 'ger', '_', 'online', 'store', '▁ለተ', 'ጨ', 'ማሪ', '▁ማብራሪያ', '▁የ', 'ቴ', 'ሌ', 'ግራ', 'ም', '▁ገ', 'ፃ', 'ችን', '▁https', '://', 't', '.', 'me', '/', 'Sha', 'ger', 'online', 'store', '</s>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', 

prepare the data in the standard format used by Hugging Face's Trainer for training and evaluating NER models.

In [17]:
from datasets import Dataset

data = {
    "input_ids": tokenized_inputs["input_ids"],
    "attention_mask": tokenized_inputs["attention_mask"],
    "labels": aligned_labels
}

dataset = Dataset.from_dict(data)
dataset = dataset.train_test_split(test_size=0.2)
train_dataset = dataset['train']
val_dataset = dataset['test']


initialize a transformer model for NER tasks using Hugging Face's transformers library.

In [18]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    "xlm-roberta-base",
    num_labels=len(label_list),
    id2label=id2label,
    label2id=label2id
)


Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


set up the training configuration for fine-tuning the NER model

In [19]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./ner_model",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="f1"
)




In [None]:
!pip install -U accelerate


compute_metrics function

In [20]:
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

def compute_metrics(p):
    predictions, labels = p
    predictions = predictions.argmax(axis=-1)

    true_labels = []
    true_preds = []

    for pred, label in zip(predictions, labels):
        for p_i, l_i in zip(pred, label):
            if l_i != -100:
                true_preds.append(p_i)
                true_labels.append(l_i)

    precision, recall, f1, _ = precision_recall_fscore_support(true_labels, true_preds, average='weighted')
    acc = accuracy_score(true_labels, true_preds)
    return {
        "accuracy": acc,
        "precision": precision,
        "recall": recall,
        "f1": f1
    }


Create the Trainer object

In [21]:
from transformers import Trainer
print("✅ Trainer import success")
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)


✅ Trainer import success


In [22]:
import os
os.environ["WANDB_DISABLED"] = "true"


In [None]:
trainer.train()
#train model

[34m[1mwandb[0m: Currently logged in as: [33mmijuualemu00[0m ([33mmijuualemu00-private[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3624,0.266951,0.890871,0.81307,0.890871,0.843714
2,0.1284,0.105772,0.96521,0.967434,0.96521,0.961046


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3624,0.266951,0.890871,0.81307,0.890871,0.843714
2,0.1284,0.105772,0.96521,0.967434,0.96521,0.961046
3,0.0653,0.062508,0.979681,0.980509,0.979681,0.979719
4,0.0444,0.055485,0.982753,0.982959,0.982753,0.982716
5,0.0401,0.054912,0.982654,0.982792,0.982654,0.982637


TrainOutput(global_step=160, training_loss=0.20182167124003172, metrics={'train_runtime': 5339.1435, 'train_samples_per_second': 0.24, 'train_steps_per_second': 0.03, 'total_flos': 179649640896000.0, 'train_loss': 0.20182167124003172, 'epoch': 5.0})

save model

In [28]:
trainer.save_model("amharic_ner_model")
tokenizer.save_pretrained("amharic_ner_model")

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Copy the model to Drive
!cp -r amharic_ner_model /content/drive/MyDrive/


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!huggingface-cli login


import model tokenizer

In [38]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

model_checkpoint = "distilbert-base-multilingual-cased" # Public model fine-tuned for NER

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    num_labels=len(label_list),
    id2label=id2label,
    label2id=label2id
)


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/466 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/542M [00:00<?, ?B/s]

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [39]:
from datasets import Dataset
import os

def read_conll(filepath):
    tokens = []
    tags = []
    all_sentences = []

    with open(filepath, "r", encoding="utf-8") as file:
        for line in file:
            line = line.strip()
            if not line:
                if tokens:
                    all_sentences.append({"tokens": tokens, "ner_tags": tags})
                    tokens = []
                    tags = []
                continue
            splits = line.split()
            tokens.append(splits[0])
            tags.append(splits[-1])  # assuming last column is the NER tag

    return all_sentences

# Load train/val files
train_data = read_conll("amharic_ner_dataset.conll")
val_data = read_conll("amharic_ner_dataset.conll")

# Convert tags to IDs (if needed)
unique_labels = sorted(list(set(tag for sentence in train_data for tag in sentence["ner_tags"])))
label2id = {label: idx for idx, label in enumerate(unique_labels)}
id2label = {idx: label for label, idx in label2id.items()}

# Map tags to IDs
def encode_labels(examples):
    examples["ner_tags"] = [label2id[tag] for tag in examples["ner_tags"]]
    return examples

train_dataset = Dataset.from_list(train_data).map(encode_labels)
val_dataset = Dataset.from_list(val_data).map(encode_labels)


Map:   0%|          | 0/320 [00:00<?, ? examples/s]

Map:   0%|          | 0/320 [00:00<?, ? examples/s]

In [40]:

# Function to tokenize the dataset and align labels
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],  # Tokenized words
        truncation=True,
        is_split_into_words=True,
        padding=True,
        max_length=128
    )

    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        previous_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)  # Padding tokens are ignored
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])  # New word, take the label
            else:
                label_ids.append(-100)  # For subword tokens, ignore
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# Apply the tokenization and alignment function
tokenized_train = train_dataset.map(tokenize_and_align_labels, batched=True)
tokenized_val = val_dataset.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/320 [00:00<?, ? examples/s]

Map:   0%|          | 0/320 [00:00<?, ? examples/s]

In [41]:
from transformers import TrainingArguments

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",  # where to save the results and model checkpoints
    evaluation_strategy="epoch",  # Evaluate every epoch
    learning_rate=2e-5,  # learning rate
    per_device_train_batch_size=8,  # batch size for training
    per_device_eval_batch_size=8,  # batch size for evaluation
    num_train_epochs=3,  # number of epochs
    weight_decay=0.01,  # weight decay for regularization
    logging_dir="./logs",  # where to save the logs
    logging_steps=10,  # log every 10 steps
)


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [47]:
from transformers import Trainer, TrainingArguments

# Initialize the model (make sure it's the correct model you're using)
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    num_labels=len(label2id),  # number of labels you have
    id2label=id2label,
    label2id=label2id
)

# Initialize Trainer with the necessary arguments
trainer = Trainer(
    model=model,  # the model we are training
    args=training_args,  # the training arguments
    train_dataset=tokenized_train,  # our tokenized train dataset
    eval_dataset=tokenized_val,
    compute_metrics=compute_metrics# our tokenized validation dataset
)


Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [48]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2911,0.240295,0.930247,0.92899,0.930247,0.917719


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2911,0.240295,0.930247,0.92899,0.930247,0.917719
2,0.1697,0.141605,0.962061,0.961379,0.962061,0.959297
3,0.1394,0.124372,0.965539,0.964915,0.965539,0.962979


TrainOutput(global_step=120, training_loss=0.27172154386838276, metrics={'train_runtime': 1054.303, 'train_samples_per_second': 0.911, 'train_steps_per_second': 0.114, 'total_flos': 31358444912640.0, 'train_loss': 0.27172154386838276, 'epoch': 3.0})

In [49]:
trainer.save_model("amharic_ner_model_distilbert-base-multilingual-cased")
tokenizer.save_pretrained("amharic_ner_model_distilbert-base-multilingual-cased")

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Copy the model to Drive
!cp -r amharic_ner_model /content/drive/MyDrive/

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
