### Clone Repository and Change Directory



In [1]:
!git clone https://github.com/milki93/Amharic-E-commerce-Data-Extractor.git

Cloning into 'Amharic-E-commerce-Data-Extractor'...
remote: Enumerating objects: 18842, done.[K
remote: Counting objects: 100% (47/47), done.[K
remote: Compressing objects: 100% (38/38), done.[K
remote: Total 18842 (delta 19), reused 30 (delta 8), pack-reused 18795 (from 1)[K
Receiving objects: 100% (18842/18842), 106.69 MiB | 9.58 MiB/s, done.
Resolving deltas: 100% (2005/2005), done.
Updating files: 100% (18556/18556), done.


In [2]:
%cd /content/Amharic-E-commerce-Data-Extractor/notebooks

/content/Amharic-E-commerce-Data-Extractor/notebooks


###  Load Dataset Dependencies and Define Labels

In [None]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Load CoNLL Dataset
from datasets import Dataset, DatasetDict
from collections import Counter
from itertools import chain
import torch

conll_file_path = "/content/Amharic-E-commerce-Data-Extractor/data/labeled_data.conll"
unique_labels = ['O', 'B-Product', 'I-Product', 'B-LOC', 'I-LOC', 'B-PRICE', 'I-PRICE']
label2id = {label: i for i, label in enumerate(unique_labels)}
id2label = {i: label for i, label in enumerate(unique_labels)}

### CoNLL File Parsing

In [None]:
# Parse CoNLL

def parse_conll_file(file_path):
    data = []
    tokens, ner_tags = [], []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if line:
                parts = line.split()
                if len(parts) == 2:
                    token, label = parts
                    tokens.append(token)
                    ner_tags.append(label)
            else:
                if tokens:
                    data.append({"tokens": tokens, "ner_tags": ner_tags})
                tokens, ner_tags = [], []
        if tokens:
            data.append({"tokens": tokens, "ner_tags": ner_tags})
    return data

if os.path.exists(conll_file_path):
    loaded_data = parse_conll_file(conll_file_path)
    ner_dataset = Dataset.from_list(loaded_data)
    split_dataset = ner_dataset.train_test_split(test_size=0.2, seed=42)
    tokenized_datasets = DatasetDict({"train": split_dataset["train"], "validation": split_dataset["test"]})

### Load Tokenizer

In [None]:
def has_entity_labels(example):
    return any(label != 'O' for label in example['ner_tags'])

filtered_tokenized_datasets = tokenized_datasets.filter(has_entity_labels)

# Load Tokenizer
from transformers import AutoTokenizer
model_checkpoint = "Davlan/afro-xlmr-base"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# Tokenization & Label Alignment
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True, padding=True)
    labels = []
    for i in range(len(examples["tokens"])):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label = examples["ner_tags"][i]
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label2id[label[word_idx]])
            else:
                current_label = label[word_idx]
                label_ids.append(label2id.get(f"I-{current_label[2:]}" if current_label.startswith("B-") else current_label, 0))
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

processed_filtered_dataset = filtered_tokenized_datasets.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=filtered_tokenized_datasets["train"].column_names
)

Filter:   0%|          | 0/40 [00:00<?, ? examples/s]

Filter:   0%|          | 0/10 [00:00<?, ? examples/s]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Map:   0%|          | 0/39 [00:00<?, ? examples/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

In [None]:
# !pip install torch

### Define Weighted Loss Model

In [None]:
from torch import nn
from torch.nn import CrossEntropyLoss
from transformers import AutoModelForTokenClassification

all_labels = list(chain(*tokenized_datasets["train"]["ner_tags"]))
label_counts = Counter(all_labels)
weights = [1.0 / label_counts.get(i, 1) for i in range(len(unique_labels))]
weights = torch.tensor(weights, dtype=torch.float)
weights = weights / weights.sum()

# Define Weighted Model
class WeightedTokenClassifier(nn.Module):
    def __init__(self, model_checkpoint, num_labels, weight_tensor):
        super().__init__()
        self.num_labels = num_labels
        self.base_model = AutoModelForTokenClassification.from_pretrained(
            model_checkpoint,
            num_labels=num_labels,
            id2label=id2label,
            label2id=label2id
        )
        self.loss_fct = CrossEntropyLoss(weight=weight_tensor, ignore_index=-100)

    def forward(self, input_ids=None, attention_mask=None, labels=None):
        outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        loss = None
        if labels is not None:
            loss = self.loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
        return {"loss": loss, "logits": logits}

model = WeightedTokenClassifier(model_checkpoint, len(unique_labels), weights)

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at Davlan/afro-xlmr-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# !pip install seqeval

### Set Up Training Arguments and Metrics Computation

In [None]:
from transformers import TrainingArguments, Trainer
from seqeval.metrics import f1_score, precision_score, recall_score
import numpy as np

training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=15,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=30,
    report_to="none",
    seed=42,
    disable_tqdm=False
)

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)
    true_labels = [[id2label[l] for l in label if l != -100] for label in labels]
    true_preds = [[id2label[p] for p, l in zip(pred, label) if l != -100] for pred, label in zip(predictions, labels)]
    return {
        "precision": precision_score(true_labels, true_preds),
        "recall": recall_score(true_labels, true_preds),
        "f1": f1_score(true_labels, true_preds)
    }

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=processed_filtered_dataset["train"],
    eval_dataset=processed_filtered_dataset["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

  trainer = Trainer(


### Initialize Trainer and Evaluate Model

In [None]:
# trainer.train()
metrics = trainer.evaluate()
print("Evaluation Metrics:", metrics)

model_save_path = "./fine_tuned_amharic_ner_model"
trainer.save_model(model_save_path)
tokenizer.save_pretrained(model_save_path)

Evaluation Metrics: {'eval_loss': 0.4974081516265869, 'eval_precision': 0.5384615384615384, 'eval_recall': 0.4117647058823529, 'eval_f1': 0.4666666666666667, 'eval_runtime': 11.3111, 'eval_samples_per_second': 0.884, 'eval_steps_per_second': 0.177, 'epoch': 15.0}


('./fine_tuned_amharic_ner_model/tokenizer_config.json',
 './fine_tuned_amharic_ner_model/special_tokens_map.json',
 './fine_tuned_amharic_ner_model/sentencepiece.bpe.model',
 './fine_tuned_amharic_ner_model/added_tokens.json',
 './fine_tuned_amharic_ner_model/tokenizer.json')

### Save Model to Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

model.base_model.save_pretrained("/content/drive/MyDrive/amharic_ner_model")
tokenizer.save_pretrained("/content/drive/MyDrive/amharic_ner_model")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


('/content/drive/MyDrive/amharic_ner_model/tokenizer_config.json',
 '/content/drive/MyDrive/amharic_ner_model/special_tokens_map.json',
 '/content/drive/MyDrive/amharic_ner_model/sentencepiece.bpe.model',
 '/content/drive/MyDrive/amharic_ner_model/added_tokens.json',
 '/content/drive/MyDrive/amharic_ner_model/tokenizer.json')

In [None]:
# from transformers import AutoModelForTokenClassification, AutoTokenizer

# model_path = "/content/drive/MyDrive/amharic_ner_model"

# model = AutoModelForTokenClassification.from_pretrained(model_path)
# tokenizer = AutoTokenizer.from_pretrained(model_path)


### Predict named entities on new, unseen data


In [None]:
from transformers import AutoModelForTokenClassification, AutoTokenizer, pipeline

# Create a Hugging Face pipeline for token classification
ner_pipeline = pipeline("token-classification", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

# Test with some example Amharic sentences
test_sentences = [
    "አዲስ አበባ ውስጥ ያማረ ሳምሰንግ ስልክ ገዛሁ።", # "I bought a beautiful Samsung phone in Addis Ababa."
    "የአይፎን 15 ዋጋ 20,000 ብር ነው?", # "Is the iPhone 15 price 20,000 Birr?"
    "በ500 ብር ጫማ መግዛት እፈልጋለሁ።", # "I want to buy shoes for 500 Birr."
    "ኢትዮጵያ ውስጥ ምርጥ ምርት እየፈለግኩ ነው።", # "I am looking for the best product in Ethiopia."
    "ዋጋው 300 ብር ሲሆን በአዲስ አበባ እና በአዳማ መላኪያ ይገኛል", # "The price is 300 Birr and delivery is available in Addis Ababa and Adama."
    "እጅግ በጣም ቆንጆ አይፎን 14 ፕሮ ማክስ በቅናሽ ዋጋ", # "Very beautiful iPhone 14 Pro Max at a discounted price"
    "የአንድሮይድ ስልክ ብራንድ ሳምሰንግ ጋላክሲ A52 በ25,000 ብር" # "Android phone brand Samsung Galaxy A52 for 25,000 Birr"
]

print("\n Model Predictions on New Text")
for sentence in test_sentences:
    print(f"\nSentence: {sentence}")
    predictions = ner_pipeline(sentence)
    if predictions:
        for pred in predictions:
            print(f"  Word: '{pred['word']}' | Entity: {pred['entity_group']} | Score: {pred['score']:.2f}")
    else:
        print("  No entities detected.")

Device set to use cpu



 Model Predictions on New Text

Sentence: አዲስ አበባ ውስጥ ያማረ ሳምሰንግ ስልክ ገዛሁ።
  Word: 'አዲስ አበባ' | Entity: Product | Score: 0.34
  Word: 'ም' | Entity: LOC | Score: 0.53
  Word: 'ንግ' | Entity: LOC | Score: 0.40

Sentence: የአይፎን 15 ዋጋ 20,000 ብር ነው?
  Word: 'ይ' | Entity: LOC | Score: 0.44

Sentence: በ500 ብር ጫማ መግዛት እፈልጋለሁ።
  Word: 'ጫ' | Entity: LOC | Score: 0.36

Sentence: ኢትዮጵያ ውስጥ ምርጥ ምርት እየፈለግኩ ነው።
  No entities detected.

Sentence: ዋጋው 300 ብር ሲሆን በአዲስ አበባ እና በአዳማ መላኪያ ይገኛል
  Word: '300' | Entity: PRICE | Score: 0.30

Sentence: እጅግ በጣም ቆንጆ አይፎን 14 ፕሮ ማክስ በቅናሽ ዋጋ
  Word: 'በጣም ቆንጆ አይፎን 14 ፕሮ ማክስ' | Entity: LOC | Score: 0.63

Sentence: የአንድሮይድ ስልክ ብራንድ ሳምሰንግ ጋላክሲ A52 በ25,000 ብር
  Word: 'የአንድሮይድ ስልክ ብራንድ ሳምሰንግ ጋላክሲ A52' | Entity: LOC | Score: 0.64


In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
mv NER_model.ipynb /content/Amharic-E-commerce-Data-Extractor/notebooks

mv: cannot stat 'NER_model.ipynb': No such file or directory


In [None]:
!cp "/content/drive/My Drive/Colab Notebooks/NER_model.ipynb" /contentrep
