In [1]:
!git clone https://github.com/milki93/Amharic-E-commerce-Data-Extractor.git

fatal: destination path 'Amharic-E-commerce-Data-Extractor' already exists and is not an empty directory.


In [2]:
%cd /content/Amharic-E-commerce-Data-Extractor/notebooks

/content/Amharic-E-commerce-Data-Extractor/notebooks


### Load Dataset and Parse CoNLL file

In [3]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Load CoNLL Dataset
import numpy as np
import pandas as pd
from datasets import Dataset, DatasetDict, load_metric
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
from collections import defaultdict

conll_file_path = "/content/Amharic-E-commerce-Data-Extractor/data/labeled_data.conll"
unique_labels = ['O', 'B-Product', 'I-Product', 'B-LOC', 'I-LOC', 'B-PRICE', 'I-PRICE']

In [4]:
def parse_conll_file(file_path):
    data = []
    tokens, ner_tags = [], []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if line:
                parts = line.split()
                if len(parts) == 2:
                    token, label = parts
                    tokens.append(token)
                    ner_tags.append(label)
                else:
                    if tokens:
                        data.append({"tokens": tokens, "ner_tags": ner_tags})
                    tokens, ner_tags = [], []
            else:
                if tokens:
                    data.append({"tokens": tokens, "ner_tags": ner_tags})
                tokens, ner_tags = [], []
        if tokens:
            data.append({"tokens": tokens, "ner_tags": ner_tags})
    return data

if os.path.exists(conll_file_path):
    loaded_data = parse_conll_file(conll_file_path)
    ner_dataset = Dataset.from_list(loaded_data)
    ner_dataset = ner_dataset.filter(lambda example: len(example['tokens']) > 0)
    split_dataset = ner_dataset.train_test_split(test_size=0.2, seed=42)
    tokenized_datasets = DatasetDict({"train": split_dataset["train"], "validation": split_dataset["test"]})
    print(f"Train size: {len(tokenized_datasets['train'])}")
    print(f"Validation size: {len(tokenized_datasets['validation'])}")
else:
    raise FileNotFoundError(f"CoNLL file not found at: {conll_file_path}. Please ensure it exists.")


Filter:   0%|          | 0/50 [00:00<?, ? examples/s]

Train size: 40
Validation size: 10


### Labels and Tokenization Function

In [5]:
unique_ner_tags = set()
for tags in ner_dataset['ner_tags']:
    for tag in tags:
        unique_ner_tags.add(tag)

label_list = sorted(list(unique_ner_tags))
if 'O' in label_list:
    label_list.remove('O')
    label_list.insert(0, 'O')

id2label = {i: label for i, label in enumerate(label_list)}
label2id = {label: i for i, label in enumerate(label_list)}

def tokenize_and_align_labels(examples, tokenizer):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        is_split_into_words=True,
        max_length=512,
        padding="max_length" # ADDED THIS LINE to ensure consistent sequence lengths
    )

    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label2id[label[word_idx]])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs
def tokenize_and_align_labels(examples, tokenizer):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        is_split_into_words=True,
        max_length=512,
        padding="max_length"
    )

    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label2id[label[word_idx]])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [6]:
# !pip install seqeval

### Metric Computation

In [7]:
metric = load_metric("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_labels = [[id2label[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [id2label[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

  metric = load_metric("seqeval")


In [8]:
model_checkpoints = [
    "Davlan/xlm-roberta-base-ner-hrl",
    "distilbert-base-multilingual-cased",
    "bert-base-multilingual-cased",
]

### Model Training for each

In [14]:
from transformers import  AutoConfig

comparison_results = {}
for checkpoint in model_checkpoints:
    print(f"\n Starting Fine-tuning and Evaluation for: {checkpoint}")

    try:
        tokenizer = AutoTokenizer.from_pretrained(checkpoint)
        config = AutoConfig.from_pretrained(checkpoint, num_labels=len(label_list), id2label=id2label, label2id=label2id)
        model = AutoModelForTokenClassification.from_pretrained(checkpoint, config=config, ignore_mismatched_sizes=True)
        print(f"Successfully loaded tokenizer and model for {checkpoint}")
    except Exception as e:
        print(f"Error loading model or tokenizer for {checkpoint}: {e}")
        print("Skipping this model and moving to the next one.")
        continue

    print(f"Tokenizing dataset for {checkpoint}...")
    current_tokenized_datasets = tokenized_datasets.map(
        lambda examples: tokenize_and_align_labels(examples, tokenizer),
        batched=True,
        load_from_cache_file=False
    )
    print("Dataset tokenization complete.")

    output_dir_name = checkpoint.replace('/', '_').replace('-', '_').lower()
    output_dir = f"./results/{output_dir_name}"

    training_args = TrainingArguments(
        output_dir=output_dir,
        learning_rate=2e-5,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        num_train_epochs=3,
        weight_decay=0.01,
        save_total_limit=1,
        logging_dir="./logs",
        logging_steps=10,
        metric_for_best_model="f1",
        greater_is_better=True,
        report_to="none"
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=current_tokenized_datasets["train"],
        eval_dataset=current_tokenized_datasets["validation"],
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )

    print(f"Training {checkpoint}...")
    train_start_time = pd.Timestamp.now()
    trainer.train()
    train_end_time = pd.Timestamp.now()
    training_duration = (train_end_time - train_start_time).total_seconds() / 60
    print(f"Training for {checkpoint} completed in {training_duration:.2f} minutes.")

    print(f"Evaluating {checkpoint}...")
    eval_results = trainer.evaluate()
    print(f"Evaluation Results for {checkpoint}: {eval_results}")

    comparison_results[checkpoint] = {
        "F1-score": eval_results.get("eval_f1", 0.0),
        "Precision": eval_results.get("eval_precision", 0.0),
        "Recall": eval_results.get("eval_recall", 0.0),
        "Accuracy": eval_results.get("eval_accuracy", 0.0),
        "Training Duration (min)": training_duration,
        "Eval Runtime (s)": eval_results.get("eval_runtime", 0.0)
    }

    final_model_save_path = f"./final_models/{output_dir_name}"
    trainer.save_model(final_model_save_path)
    print(f"Best model for {checkpoint} saved to {final_model_save_path}")


 Starting Fine-tuning and Evaluation for: Davlan/xlm-roberta-base-ner-hrl


Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at Davlan/xlm-roberta-base-ner-hrl and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([9]) in the checkpoint and torch.Size([7]) in the model instantiated
- classifier.weight: found shape torch.Size([9, 768]) in the checkpoint and torch.Size([7, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Successfully loaded tokenizer and model for Davlan/xlm-roberta-base-ner-hrl
Tokenizing dataset for Davlan/xlm-roberta-base-ner-hrl...


Map:   0%|          | 0/40 [00:00<?, ? examples/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

Dataset tokenization complete.
Training Davlan/xlm-roberta-base-ner-hrl...


  trainer = Trainer(


Step,Training Loss
10,1.6548
20,1.0476
30,0.8286


Training for Davlan/xlm-roberta-base-ner-hrl completed in 16.73 minutes.
Evaluating Davlan/xlm-roberta-base-ner-hrl...


Evaluation Results for Davlan/xlm-roberta-base-ner-hrl: {'eval_loss': 0.9116111993789673, 'eval_precision': 0.1, 'eval_recall': 0.0392156862745098, 'eval_f1': 0.05633802816901408, 'eval_accuracy': 0.703971119133574, 'eval_runtime': 18.127, 'eval_samples_per_second': 0.552, 'eval_steps_per_second': 0.165, 'epoch': 3.0}
Best model for Davlan/xlm-roberta-base-ner-hrl saved to ./final_models/davlan_xlm_roberta_base_ner_hrl

 Starting Fine-tuning and Evaluation for: distilbert-base-multilingual-cased


Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Successfully loaded tokenizer and model for distilbert-base-multilingual-cased
Tokenizing dataset for distilbert-base-multilingual-cased...


Map:   0%|          | 0/40 [00:00<?, ? examples/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

Dataset tokenization complete.
Training distilbert-base-multilingual-cased...


  trainer = Trainer(


Step,Training Loss
10,1.5625
20,1.1757
30,1.0267


Training for distilbert-base-multilingual-cased completed in 7.39 minutes.
Evaluating distilbert-base-multilingual-cased...


  _warn_prf(average, modifier, msg_start, len(result))


Evaluation Results for distilbert-base-multilingual-cased: {'eval_loss': 1.2609272003173828, 'eval_precision': 0.09090909090909091, 'eval_recall': 0.0392156862745098, 'eval_f1': 0.05479452054794521, 'eval_accuracy': 0.5324909747292419, 'eval_runtime': 9.0887, 'eval_samples_per_second': 1.1, 'eval_steps_per_second': 0.33, 'epoch': 3.0}
Best model for distilbert-base-multilingual-cased saved to ./final_models/distilbert_base_multilingual_cased

 Starting Fine-tuning and Evaluation for: bert-base-multilingual-cased


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Successfully loaded tokenizer and model for bert-base-multilingual-cased
Tokenizing dataset for bert-base-multilingual-cased...


Map:   0%|          | 0/40 [00:00<?, ? examples/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

Dataset tokenization complete.
Training bert-base-multilingual-cased...


  trainer = Trainer(


Step,Training Loss
10,1.4044
20,0.9133
30,0.7075


Training for bert-base-multilingual-cased completed in 13.90 minutes.
Evaluating bert-base-multilingual-cased...


Evaluation Results for bert-base-multilingual-cased: {'eval_loss': 0.9591091871261597, 'eval_precision': 0.375, 'eval_recall': 0.17647058823529413, 'eval_f1': 0.24, 'eval_accuracy': 0.7021660649819494, 'eval_runtime': 16.6156, 'eval_samples_per_second': 0.602, 'eval_steps_per_second': 0.181, 'epoch': 3.0}
Best model for bert-base-multilingual-cased saved to ./final_models/bert_base_multilingual_cased


### Final Model Comparison and Selection

In [18]:
print("FINAL MODEL COMPARISON")
results_df = pd.DataFrame.from_dict(comparison_results, orient='index')
results_df_sorted = results_df.sort_values(by='F1-score', ascending=False)
print(results_df_sorted.to_markdown())
print("\n Best Performing Model (by F1-score)")
best_model_name = results_df_sorted.index[0]
print(f"Recommended Model: {best_model_name}")
print(f"Details:\n{results_df_sorted.iloc[0].to_string()}")
print(" Highest F1-score indicates the best balance of precision and recall.")

FINAL MODEL COMPARISON
|                                    |   F1-score |   Precision |    Recall |   Accuracy |   Training Duration (min) |   Eval Runtime (s) |
|:-----------------------------------|-----------:|------------:|----------:|-----------:|--------------------------:|-------------------:|
| bert-base-multilingual-cased       |  0.24      |   0.375     | 0.176471  |   0.702166 |                  13.9046  |            16.6156 |
| Davlan/xlm-roberta-base-ner-hrl    |  0.056338  |   0.1       | 0.0392157 |   0.703971 |                  16.7278  |            18.127  |
| distilbert-base-multilingual-cased |  0.0547945 |   0.0909091 | 0.0392157 |   0.532491 |                   7.38947 |             9.0887 |

 Best Performing Model (by F1-score)
Recommended Model: bert-base-multilingual-cased
Details:
F1-score                    0.240000
Precision                   0.375000
Recall                      0.176471
Accuracy                    0.702166
Training Duration (min)    13.9046

In [19]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [20]:
%cd /content/Amharic-E-commerce-Data-Extractor

/content/Amharic-E-commerce-Data-Extractor


In [22]:
!ssh-keygen -t ed25519 -C "milkidida131@gmail.com" -f ~/.ssh/id_ed25519 -N ""

Generating public/private ed25519 key pair.
Created directory '/root/.ssh'.
Your identification has been saved in /root/.ssh/id_ed25519
Your public key has been saved in /root/.ssh/id_ed25519.pub
The key fingerprint is:
SHA256:vA8jlTsAAnz0UkTEGyLeFdk1oJ1F6A7Em41dVRw71lk milkidida131@gmail.com
The key's randomart image is:
+--[ED25519 256]--+
|o ..**+.==..oo. E|
| + ooBooo..  .o o|
|. =.=oXo.    + o |
| . o.B = .  . .  |
|      + S        |
|       + o       |
|      . *        |
|       . =       |
|          .      |
+----[SHA256]-----+


In [26]:
# !mkdir -p ~/.ssh
# !eval "$(ssh-agent -s)" && ssh-add ~/.ssh/id_ed25519

In [27]:
# !mkdir -p ~/.ssh && ssh-keyscan github.com >> ~/.ssh/known_hosts

In [28]:
# !cat ~/.ssh/id_ed25519.pub

In [30]:
# !ssh -T git@github.com

In [32]:
!git remote set-url origin git@github.com:milki93/Amharic-E-commerce-Data-Extractor.git

In [33]:
!git checkout -b task-4

Switched to a new branch 'task-4'


In [None]:
!cp "/content/drive/My Drive/Colab Notebooks/NER_model.ipynb" /content/Amharic-E-commerce-Data-Extractor/notebooks