## Install necessary libraries

In [None]:
!pip install transformers datasets accelerate seqeval -q

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m69.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m54.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m47.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m4.3 MB/s[0m eta [36m

In [None]:
import os
import sys
import pandas as pd
from pathlib import Path
from typing import List, Dict

In [None]:
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
import numpy as np
from seqeval.metrics import f1_score, precision_score, recall_score, classification_report

In [None]:
# Adjust sys.path to allow imports from src/data_labeling
# In Colab, you might need to adjust this path based on where you upload the conll_parser.py
# If conll_parser.py is directly in /content, you can simplify the import.
# Assuming you upload conll_parser.py into /content/src/data_labeling/
project_root = Path('/content') # Assuming your project root is /content in Colab
sys.path.insert(0, str(project_root))

## Reads a CoNLL formatted file and parses it into a list of sentences.

In [None]:
# Import from your conll_parser
#from conll_parser import read_conll

In [None]:
import os
import re # Import re for regex splitting
from typing import List, Dict

def read_conll(file_path: str) -> List[List[Dict[str, str]]]:
    """
    Reads a CoNLL formatted file and parses it into a list of sentences.
    Each sentence is a list of dictionaries, where each dictionary represents
    a token and its associated label. It is now more flexible with whitespace
    delimiters.

    Args:
        file_path (str): The path to the CoNLL formatted text file.

    Returns:
        List[List[Dict[str, str]]]: A list of sentences, where each sentence
                                     is a list of {'text': token, 'label': label} dictionaries.
                                     Returns an empty list if the file is empty or not found.
    Raises:
        ValueError: If a line in the CoNLL file does not contain exactly two parts
                    (token and label) after splitting by whitespace.
    """
    if not os.path.exists(file_path):
        print(f"Warning: CoNLL file not found at {file_path}. Returning empty list.")
        return []

    sentences: List[List[Dict[str, str]]] = []
    current_sentence: List[Dict[str, str]] = []

    with open(file_path, 'r', encoding='utf-8') as f:
        for line_num, line in enumerate(f, 1):
            line = line.strip() # Remove leading/trailing whitespace including newlines
            if not line:  # Blank line indicates end of a sentence
                if current_sentence:  # Only add if the sentence is not empty
                    sentences.append(current_sentence)
                    current_sentence = []
            else:
                # Use re.split to split by one or more whitespace characters
                # This is more robust than line.split('\t')
                parts = re.split(r'\s+', line)
                if len(parts) != 2:
                    raise ValueError(
                        f"Malformed CoNLL line at {file_path}:{line_num}. "
                        f"Expected 'token\\tlabel' or 'token  label' (any whitespace delimiter), got '{line}'"
                    )
                token, label = parts
                current_sentence.append({'text': token, 'label': label})

        # Add the last sentence if the file doesn't end with a blank line
        if current_sentence:
            sentences.append(current_sentence)

    return sentences

def write_conll(data: List[List[Dict[str, str]]], file_path: str) -> None:
    """
    Writes structured data (list of sentences, each with tokens and labels)
    into a CoNLL formatted text file, using a tab ('\\t') as a delimiter.

    Args:
        data (List[List[Dict[str, str]]]): The data to write, in the format
                                            [[{'text': token, 'label': label}, ...], ...].
        file_path (str): The path to the output CoNLL formatted text file.
    """
    output_dir = os.path.dirname(file_path)
    if output_dir and not os.path.exists(output_dir):
        os.makedirs(output_dir)

    with open(file_path, 'w', encoding='utf-8') as f:
        for sentence in data:
            for token_data in sentence:
                # Always write with a tab to maintain consistent output format
                f.write(f"{token_data['text']}\t{token_data['label']}\n")
            f.write("\n") # Blank line to separate sentences



## 2. Configuration

In [None]:
# Define paths and model parameters
LABELED_DATA_PATH = '/content/data/labeled/01_labeled_telegram_product_price_location.txt' # Adjust if you manually corrected a different file
OUTPUT_MODEL_DIR = './fine_tuned_ner_model'
MODEL_NAME = "xlm-roberta-base" # You can also try "bert-tiny-amharic" or "afroxmlr" if they are available on HuggingFace Hub
                                # "bert-tiny-amharic" and "afroxmlr" might require specific model loading from their repos
                                # For "bert-tiny-amharic", you might need 'Davlan/bert-tiny-amharic'
                                # For "afroxmlr", you might need 'Davlan/afro-xlmr-large'

# Ensure the parent directory for labeled data exists in Colab
os.makedirs(Path(LABELED_DATA_PATH).parent, exist_ok=True)
print(f"Using model: {MODEL_NAME}")
print(f"Loading data from: {LABELED_DATA_PATH}")


Using model: xlm-roberta-base
Loading data from: /content/data/labeled/01_labeled_telegram_product_price_location.txt


## 3. Load the labeled dataset

In [None]:
# Using your custom conll_parser to load the data
raw_data = read_conll(LABELED_DATA_PATH)

if not raw_data:
    raise ValueError(f"No data loaded from {LABELED_DATA_PATH}. Please ensure the file exists and is correctly formatted.")

# Extract all unique labels to create ID mappings
# Flatten the list of lists of dictionaries to get all token_data dicts
all_labels = sorted(list(set(item['label'] for sentence in raw_data for item in sentence)))
label_to_id = {label: i for i, label in enumerate(all_labels)}
id_to_label = {i: label for label, i in label_to_id.items()}

print(f"Detected labels: {all_labels}")
print(f"Label to ID mapping: {label_to_id}")

# Convert raw_data to Hugging Face Dataset format
# The 'datasets' library expects lists of lists for tokens and labels
valid_tokens = []
valid_ner_tags = []

for sentence_data in raw_data:
    tokens_sentence = [item['text'] for item in sentence_data]
    ner_tags_sentence = [label_to_id[item['label']] for item in sentence_data]

    # Only add non-empty sentences to the dataset ---
    if tokens_sentence and ner_tags_sentence: # Ensure both tokens and tags exist for the sentence
        valid_tokens.append(tokens_sentence)
        valid_ner_tags.append(ner_tags_sentence)

hf_dataset_format = {
    "tokens": valid_tokens,
    "ner_tags": valid_ner_tags
}

dataset = Dataset.from_dict(hf_dataset_format)

print(f"Total samples: {len(dataset)}")

# Split dataset into training and validation sets
# Handle small datasets for train_test_split
if len(dataset) < 2:
    print("Warning: Dataset has less than 2 samples. Skipping train/test split. All data used for training.")
    train_dataset = dataset
    eval_dataset = Dataset.from_dict({"tokens": [], "ner_tags": []}) # Create an empty eval dataset
else:
    # Split dataset into training and validation sets (80% train, 20% validation)
    train_test_split = dataset.train_test_split(test_size=0.2, seed=42)
    train_dataset = train_test_split['train']
    eval_dataset = train_test_split['test']


print(f"Train samples: {len(train_dataset)}")
print(f"Eval samples: {len(eval_dataset)}")
if len(train_dataset) > 0: # Only print if there's at least one sample
    print(f"First training example tokens: {train_dataset[0]['tokens']}")
    print(f"First training example NER tags (IDs): {train_dataset[0]['ner_tags']}")
    print(f"First training example NER tags (Labels): {[id_to_label[tag_id] for tag_id in train_dataset[0]['ner_tags']]}")


Detected labels: ['B-CONTACT_INFO', 'B-LOC', 'B-PRICE', 'B-PRODUCT', 'I-LOC', 'I-LOCገ', 'I-PRICE', 'I-PRODUCT', 'O']
Label to ID mapping: {'B-CONTACT_INFO': 0, 'B-LOC': 1, 'B-PRICE': 2, 'B-PRODUCT': 3, 'I-LOC': 4, 'I-LOCገ': 5, 'I-PRICE': 6, 'I-PRODUCT': 7, 'O': 8}
Total samples: 3257
Train samples: 2605
Eval samples: 652
First training example tokens: ['Spring', 'Slicer', 'ጊዜ', 'ቆጣቢ', 'ስላይስ', 'ማድረጊያ', 'ለእጅ', 'ሴፍቲ', 'ተመራጭ', 'ለድንች', 'ለካሮትና', 'ሌሎች', 'አታክልቶች', 'ተመራጭ', 'ጥራት', 'ያለው', 'ዕቃ', 'ዋጋ፦', '1,200', 'ብር', 'አድራሻ', 'መገናኛ', 'ስሪ', 'ኤም', 'ሲቲ', 'ሞል', 'ሁለተኛ', 'ፎቅ', 'ቢሮ', 'ቁ.', 'SL-05A(ከ', 'ሊፍቱ', 'ፊት', 'ለ', 'ፊት)', '0909522840', '0923350054', 'በTelegram', 'ለማዘዝ', 'ይጠቀሙ', '@shager_onlinestore', 'ለተጨማሪ', 'ማብራሪያ', 'የቴሌግራም', 'ገፃችን', 'https://t.me/Shageronlinestore']
First training example NER tags (IDs): [3, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 6, 8, 6, 8, 4, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8, 8]
First training example NER tags (Labels): ['B-PRODUCT', 'I

## 4. Tokenize the data and align labels

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def align_labels_with_tokens(examples):
    """
    Function to tokenize inputs and align labels with new tokens.
    Handles potential subword tokenization by setting labels for subword pieces to -100 (ignored by PyTorch).
    """
    tokenized_inputs = tokenizer(
        examples["tokens"], is_split_into_words=True, truncation=True
    )
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word_idx of None. We set their label to -100.
            if word_idx is None:
                label_ids.append(-100)
            # Only label the first token of a given word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For other tokens of a word, we set the label to -100.
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# Apply the tokenization and label alignment to the datasets
tokenized_train_dataset = train_dataset.map(align_labels_with_tokens, batched=True)
tokenized_eval_dataset = eval_dataset.map(align_labels_with_tokens, batched=True)

# Remove the original 'tokens' and 'ner_tags' columns as they are no longer needed for training
tokenized_train_dataset = tokenized_train_dataset.remove_columns(["tokens", "ner_tags"])
tokenized_eval_dataset = tokenized_eval_dataset.remove_columns(["tokens", "ner_tags"])


print("\nTokenization and label alignment complete.")
if len(tokenized_train_dataset) > 0:
    print(f"First tokenized training example input_ids: {tokenized_train_dataset[0]['input_ids']}")
    print(f"First tokenized training example labels: {tokenized_train_dataset[0]['labels']}")
    # The original 'tokens' column is removed, so we can't decode from it directly
    # print(f"Decoded tokens: {tokenizer.convert_ids_to_tokens(tokenized_train_dataset[0]['input_ids'])}")
    # Check if original_labels list is not empty before accessing element 0
    if len(train_dataset[0]['ner_tags']) > 0:
        print(f"Original labels (from IDs): {[id_to_label[tag_id] if tag_id != -100 else 'N/A' for tag_id in train_dataset[0]['ner_tags']]}")
    else:
        print("Original labels (from IDs): []") # Handle empty original labels
    print(f"Aligned labels (from IDs): {[id_to_label[tag_id] if tag_id != -100 else 'N/A' for tag_id in tokenized_train_dataset[0]['labels']]}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

Map:   0%|          | 0/2605 [00:00<?, ? examples/s]

Map:   0%|          | 0/652 [00:00<?, ? examples/s]


Tokenization and label alignment complete.
First tokenized training example input_ids: [0, 38026, 103216, 3443, 7982, 63853, 6970, 13942, 118693, 3841, 2095, 4363, 3236, 3376, 185247, 124054, 12261, 6, 25556, 4722, 14623, 2981, 41295, 13799, 2237, 49101, 5519, 6, 196396, 8418, 32763, 46973, 2350, 3348, 54991, 12528, 2981, 41295, 13799, 6, 155706, 23683, 35587, 6550, 80667, 54164, 106, 4, 5955, 35648, 140042, 2370, 60014, 9171, 17930, 7423, 6, 197688, 6021, 14623, 45694, 2202, 6, 115742, 6, 31531, 5653, 137526, 47885, 5, 42135, 87741, 284, 132, 5679, 10085, 4722, 4585, 125631, 2237, 125631, 16, 6, 143842, 8821, 134712, 2839, 3894, 105586, 4283, 12338, 728, 75809, 25561, 13253, 9039, 7872, 2934, 169422, 1374, 2420, 1505, 454, 25607, 45804, 91602, 16333, 124449, 243084, 623, 17680, 10824, 70317, 816, 24163, 29307, 23374, 3975, 696, 18, 5, 282, 64, 40798, 1505, 25607, 45804, 2]
First tokenized training example labels: [-100, 3, 7, -100, 8, 8, -100, -100, 8, -100, -100, 8, -100, -100, -100

### Inspect the structure and content of the tokenized dataset

In [None]:
print("\nInspecting tokenized_train_dataset structure and content:")
print(tokenized_train_dataset)

# Print the first example in detail
if len(tokenized_train_dataset) > 0:
    first_example = tokenized_train_dataset[0]
    print("\nFirst example in tokenized_train_dataset:")
    for key, value in first_example.items():
        print(f"  Key: {key}")
        print(f"  Type of value: {type(value)}")
        if isinstance(value, list):
            print(f"  Length of list: {len(value)}")
            if len(value) > 0:
                print(f"  Type of first element: {type(value[0])}")
                print(f"  First element: {value[0]}")
        else:
            print(f"  Value: {value}")
else:
    print("\ntokenized_train_dataset is empty.")


Inspecting tokenized_train_dataset structure and content:
Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 2605
})

First example in tokenized_train_dataset:
  Key: input_ids
  Type of value: <class 'list'>
  Length of list: 128
  Type of first element: <class 'int'>
  First element: 0
  Key: attention_mask
  Type of value: <class 'list'>
  Length of list: 128
  Type of first element: <class 'int'>
  First element: 1
  Key: labels
  Type of value: <class 'list'>
  Length of list: 128
  Type of first element: <class 'int'>
  First element: -100


## 5. Set up training arguments

In [None]:
training_args = TrainingArguments(
    output_dir=OUTPUT_MODEL_DIR,
    eval_strategy="epoch",  # Disable evaluation during training due to small dataset
    learning_rate=2e-5,
    per_device_train_batch_size=16, # Adjust based on GPU memory
    per_device_eval_batch_size=16,  # Adjust based on GPU memory
    num_train_epochs=3,             # Start with a small number, increase if needed
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=100,
    save_strategy="epoch",
    load_best_model_at_end=True, # Set to False when eval_strategy="no"
    metric_for_best_model="f1", # This will not have much effect with eval_strategy="no"
    report_to="none", # Disable reporting to W&B, MLflow etc. for simplicity
    remove_unused_columns=False
)

## 6. Load the pre-trained model

In [None]:
model = AutoModelForTokenClassification.from_pretrained(
    MODEL_NAME, num_labels=len(all_labels), id2label=id_to_label, label2id=label_to_id
)

print(f"\nModel '{MODEL_NAME}' loaded with {len(all_labels)} labels.")

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Model 'xlm-roberta-base' loaded with 9 labels.


## 7. Define metrics for evaluation

In [None]:
def compute_metrics(p):
    """
    Computes precision, recall, and F1-score for token classification.
    Ignores -100 labels.
    """
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_labels = [[id_to_label[l] for l in label if l != -100] for label in labels]
    true_predictions = [[id_to_label[p] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]

    # Filter out empty lists if any (e.g. if a sentence was only special tokens after filtering)
    true_labels_filtered = [sublist for sublist in true_labels if sublist]
    true_predictions_filtered = [sublist for sublist in true_predictions if sublist]

    # Ensure true_labels and true_predictions have the same number of samples
    # This can happen if some prediction sublists are empty after filtering
    min_len = min(len(true_labels_filtered), len(true_predictions_filtered))
    true_labels_final = true_labels_filtered[:min_len]
    true_predictions_final = true_predictions_filtered[:min_len]

    if not true_labels_final or not true_predictions_final:
        print("Warning: No valid labels or predictions to compute metrics. Returning zeros.")
        return {"precision": 0.0, "recall": 0.0, "f1": 0.0}

    f1 = f1_score(true_labels_final, true_predictions_final, average="macro") # Use "macro" for class imbalance
    precision = precision_score(true_labels_final, true_predictions_final, average="macro")
    recall = recall_score(true_labels_final, true_predictions_final, average="macro")

    #  Print classification report for detailed view
    print("\nClassification Report:")
    print(classification_report(true_labels_final, true_predictions_final))

    return {
        "precision": precision,
        "recall": recall,
        "f1": f1,
    }

## 8. Initialize the Trainer

In [None]:
from transformers import DataCollatorForTokenClassification

# Initialize the DataCollator
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_eval_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    # Use the DataCollatorForTokenClassification to handle padding
    data_collator=data_collator
)

print("\nTrainer initialized. Starting fine-tuning...")

  trainer = Trainer(



Trainer initialized. Starting fine-tuning...


## 9. Fine-tune the model

In [None]:
trainer.train()

print("\nFine-tuning complete. Evaluating model...")

Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.3542,0.060432,0.880217,0.91437,0.896685
2,0.035,0.027276,0.95474,0.970698,0.962644
3,0.0252,0.025085,0.960575,0.971514,0.966012



Classification Report:
              precision    recall  f1-score   support

CONTACT_INFO       0.99      1.00      1.00      1161
         LOC       0.84      0.91      0.87       565
       PRICE       0.94      0.94      0.94       942
     PRODUCT       0.75      0.81      0.78       641

   micro avg       0.90      0.93      0.92      3309
   macro avg       0.88      0.91      0.90      3309
weighted avg       0.90      0.93      0.92      3309


Classification Report:
              precision    recall  f1-score   support

CONTACT_INFO       0.99      1.00      1.00      1161
         LOC       0.96      0.99      0.97       565
       PRICE       0.95      0.97      0.96       942
     PRODUCT       0.91      0.93      0.92       641

   micro avg       0.96      0.97      0.97      3309
   macro avg       0.95      0.97      0.96      3309
weighted avg       0.96      0.97      0.97      3309


Classification Report:
              precision    recall  f1-score   support

CON

## 10. Evaluate the fine-tuned model

In [None]:
if len(eval_dataset) > 0:
    results = trainer.evaluate()
    print("\nEvaluation Results:")
    for key, value in results.items():
        print(f"  {key}: {value:.4f}")
else:
    print("\nSkipping evaluation because the evaluation dataset is empty.")
    print("Please provide a larger dataset to enable meaningful evaluation.")


Classification Report:
              precision    recall  f1-score   support

CONTACT_INFO       0.99      1.00      1.00      1161
         LOC       0.97      0.98      0.97       565
       PRICE       0.96      0.98      0.97       942
     PRODUCT       0.92      0.93      0.93       641

   micro avg       0.97      0.98      0.97      3309
   macro avg       0.96      0.97      0.97      3309
weighted avg       0.97      0.98      0.97      3309


Evaluation Results:
  eval_loss: 0.0251
  eval_precision: 0.9606
  eval_recall: 0.9715
  eval_f1: 0.9660
  eval_runtime: 9.1243
  eval_samples_per_second: 71.4580
  eval_steps_per_second: 4.4940
  epoch: 3.0000


## 11. Save the model

In [None]:
trainer.save_model(OUTPUT_MODEL_DIR)
tokenizer.save_pretrained(OUTPUT_MODEL_DIR) # Save tokenizer with the model

print(f"\nFine-tuned model and tokenizer saved to: {OUTPUT_MODEL_DIR}")


Fine-tuned model and tokenizer saved to: ./fine_tuned_ner_model


## 12. Inference with the Fine-tuned Model

Now that the model is fine-tuned and saved, you can load it and use it to predict NER tags on new text data.

In [None]:
from transformers import pipeline

# Load the fine-tuned model and tokenizer
loaded_tokenizer = AutoTokenizer.from_pretrained(OUTPUT_MODEL_DIR)
loaded_model = AutoModelForTokenClassification.from_pretrained(OUTPUT_MODEL_DIR)

# Create a NER pipeline
# The id2label mapping should be loaded with the model
ner_pipeline = pipeline("ner", model=loaded_model, tokenizer=loaded_tokenizer, aggregation_strategy="simple")

print(f"Fine-tuned model loaded from: {OUTPUT_MODEL_DIR}")

Device set to use cuda:0


Fine-tuned model loaded from: ./fine_tuned_ner_model


Now you can test the model on a new sentence.

In [None]:
# Example sentence for inference
# Replace this with the actual text you want to process
text_to_predict = "Dell laptop with 16GB RAM for sale at Bole road, price 25000 ETB, contact +251912345678"

# Perform NER prediction
prediction = ner_pipeline(text_to_predict)

print("\nOriginal Text:")
print(text_to_predict)
print("\nNER Prediction:")
display(prediction)


Original Text:
Dell laptop with 16GB RAM for sale at Bole road, price 25000 ETB, contact +251912345678

NER Prediction:


[{'entity_group': 'PRODUCT',
  'score': np.float32(0.99223346),
  'word': 'Dell laptop with 16GB RAM',
  'start': 0,
  'end': 25},
 {'entity_group': 'PRICE',
  'score': np.float32(0.5511675),
  'word': 'price 25000',
  'start': 49,
  'end': 60},
 {'entity_group': 'CONTACT_INFO',
  'score': np.float32(0.9749747),
  'word': '+',
  'start': 74,
  'end': 75},
 {'entity_group': 'CONTACT_INFO',
  'score': np.float32(0.9853001),
  'word': '25',
  'start': 75,
  'end': 77},
 {'entity_group': 'CONTACT_INFO',
  'score': np.float32(0.9624269),
  'word': '19',
  'start': 77,
  'end': 79},
 {'entity_group': 'CONTACT_INFO',
  'score': np.float32(0.46889722),
  'word': '12',
  'start': 79,
  'end': 81},
 {'entity_group': 'CONTACT_INFO',
  'score': np.float32(0.43011215),
  'word': '345',
  'start': 81,
  'end': 84},
 {'entity_group': 'CONTACT_INFO',
  'score': np.float32(0.39568764),
  'word': '678',
  'start': 84,
  'end': 87}]

In [None]:
# Define the path to your preprocessed data
# # Ensure the parent directory for labeled data exists in Colab
os.makedirs(Path(LABELED_DATA_PATH).parent, exist_ok=True)
PREPROCESSED_DATA_PATH = '/content/data/preprocessed/predicted_data_for_labeling.csv' # Example path, CHANGE THIS

# Define the path for the output CSV file
OUTPUT_PREDICTIONS_CSV_PATH = './predicted_data_for_labeling.csv'

# --- Function to read preprocessed data (assuming one token per line, sentences separated by blank lines) ---
# Adapting read_conll to handle files with only tokens
def read_preprocessed_tokens(file_path: str) -> List[List[str]]:
    """
    Reads a file assuming one token per line and blank lines separating sentences.
    Returns a list of sentences, where each sentence is a list of tokens (strings).
    Returns an empty list if the file is empty or not found.
    """
    if not os.path.exists(file_path):
        print(f"Warning: Preprocessed data file not found at {file_path}. Returning empty list.")
        return []

    sentences: List[List[str]] = []
    current_sentence: List[str] = []

    with open(file_path, 'r', encoding='utf-8') as f:
        for line_num, line in enumerate(f, 1):
            line = line.strip() # Remove leading/trailing whitespace including newlines
            if not line:  # Blank line indicates end of a sentence
                if current_sentence:  # Only add if the sentence is not empty
                    sentences.append(current_sentence)
                    current_sentence = []
            else:
                # Assuming one token per line
                token = line
                current_sentence.append(token)

        # Add the last sentence if the file doesn't end with a blank line
        if current_sentence:
            sentences.append(current_sentence)

    return sentences

# --- Load preprocessed data ---
preprocessed_sentences_tokens = read_preprocessed_tokens(PREPROCESSED_DATA_PATH)

if not preprocessed_sentences_tokens:
    print(f"No preprocessed data loaded from {PREPROCESSED_DATA_PATH}. Skipping prediction and CSV saving.")
else:
    print(f"Loaded {len(preprocessed_sentences_tokens)} sentences from {PREPROCESSED_DATA_PATH}")

    # Prepare data for CSV
    csv_data = []
    sentence_id_counter = 0

    # --- Perform prediction and structure data for CSV ---
    print("Performing NER prediction on preprocessed data...")
    # Reuse the ner_pipeline created in a previous cell
    # Make sure the cell creating ner_pipeline (cell_id: 4bda5e51) has been run

    for sentence_tokens in preprocessed_sentences_tokens:
        # Convert list of tokens back to a string for the pipeline
        sentence_text = " ".join(sentence_tokens)

        # Perform prediction using the loaded pipeline
        prediction = ner_pipeline(sentence_text)

        # Process prediction results and align with original tokens
        # This is a simplified alignment. More complex cases (e.g., multi-token predictions,
        # differences in tokenization) might require more sophisticated logic.
        predicted_labels = ["O"] * len(sentence_tokens) # Initialize with 'O' labels

        # Map predicted entities back to original tokens
        # The pipeline output gives character spans, we need to map these to token indices.
        # This is a basic approach; for robust alignment, especially with complex tokenization,
        # you might need to iterate through tokens and check if their character span
        # overlaps with predicted entity spans.
        current_token_index = 0
        current_char_index = 0
        sentence_text_for_char_find = " ".join(sentence_tokens) # Use the joined string for finding char indices


        for i, token in enumerate(sentence_tokens):
             # Find the start and end character index of the original token in the joined sentence string
             # Use find starting from the last found character index to handle repeated tokens
             token_start_char = sentence_text_for_char_find.find(token, current_char_index)

             if token_start_char != -1:
                 token_end_char = token_start_char + len(token)

                 # Check if the token's character span overlaps with any predicted entity span
                 for pred in prediction:
                     # Simple check: see if the token's span is within or overlaps significantly with the prediction span
                     # This is not perfect for subword tokenization or complex cases.
                     # A more robust check would compare character ranges more rigorously.
                     # For simplicity here, we check for overlap.
                     overlap = max(0, min(token_end_char, pred['end']) - max(token_start_char, pred['start']))

                     if overlap > 0: # If there is any character overlap
                         # Assign the predicted label. Prioritize B- tags if multiple overlaps.
                         # If a token overlaps with a B- tag, assign the B- tag.
                         # If it overlaps with an I- tag and no B- tag, assign the I- tag.
                         # This simple logic might need refinement for complex cases.
                         if predicted_labels[i] == "O": # Only assign if not already assigned
                             predicted_labels[i] = pred['entity']
                         elif predicted_labels[i].startswith("I-") and pred['entity'].startswith("B-"):
                              predicted_labels[i] = pred['entity'] # Prioritize B- if already an I-

                 current_char_index = token_end_char # Update character index for finding the next token + 1 for space
                 if i < len(sentence_tokens) -1:
                     current_char_index += 1 # Account for the space added during join


             # Append to CSV data
             csv_data.append({
                 'sentence_id': sentence_id_counter,
                 'token': token,
                 'predicted_label': predicted_labels[i] if i < len(predicted_labels) else "O" # Use predicted label or default to O
             })

        sentence_id_counter += 1
        # Add a blank row after each sentence in the CSV for readability during manual review
        csv_data.append({
            'sentence_id': sentence_id_counter -1, # Associate blank line with the sentence above
            'token': '',
            'predicted_label': ''
        })


    # Remove the last blank row as it's not needed after the last sentence
    if csv_data and csv_data[-1]['token'] == '' and csv_data[-1]['predicted_label'] == '':
         csv_data.pop()


    # --- Save to CSV ---
    predicted_df = pd.DataFrame(csv_data)
    predicted_df.to_csv(OUTPUT_PREDICTIONS_CSV_PATH, index=False)

    print(f"\nPredicted labels saved to {OUTPUT_PREDICTIONS_CSV_PATH}")
    print("You can now manually review and correct this CSV to create more labeled data.")

No preprocessed data loaded from /content/data/preprocessed/predicted_data_for_labeling.csv. Skipping prediction and CSV saving.


# **Model Comparision & Selection**
Compare different models and select the best-performing one for the entity extraction task by fine-tuning and evaluating "xlm-roberta-base", "distilbert-base-multilingual-cased", and "bert-base-multilingual-cased" on the provided training and validation datasets.

## Define a function for fine-tuning and evaluation

Define a function that takes a model name as input, fine-tunes the model on the training data, evaluates it on the validation data, and returns the evaluation metrics.


Define the function `finetune_and_evaluate_model`  including tokenization, model loading, trainer setup, training, and evaluation.



In [None]:
import os
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification
import numpy as np
from seqeval.metrics import f1_score, precision_score, recall_score, classification_report
from datasets import Dataset, DatasetDict

def finetune_and_evaluate_model(model_name: str, train_dataset: Dataset, eval_dataset: Dataset, label_to_id: Dict[str, int], id_to_label: Dict[int, str]):
    """
    Fine-tune and evaluate a token classification model on a given dataset.

    Args:
        model_name (str): The name of the pre-trained model to use (from Hugging Face Hub).
        train_dataset (Dataset): The training dataset in Hugging Face Dataset format.
        eval_dataset (Dataset): The evaluation dataset in Hugging Face Dataset format.
        label_to_id (Dict[str, int]): Mapping from label names to IDs.
        id_to_label (Dict[int, str]): Mapping from IDs to label names.

    Returns:
        Dict[str, float]: A dictionary containing the evaluation metrics.
    """
    print(f"\n--- Fine-tuning and evaluating model: {model_name} ---")

    # 1. Load the tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    print(f"Tokenizer for {model_name} loaded.")

    # 2. Define tokenization and alignment function
    def align_labels_with_tokens_inner(examples):
        """
        Function to tokenize inputs and align labels with new tokens using the current tokenizer.
        """
        tokenized_inputs = tokenizer(
            examples["tokens"], is_split_into_words=True, truncation=True
        )
        labels = []
        for i, label in enumerate(examples["ner_tags"]):
            word_ids = tokenized_inputs.word_ids(batch_index=i)
            previous_word_idx = None
            label_ids = []
            for word_idx in word_ids:
                if word_idx is None:
                    label_ids.append(-100)
                elif word_idx != previous_word_idx:
                    label_ids.append(label[word_idx])
                else:
                    label_ids.append(-100)
                previous_word_idx = word_idx
            labels.append(label_ids)
        tokenized_inputs["labels"] = labels
        return tokenized_inputs

    # 3. Apply tokenization and alignment
    print("Tokenizing and aligning labels...")
    tokenized_train_dataset = train_dataset.map(align_labels_with_tokens_inner, batched=True)
    tokenized_eval_dataset = eval_dataset.map(align_labels_with_tokens_inner, batched=True)

    # Remove original columns
    tokenized_train_dataset = tokenized_train_dataset.remove_columns(["tokens", "ner_tags"])
    tokenized_eval_dataset = tokenized_eval_dataset.remove_columns(["tokens", "ner_tags"])
    print("Tokenization and alignment complete.")


    # 4. Load the pre-trained model
    print(f"Loading model: {model_name}...")
    model = AutoModelForTokenClassification.from_pretrained(
        model_name, num_labels=len(label_to_id), id2label=id_to_label, label2id=label_to_id
    )
    print(f"Model '{model_name}' loaded with {len(label_to_id)} labels.")

    # 5. Define Training Arguments
    # Create a unique output directory for each model
    model_output_dir = f"./fine_tuned_ner_model_{model_name.replace('/', '_')}"
    os.makedirs(model_output_dir, exist_ok=True)

    training_args = TrainingArguments(
        output_dir=model_output_dir,
        eval_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=3,
        weight_decay=0.01,
        logging_dir=f'./logs_{model_name.replace("/", "_")}',
        logging_steps=100,
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        report_to="none",
        remove_unused_columns=False
    )
    print("Training arguments defined.")

    # 6. Initialize Data Collator
    data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
    print("Data collator initialized.")

    # 7. Initialize the Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train_dataset,
        eval_dataset=tokenized_eval_dataset,
        tokenizer=tokenizer, # Keep tokenizer here for data collator
        compute_metrics=compute_metrics, # Use the compute_metrics function defined earlier
        data_collator=data_collator
    )
    print("Trainer initialized.")

    # 8. Fine-tune the model
    print("Starting fine-tuning...")
    trainer.train()
    print("Fine-tuning complete.")

    # 9. Evaluate the model
    print("Evaluating model...")
    if len(eval_dataset) > 0:
        results = trainer.evaluate()
        print("\nEvaluation Results:")
        for key, value in results.items():
            print(f"  {key}: {value:.4f}")
    else:
        print("\nSkipping evaluation because the evaluation dataset is empty.")
        results = {} # Return empty dict if no evaluation
    print("Evaluation complete.")

    return results


## Fine-tune and evaluate xlm-roberta

Use the function created in the previous step to fine-tune and evaluate the "xlm-roberta-base" model.


Call the `finetune_and_evaluate_model` function with the specified model name and datasets, then print the results.



In [None]:
xlm_roberta_results = finetune_and_evaluate_model(
    model_name="xlm-roberta-base",
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    label_to_id=label_to_id,
    id_to_label=id_to_label
)

print("\nEvaluation results for xlm-roberta-base:")
print(xlm_roberta_results)


--- Fine-tuning and evaluating model: xlm-roberta-base ---
Tokenizer for xlm-roberta-base loaded.
Tokenizing and aligning labels...


Map:   0%|          | 0/2605 [00:00<?, ? examples/s]

Map:   0%|          | 0/652 [00:00<?, ? examples/s]

Tokenization and alignment complete.
Loading model: xlm-roberta-base...


Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Model 'xlm-roberta-base' loaded with 9 labels.
Training arguments defined.
Data collator initialized.
Trainer initialized.
Starting fine-tuning...


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.3132,0.043693,0.932027,0.950773,0.94119
2,0.0321,0.02765,0.960808,0.972695,0.966712
3,0.0218,0.022885,0.970171,0.978881,0.974498



Classification Report:
              precision    recall  f1-score   support

CONTACT_INFO       0.99      1.00      0.99      1161
         LOC       0.93      0.97      0.95       565
       PRICE       0.95      0.94      0.95       942
     PRODUCT       0.86      0.89      0.87       641

   micro avg       0.94      0.96      0.95      3309
   macro avg       0.93      0.95      0.94      3309
weighted avg       0.94      0.96      0.95      3309


Classification Report:
              precision    recall  f1-score   support

CONTACT_INFO       0.99      1.00      1.00      1161
         LOC       0.97      0.98      0.98       565
       PRICE       0.97      0.98      0.98       942
     PRODUCT       0.92      0.93      0.92       641

   micro avg       0.97      0.98      0.97      3309
   macro avg       0.96      0.97      0.97      3309
weighted avg       0.97      0.98      0.97      3309


Classification Report:
              precision    recall  f1-score   support

CON


Classification Report:
              precision    recall  f1-score   support

CONTACT_INFO       0.99      1.00      1.00      1161
         LOC       0.98      0.99      0.98       565
       PRICE       0.98      0.98      0.98       942
     PRODUCT       0.93      0.94      0.94       641

   micro avg       0.97      0.98      0.98      3309
   macro avg       0.97      0.98      0.97      3309
weighted avg       0.97      0.98      0.98      3309


Evaluation Results:
  eval_loss: 0.0229
  eval_precision: 0.9702
  eval_recall: 0.9789
  eval_f1: 0.9745
  eval_runtime: 9.0579
  eval_samples_per_second: 71.9810
  eval_steps_per_second: 4.5260
  epoch: 3.0000
Evaluation complete.

Evaluation results for xlm-roberta-base:
{'eval_loss': 0.02288529835641384, 'eval_precision': 0.9701709396485287, 'eval_recall': 0.9788812015250464, 'eval_f1': 0.9744977081604629, 'eval_runtime': 9.0579, 'eval_samples_per_second': 71.981, 'eval_steps_per_second': 4.526, 'epoch': 3.0}


## Fine-tune and evaluate distilbert

Use the function created in the first step to fine-tune and evaluate the "distilbert-base-multilingual-cased" model.


**Reasoning**:
Use the previously defined `finetune_and_evaluate_model` function to fine-tune and evaluate the "distilbert-base-multilingual-cased" model on the available training and evaluation datasets and store the results.



In [None]:
distilbert_results = finetune_and_evaluate_model(
    model_name="distilbert-base-multilingual-cased",
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    label_to_id=label_to_id,
    id_to_label=id_to_label
)

print("\nEvaluation results for distilbert-base-multilingual-cased:")
print(distilbert_results)


--- Fine-tuning and evaluating model: distilbert-base-multilingual-cased ---
Tokenizer for distilbert-base-multilingual-cased loaded.
Tokenizing and aligning labels...


Map:   0%|          | 0/2605 [00:00<?, ? examples/s]

Map:   0%|          | 0/652 [00:00<?, ? examples/s]

Tokenization and alignment complete.
Loading model: distilbert-base-multilingual-cased...


Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Model 'distilbert-base-multilingual-cased' loaded with 9 labels.
Training arguments defined.
Data collator initialized.
Trainer initialized.
Starting fine-tuning...


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.3746,0.122445,0.760479,0.712813,0.733226
2,0.0974,0.080384,0.860148,0.828742,0.843347
3,0.0807,0.070643,0.895075,0.869314,0.881563



Classification Report:
              precision    recall  f1-score   support

CONTACT_INFO       0.99      1.00      1.00      1163
         LOC       0.51      0.37      0.43       565
       PRICE       0.83      0.74      0.78       943
     PRODUCT       0.71      0.74      0.72       642

   micro avg       0.82      0.77      0.79      3313
   macro avg       0.76      0.71      0.73      3313
weighted avg       0.81      0.77      0.79      3313


Classification Report:
              precision    recall  f1-score   support

CONTACT_INFO       0.99      1.00      1.00      1163
         LOC       0.76      0.67      0.71       565
       PRICE       0.86      0.80      0.83       943
     PRODUCT       0.83      0.85      0.84       642

   micro avg       0.89      0.86      0.87      3313
   macro avg       0.86      0.83      0.84      3313
weighted avg       0.88      0.86      0.87      3313


Classification Report:
              precision    recall  f1-score   support

CON


Classification Report:
              precision    recall  f1-score   support

CONTACT_INFO       0.99      1.00      1.00      1163
         LOC       0.85      0.78      0.81       565
       PRICE       0.89      0.83      0.86       943
     PRODUCT       0.85      0.87      0.86       642

   micro avg       0.91      0.89      0.90      3313
   macro avg       0.90      0.87      0.88      3313
weighted avg       0.91      0.89      0.90      3313


Evaluation Results:
  eval_loss: 0.0706
  eval_precision: 0.8951
  eval_recall: 0.8693
  eval_f1: 0.8816
  eval_runtime: 3.9093
  eval_samples_per_second: 166.7800
  eval_steps_per_second: 10.4880
  epoch: 3.0000
Evaluation complete.

Evaluation results for distilbert-base-multilingual-cased:
{'eval_loss': 0.0706433355808258, 'eval_precision': 0.89507548876094, 'eval_recall': 0.8693135761245455, 'eval_f1': 0.8815631664488628, 'eval_runtime': 3.9093, 'eval_samples_per_second': 166.78, 'eval_steps_per_second': 10.488, 'epoch': 3.0}


## Fine-tune and evaluate mBERT

Use the function created in the first step to fine-tune and evaluate the "bert-base-multilingual-cased" model.

Use the previously defined `finetune_and_evaluate_model` function to fine-tune and evaluate the "bert-base-multilingual-cased" model on the available training and evaluation datasets and store the results.

In [None]:
mbert_results = finetune_and_evaluate_model(
    model_name="bert-base-multilingual-cased",
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    label_to_id=label_to_id,
    id_to_label=id_to_label
)

print("\nEvaluation results for bert-base-multilingual-cased:")
print(mbert_results)


--- Fine-tuning and evaluating model: bert-base-multilingual-cased ---


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

Tokenizer for bert-base-multilingual-cased loaded.
Tokenizing and aligning labels...


Map:   0%|          | 0/2605 [00:00<?, ? examples/s]

Map:   0%|          | 0/652 [00:00<?, ? examples/s]

Tokenization and alignment complete.
Loading model: bert-base-multilingual-cased...


model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Model 'bert-base-multilingual-cased' loaded with 9 labels.
Training arguments defined.
Data collator initialized.
Trainer initialized.
Starting fine-tuning...


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.3021,0.123756,0.702709,0.724349,0.713197
2,0.0753,0.066392,0.880622,0.898229,0.889198
3,0.057,0.059816,0.896684,0.911673,0.903809



Classification Report:
              precision    recall  f1-score   support

CONTACT_INFO       0.99      1.00      0.99      1163
         LOC       0.34      0.33      0.33       565
       PRICE       0.76      0.80      0.78       943
     PRODUCT       0.72      0.77      0.75       642

   micro avg       0.76      0.78      0.77      3313
   macro avg       0.70      0.72      0.71      3313
weighted avg       0.76      0.78      0.77      3313


Classification Report:
              precision    recall  f1-score   support

CONTACT_INFO       0.99      1.00      1.00      1163
         LOC       0.80      0.85      0.83       565
       PRICE       0.86      0.85      0.85       943
     PRODUCT       0.87      0.89      0.88       642

   micro avg       0.90      0.91      0.90      3313
   macro avg       0.88      0.90      0.89      3313
weighted avg       0.90      0.91      0.90      3313


Classification Report:
              precision    recall  f1-score   support

CON


Classification Report:
              precision    recall  f1-score   support

CONTACT_INFO       0.99      1.00      1.00      1163
         LOC       0.82      0.89      0.85       565
       PRICE       0.90      0.87      0.88       943
     PRODUCT       0.88      0.89      0.88       642

   micro avg       0.91      0.92      0.92      3313
   macro avg       0.90      0.91      0.90      3313
weighted avg       0.91      0.92      0.92      3313


Evaluation Results:
  eval_loss: 0.0598
  eval_precision: 0.8967
  eval_recall: 0.9117
  eval_f1: 0.9038
  eval_runtime: 7.4788
  eval_samples_per_second: 87.1800
  eval_steps_per_second: 5.4820
  epoch: 3.0000
Evaluation complete.

Evaluation results for bert-base-multilingual-cased:
{'eval_loss': 0.05981598421931267, 'eval_precision': 0.8966844241582994, 'eval_recall': 0.9116733387049938, 'eval_f1': 0.9038094590438971, 'eval_runtime': 7.4788, 'eval_samples_per_second': 87.18, 'eval_steps_per_second': 5.482, 'epoch': 3.0}


## 13. Compare Model Results and Select the Best Model

Now that we have fine-tuned and evaluated all three models, we can compare their performance metrics to select the best one for the task. We will primarily look at the F1-score, precision, and recall on the evaluation dataset.

In [None]:
# Create a dictionary to store the results
model_comparison = {
    "xlm-roberta-base": xlm_roberta_results,
    "distilbert-base-multilingual-cased": distilbert_results,
    "bert-base-multilingual-cased": mbert_results
}

# Print the comparison in a readable format
print("--- Model Comparison Results ---")
for model_name, metrics in model_comparison.items():
    print(f"\nModel: {model_name}")
    if metrics:
        print(f"  Eval Loss: {metrics.get('eval_loss', 'N/A'):.4f}")
        print(f"  Eval Precision: {metrics.get('eval_precision', 'N/A'):.4f}")
        print(f"  Eval Recall: {metrics.get('eval_recall', 'N/A'):.4f}")
        print(f"  Eval F1-Score: {metrics.get('eval_f1', 'N/A'):.4f}")
    else:
        print("  No evaluation results available.")

# Select the best model based on F1-score
best_model_name = None
best_f1_score = -1

for model_name, metrics in model_comparison.items():
    if metrics and 'eval_f1' in metrics:
        if metrics['eval_f1'] > best_f1_score:
            best_f1_score = metrics['eval_f1']
            best_model_name = model_name

print(f"\nBased on F1-score, the best performing model is: {best_model_name}")

# You can add further considerations here, like model size, inference speed, etc.
# For instance, if DistilBERT had a slightly lower F1 but was significantly faster,
# you might choose it depending on your production requirements.

--- Model Comparison Results ---

Model: xlm-roberta-base
  Eval Loss: 0.0229
  Eval Precision: 0.9702
  Eval Recall: 0.9789
  Eval F1-Score: 0.9745

Model: distilbert-base-multilingual-cased
  Eval Loss: 0.0706
  Eval Precision: 0.8951
  Eval Recall: 0.8693
  Eval F1-Score: 0.8816

Model: bert-base-multilingual-cased
  Eval Loss: 0.0598
  Eval Precision: 0.8967
  Eval Recall: 0.9117
  Eval F1-Score: 0.9038

Based on F1-score, the best performing model is: xlm-roberta-base


Based on the evaluation metrics, the xlm-roberta-base model achieved the highest F1-score (0.9745) on the evaluation dataset. While distilbert-base-multilingual-cased and bert-base-multilingual-cased are potentially faster due to their smaller size or architecture, xlm-roberta-base demonstrates superior performance on this specific dataset.

Given the importance of accuracy for the NER task, and assuming the current computational resources can accommodate xlm-roberta-base, it is the best choice for production based on the evaluation results. If computational constraints were more stringent, further analysis on inference speed would be necessary to determine if the performance trade-off with the other models is acceptable.

## 14. Saving the Best Model to Hugging Face Hub

To save the fine-tuned model to the Hugging Face Hub, you need to:

1.  Install the `huggingface_hub` library.
2.  Log in to your Hugging Face account (you'll need an API token).
3.  Push the saved model and tokenizer files to the Hub.

In [None]:
!pip install huggingface_hub -q

Now, log in to the Hugging Face Hub. You will be prompted to enter your token.

In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

Finally, push the saved model and tokenizer to your Hugging Face repository. Replace `"your-username/your-model-name"` with your desired repository name on the Hub.

In [None]:
# Replace "your-username/your-model-name" with your desired repository name on the Hugging Face Hub
repo_name = "michaWorku/amahric_ner_fine_tunning"

# Push the model and tokenizer
loaded_model.push_to_hub(repo_name)
loaded_tokenizer.push_to_hub(repo_name)

print(f"\nModel and tokenizer successfully pushed to https://huggingface.co/{repo_name}")

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]


Model and tokenizer successfully pushed to https://huggingface.co/michaWorku/amahric_ner_fine_tunning


## 15. Loading and Using the Best Model for Inference

Now that we have identified the best model (`xlm-roberta-base`), we can load it and use it to predict NER tags on new text data.

Now you can test the model on a new sentence.

In [None]:
# Example sentence for inference
# Replace this with the actual text you want to process
text_to_predict = "Dell laptop with 16GB RAM for sale at Bole road, price 25000 ETB, contact +251912345678"

# Perform NER prediction
prediction = ner_pipeline(text_to_predict)

print("\nOriginal Text:")
print(text_to_predict)
print("\nNER Prediction:")
display(prediction)


Original Text:
Dell laptop with 16GB RAM for sale at Bole road, price 25000 ETB, contact +251912345678

NER Prediction:


[{'entity_group': 'PRODUCT',
  'score': np.float32(0.99223346),
  'word': 'Dell laptop with 16GB RAM',
  'start': 0,
  'end': 25},
 {'entity_group': 'PRICE',
  'score': np.float32(0.5511675),
  'word': 'price 25000',
  'start': 49,
  'end': 60},
 {'entity_group': 'CONTACT_INFO',
  'score': np.float32(0.9749747),
  'word': '+',
  'start': 74,
  'end': 75},
 {'entity_group': 'CONTACT_INFO',
  'score': np.float32(0.9853001),
  'word': '25',
  'start': 75,
  'end': 77},
 {'entity_group': 'CONTACT_INFO',
  'score': np.float32(0.9624269),
  'word': '19',
  'start': 77,
  'end': 79},
 {'entity_group': 'CONTACT_INFO',
  'score': np.float32(0.46889722),
  'word': '12',
  'start': 79,
  'end': 81},
 {'entity_group': 'CONTACT_INFO',
  'score': np.float32(0.43011215),
  'word': '345',
  'start': 81,
  'end': 84},
 {'entity_group': 'CONTACT_INFO',
  'score': np.float32(0.39568764),
  'word': '678',
  'start': 84,
  'end': 87}]

## Analyzing Per-Label Metrics

Looking at the classification reports generated during the evaluation of each model provides insights into how well each model performed on individual entity types (labels).

Let's summarize the per-label metrics (Precision, Recall, F1-score) for each model based on the previously printed classification reports:

**xlm-roberta-base:**

*   **CONTACT_INFO:** High Precision, Recall, and F1-score (close to 1.00)
*   **LOC:** High Precision, Recall, and F1-score (around 0.97-0.98)
*   **PRICE:** High Precision, Recall, and F1-score (around 0.97-0.98)
*   **PRODUCT:** High Precision, Recall, and F1-score (around 0.93-0.94)

**distilbert-base-multilingual-cased:**

*   **CONTACT_INFO:** High Precision, Recall, and F1-score (close to 1.00)
*   **LOC:** Lower Precision, Recall, and F1-score compared to xlm-roberta-base and mBERT (around 0.78-0.85)
*   **PRICE:** Moderate Precision, Recall, and F1-score (around 0.83-0.89)
*   **PRODUCT:** Moderate Precision, Recall, and F1-score (around 0.85-0.87)

**bert-base-multilingual-cased:**

*   **CONTACT_INFO:** High Precision, Recall, and F1-score (close to 1.00)
*   **LOC:** Good Precision, Recall, and F1-score (around 0.82-0.89)
*   **PRICE:** Good Precision, Recall, and F1-score (around 0.87-0.90)
*   **PRODUCT:** Good Precision, Recall, and F1-score (around 0.88-0.89)

**Summary:**

From the per-label metrics, it's clear that `xlm-roberta-base` consistently performs better across all entity types compared to `distilbert-base-multilingual-cased` and `bert-base-multilingual-cased` on this dataset. While all models perform very well on `CONTACT_INFO`, `xlm-roberta-base` shows a significant advantage in identifying `LOC`, `PRICE`, and `PRODUCT` entities with higher precision, recall, and F1-scores.

This detailed look at per-label performance further supports the selection of `xlm-roberta-base` as the best model for this task based on accuracy.