In [None]:
!pip install seqeval==0.0.10
!pip install lime

Collecting seqeval==0.0.10
  Downloading seqeval-0.0.10-py3-none-any.whl.metadata (3.4 kB)
Downloading seqeval-0.0.10-py3-none-any.whl (7.5 kB)
Installing collected packages: seqeval
Successfully installed seqeval-0.0.10
Collecting lime
  Downloading lime-0.2.0.1.tar.gz (275 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m275.7/275.7 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: lime
  Building wheel for lime (setup.py) ... [?25l[?25hdone
  Created wheel for lime: filename=lime-0.2.0.1-py3-none-any.whl size=283834 sha256=0b1a016e802c7fec7d61898e2f6e4932eaa5779c9e4c8a8c8be362a0ba9ca16a
  Stored in directory: /root/.cache/pip/wheels/85/fa/a3/9c2d44c9f3cd77cf4e533b58900b2bf4487f2a17e8ec212a3d
Successfully built lime
Installing collected packages: lime
Successfully installed lime-0.2.0.1


In [None]:
from google.colab import drive
drive.mount('/content/drive')

# New Section

In [None]:
from datasets import load_dataset, DatasetDict,Dataset

# Assuming your CoNLL files are named train.txt, validation.txt, test.txt
# and are in a directory called 'data'.

# If your CoNLL files are directly in a specified path
try:
    # Use 'conll2003' builder if your format is strictly CoNLL-2003-like
    # The 'features' argument might need adjustment based on your exact columns
    # For a custom CoNLL file with just 'tokens' and 'ner_tags' as the last column:
    # You might need to write a small custom loading script if it's not strictly CoNLL-2003
    # which has 4 columns (token, pos, chunk, ner_tag).

    # Common scenario: your custom data has two columns: token and NER tag.
    # You can adapt the loading or write a simple parser.

    # Option A: If your data closely matches CoNLL2003 (token, POS, chunk, NER_tag)
    # This is for public datasets following strict CoNLL2003 format
    # raw_datasets = load_dataset("conll2003")

    # Option B: If your custom CoNLL has only two columns (word, NER_tag)
    # You'll likely need a custom loading function.
    # Let's write a simple one.

    def read_conll_file(file_path):
        """Reads a CoNLL formatted file and returns a list of dictionaries."""
        with open(file_path, "r", encoding="utf-8") as f:
            lines = f.readlines()

        data = []
        tokens = []
        ner_tags = []

        for line in lines:
            line = line.strip()
            if not line: # Empty line signals end of a sentence
                if tokens: # Add sentence if not empty
                    data.append({"tokens": tokens, "ner_tags": ner_tags})
                tokens = []
                ner_tags = []
            elif line.startswith("-DOCSTART-"): # Skip DOCSTART lines
                continue
            else:
                parts = line.split()
                if len(parts) >= 2: # Ensure there are at least two columns
                    tokens.append(parts[0])
                    ner_tags.append(parts[-1]) # Take the last column as the NER tag
                else:
                    # Handle malformed lines if any, or raise an error
                    print(f"Warning: Malformed line encountered: {line}")

        # Add the last sentence if the file doesn't end with a blank line
        if tokens:
            data.append({"tokens": tokens, "ner_tags": ner_tags})

        return data

    # Load your custom CoNLL files
    train_data = read_conll_file("drive/MyDrive/model_data/data_splits/train.txt")

    test_data = read_conll_file("drive/MyDrive/model_data/test.txt")
    train_data = read_conll_file("drive/MyDrive/model_data/data_splits/train.txt")
    val_data = read_conll_file("drive/MyDrive/model_data/data_splits/validation.txt")
    test_data = read_conll_file("drive/MyDrive/model_data/test.txt")
    raw_datasets = DatasetDict({
        "train": Dataset.from_list(train_data),
        "validation": Dataset.from_list(val_data),
        "test": Dataset.from_list(test_data),
    })

except Exception as e:
    print(f"Could not load CoNLL data directly using load_dataset or custom parser: {e}")
    print("Please ensure your data files are correctly placed and formatted.")
    # Fallback or error handling
    raw_datasets = None # Or raise an error



In [None]:
# Collect all unique NER tags from your dataset
unique_tags = set()
for dataset_split in raw_datasets.values():
    for example in dataset_split:
        unique_tags.update(example["ner_tags"])

# Sort them for consistent ID assignment
label_list = sorted(list(unique_tags))
# Make sure "O" is always at ID 0 if preferred, or handle BIOES for correct metric calculation.
# For simplicity, if your tags are already BIO/BIOES, sorting is usually fine.
# If you need to add specific tags, e.g., for missing 'I-' tags, ensure they are present.

label_to_id = {label: i for i, label in enumerate(label_list)}
id_to_label = {i: label for i, label in enumerate(label_list)}

print(f"Detected Labels: {label_list}")
print(f"Label to ID Mapping: {label_to_id}")

Detected Labels: ['B-LOC', 'B-PRICE', 'B-PRODUCT', 'I-LOC', 'I-PRICE', 'I-PRODUCT', 'O']
Label to ID Mapping: {'B-LOC': 0, 'B-PRICE': 1, 'B-PRODUCT': 2, 'I-LOC': 3, 'I-PRICE': 4, 'I-PRODUCT': 5, 'O': 6}


In [None]:
from transformers import AutoTokenizer, DataCollatorForTokenClassification, AutoModelForTokenClassification,TrainingArguments, Trainer
import numpy as np
from seqeval.metrics import classification_report, f1_score, precision_score, recall_score
models = ['']
model_name = "xlm-roberta-base" # or xlm-roberta-large
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        is_split_into_words=True,
    )

    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None: # Special tokens
                label_ids.append(-100)
            elif word_idx != previous_word_idx: # First token of a new word
                label_ids.append(label_to_id[label[word_idx]])
            else: # Subsequent token of the same word
                current_tag = label[word_idx]
                # If original tag was B-X, make subsequent I-X. If I-X, keep I-X. Else -100.
                if current_tag.startswith("B-"):
                    label_ids.append(label_to_id["I-" + current_tag[2:]])
                elif current_tag.startswith("I-"):
                    label_ids.append(label_to_id[current_tag])
                else: # O tag or other types
                    label_ids.append(-100) # Or keep O if you want, but -100 is safer for loss
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# Apply the tokenization and alignment to your loaded datasets
tokenized_datasets = raw_datasets.map(
    tokenize_and_align_labels,
    batched=True, # Process in batches for speed
    remove_columns=raw_datasets["train"].column_names, # Remove original 'tokens' and 'ner_tags'
)

print(tokenized_datasets)
print(tokenized_datasets["train"][0]) # Check a sample to ensure it looks correct
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(label_list),
    id2label=id_to_label,
    label2id=label_to_id,
)


def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (where label is -100)
    true_predictions = [
        [id_to_label[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [id_to_label[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = {
        "precision": precision_score(true_labels, true_predictions),
        "recall": recall_score(true_labels, true_predictions),
        "f1": f1_score(true_labels, true_predictions),
    }
    return results
training_args = TrainingArguments(
    output_dir="drive/MyDrive/model_data/model/xlm_roberta_ner_results", # Where model checkpoints and logs will be saved
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3, # Adjust based on dataset size and convergence
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",       # Save checkpoint at the end of each epoch
    logging_dir='./logs',
    logging_steps=100,
    load_best_model_at_end=True, # Load the best model based on 'metric_for_best_model'
    metric_for_best_model="f1",  # Choose F1-score as the metric to track for best model
    report_to="tensorboard",     # Optional: enables logging to TensorBoard
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()
results = trainer.evaluate(tokenized_datasets["test"])
print(results)

# Get a full classification report for detailed per-entity metrics
predictions, labels, _ = trainer.predict(tokenized_datasets["test"])
predictions = np.argmax(predictions, axis=2)

true_predictions = [
    [id_to_label[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [id_to_label[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

print("\n--- Final Classification Report on Test Set ---")
print(classification_report(true_labels, true_predictions))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

Map:   0%|          | 0/213 [00:00<?, ? examples/s]

Map:   0%|          | 0/27 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 213
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 27
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 1
    })
})
{'input_ids': [0, 13253, 9039, 7872, 2934, 169422, 91602, 16333, 124449, 243084, 623, 17680, 10824, 70317, 816, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'labels': [-100, 6, -100, -100, 6, -100, 6, -100, -100, 6, 6, -100, -100, -100, -100, -100]}


model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,1.22914,0.0,0.0,0
2,No log,0.79553,0.0,0.0,0
3,No log,0.718642,0.0,0.0,0


{'eval_loss': 1.2101311683654785, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0, 'eval_runtime': 3.931, 'eval_samples_per_second': 0.254, 'eval_steps_per_second': 0.254, 'epoch': 3.0}

--- Final Classification Report on Test Set ---
           precision    recall  f1-score   support

      LOC       0.00      0.00      0.00         4
    PRICE       0.00      0.00      0.00         4
  PRODUCT       0.00      0.00      0.00         6

micro avg       0.00      0.00      0.00        14
macro avg       0.00      0.00      0.00        14



model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
import numpy as np
from seqeval.metrics import classification_report, f1_score, precision_score, recall_score

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (where label is -100)
    true_predictions = [
        [id_to_label[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [id_to_label[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = {
        "precision": precision_score(true_labels, true_predictions),
        "recall": recall_score(true_labels, true_predictions),
        "f1": f1_score(true_labels, true_predictions),
    }
    return results

In [None]:
from transformers import TrainingArguments, Trainer
training_args = TrainingArguments(
    output_dir="drive/MyDrive/model_data/model/xlm_roberta_ner_results", # Where model checkpoints and logs will be saved
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3, # Adjust based on dataset size and convergence
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",       # Save checkpoint at the end of each epoch
    logging_dir='./logs',
    logging_steps=100,
    load_best_model_at_end=True, # Load the best model based on 'metric_for_best_model'
    metric_for_best_model="f1",  # Choose F1-score as the metric to track for best model
    report_to="tensorboard",     # Optional: enables logging to TensorBoard
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()



  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,1.00675,0.0,0.0,0.0
2,No log,0.751001,0.142857,0.058824,0.083333
3,No log,0.685597,0.428571,0.176471,0.25


TrainOutput(global_step=42, training_loss=1.0723981403169178, metrics={'train_runtime': 780.3465, 'train_samples_per_second': 0.819, 'train_steps_per_second': 0.054, 'total_flos': 8741588208144.0, 'train_loss': 1.0723981403169178, 'epoch': 3.0})

In [None]:
results = trainer.evaluate(tokenized_datasets["test"])
print(results)

# Get a full classification report for detailed per-entity metrics
predictions, labels, _ = trainer.predict(tokenized_datasets["test"])
predictions = np.argmax(predictions, axis=2)

true_predictions = [
    [id_to_label[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [id_to_label[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

print("\n--- Final Classification Report on Test Set ---")
print(classification_report(true_labels, true_predictions))

{'eval_loss': 0.6741194128990173, 'eval_precision': 0.4, 'eval_recall': 0.14285714285714285, 'eval_f1': 0.21052631578947364, 'eval_runtime': 3.143, 'eval_samples_per_second': 0.318, 'eval_steps_per_second': 0.318, 'epoch': 3.0}

--- Final Classification Report on Test Set ---
           precision    recall  f1-score   support

      LOC       0.50      0.50      0.50         4
  PRODUCT       0.00      0.00      0.00         6
    PRICE       0.00      0.00      0.00         4

micro avg       0.40      0.14      0.21        14
macro avg       0.14      0.14      0.14        14



In [None]:
model_names_to_compare = [
    "xlm-roberta-base",
    "distilbert-base-multilingual-cased",
    "bert-base-multilingual-cased"
]

In [None]:
# Model comparison loop example
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification

from datasets import DatasetDict
import numpy as np
from seqeval.metrics import classification_report, f1_score, precision_score, recall_score
import os

# Assume raw_datasets, label_to_id, id_to_label, read_conll_file, compute_metrics are defined from previous steps

# List of models to compare
model_names_to_compare = [
    "xlm-roberta-base",
    "distilbert-base-multilingual-cased",
    "bert-base-multilingual-cased"
]

results_summary = {}

for model_name in model_names_to_compare:
    print(f"\n--- Fine-tuning {model_name} ---")

    try:
        # Load tokenizer specific to the model
        tokenizer = AutoTokenizer.from_pretrained(model_name)

        # Apply tokenization and alignment
        # Need to re-apply map because tokenizer changes
        def tokenize_and_align_labels(examples):
            tokenized_inputs = tokenizer(
                examples["tokens"],
                truncation=True,
                is_split_into_words=True,
            )
            labels = []
            for i, label in enumerate(examples["ner_tags"]):
                word_ids = tokenized_inputs.word_ids(batch_index=i)
                previous_word_idx = None
                label_ids = []
                for word_idx in word_ids:
                    if word_idx is None:
                        label_ids.append(-100)
                    elif word_idx != previous_word_idx:
                        label_ids.append(label_to_id[label[word_idx]])
                    else:
                        current_tag = label[word_idx]
                        if current_tag.startswith("B-"):
                            label_ids.append(label_to_id["I-" + current_tag[2:]])
                        elif current_tag.startswith("I-"):
                            label_ids.append(label_to_id[current_tag])
                        else:
                            label_ids.append(-100)
                    previous_word_idx = word_idx
                labels.append(label_ids)
            tokenized_inputs["labels"] = labels
            return tokenized_inputs

        tokenized_datasets = raw_datasets.map(
            tokenize_and_align_labels,
            batched=True,
            remove_columns=raw_datasets["train"].column_names,
            load_from_cache_file=False # Important to re-process for each tokenizer
        )

        # Load model
        model = AutoModelForTokenClassification.from_pretrained(
            model_name,
            num_labels=len(label_list),
            id2label=id_to_label,
            label2id=label_to_id,
        )

        # Data collator (tokenizer needed for padding)
        data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

        # Training arguments (adjust output_dir for each model)
        output_dir = f"drive/MyDrive/model_data/model/results_{model_name.replace('/', '_')}_ner"
        training_args = TrainingArguments(
            output_dir=output_dir,
            learning_rate=2e-5,
            per_device_train_batch_size=16,
            per_device_eval_batch_size=16,
            num_train_epochs=3, # Keep consistent for comparison
            weight_decay=0.01,
            eval_strategy="epoch",
            save_strategy="epoch",
            logging_dir=f'drive/MyDrive/model_data/model/logs_{model_name.replace("/", "_")}',
            logging_steps=100,
            load_best_model_at_end=True,
            metric_for_best_model="f1",
            report_to="tensorboard",
            # disable_tqdm=True, # Optional: disable progress bars for cleaner logs if running many
        )

        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=tokenized_datasets["train"],
            eval_dataset=tokenized_datasets["validation"],
            processing_class=tokenizer,
            data_collator=data_collator,
            compute_metrics=compute_metrics, # Re-use the seqeval compute_metrics
        )

        # Train
        trainer.train()

        # Evaluate on validation set (best model will be loaded)
        val_metrics = trainer.evaluate(tokenized_datasets["validation"])
        print(f"Validation metrics for {model_name}: {val_metrics}")

        # Store results
        results_summary[model_name] = {
            "validation_f1": val_metrics.get('eval_f1'),
            "validation_precision": val_metrics.get('eval_precision'),
            "validation_recall": val_metrics.get('eval_recall'),
            "validation_loss": val_metrics.get('eval_loss')
            # "training_time_seconds": trainer.state.global_step * training_args.logging_steps * (trainer.state.log_history[-1]['loss'] / trainer.state.log_history[-1]['learning_rate']) if trainer.state.log_history else 'N/A' # A rough estimate
        }

        # Save the best model explicitly (Trainer might already do it, but good to be sure)
        trainer.save_model(f'drive/MyDrive/model_data/model/{model_name}')

    except Exception as e:
        print(f"Error fine-tuning {model_name}: {e}")
        results_summary[model_name] = {"error": str(e)}

print("\n--- Comparison Summary ---")
for model, metrics in results_summary.items():
    print(f"Model: {model}")
    for metric, value in metrics.items():
        print(f"  {metric}: {value}")


--- Fine-tuning xlm-roberta-base ---


Map:   0%|          | 0/213 [00:00<?, ? examples/s]

Map:   0%|          | 0/27 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.99883,0.0,0.0,0.0
2,No log,0.733158,0.0,0.0,0.0
3,No log,0.644321,0.222222,0.117647,0.153846


Validation metrics for xlm-roberta-base: {'eval_loss': 0.6443211436271667, 'eval_precision': 0.2222222222222222, 'eval_recall': 0.11764705882352941, 'eval_f1': 0.15384615384615383, 'eval_runtime': 4.2223, 'eval_samples_per_second': 6.395, 'eval_steps_per_second': 0.474, 'epoch': 3.0}

--- Fine-tuning distilbert-base-multilingual-cased ---


Map:   0%|          | 0/213 [00:00<?, ? examples/s]

Map:   0%|          | 0/27 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,1.053696,0.0,0.0,0.0
2,No log,0.803032,0.217391,0.294118,0.25
3,No log,0.748811,0.3,0.352941,0.324324


Validation metrics for distilbert-base-multilingual-cased: {'eval_loss': 0.7488110065460205, 'eval_precision': 0.3, 'eval_recall': 0.35294117647058826, 'eval_f1': 0.3243243243243243, 'eval_runtime': 1.1999, 'eval_samples_per_second': 22.502, 'eval_steps_per_second': 1.667, 'epoch': 3.0}

--- Fine-tuning bert-base-multilingual-cased ---


Map:   0%|          | 0/213 [00:00<?, ? examples/s]

Map:   0%|          | 0/27 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.760912,0.176471,0.176471,0.176471
2,No log,0.715468,0.157895,0.176471,0.166667
3,No log,0.682104,0.294118,0.294118,0.294118


Validation metrics for bert-base-multilingual-cased: {'eval_loss': 0.682104229927063, 'eval_precision': 0.29411764705882354, 'eval_recall': 0.29411764705882354, 'eval_f1': 0.29411764705882354, 'eval_runtime': 3.1109, 'eval_samples_per_second': 8.679, 'eval_steps_per_second': 0.643, 'epoch': 3.0}

--- Comparison Summary ---
Model: xlm-roberta-base
  validation_f1: 0.15384615384615383
  validation_precision: 0.2222222222222222
  validation_recall: 0.11764705882352941
  validation_loss: 0.6443211436271667
Model: distilbert-base-multilingual-cased
  validation_f1: 0.3243243243243243
  validation_precision: 0.3
  validation_recall: 0.35294117647058826
  validation_loss: 0.7488110065460205
Model: bert-base-multilingual-cased
  validation_f1: 0.29411764705882354
  validation_precision: 0.29411764705882354
  validation_recall: 0.29411764705882354
  validation_loss: 0.682104229927063


In [65]:
from transformers import pipeline
import torch # For checking GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
for model_name in model_names_to_compare:
  final_best_model_path = f"drive/MyDrive/model_data/model/results_{model_name.replace('/', '_')}_ner/checkpoint-14"
  if model_name == "xlm-roberta-base":
    final_best_model_path = f"drive/MyDrive/model_data/model/xlm_roberta_ner_results/checkpoint-42"
  print(final_best_model_path)
  loaded_tokenizer = AutoTokenizer.from_pretrained(final_best_model_path)
  loaded_model = AutoModelForTokenClassification.from_pretrained(
    final_best_model_path,
    id2label=id_to_label,
    label2id=label_to_id
  )
  loaded_model.to(device)
  ner_pipeline = pipeline(
    "token-classification",
    model=loaded_model,
    tokenizer=loaded_tokenizer,
    aggregation_strategy="simple", # Optional: aggregates subword tokens into full words
    device=0 if torch.cuda.is_available() else -1 # 0 for GPU, -1 for CPU
  )
  amharic_text = "Kids Hula Hoop In/Outdoor Fitness Collapsible Adjustable Colourful Fun Game Ring  1200 birr  ለልጆ ኦሪጅናል በቀጥታ ከካምፓኒ የሚመጡ እቃዎችን በቅናሽ ዋጋ የሚያገኙበት ቦታ SINA KIDS/ሲና ኪድስ ጥራታቸውን ባልጠበቁ እቃዎች ልጆን አደጋ ላይ እንዳይጥሉ ይጠንቀቁ  0909003864 0905707448    ሊንኩን በመጫን ቴሌግራማችንን ይቀላቀሉ  እቃ ለማዘዝ ከስር ያለውን ሊንኮች በመጫን ማዘዝ ትችላላቹ        አድራሻ   1⃣ቁጥር1 ገርጂ ኢምፔሪያል ከሳሚ ህንፃ ጎን አልፎዝ ፕላዛ ግራውንድ ላይ እንደገቡ ያገኙናል  2⃣ቁጥር2 4ኪሎ ቅድስት ስላሴ ህንፃ ማለትም ከብልፅግና ዋናፅፈት ቤት ህንፃ በስተ ቀኝ ባለው አስፓልት 20ሜትር ዝቅ እንዳሉ ሀበሻ ኮፊ የሚገኝበት ቀይ ሸክላ ህንፃ 2ተኛ ፎቅ ላይ ያገኙናል  ክቡራን ደምበኞቻችን ገርጂ አልፎዝ ፕላዛ ላይ አራት ኪሎ ቅድስት ስላሴ ያሉት ሱቆቻችን ሲመጡ  አስተማማኝ ሰፊ ፓርኪንግ ያላቸው መሆናቸውን በታላቅ ደስታ እናበስራለን"
  print(f"\nInference on text 1: '{amharic_text}'{model_name}")
  results_1 = ner_pipeline(amharic_text)
  print(results_1)

drive/MyDrive/model_data/model/xlm_roberta_ner_results/checkpoint-42


Device set to use cpu



Inference on text 1: 'Kids Hula Hoop In/Outdoor Fitness Collapsible Adjustable Colourful Fun Game Ring  1200 birr  ለልጆ ኦሪጅናል በቀጥታ ከካምፓኒ የሚመጡ እቃዎችን በቅናሽ ዋጋ የሚያገኙበት ቦታ SINA KIDS/ሲና ኪድስ ጥራታቸውን ባልጠበቁ እቃዎች ልጆን አደጋ ላይ እንዳይጥሉ ይጠንቀቁ  0909003864 0905707448    ሊንኩን በመጫን ቴሌግራማችንን ይቀላቀሉ  እቃ ለማዘዝ ከስር ያለውን ሊንኮች በመጫን ማዘዝ ትችላላቹ        አድራሻ   1⃣ቁጥር1 ገርጂ ኢምፔሪያል ከሳሚ ህንፃ ጎን አልፎዝ ፕላዛ ግራውንድ ላይ እንደገቡ ያገኙናል  2⃣ቁጥር2 4ኪሎ ቅድስት ስላሴ ህንፃ ማለትም ከብልፅግና ዋናፅፈት ቤት ህንፃ በስተ ቀኝ ባለው አስፓልት 20ሜትር ዝቅ እንዳሉ ሀበሻ ኮፊ የሚገኝበት ቀይ ሸክላ ህንፃ 2ተኛ ፎቅ ላይ ያገኙናል  ክቡራን ደምበኞቻችን ገርጂ አልፎዝ ፕላዛ ላይ አራት ኪሎ ቅድስት ስላሴ ያሉት ሱቆቻችን ሲመጡ  አስተማማኝ ሰፊ ፓርኪንግ ያላቸው መሆናቸውን በታላቅ ደስታ እናበስራለን'xlm-roberta-base
[{'entity_group': 'LOC', 'score': np.float32(0.26560524), 'word': 'INA', 'start': 145, 'end': 148}, {'entity_group': 'LOC', 'score': np.float32(0.23973036), 'word': '003', 'start': 208, 'end': 211}, {'entity_group': 'LOC', 'score': np.float32(0.23790057), 'word': '707', 'start': 219, 'end': 222}, {'entity_group': 'LOC', 'score': np.float32(0.5028381), 'word': '⃣ቁጥር

Device set to use cpu



Inference on text 1: 'Kids Hula Hoop In/Outdoor Fitness Collapsible Adjustable Colourful Fun Game Ring  1200 birr  ለልጆ ኦሪጅናል በቀጥታ ከካምፓኒ የሚመጡ እቃዎችን በቅናሽ ዋጋ የሚያገኙበት ቦታ SINA KIDS/ሲና ኪድስ ጥራታቸውን ባልጠበቁ እቃዎች ልጆን አደጋ ላይ እንዳይጥሉ ይጠንቀቁ  0909003864 0905707448    ሊንኩን በመጫን ቴሌግራማችንን ይቀላቀሉ  እቃ ለማዘዝ ከስር ያለውን ሊንኮች በመጫን ማዘዝ ትችላላቹ        አድራሻ   1⃣ቁጥር1 ገርጂ ኢምፔሪያል ከሳሚ ህንፃ ጎን አልፎዝ ፕላዛ ግራውንድ ላይ እንደገቡ ያገኙናል  2⃣ቁጥር2 4ኪሎ ቅድስት ስላሴ ህንፃ ማለትም ከብልፅግና ዋናፅፈት ቤት ህንፃ በስተ ቀኝ ባለው አስፓልት 20ሜትር ዝቅ እንዳሉ ሀበሻ ኮፊ የሚገኝበት ቀይ ሸክላ ህንፃ 2ተኛ ፎቅ ላይ ያገኙናል  ክቡራን ደምበኞቻችን ገርጂ አልፎዝ ፕላዛ ላይ አራት ኪሎ ቅድስት ስላሴ ያሉት ሱቆቻችን ሲመጡ  አስተማማኝ ሰፊ ፓርኪንግ ያላቸው መሆናቸውን በታላቅ ደስታ እናበስራለን'distilbert-base-multilingual-cased
[{'entity_group': 'PRODUCT', 'score': np.float32(0.29278946), 'word': '##ss', 'start': 31, 'end': 33}, {'entity_group': 'PRODUCT', 'score': np.float32(0.20092161), 'word': '##just', 'start': 48, 'end': 52}, {'entity_group': 'LOC', 'score': np.float32(0.35705155), 'word': '##NA', 'start': 146, 'end': 148}, {'entity_group': 'LOC', 'score': np.float3

Device set to use cpu



Inference on text 1: 'Kids Hula Hoop In/Outdoor Fitness Collapsible Adjustable Colourful Fun Game Ring  1200 birr  ለልጆ ኦሪጅናል በቀጥታ ከካምፓኒ የሚመጡ እቃዎችን በቅናሽ ዋጋ የሚያገኙበት ቦታ SINA KIDS/ሲና ኪድስ ጥራታቸውን ባልጠበቁ እቃዎች ልጆን አደጋ ላይ እንዳይጥሉ ይጠንቀቁ  0909003864 0905707448    ሊንኩን በመጫን ቴሌግራማችንን ይቀላቀሉ  እቃ ለማዘዝ ከስር ያለውን ሊንኮች በመጫን ማዘዝ ትችላላቹ        አድራሻ   1⃣ቁጥር1 ገርጂ ኢምፔሪያል ከሳሚ ህንፃ ጎን አልፎዝ ፕላዛ ግራውንድ ላይ እንደገቡ ያገኙናል  2⃣ቁጥር2 4ኪሎ ቅድስት ስላሴ ህንፃ ማለትም ከብልፅግና ዋናፅፈት ቤት ህንፃ በስተ ቀኝ ባለው አስፓልት 20ሜትር ዝቅ እንዳሉ ሀበሻ ኮፊ የሚገኝበት ቀይ ሸክላ ህንፃ 2ተኛ ፎቅ ላይ ያገኙናል  ክቡራን ደምበኞቻችን ገርጂ አልፎዝ ፕላዛ ላይ አራት ኪሎ ቅድስት ስላሴ ያሉት ሱቆቻችን ሲመጡ  አስተማማኝ ሰፊ ፓርኪንግ ያላቸው መሆናቸውን በታላቅ ደስታ እናበስራለን'bert-base-multilingual-cased
[]


In [64]:
# --- LIME Prediction Function Wrapper for a Specific Token ---
# Similar to SHAP, but LIME expects predict_proba to return (num_samples, num_classes)
# where num_classes are the classes for the target token's prediction.
choosen_model = f"drive/MyDrive/model_data/model/results_distilbert-base-multilingual-cased_ner"
tokenizer = AutoTokenizer.from_pretrained(final_best_model_path)
model = AutoModelForTokenClassification.from_pretrained(
    final_best_model_path,
    id2label=id_to_label,
    label2id=label_to_id
)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval() # Set to evaluation mode
print("Model and tokenizer loaded successfully.")
def get_lime_prediction_wrapper(model, tokenizer, target_word_index_in_original_text):
    def predict_proba_fn(texts):
        all_target_token_probs = []
        for text in texts:
            tokenized_input = tokenizer(
                text,
                return_tensors="pt",
                padding=True,
                truncation=True,
                is_split_into_words=False
            )
            input_ids = tokenized_input["input_ids"].to(model.device)

            word_ids = tokenized_input.word_ids(batch_index=0)

            target_token_idx = None
            for i, wid in enumerate(word_ids):
                if wid == target_word_index_in_original_text:
                    target_token_idx = i
                    break

            if target_token_idx is None:
                # If target word is missing in perturbed text, return uniform distribution
                all_target_token_probs.append(np.full(len(label_list), 1.0 / len(label_list)))
                continue

            with torch.no_grad():
                outputs = model(input_ids)

            target_token_logits = outputs.logits.squeeze(0)[target_token_idx, :]
            target_token_probs = F.softmax(target_token_logits, dim=-1).cpu().numpy()

            all_target_token_probs.append(target_token_probs)

        return np.array(all_target_token_probs)

    return predict_proba_fn

# --- LIME Usage Example ---
text_to_explain_lime = "Smart Usb Ultrasonic Car And Home Air Humidifier With Colorful Led Light Original High-quality በኤሌክትሪክ የሚሰራ ለቤትና ለመኪና መልካም መዓዛን የሚሰጥ  Elevate the comfort level within your living premises with this fantastic Green Lion Air Mist Humidifier ዋጋ፦ 1100 ብር ውስን ፍሬ ነው ያለን አድራሻ #መገናኛ_መሰረት_ደፋር_ሞል_ሁለተኛ_ፎቅ ቢሮ ቁ S05/S06 0902660722 0928460606 በTelegram ለማዘዝ  ይጠቀሙ ለተጨማሪ ማብራሪያ የቴሌግራም ገፃችን"
# "Commercial Bank of Ethiopia opened a new branch in 2024."
original_words_lime = custom_word_tokenizer(text_to_explain_lime)

# Let's explain why "ባንክ" (Bank) at index 2 is predicted as I-ORG (assuming it is part of an ORG entity).
target_word_index_lime = 1 # Index of "ባንክ"

# Get the LIME prediction wrapper function for this specific token
lime_predict_proba_fn = get_lime_prediction_wrapper(model, tokenizer, target_word_index_lime)

# Create a LIME explainer
explainer_lime = LimeTextExplainer(
    class_names=label_list,
    split_expression=r'\s+' # Use regex for spaces to handle multiple spaces
)

print(f"\nGenerating LIME Explanations for word '{original_words_lime[target_word_index_lime]}' in sentence: '{text_to_explain_lime}' (This may take a while)...")

# Explain the instance
# `num_features` is the number of words to show in the explanation
# `num_samples` is how many perturbed samples LIME creates
# `top_labels` is how many of the top predicted labels for the target token to explain
explanation_lime = explainer_lime.explain_instance(
    text_to_explain_lime,
    lime_predict_proba_fn,
    num_features=5, # Number of words to highlight
    num_samples=1000, # Number of perturbations (can be slow)
    top_labels=3 # Show explanations for top 3 predicted labels of the target token
)

# --- Visualize LIME Explanation ---
print(f"\nLIME Explanation for '{original_words_lime[target_word_index_lime]}' (index {target_word_index_lime}):")
print(explanation_lime.top_labels)
for label_id in explanation_lime.top_labels:
    label_name = id_to_label[label_id]
    # print(f"  Explaining for label: {label_name} (Probability: {prob:.4f})")
    print("  Word contributions:")
    for word, weight in explanation_lime.as_list(label=label_id):
        print(f"    - '{word}': {weight:.4f}")



Model and tokenizer loaded successfully.

Generating LIME Explanations for word 'Usb' in sentence: 'Smart Usb Ultrasonic Car And Home Air Humidifier With Colorful Led Light Original High-quality በኤሌክትሪክ የሚሰራ ለቤትና ለመኪና መልካም መዓዛን የሚሰጥ  Elevate the comfort level within your living premises with this fantastic Green Lion Air Mist Humidifier ዋጋ፦ 1100 ብር ውስን ፍሬ ነው ያለን አድራሻ #መገናኛ_መሰረት_ደፋር_ሞል_ሁለተኛ_ፎቅ ቢሮ ቁ S05/S06 0902660722 0928460606 በTelegram ለማዘዝ  ይጠቀሙ ለተጨማሪ ማብራሪያ የቴሌግራም ገፃችን' (This may take a while)...

LIME Explanation for 'Usb' (index 1):
[np.int64(6), np.int64(5), np.int64(3)]
  Word contributions:
    - 'Usb': -0.0325
    - 'Ultrasonic': -0.0304
    - 'Smart': -0.0255
    - 'Colorful': -0.0253
    - 'And': 0.0237
  Word contributions:
    - 'Ultrasonic': 0.0446
    - 'Smart': 0.0291
    - 'With': -0.0287
    - 'Colorful': 0.0268
    - 'And': -0.0185
  Word contributions:
    - '#መገናኛ_መሰረት_ደፋር_ሞል_ሁለተኛ_ፎቅ': 0.0101
    - 'Usb': 0.0049
    - 'S05/S06': 0.0049
    - 'Smart': -0.0048
    - '