In [3]:
# Install required libraries
!pip install transformers datasets tokenizers seqeval torch pandas numpy scikit-learn -q

import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification
from datasets import Dataset
from sklearn.model_selection import train_test_split
from seqeval.metrics import classification_report
from google.colab import files

# Function to read CoNLL file
def read_conll(file_path):
    """Parse CoNLL file into sentences and labels.
    Args:
        file_path: Path to CoNLL file.
    Returns:
        Tuple of (sentences, labels) where each is a list of lists.
    """
    sentences, labels = [], []
    current_sentence, current_labels = [], []
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            for line in f:
                if line.strip():
                    token, label = line.strip().split()
                    current_sentence.append(token)
                    current_labels.append(label)
                else:
                    if current_sentence:
                        sentences.append(current_sentence)
                        labels.append(current_labels)
                        current_sentence, current_labels = [], []
        if current_sentence:
            sentences.append(current_sentence)
            labels.append(current_labels)
        return sentences, labels
    except Exception as e:
        print(f"Error reading CoNLL file: {e}")
        return None, None

# Upload CoNLL file
print("Upload samples/labeled_data.conll")
uploaded = files.upload()
sentences, labels = read_conll('labeled_data.conll')

# Verify data
if sentences is None or len(sentences) < 30:
    raise ValueError(f"Expected 30 messages, got {len(sentences) if sentences else 0}")
print(f"Loaded {len(sentences)} messages")

# Define label mapping
label_list = ["O", "B-Product", "I-Product", "B-PRICE", "I-PRICE", "B-LOC", "I-LOC", "B-CONTACT_INFO", "I-CONTACT_INFO"]
label2id = {label: i for i, label in enumerate(label_list)}
id2label = {i: label for i, label in enumerate(label_list)}

# Load tokenizer and model
model_name = "xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=len(label_list), id2label=id2label, label2id=label2id)

# Tokenize and align labels
def tokenize_and_align_labels(examples):
    """Tokenize sentences and align labels with tokenized inputs.
    Args:
        examples: Dictionary with 'tokens' and 'labels'.
    Returns:
        Dictionary with tokenized inputs and aligned label IDs.
    """
    tokenized_inputs = tokenizer(
        examples['tokens'],
        truncation=True,
        is_split_into_words=True,
        padding=True,
        return_tensors=None
    )
    labels = []
    for i, label in enumerate(examples['labels']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        previous_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label2id[label[word_idx]])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# Prepare dataset
data = [{"tokens": s, "labels": l} for s, l in zip(sentences, labels)]
train_data, val_data = train_test_split(data, test_size=0.2, random_state=42)
train_dataset = Dataset.from_list(train_data)
val_dataset = Dataset.from_list(val_data)

# Tokenize datasets
train_tokenized = train_dataset.map(tokenize_and_align_labels, batched=True)
val_tokenized = val_dataset.map(tokenize_and_align_labels, batched=True)

# Debug: Print tokenized sample
print("Sample tokenized data:", train_tokenized[0])

# Custom data collator
data_collator = DataCollatorForTokenClassification(tokenizer)

# Define metrics
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)
    true_labels = [[label_list[l] for l in label if l != -100] for label in labels]
    pred_labels = [[label_list[p] for p, l in zip(pred, label) if l != -100] for pred, label in zip(predictions, labels)]
    results = classification_report(true_labels, pred_labels, output_dict=True, zero_division=0)
    return {
        "precision": results["weighted avg"]["precision"],
        "recall": results["weighted avg"]["recall"],
        "f1": results["weighted avg"]["f1-score"],
    }

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=3e-5,  # Increased for better learning
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,  # Increased for better training
    weight_decay=0.01,
    save_strategy="epoch",
    load_best_model_at_end=True,
    logging_dir="./logs",
    logging_steps=10,
    report_to="none",
)

# Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=val_tokenized,
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# Train model
trainer.train()

# Evaluate model
results = trainer.evaluate()
print("Evaluation results:", results)

# Save model
model.save_pretrained("./finetuned_xlmroberta")
tokenizer.save_pretrained("./finetuned_xlmroberta")

# Zip and download model
!zip -r finetuned_xlmroberta.zip ./finetuned_xlmroberta
files.download('finetuned_xlmroberta.zip')

# Test model
from transformers import pipeline
ner_pipeline = pipeline("ner", model="./finetuned_xlmroberta", tokenizer="./finetuned_xlmroberta")
test_text = "SKECHERS QUANTUM FLEX በ4400 ብር ሜክሲኮ 0944222069"
results = ner_pipeline(test_text)
print("Test prediction:", results)

Upload samples/labeled_data.conll


Saving labeled_data.conll to labeled_data (2).conll
Loaded 30 messages


Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/24 [00:00<?, ? examples/s]

Map:   0%|          | 0/6 [00:00<?, ? examples/s]

Sample tokenized data: {'tokens': ['NB', 'Size', '41', 'Price', '3200', 'Free', 'Delivery', 'Inbox', 'Hiwe5266', 'ስልክ', '251945355266', 'ፋሽን', 'ተራ', 'Fashion', 'Tera', 'አድራሻ', 'አዲስ', 'አበባ', 'ጦር', 'ሀይሎች', 'ድሪም', 'ታወር', '2ተኛ', 'ፎቅ'], 'labels': [-100, 0, 0, 3, 0, 3, -100, 0, 0, 0, -100, 0, -100, -100, -100, 0, 3, -100, -100, -100, -100, 0, -100, -100, 0, 0, 0, 0, 5, 6, 0, 0, -100, -100, 0, -100, -100, 0, -100, 0, -100, 5, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -

Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,1.735673,0.0,0.0,0.0
2,No log,0.891115,0.0,0.0,0.0
3,No log,0.723141,0.0,0.0,0.0
4,1.545800,0.693747,0.0,0.0,0.0
5,1.545800,0.676339,0.0,0.0,0.0


Evaluation results: {'eval_loss': 0.6763389110565186, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_runtime': 0.1534, 'eval_samples_per_second': 39.112, 'eval_steps_per_second': 6.519, 'epoch': 5.0}
updating: finetuned_xlmroberta/ (stored 0%)
updating: finetuned_xlmroberta/special_tokens_map.json (deflated 52%)
updating: finetuned_xlmroberta/tokenizer_config.json (deflated 76%)
updating: finetuned_xlmroberta/config.json (deflated 55%)
updating: finetuned_xlmroberta/tokenizer.json (deflated 76%)
updating: finetuned_xlmroberta/sentencepiece.bpe.model (deflated 49%)
updating: finetuned_xlmroberta/model.safetensors (deflated 29%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Device set to use cuda:0


Test prediction: []
