<a href="https://colab.research.google.com/github/nanecha/Telegram_E-commerce-_extractor-/blob/main/fine_tune_ner.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# TASK-3 Fine-tuning

In [None]:
# Install Hugging Face Transformers, Datasets, and PEFT (for parameter-efficient fine-tuning)
!pip install transformers datasets accelerate peft


Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.wh

In [None]:
import torch
import evaluate
import numpy as np
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    DataCollatorForTokenClassification,
    TrainingArguments,
    Trainer,
    pipeline
)

In [None]:
!pip install datasets


# Load the dataset.

In [None]:
from google.colab import files

uploaded = files.upload()


Saving labeled_data.conll to labeled_data.conll


In [None]:
!head -n 5 labeled_data.conll

[ O
' O
በ B-PRICE
ቀ O
ላ O
ሉ O
' O
, O
  O
' O
ከ O
ሰ O
ል O
' O
, O
  O
' O
ለ O
ማ O
ያ O


In [None]:
# Define unwanted tokens
bad_tokens = {"'", ",", "[", "]", ""}

# Read raw lines
cleaned_lines = []
with open("labeled_data.conll", "r", encoding="utf-8") as f:
    for line in f:
        line = line.strip()
        if not line:
            continue
        parts = line.split()
        if len(parts) != 2:
            continue  # skip lines that are not token-label pairs
        token, label = parts
        if token in bad_tokens:
            continue
        cleaned_lines.append(f"{token} {label}")

# Write cleaned result to new file
with open("cleaned_data.conll", "w", encoding="utf-8") as out:
    for line in cleaned_lines:
        out.write(line + "\n")

print("✅ Cleaned CoNLL data written to cleaned_data.conll")

✅ Cleaned CoNLL data written to cleaned_data.conll


In [None]:
!head cleaned_data.conll

በ B-PRICE
ቀ O
ላ O
ሉ O
ከ O
ሰ O
ል O
ለ O
ማ O
ያ O


In [None]:
def load_conll_file(file_path):
    """
    Load CoNLL file into a Hugging Face Dataset.
    Args:
        file_path (str): Path to CoNLL file.
    Returns:
        Dataset: Dataset with 'tokens' and 'ner_tags' columns.
    """
    sentences = []
    labels = []
    current_sentence = []
    current_labels = []

    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if line:
                try:
                    # Split by space instead of tab
                    token, label = line.split()
                    if not token or not label:
                        print(f"Skipping empty token or label in line: {line}")
                        continue
                    current_sentence.append(token)
                    current_labels.append(label)
                except ValueError:
                    print(f"Skipping malformed line: {line}")
                    continue
            else:
                if current_sentence and current_labels:
                    if len(current_sentence) == len(current_labels):
                        sentences.append(current_sentence)
                        labels.append(current_labels)
                    else:
                        print(f"Skipping sentence with mismatched tokens and labels: {current_sentence}, {current_labels}")
                    current_sentence = []
                    current_labels = []

    if current_sentence and current_labels and len(current_sentence) == len(current_labels):
        sentences.append(current_sentence)
        labels.append(current_labels)

    dataset = Dataset.from_dict({'tokens': sentences, 'ner_tags': labels})
    print(f"Loaded dataset with {len(dataset)} sentences")
    return dataset

In [None]:
dataset = load_conll_file('cleaned_data.conll')
print(dataset['sentences'][0])
print(dataset['labels'][0])
#print(dataset(Dict)

## Tokenization
I use a pre-trained model’s tokenizer to convert text into input tokens (IDs) that the model can process.

In [None]:
!pip install evaluate

In [None]:
#def tokenize_and_align_labels(ex, tokenizer):
    """
    Tokenize dataset and align labels with subword tokens.
    Args:
        examples (dict): Dataset with 'tokens' and 'ner_tags'.
        tokenizer: Tokenizer instance.
        label2id (dict): Mapping of labels to IDs.
    Returns:
        dict: Tokenized inputs with aligned labels.
    """
    tokenized_inputs = tokenizer(
        ex['tokens'],
        is_split_into_words=True,
        truncation=True,
        padding='max_length',
        max_length=128
    )

    labels = []
    for i, label in enumerate(ex['ner_tags']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        previous_word_idx = None

        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)  # Ignore special tokens
            elif word_idx != previous_word_idx:
              if word_idx < len(labels):
                label_ids.append(labels[word_idx])
            else:
                label_ids.append(-100)  # Ignore subword tokens
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs['labels'] = labels
    return tokenized_inputs


In [None]:
def tokenize_and_align_labels(examples, tokenizer, label2id):
    """
    Tokenize dataset and align labels with subword tokens.
    Args:
        examples (dict): Dataset with 'tokens' and 'ner_tags' (containing integer IDs).
        tokenizer: Tokenizer instance.
        label2id (dict): Mapping of labels to IDs (not strictly needed for lookup here, but kept for consistency).
    Returns:
        dict: Tokenized inputs with aligned labels.
    """
    tokenized_inputs = tokenizer(
        examples['tokens'],
        is_split_into_words=True,
        truncation=True,
        padding='max_length',
        max_length=128
    )

    labels = []
    for i, label in enumerate(examples['ner_tags']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)  # Ignore special tokens
            elif word_idx != previous_word_idx:
                # Use the already converted integer label ID directly
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)  # Ignore subword tokens
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs['labels'] = labels
    return tokenized_inputs

## Define a simple accuracy metric

In [None]:
def compute_metrics(p, label_list):
    """
    Compute precision, recall, and F1-score using seqeval.
    Args:
        p: Tuple of (predictions, labels).
        label_list (list): List of label names.
    Returns:
        dict: Metrics (accuracy).
    """
    seqeval = evaluate.load('seqeval')
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_labels = [
        [label_list[l] for l in label if l != -100]
        for label in labels
    ]
    pred_labels = [
        [label_list[p] for p, l in zip(pred, label) if l != -100]
        for pred, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=pred_labels, references=true_labels)
    return {
        'accuracy': results['overall_accuracy']
    }

In [None]:
# Check for GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

Using device: cpu


## Fine tunining

In [None]:
# Define paths and parameters
conll_file = 'cleaned_data.conll'
model_name = 'xlm-roberta-base'
output_dir = './fine_tuned_ner_model'
num_train_epochs = 3
batch_size = 16
learning_rate = 2e-5

In [None]:
# Define label list
label_list = ['O', 'B-Product', 'I-Product', 'B-LOC', 'I-LOC', 'B-PRICE', 'I-PRICE']
label2id = {label: idx for idx, label in enumerate(label_list)}
id2label = {idx: label for idx, label in enumerate(label_list)}

In [None]:
# Step 1: Load dataset
dataset = load_conll_file('cleaned_data.conll')
dataset = dataset.map(lambda x: {'ner_tags': [label2id[label] for label in x['ner_tags']]})
#train_test_split = dataset.train_test_split(test_size=0.001, seed=42)
#train_test_split = dataset.train_test_split(test_size=0.2, seed=42)
#train_dataset = train_test_split['train']
#val_dataset = train_test_split['test']
train_dataset = dataset
val_dataset = dataset
print(f'Train size: {len(train_dataset)}, Validation size: {len(val_dataset)}')

if len(train_dataset) == 0 or len(val_dataset) == 0:
    raise ValueError("Training or validation dataset is empty. Please check your data loading and splitting.")

Loaded dataset with 1 sentences


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Train size: 1, Validation size: 1


In [None]:
# Step 2: Load tokenizer and tokenize dataset
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenized_train = train_dataset.map(
        lambda x: tokenize_and_align_labels(x, tokenizer, label2id),
        batched=True
    )
tokenized_val = val_dataset.map(
        lambda x: tokenize_and_align_labels(x, tokenizer, label2id),
        batched=True
    )

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

In [None]:
# Step 3: Load model and data collator
model = AutoModelForTokenClassification.from_pretrained(
model_name,
num_labels=len(label_list),
id2label=id2label,
label2id=label2id
).to(device)
data_collator = DataCollatorForTokenClassification(tokenizer)

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
!pip install evaluate

In [None]:
from transformers import TrainingArguments,Trainer
# Step 4: Set up training arguments
training_args = TrainingArguments(
output_dir=output_dir,
eval_strategy='epoch',
save_strategy='epoch',
learning_rate=learning_rate,
per_device_train_batch_size=batch_size,
per_device_eval_batch_size=batch_size,
num_train_epochs=num_train_epochs,
weight_decay=0.01,
load_best_model_at_end=True,
metric_for_best_model='accuracy'
)

In [None]:
# Step 5: Initialize Trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_train,
eval_dataset=tokenized_val,
tokenizer=tokenizer,
data_collator=data_collator,
compute_metrics=lambda p: compute_metrics(p, label_list)
    )


  trainer = Trainer(


In [None]:
pip install seqeval

In [None]:
# Step 6: Train model
print('Starting training...')
trainer.train()



Starting training...


[34m[1mwandb[0m: Currently logged in as: [33mnanechakebede[0m ([33mnanechakebede-haramaya-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.778577,0.020202
2,No log,1.599695,0.242424
3,No log,1.482121,0.818182


TrainOutput(global_step=3, training_loss=1.8065476417541504, metrics={'train_runtime': 401.7522, 'train_samples_per_second': 0.007, 'train_steps_per_second': 0.007, 'total_flos': 195981426432.0, 'train_loss': 1.8065476417541504, 'epoch': 3.0})

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Step 7: Evaluate model
    print('Evaluating model...')
    eval_results = trainer.evaluate()
    print('Evaluation Results:')
    for key, value in eval_results.items():
        print(f'{key}: {value:.4f}')

In [None]:
# Step 8: Save model and tokenizer
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)
    print(f'Model and tokenizer saved to {output_dir}')

In [None]:
# Step 9: Test model on a sample sentence
    ner_pipeline = pipeline(
        'ner',
        model=output_dir,
        tokenizer=output_dir,
        device=0 if torch.cuda.is_available() else -1
    )
    test_sentence = 'ለሽያጭ ስማርትፎን በ1000 ብር በአዲስ አበባ'
    results = ner_pipeline(test_sentence)
    print('Test Sentence:', test_sentence)
    print('NER Results:')
    for entity in results:
        print(f"Token: {entity['word']}, Entity: {entity['entity']}, Score: {entity['score']:.4f}")