<a href="https://colab.research.google.com/github/metasebiya/ethiomart-llm-challenge-week4/blob/task-3/fine_tune_ner.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# --- Step 1: Environment Setup and Library Installation ---
# Run this cell first in Google Colab

!pip install transformers datasets seqeval evaluate accelerate -U -qq

import os
import json
import re
import pandas as pd
from datasets import load_dataset, Features, Value, ClassLabel, Sequence
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
import evaluate
import numpy as np

print("Libraries installed and imported successfully!")

# --- Step 2: Prepare and Load the Labeled Dataset (CoNLL Format) ---
# IMPORTANT: You need to upload your 'labeled_dataset.txt' to your Colab environment
# You can do this by clicking the folder icon on the left sidebar in Colab, then
# clicking the 'Upload to session storage' icon and selecting your file.

# A. Define a function to parse CoNLL format
def parse_conll_file(file_path):
    """
    Parses a CoNLL-formatted text file into a list of dictionaries,
    where each dictionary represents a sentence with 'tokens' and 'ner_tags'.
    """
    sentences = []
    current_tokens = []
    current_labels = []

    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if line: # If line is not empty
                parts = line.split('\t')
                if len(parts) == 2:
                    token, label = parts
                    current_tokens.append(token)
                    current_labels.append(label)
                else:
                    # Handle lines that might not be correctly formatted (e.g., just a token)
                    # For robust parsing, ensure your CoNLL is consistently token<TAB>label
                    print(f"Warning: Skipping malformed line in CoNLL: '{line}'")
            else: # Empty line signifies end of a sentence/message
                if current_tokens: # If there are tokens accumulated
                    sentences.append({"tokens": current_tokens, "ner_tags": current_labels})
                    current_tokens = []
                    current_labels = []
        # Add the last sentence if the file doesn't end with a blank line
        if current_tokens:
            sentences.append({"tokens": current_tokens, "ner_tags": current_labels})
    return sentences

# Upload your labeled_dataset.txt to Colab session storage
conll_file_path = 'labeled_dataset.txt'

# Parse the CoNLL file
print(f"Parsing CoNLL file from: {conll_file_path}")
raw_data = parse_conll_file(conll_file_path)

# Verify a few samples
print(f"Loaded {len(raw_data)} sentences/messages.")
if raw_data:
    print("\nFirst sample:")
    print(raw_data[0])
    print("\nSecond sample:")
    if len(raw_data) > 1:
        print(raw_data[1])
    else:
        print("Only one sample available.")
else:
    print("No data parsed. Check your CoNLL file format.")

# Create a dummy dataset structure for Hugging Face `datasets` library
# First, collect all unique labels to define the ClassLabel feature
all_labels = sorted(list(set(label for sentence in raw_data for label in sentence['ner_tags'])))
# Ensure 'O' is always present and specific order if preferred, but sorted handles it.
print(f"\nUnique NER labels found: {all_labels}")

# Map labels to IDs
label_to_id = {label: i for i, label in enumerate(all_labels)}
id_to_label = {i: label for i, label in enumerate(all_labels)}
num_labels = len(all_labels)

print(f"Label to ID mapping: {label_to_id}")

# Convert string labels to numerical IDs
for sentence in raw_data:
    sentence['ner_tags'] = [label_to_id[label] for label in sentence['ner_tags']]

# Define the features for the dataset
features = Features({
    "tokens": Sequence(Value(dtype='string')),
    "ner_tags": Sequence(ClassLabel(names=all_labels))
})

# Load raw_data into a Hugging Face Dataset
from datasets import Dataset
dataset = Dataset.from_list(raw_data, features=features)

# Split the dataset into training and testing (and validation if desired)
# For simplicity, we'll use a train-test split. For larger datasets, train-val-test is better.
train_test_split = dataset.train_test_split(test_size=0.2, seed=42) # 20% for testing

train_dataset = train_test_split['train']
eval_dataset = train_test_split['test']

print(f"\nTraining dataset size: {len(train_dataset)}")
print(f"Evaluation dataset size: {len(eval_dataset)}")

# --- Step 3: Load Pre-trained Model and Tokenizer ---

# Choose your pre-trained model:
# Option 1: "Davlan/bert-tiny-amharic" (smaller, faster)
# Option 2: "Davlan/afro-xlmr-large-amharic" (larger XLM-R, potentially better performance)
# Option 3: "xlm-roberta-base" (multilingual, general purpose, might need more fine-tuning)

# model_checkpoint = "Davlan/afro-xlmr-large-amharic" # Recommended for Amharic performance
# model_checkpoint = "Davlan/bert-tiny-amharic"
model_checkpoint = "xlm-roberta-base" # If you choose xlm-roberta-base, make sure to add the correct `num_labels` for your task

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    num_labels=num_labels,
    id2label=id_to_label,
    label2id=label_to_id
)

print(f"\nLoaded tokenizer and model from: {model_checkpoint}")
print(f"Model has {model.config.num_labels} labels configured.")

# --- Step 4: Tokenize Data and Align Labels ---

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        is_split_into_words=True,
        padding="max_length" # Pad to max_length of model or tokenizer default
    )
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word_idx that is None. We set the label to -100 so they are ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to -100 to ignore them.
            else:
                label_ids.append(-100) # Or optionally, set to the I-tag for consistent subword labeling
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# Apply the tokenization and alignment function to the datasets
tokenized_train_dataset = train_dataset.map(tokenize_and_align_labels, batched=True)
tokenized_eval_dataset = eval_dataset.map(tokenize_and_align_labels, batched=True)

print("\nData tokenization and label alignment complete!")
print("Example of tokenized and aligned labels (first sample):")
# Example of tokenized input and aligned labels
# Note: -100 indicates tokens that should be ignored in loss calculation
if tokenized_train_dataset:
    first_sample_input_ids = tokenized_train_dataset[0]["input_ids"]
    first_sample_tokens = tokenizer.convert_ids_to_tokens(first_sample_input_ids)
    first_sample_labels = tokenized_train_dataset[0]["labels"]
    print(list(zip(first_sample_tokens, first_sample_labels)))

# --- Step 5: Set Up Training Arguments ---

# Define evaluation metrics
metric = evaluate.load("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_labels = [[id_to_label[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [id_to_label[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    # Flatten the results for easier interpretation
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",               # Directory to save checkpoints and logs
    evaluation_strategy="epoch",          # Evaluate at the end of each epoch
    learning_rate=2e-5,                   # Learning rate
    per_device_train_batch_size=16,       # Batch size for training
    per_device_eval_batch_size=16,        # Batch size for evaluation
    num_train_epochs=3,                   # Number of training epochs
    weight_decay=0.01,                    # Weight decay for regularization
    logging_dir="./logs",                 # Directory for storing logs
    logging_steps=500,                    # Log every 500 steps
    save_strategy="epoch",                # Save the model at the end of each epoch
    load_best_model_at_end=True,          # Load the best model after training
    metric_for_best_model="f1",           # Metric to monitor for best model selection
    greater_is_better=True,               # Higher F1 is better
    push_to_hub=False,                    # Set to True if you want to push to Hugging Face Hub
    report_to="none"                      # Disable wandb, mlflow etc. reporting if not needed
)

print("\nTraining arguments configured.")

# --- Step 6: Fine-tune the Model with Trainer API ---

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_eval_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

print("\nStarting model training...")
train_result = trainer.train()

print("\nTraining complete!")

# --- Step 7: Evaluate the Fine-tuned Model ---

metrics = trainer.evaluate()
print("\nEvaluation Metrics:")
print(metrics)

# --- Step 8: Save the Fine-tuned Model ---

# Define a directory to save your model
model_save_path = "./fine_tuned_ner_model"
trainer.save_model(model_save_path)
tokenizer.save_pretrained(model_save_path) # Save tokenizer with the model

print(f"\nModel saved to: {model_save_path}")
print("You can download this folder from Colab's file browser (left sidebar) to your local machine.")
print("The folder will contain 'pytorch_model.bin', 'config.json', and 'tokenizer.json' (or similar).")

Libraries installed and imported successfully!
Parsing CoNLL file from: labeled_dataset.txt
Loaded 1053 sentences/messages.

First sample:
{'tokens': ['የኛ', 'እጣ', 'ባለ', 'እድለኛ', 'nndlc', 'espresso', 'coffee', 'maker', 'ባለእድለኛ', 'የሆነችውን', 'ከተወሰነ', 'በኋላ', 'እናሳውቃለን'], 'ner_tags': ['O', 'O', 'O', 'O', 'O', 'O', 'B-PRODUCT', 'I-PRODUCT', 'O', 'O', 'O', 'O', 'O']}

Second sample:
{'tokens': ['ኛnnእጣ', 'ባለ', 'እድለኛ'], 'ner_tags': ['O', 'O', 'O']}

Unique NER labels found: ['B-LOC', 'B-PRODUCT', 'I-LOC', 'I-PRODUCT', 'O']
Label to ID mapping: {'B-LOC': 0, 'B-PRODUCT': 1, 'I-LOC': 2, 'I-PRODUCT': 3, 'O': 4}

Training dataset size: 842
Evaluation dataset size: 211


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


OSError: Davlan/afro-xlmr-large-amharic is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`