In [1]:
# Set value of CUDA_VISIBLE_DEVICES="" to disable GPU
import os

os.environ["CUDA_VISIBLE_DEVICES"] = ""

In [2]:
dataset_name = "nilc-nlp/mac_morpho"
model_name = "neuralmind/bert-base-portuguese-cased"

# Prepare dataset

## Load dataset from HuggingFace

In [None]:
from datasets import load_dataset

dataset = load_dataset(dataset_name, trust_remote_code=True)

## Prepare dataset

### Label mapping

In [None]:
# Get unique labels
labels = dataset["train"].features["pos_tags"].feature.names

# Create a mapping
label2id = {label: i for i, label in enumerate(labels)}
id2label = {i: label for label, i in label2id.items()}

label2id

## Prepare tokenizer

### Load from HuggingFace

In [5]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(
    model_name,
)

### Tokenize dataset

In [6]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, padding=True, is_split_into_words=True
    )

    labels = []
    for i, label in enumerate(examples["pos_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to words
        aligned_labels = []
        previous_word_id = None
        for word_id in word_ids:
            if word_id is None:
                aligned_labels.append(-100)  # Ignore special tokens
            elif word_id != previous_word_id:  # Start of a new word
                aligned_labels.append(label[word_id])
            else:
                aligned_labels.append(-100)  # Ignore subword tokens
            previous_word_id = word_id
        labels.append(aligned_labels)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs


# Tokenize the dataset
tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)

In [7]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer)

# Fine tune model

## Download model

In [None]:
from transformers import AutoModelForTokenClassification


model = AutoModelForTokenClassification.from_pretrained(
    model_name, num_labels=len(label2id), id2label=id2label, label2id=label2id
)

## Fine tune model

### Defining custom metrics to evaluate model

In [9]:
from sklearn.metrics import precision_score
import numpy as np

def compute_metrics(p):
    predictions, labels = p

    # Get predicted labels by taking argmax
    predictions = np.argmax(predictions, axis=-1)
    
    # Flatten and filter out ignored tokens (-100)
    true_labels = labels.flatten()
    pred_labels = predictions.flatten()
    mask = true_labels != -100
    true_labels = true_labels[mask]
    pred_labels = pred_labels[mask]
    
    # Compute precision
    macro_precision = precision_score(true_labels, pred_labels, average="macro", zero_division=0)
    weighted_precision = precision_score(true_labels, pred_labels, average="weighted", zero_division=0)
    per_class_precision = precision_score(true_labels, pred_labels, average=None, zero_division=0)

    # Map class indices to precision values
    unique_tags = np.unique(true_labels)
    per_class_precision_dict = {id2label[int(tag)]: float(per_class_precision[i]) for i, tag in enumerate(unique_tags)}
    
    return {
        "macro_precision": macro_precision,
        "weighted_precision": weighted_precision,
        "per_class_precision": per_class_precision_dict,
    }


In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir=".results",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    num_train_epochs=3,
    save_strategy="epoch",
    use_cpu=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    data_collator=data_collator,
    processing_class=tokenizer,
    compute_metrics=compute_metrics,
)

In [11]:
# trainer.train()

# Evaluation

## Against test data

In [None]:
results = trainer.evaluate(
    # tokenized_dataset["test"].shuffle(seed=42).select(range(0, 10))
    tokenized_dataset["test"]
)
print(results)

model.save_pretrained("./fine_tuned_model")
tokenizer.save_pretrained("./fine_tuned_model")

## Against another model trained on the same dataset

In [None]:
# Load `lisaterumi/postagger-portuguese` from hugging face and store in a `competitor_model` variable

from transformers import BertForTokenClassification

competitor_model = BertForTokenClassification.from_pretrained(
    "lisaterumi/postagger-portuguese"
)
competitor_tokenizer = AutoTokenizer.from_pretrained("lisaterumi/postagger-portuguese")

competitor_model.to("cpu")

In [None]:
def tokenize_and_align_labels_for_comparison(examples):
    tokenized_inputs = competitor_tokenizer(
        examples["tokens"],
        truncation=True,
        is_split_into_words=True,
        padding="max_length",
        max_length=128,
    )

    # Align labels with tokenized inputs
    labels = []
    for i, label in enumerate(examples["labels"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        aligned_labels = [
            -100 if word_id is None else label[word_id] for word_id in word_ids
        ]
        labels.append(aligned_labels)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs


tokenized_test_dataset_comparison = tokenized_dataset.map(
    tokenize_and_align_labels_for_comparison, batched=True
)

In [None]:
# Evaluate the `competitor_model` using the same `compute_metrics` function and the same `tokenized_dataset["test"]` dataset

# Define dummy training arguments (just for evaluation)
dummy_training_args = TrainingArguments(
    output_dir="./results",
    logging_dir="./logs",
    do_train=False,
    do_eval=True,
    evaluation_strategy="no",
)

# Create a Trainer for the comparison model
competitor_trainer = Trainer(
    model=competitor_model,
    args=training_args,
    eval_dataset=tokenized_test_dataset_comparison["validation"],
    processing_class=competitor_tokenizer,
    compute_metrics=compute_metrics,  # Use the same custom metric function
)

In [None]:
eval_results_comparison = competitor_trainer.evaluate(
    tokenized_test_dataset_comparison["test"]
)

print(eval_results_comparison)