In [1]:
dataset_name = "nilc-nlp/mac_morpho"
model_name = "neuralmind/bert-base-portuguese-cased"

# Prepare dataset

## Load dataset from HuggingFace

In [2]:
from datasets import load_dataset

dataset = load_dataset(dataset_name, trust_remote_code=True)

  from .autonotebook import tqdm as notebook_tqdm


## Prepare dataset

### Label mapping

In [3]:
# Get unique labels
labels = dataset["train"].features["pos_tags"].feature.names

# Create a mapping
label2id = {label: i for i, label in enumerate(labels)}
id2label = {i: label for label, i in label2id.items()}

label2id

{'PREP+PROADJ': 0,
 'IN': 1,
 'PREP+PRO-KS': 2,
 'NPROP': 3,
 'PREP+PROSUB': 4,
 'KC': 5,
 'PROPESS': 6,
 'NUM': 7,
 'PROADJ': 8,
 'PREP+ART': 9,
 'KS': 10,
 'PRO-KS': 11,
 'ADJ': 12,
 'ADV-KS': 13,
 'N': 14,
 'PREP': 15,
 'PROSUB': 16,
 'PREP+PROPESS': 17,
 'PDEN': 18,
 'V': 19,
 'PREP+ADV': 20,
 'PCP': 21,
 'CUR': 22,
 'ADV': 23,
 'PU': 24,
 'ART': 25}

## Prepare tokenizer

### Load from HuggingFace

In [4]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(
    model_name,
)

### Tokenize dataset

In [5]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, padding=True, is_split_into_words=True
    )

    labels = []
    for i, label in enumerate(examples["pos_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to words
        aligned_labels = []
        previous_word_id = None
        for word_id in word_ids:
            if word_id is None:
                aligned_labels.append(-100)  # Ignore special tokens
            elif word_id != previous_word_id:  # Start of a new word
                aligned_labels.append(label[word_id])
            else:
                aligned_labels.append(-100)  # Ignore subword tokens
            previous_word_id = word_id
        labels.append(aligned_labels)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs


# Tokenize the dataset
tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)

In [6]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer)

# Fine tune model

## Download model

In [7]:
from transformers import AutoModelForTokenClassification


model = AutoModelForTokenClassification.from_pretrained(
    model_name, num_labels=len(label2id), id2label=id2label, label2id=label2id
)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at neuralmind/bert-base-portuguese-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Fine tune model

### Defining custom metric to evaluate model

In [8]:
from sklearn.metrics import accuracy_score
import numpy as np


def compute_metrics(p):
    predictions, labels = p
    # Remove ignored index (e.g., -100 used for padding)
    predictions = np.argmax(predictions, axis=-1)

    # Flatten the lists of predictions and labels
    true_labels = labels.flatten()
    pred_labels = predictions.flatten()

    # Mask the -100 labels (ignore them)
    mask = true_labels != -100
    true_labels = true_labels[mask]
    pred_labels = pred_labels[mask]

    # Calculate overall accuracy
    overall_accuracy = accuracy_score(true_labels, pred_labels)

    # Calculate per-tag accuracy
    unique_tags = np.unique(true_labels)
    per_tag_accuracy = {}
    for tag in unique_tags:
        # Mask for the current tag
        tag_mask = true_labels == tag
        tag_true = true_labels[tag_mask]
        tag_pred = pred_labels[tag_mask]

        # Calculate accuracy for this tag
        tag_accuracy = accuracy_score(tag_true, tag_pred)
        per_tag_accuracy[id2label[tag]] = tag_accuracy

    # Return the overall accuracy and per-tag accuracy
    return {"overall_accuracy": overall_accuracy, "per_tag_accuracy": per_tag_accuracy}

In [9]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    num_train_epochs=3,
    logging_dir="./logs",
    save_strategy="epoch",
    use_cpu=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    data_collator=data_collator,
    processing_class=tokenizer,
    compute_metrics=compute_metrics,
)



In [10]:
# trainer.train()

# Evaluate on test data

In [11]:
results = trainer.evaluate(
    tokenized_dataset["test"].shuffle(seed=42).select(range(0, 150))
)
print(results)

model.save_pretrained("./fine_tuned_model")
tokenizer.save_pretrained("./fine_tuned_model")

{'eval_loss': 3.260129690170288, 'eval_model_preparation_time': 0.0043, 'eval_overall_accuracy': 0.04455081001472754, 'eval_per_tag_accuracy': {'PREP+PROADJ': 0.0, 'NPROP': 0.005154639175257732, 'PREP+PROSUB': 0.0, 'KC': 0.013513513513513514, 'PROPESS': 0.07272727272727272, 'NUM': 0.03125, 'PROADJ': 0.0, 'PREP+ART': 0.0, 'KS': 0.06382978723404255, 'PRO-KS': 0.030303030303030304, 'ADJ': 0.007692307692307693, 'ADV-KS': 0.0, 'N': 0.0954861111111111, 'PREP': 0.0035971223021582736, 'PROSUB': 0.16666666666666666, 'PREP+PROPESS': 0.0, 'PDEN': 0.08333333333333333, 'V': 0.056962025316455694, 'PREP+ADV': 0.0, 'PCP': 0.0, 'CUR': 0.2, 'ADV': 0.06666666666666667, 'PU': 0.05912596401028278, 'ART': 0.004807692307692308}, 'eval_runtime': 39.2288, 'eval_samples_per_second': 3.824, 'eval_steps_per_second': 0.484}


('./fine_tuned_model/tokenizer_config.json',
 './fine_tuned_model/special_tokens_map.json',
 './fine_tuned_model/vocab.txt',
 './fine_tuned_model/added_tokens.json',
 './fine_tuned_model/tokenizer.json')