In [1]:
# Set value of CUDA_VISIBLE_DEVICES="" to disable GPU
import os

os.environ["CUDA_VISIBLE_DEVICES"] = ""

In [2]:
dataset_name = "nilc-nlp/mac_morpho"
model_name = "neuralmind/bert-base-portuguese-cased"

# Prepare dataset

## Load dataset from HuggingFace

In [3]:
from datasets import load_dataset

dataset = load_dataset(dataset_name, trust_remote_code=True)

  from .autonotebook import tqdm as notebook_tqdm


## Prepare dataset

### Label mapping

In [4]:
# Get unique labels
labels = dataset["train"].features["pos_tags"].feature.names

# Create a mapping
label2id = {label: i for i, label in enumerate(labels)}
id2label = {i: label for label, i in label2id.items()}

label2id

{'PREP+PROADJ': 0,
 'IN': 1,
 'PREP+PRO-KS': 2,
 'NPROP': 3,
 'PREP+PROSUB': 4,
 'KC': 5,
 'PROPESS': 6,
 'NUM': 7,
 'PROADJ': 8,
 'PREP+ART': 9,
 'KS': 10,
 'PRO-KS': 11,
 'ADJ': 12,
 'ADV-KS': 13,
 'N': 14,
 'PREP': 15,
 'PROSUB': 16,
 'PREP+PROPESS': 17,
 'PDEN': 18,
 'V': 19,
 'PREP+ADV': 20,
 'PCP': 21,
 'CUR': 22,
 'ADV': 23,
 'PU': 24,
 'ART': 25}

## Prepare tokenizer

### Load from HuggingFace

In [5]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(
    model_name,
)

### Tokenize dataset

In [6]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, padding=True, is_split_into_words=True
    )

    labels = []
    for i, label in enumerate(examples["pos_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to words
        aligned_labels = []
        previous_word_id = None
        for word_id in word_ids:
            if word_id is None:
                aligned_labels.append(-100)  # Ignore special tokens
            elif word_id != previous_word_id:  # Start of a new word
                aligned_labels.append(label[word_id])
            else:
                aligned_labels.append(-100)  # Ignore subword tokens
            previous_word_id = word_id
        labels.append(aligned_labels)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs


# Tokenize the dataset
tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)

In [7]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer)

# Fine tune model

## Download model

In [8]:
from transformers import AutoModelForTokenClassification


model = AutoModelForTokenClassification.from_pretrained(
    model_name, num_labels=len(label2id), id2label=id2label, label2id=label2id
)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at neuralmind/bert-base-portuguese-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Fine tune model

### Defining custom metric to evaluate model

In [9]:
from sklearn.metrics import accuracy_score
import numpy as np


def compute_metrics(p):
    predictions, labels = p
    # Remove ignored index (e.g., -100 used for padding)
    predictions = np.argmax(predictions, axis=-1)

    # Flatten the lists of predictions and labels
    true_labels = labels.flatten()
    pred_labels = predictions.flatten()

    # Mask the -100 labels (ignore them)
    mask = true_labels != -100
    true_labels = true_labels[mask]
    pred_labels = pred_labels[mask]

    # Calculate overall accuracy
    overall_accuracy = accuracy_score(true_labels, pred_labels)

    # Calculate per-tag accuracy
    unique_tags = np.unique(true_labels)
    per_tag_accuracy = {}
    for tag in unique_tags:
        # Mask for the current tag
        tag_mask = true_labels == tag
        tag_true = true_labels[tag_mask]
        tag_pred = pred_labels[tag_mask]

        # Calculate accuracy for this tag
        tag_accuracy = accuracy_score(tag_true, tag_pred)
        per_tag_accuracy[id2label[tag]] = tag_accuracy

    # Return the overall accuracy and per-tag accuracy
    return {"overall_accuracy": overall_accuracy, "per_tag_accuracy": per_tag_accuracy}

In [10]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir=".results",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    num_train_epochs=3,
    save_strategy="epoch",
    use_cpu=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    data_collator=data_collator,
    processing_class=tokenizer,
    compute_metrics=compute_metrics,
)



In [11]:
# trainer.train()

# Evaluation

## Against test data

In [12]:
results = trainer.evaluate(
    # tokenized_dataset["test"].shuffle(seed=42).select(range(0, 150))
    tokenized_dataset["test"]
)
print(results)

model.save_pretrained("./fine_tuned_model")
tokenizer.save_pretrained("./fine_tuned_model")

{'eval_loss': 3.3340036869049072, 'eval_model_preparation_time': 0.0067, 'eval_overall_accuracy': 0.02604093668884865, 'eval_per_tag_accuracy': {'PREP+PROADJ': 0.11003236245954692, 'IN': 0.0, 'PREP+PRO-KS': 0.0, 'NPROP': 0.03978413654618474, 'PREP+PROSUB': 0.0, 'KC': 0.0006621054954756124, 'PROPESS': 0.014603616133518776, 'NUM': 0.01652892561983471, 'PROADJ': 0.00116993272886809, 'PREP+ART': 0.1415989822878951, 'KS': 0.02955082742316785, 'PRO-KS': 0.006378132118451025, 'ADJ': 0.013911620294599018, 'ADV-KS': 0.05217391304347826, 'N': 0.013299764654370314, 'PREP': 0.010966742162355466, 'PROSUB': 0.05679642629227824, 'PREP+PROPESS': 0.047619047619047616, 'PDEN': 0.03021978021978022, 'V': 0.017604383339252194, 'PREP+ADV': 0.0, 'PCP': 0.2662087912087912, 'CUR': 0.0, 'ADV': 0.018912963643040764, 'PU': 7.43383883437407e-05, 'ART': 0.0}, 'eval_runtime': 1539.395, 'eval_samples_per_second': 6.488, 'eval_steps_per_second': 0.811}


('./fine_tuned_model/tokenizer_config.json',
 './fine_tuned_model/special_tokens_map.json',
 './fine_tuned_model/vocab.txt',
 './fine_tuned_model/added_tokens.json',
 './fine_tuned_model/tokenizer.json')

## Against another model trained on the same dataset

In [13]:
# Load `lisaterumi/postagger-portuguese` from hugging face and store in a `competitor_model` variable

from transformers import BertForTokenClassification

competitor_model = BertForTokenClassification.from_pretrained(
    "lisaterumi/postagger-portuguese"
)
competitor_tokenizer = AutoTokenizer.from_pretrained("lisaterumi/postagger-portuguese")

competitor_model.to("cpu")

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(29794, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12

In [14]:
def tokenize_and_align_labels_for_comparison(examples):
    tokenized_inputs = competitor_tokenizer(
        examples["tokens"],
        truncation=True,
        is_split_into_words=True,
        padding="max_length",
        max_length=128,
    )

    # Align labels with tokenized inputs
    labels = []
    for i, label in enumerate(examples["labels"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        aligned_labels = [
            -100 if word_id is None else label[word_id] for word_id in word_ids
        ]
        labels.append(aligned_labels)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs


tokenized_test_dataset_comparison = tokenized_dataset.map(
    tokenize_and_align_labels_for_comparison, batched=True
)

Map: 100%|██████████| 37948/37948 [00:07<00:00, 5115.60 examples/s]
Map: 100%|██████████| 9987/9987 [00:02<00:00, 4796.42 examples/s]
Map: 100%|██████████| 1997/1997 [00:00<00:00, 4906.03 examples/s]


In [15]:
# Evaluate the `competitor_model` using the same `compute_metrics` function and the same `tokenized_dataset["test"]` dataset

# Define dummy training arguments (just for evaluation)
dummy_training_args = TrainingArguments(
    output_dir="./results",
    logging_dir="./logs",
    do_train=False,
    do_eval=True,
    evaluation_strategy="no",
)

# Create a Trainer for the comparison model
competitor_trainer = Trainer(
    model=competitor_model,
    args=training_args,
    eval_dataset=tokenized_test_dataset_comparison["validation"],
    processing_class=competitor_tokenizer,
    compute_metrics=compute_metrics,  # Use the same custom metric function
)



In [16]:
eval_results_comparison = competitor_trainer.evaluate(
    tokenized_test_dataset_comparison["test"]
)

print(eval_results_comparison)

{'eval_loss': 10.260217666625977, 'eval_model_preparation_time': 0.0072, 'eval_overall_accuracy': 0.05898346889038964, 'eval_per_tag_accuracy': {'PREP+PROADJ': 0.0, 'IN': 0.011904761904761904, 'PREP+PRO-KS': 0.0, 'NPROP': 0.0008249777890595254, 'PREP+PROSUB': 0.07692307692307693, 'KC': 0.011541072640868975, 'PROPESS': 0.013399813025864755, 'NUM': 0.02203742203742204, 'PROADJ': 0.018421052631578946, 'PREP+ART': 0.12725667189952905, 'KS': 0.04515225761288064, 'PRO-KS': 0.041666666666666664, 'ADJ': 0.011773940345368918, 'ADV-KS': 0.09210526315789473, 'N': 0.15725369638413117, 'PREP': 0.011693500998942296, 'PROSUB': 0.006422607578676943, 'PREP+PROPESS': 0.0, 'PDEN': 0.04713493530499076, 'V': 0.00038580246913580245, 'PREP+ADV': 0.0, 'PCP': 0.001984689537850865, 'CUR': 0.0, 'ADV': 0.0014207068016338128, 'PU': 0.1326202645028716, 'ART': 0.0}, 'eval_runtime': 1361.2956, 'eval_samples_per_second': 7.336, 'eval_steps_per_second': 0.918}
