In [1]:
# Set value of CUDA_VISIBLE_DEVICES="" to disable GPU
import os

os.environ["CUDA_VISIBLE_DEVICES"] = ""

In [2]:
dataset_name = "nilc-nlp/mac_morpho"
model_name = "neuralmind/bert-base-portuguese-cased"

# Prepare dataset

## Load dataset from HuggingFace

In [3]:
from datasets import load_dataset

dataset = load_dataset(dataset_name, trust_remote_code=True)

  from .autonotebook import tqdm as notebook_tqdm


## Prepare dataset

### Label mapping

In [4]:
# Get unique labels
labels = dataset["train"].features["pos_tags"].feature.names

# Create a mapping
label2id = {label: i for i, label in enumerate(labels)}
id2label = {i: label for label, i in label2id.items()}

label2id

{'PREP+PROADJ': 0,
 'IN': 1,
 'PREP+PRO-KS': 2,
 'NPROP': 3,
 'PREP+PROSUB': 4,
 'KC': 5,
 'PROPESS': 6,
 'NUM': 7,
 'PROADJ': 8,
 'PREP+ART': 9,
 'KS': 10,
 'PRO-KS': 11,
 'ADJ': 12,
 'ADV-KS': 13,
 'N': 14,
 'PREP': 15,
 'PROSUB': 16,
 'PREP+PROPESS': 17,
 'PDEN': 18,
 'V': 19,
 'PREP+ADV': 20,
 'PCP': 21,
 'CUR': 22,
 'ADV': 23,
 'PU': 24,
 'ART': 25}

## Prepare tokenizer

### Load from HuggingFace

In [5]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(
    model_name,
)

### Tokenize dataset

In [6]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, padding=True, is_split_into_words=True
    )

    labels = []
    for i, label in enumerate(examples["pos_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to words
        aligned_labels = []
        previous_word_id = None
        for word_id in word_ids:
            if word_id is None:
                aligned_labels.append(-100)  # Ignore special tokens
            elif word_id != previous_word_id:  # Start of a new word
                aligned_labels.append(label[word_id])
            else:
                aligned_labels.append(-100)  # Ignore subword tokens
            previous_word_id = word_id
        labels.append(aligned_labels)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs


# Tokenize the dataset
tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)

In [7]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer)

# Fine tune model

## Download model

In [8]:
from transformers import AutoModelForTokenClassification


model = AutoModelForTokenClassification.from_pretrained(
    model_name, num_labels=len(label2id), id2label=id2label, label2id=label2id
)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at neuralmind/bert-base-portuguese-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Fine tune model

### Defining custom metrics to evaluate model

In [9]:
from sklearn.metrics import precision_score
import numpy as np

def compute_metrics(p):
    predictions, labels = p

    # Get predicted labels by taking argmax
    predictions = np.argmax(predictions, axis=-1)
    
    # Flatten and filter out ignored tokens (-100)
    true_labels = labels.flatten()
    pred_labels = predictions.flatten()
    mask = true_labels != -100
    true_labels = true_labels[mask]
    pred_labels = pred_labels[mask]
    
    # Compute precision
    macro_precision = precision_score(true_labels, pred_labels, average="macro", zero_division=0)
    weighted_precision = precision_score(true_labels, pred_labels, average="weighted", zero_division=0)
    per_class_precision = precision_score(true_labels, pred_labels, average=None, zero_division=0)

    # Map class indices to precision values
    unique_tags = np.unique(true_labels)
    per_class_precision_dict = {id2label[int(tag)]: float(per_class_precision[i]) for i, tag in enumerate(unique_tags)}
    
    return {
        "macro_precision": macro_precision,
        "weighted_precision": weighted_precision,
        "per_class_precision": per_class_precision_dict,
    }


In [10]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir=".results",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    num_train_epochs=3,
    save_strategy="epoch",
    use_cpu=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    data_collator=data_collator,
    processing_class=tokenizer,
    compute_metrics=compute_metrics,
)



In [11]:
# trainer.train()

# Evaluation

## Against test data

In [12]:
results = trainer.evaluate(
    # tokenized_dataset["test"].shuffle(seed=42).select(range(0, 10))
    tokenized_dataset["test"]
)
print(results)

model.save_pretrained("./fine_tuned_model")
tokenizer.save_pretrained("./fine_tuned_model")

{'eval_loss': 3.2472450733184814, 'eval_model_preparation_time': 0.004, 'eval_macro_precision': 0.039374073146533455, 'eval_weighted_precision': 0.1169947038889422, 'eval_per_class_precision': {'PREP+PROADJ': 0.001557026080186843, 'IN': 0.00022401433691756272, 'PREP+PRO-KS': 0.0009269988412514484, 'NPROP': 0.03873574144486692, 'PREP+PROSUB': 0.0015748031496062992, 'KC': 0.04230694887406978, 'PROPESS': 0.0006574621959237344, 'NUM': 0.038202247191011236, 'PROADJ': 0.005723630417007359, 'PREP+ART': 0.03877908431323493, 'KS': 0.0026536930561698365, 'PRO-KS': 0.018703898840885143, 'ADJ': 0.09069981583793739, 'ADV-KS': 0.0004393673110720562, 'N': 0.3588334742180896, 'PREP': 0.12414823112106786, 'PROSUB': 0.017066233238520925, 'PREP+PROPESS': 0.002097535395909806, 'PDEN': 0.0, 'V': 0.12671905697445973, 'PREP+ADV': 0.00044583147570218456, 'PCP': 0.015246015246015246, 'CUR': 0.0, 'ADV': 0.06662553979025293, 'PU': 0.013074969429028313, 'ART': 0.01828428303068253}, 'eval_runtime': 1656.9818, 'eva

('./fine_tuned_model/tokenizer_config.json',
 './fine_tuned_model/special_tokens_map.json',
 './fine_tuned_model/vocab.txt',
 './fine_tuned_model/added_tokens.json',
 './fine_tuned_model/tokenizer.json')

## Against another model trained on the same dataset

In [13]:
# Load `lisaterumi/postagger-portuguese` from hugging face and store in a `competitor_model` variable

from transformers import BertForTokenClassification

competitor_model = BertForTokenClassification.from_pretrained(
    "lisaterumi/postagger-portuguese"
)

competitor_tokenizer = AutoTokenizer.from_pretrained("lisaterumi/postagger-portuguese")

competitor_model.to("cpu")

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(29794, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12

In [14]:
def tokenize_and_align_labels_for_comparison(examples):
    tokenized_inputs = competitor_tokenizer(
        examples["tokens"],
        truncation=True,
        is_split_into_words=True,
        padding="max_length",
        max_length=128,
    )

    # Align labels with tokenized inputs
    labels = []
    for i, label in enumerate(examples["labels"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        aligned_labels = [
            -100 if word_id is None else label[word_id] for word_id in word_ids
        ]
        labels.append(aligned_labels)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs


tokenized_test_dataset_comparison = tokenized_dataset.map(
    tokenize_and_align_labels_for_comparison, batched=True
)

Map: 100%|██████████| 1997/1997 [00:00<00:00, 4166.36 examples/s]


In [15]:
# Evaluate the `competitor_model` using the same `compute_metrics` function and the same `tokenized_dataset["test"]` dataset

# Define dummy training arguments (just for evaluation)
dummy_training_args = TrainingArguments(
    output_dir="./results",
    logging_dir="./logs",
    do_train=False,
    do_eval=True,
    evaluation_strategy="no",
)

# Create a Trainer for the comparison model
competitor_trainer = Trainer(
    model=competitor_model,
    args=training_args,
    eval_dataset=tokenized_test_dataset_comparison["validation"],
    processing_class=competitor_tokenizer,
    compute_metrics=compute_metrics,  # Use the same custom metric function
)



In [16]:
eval_results_comparison = competitor_trainer.evaluate(
    tokenized_test_dataset_comparison["test"]
)

print(eval_results_comparison)

{'eval_loss': 10.260217666625977, 'eval_model_preparation_time': 0.0035, 'eval_macro_precision': 0.034122055046705034, 'eval_weighted_precision': 0.07739469736230463, 'eval_per_class_precision': {'PREP+PROADJ': 0.0, 'IN': 0.0005425935973955507, 'PREP+PRO-KS': 0.0, 'NPROP': 0.06532663316582915, 'PREP+PROSUB': 0.0005937654626422563, 'KC': 0.05448717948717949, 'PROPESS': 0.01650038372985418, 'NUM': 0.011442141623488774, 'PROADJ': 0.018442622950819672, 'PREP+ART': 0.05470727180698498, 'KS': 0.009498564170532362, 'PRO-KS': 0.022401433691756272, 'ADJ': 0.049019607843137254, 'ADV-KS': 0.002431681333950903, 'N': 0.1430054848500985, 'PREP': 0.08588692274492879, 'PROSUB': 0.08333333333333333, 'PREP+PROPESS': 0.0, 'PDEN': 0.0050009805844283195, 'V': 0.0761904761904762, 'PREP+ADV': 0.0, 'PCP': 0.006422018348623854, 'CUR': 0.0, 'ADV': 0.058823529411764705, 'PU': 0.12311680688710624, 'ART': 0.0}, 'eval_runtime': 1371.498, 'eval_samples_per_second': 7.282, 'eval_steps_per_second': 0.911}
