### Install modules
Note: You also might need to enter an API key from [Weights & Biases](https://wandb.ai/login)

In [1]:
!pip install transformers datasets evaluate accelerate peft Pillow

Collecting datasets
  Downloading datasets-3.3.1-py3-none-any.whl.metadata (19 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_cupti_cu12-12.4.1

### Imports

In [2]:
import torch
from transformers import RobertaTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding
from peft import LoraConfig, get_peft_model, AutoPeftModelForSequenceClassification
from datasets import load_dataset, Dataset
from torch.utils.data import DataLoader
import evaluate
from tqdm import tqdm

### Model names

In [3]:
# Model and dataset configurations
peft_model_name = 'roberta-portuguese-peft'
modified_base = 'roberta-portuguese-modified'
base_model = 'roberta-base'

### Config Parameters

In [4]:
# Training params
n_epochs = 20
batch_size = 16
learning_rate = 5e-5

# LoRA params
lora_r = 64
lora_alpha = 16
lora_dropout = 0.1

### Get Dataset in Portuguese

In [5]:
# Load dataset
dataset = load_dataset('LIACC/Emakhuwa-Portuguese-News-MT')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/3.76k [00:00<?, ?B/s]

train.jsonl:   0%|          | 0.00/13.5M [00:00<?, ?B/s]

dev.jsonl:   0%|          | 0.00/654k [00:00<?, ?B/s]

test.jsonl:   0%|          | 0.00/662k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/17403 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/964 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/993 [00:00<?, ? examples/s]

### Example of instance in the Dataset (we'll use "pt" as input and "category" as label)

In [6]:
dataset['train'][0]

{'seg_index': 2,
 'pt': 'Matias Guente, do Canal de Moçambique, vence Prémio Internacional de Liberdade de Imprensa',
 'vmw': 'Matias Guene, ooKanaale ya Mocampiikhi, oolola e peremiyo internasionaale ya woopowa wa imprensa',
 'source': 'a_matias-guente-do-canal-de-moçambique-vence-prémio-internacional-de-liberdade-de-imprensa_5930109.txt',
 'project_title': 'desporto-cultura-1',
 'category': 'cultura',
 'domain': 'news',
 'writting_style': 'standard',
 'job_id': '6121524-e1d3e4d73a0f',
 'translators': 'Raja,benedito',
 'project_id': 6121524.0,
 'segment_id': 2591966325.0,
 'i_segment_id': 612152459301090.0}

### Merge all sets (train, test, validation) and filter by labels

In [7]:
# Define selected labels
selected_labels = {'cultura', 'desporto', 'economia', 'mundo', 'saude'}
num_labels = len(selected_labels)

# Merge all dataset splits into a single list
all_data = {
    'pt': dataset['train']['pt'] + dataset['validation']['pt'] + dataset['test']['pt'],
    'category': dataset['train']['category'] + dataset['validation']['category'] + dataset['test']['category']
}

# Filter dataset to only include selected labels
filtered_data = {
    'pt': [],
    'category': [],
    'labels': []
}

# Convert category names to numeric labels
label2id = {label: i for i, label in enumerate(sorted(selected_labels))}
id2label = {i: label for label, i in label2id.items()}

for text, category in zip(all_data['pt'], all_data['category']):
    if category in selected_labels:
        filtered_data['pt'].append(text)
        filtered_data['category'].append(category)
        filtered_data['labels'].append(label2id[category])

# Convert to Hugging Face Dataset
full_dataset = Dataset.from_dict(filtered_data)

# Shuffle dataset
full_dataset = full_dataset.shuffle(seed=42)

### Re-split the dataset into train, test and validation

In [8]:
# Split dataset (80% train, 10% validation, 10% test)
train_size = 0.8
val_size = 0.1
test_size = 0.1

train_test_split = full_dataset.train_test_split(test_size=(val_size + test_size), seed=42)
val_test_split = train_test_split['test'].train_test_split(test_size=(test_size / (val_size + test_size)), seed=42)

train_dataset = train_test_split['train']
eval_dataset = val_test_split['train']
test_dataset = val_test_split['test']

### Tokenize the datasets

In [9]:
# Initialize tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

# Tokenization function
def preprocess(examples):
    return tokenizer(examples['pt'], truncation=True, padding=True)

# Tokenize the datasets and remove unnecessary columns
train_dataset = train_dataset.map(preprocess, batched=True, remove_columns=['pt', 'category'])
eval_dataset = eval_dataset.map(preprocess, batched=True, remove_columns=['pt', 'category'])
test_dataset = test_dataset.map(preprocess, batched=True, remove_columns=['pt', 'category'])

# Print dataset sizes
print("\nDataset sizes:")
print(f"Train set: {len(train_dataset)}")
print(f"Eval set: {len(eval_dataset)}")
print(f"Test set: {len(test_dataset)}")

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Map:   0%|          | 0/11478 [00:00<?, ? examples/s]

Map:   0%|          | 0/1435 [00:00<?, ? examples/s]

Map:   0%|          | 0/1435 [00:00<?, ? examples/s]


Dataset sizes:
Train set: 11478
Eval set: 1435
Test set: 1435


### Config Trainer

In [10]:
# Prepare data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='steps',
    learning_rate=learning_rate,
    num_train_epochs=n_epochs,
    per_device_train_batch_size=batch_size,
)

def get_trainer(model):
    return Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        data_collator=data_collator,
    )



### Initialize model

In [11]:
# Initialize the base model
model = AutoModelForSequenceClassification.from_pretrained(
    base_model,
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id
)

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Config LoRA

In [12]:
# Configure and create PEFT model
peft_config = LoraConfig(
    task_type="SEQ_CLS",
    inference_mode=False,
    r=lora_r,
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout
)
peft_model = get_peft_model(model, peft_config)
print('PEFT Model')
peft_model.print_trainable_parameters()

PEFT Model
trainable params: 2,953,733 || all params: 127,603,210 || trainable%: 2.3148


### Train and Save

In [13]:
peft_lora_finetuning_trainer = get_trainer(peft_model)
peft_lora_finetuning_trainer.train()
peft_lora_finetuning_trainer.evaluate()

# Save the model and tokenizer
tokenizer.save_pretrained(modified_base)
peft_model.save_pretrained(peft_model_name)



<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33molijacklu[0m ([33molijacklu-cole-normale-sup-rieure-paris-saclay[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Step,Training Loss,Validation Loss
500,1.3166,0.98459
1000,1.0221,0.913876
1500,0.9749,0.858731
2000,0.9234,0.843835
2500,0.8982,0.825224
3000,0.8804,0.818557
3500,0.8632,0.796267
4000,0.8491,0.801545
4500,0.834,0.781278
5000,0.8142,0.775486


### Evaluate

In [14]:
# Inference functions
def load_model_for_inference():
    inference_model = AutoPeftModelForSequenceClassification.from_pretrained(
        peft_model_name,
        id2label=id2label
    )
    tokenizer = RobertaTokenizer.from_pretrained(modified_base)
    return inference_model, tokenizer

def classify(text, inference_model, tokenizer, gold):
    device = next(inference_model.parameters()).device  # Get the device the model is on
    inputs = tokenizer(text, truncation=True, padding=True, return_tensors="pt")
    # Move input tensors to the same device as the model
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        output = inference_model(**inputs)
    prediction = output.logits.argmax(dim=-1).item()
    emoji = "✅" if id2label[prediction] == gold else "❌"

    print(f'Text: {text}\nPrediction: {id2label[prediction]}\nGold: {gold}\n{emoji}\n')


# Evaluation function
def evaluate_model(inference_model, dataset):
    metric = evaluate.load('accuracy')
    eval_dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        collate_fn=data_collator
    )

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    inference_model.to(device)
    inference_model.eval()

    for batch in tqdm(eval_dataloader):
        batch.to(device)
        with torch.no_grad():
            outputs = inference_model(**batch)
            predictions = outputs.logits.argmax(dim=-1)
        metric.add_batch(
            predictions=predictions,
            references=batch["labels"]
        )

    eval_metric = metric.compute()
    return eval_metric

In [15]:
original_model = AutoModelForSequenceClassification.from_pretrained(
    base_model,
    num_labels=num_labels,
    id2label=id2label
)
base_perf = evaluate_model(original_model, test_dataset)

# Evaluate LoRA fine-tuned model
inference_model, tokenizer = load_model_for_inference()
lora_perf = evaluate_model(inference_model, test_dataset)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

100%|██████████| 90/90 [00:09<00:00,  9.65it/s]
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 90/90 [00:09<00:00,  9.18it/s]


### Compare test accuracies: Base vs LoRA

In [16]:
print(f"Base model performance: {base_perf['accuracy']:.3f}")
print(f"LoRA Fine-tuned model performance: {lora_perf['accuracy']:.3f}")

Base model performance: 0.166
LoRA Fine-tuned model performance: 0.734


### Try on some dummy examples created by me (native speaker)

### Original model

In [17]:
# Test classification
sample_text = "O chanceler se encontrou ontem com o Primeiro Ministro britânico para discutir as relações comerciais entre os dois países"
gold = "mundo"
classify(sample_text, original_model, tokenizer, gold)

sample_text = "O Banco Central decidiu aumentar novamente a taxa SELIC para tentar combater as altas da inflação"
gold = "economia"
classify(sample_text, original_model, tokenizer, gold)

sample_text = "O Cruzeiro anunciou a contratação de um novo treinador para o restante da temporada de 2025"
gold = "desporto"
classify(sample_text, original_model, tokenizer, gold)

sample_text = "O filme Ainda Estou aqui, que conta com a atriz Fernanda Torres, foi indicado a três premios, incluindo Melhor Filme"
gold = "cultura"
classify(sample_text, original_model, tokenizer, gold)

sample_text = "Jõao Fonseca ganhou ontem seu primeiro torneio na Argentina"
gold = "desporto"
classify(sample_text, original_model, tokenizer, gold)

sample_text = "Aumenta o número de casos de catapora e sarampo nas escolas da rede pública do país"
gold = "saude"
classify(sample_text, original_model, tokenizer, gold)

sample_text = "O Ministro da Fazenda estuda implementar uma taxa sobre importações vindas da China"
gold = "economia"
classify(sample_text, original_model, tokenizer, gold)

sample_text = "Representantes de Rússia e Estados Unidos se reuniram para debater o fim da guerra na Ucrânina"
gold = "mundo"
classify(sample_text, original_model, tokenizer, gold)

sample_text = "O surto de casos de dengue em São Paulo fez com que o governo intesificasse a campanha de vacinação"
gold = "saude"
classify(sample_text, original_model, tokenizer, gold)

Text: O chanceler se encontrou ontem com o Primeiro Ministro britânico para discutir as relações comerciais entre os dois países
Prediction: saude
Gold: mundo
❌

Text: O Banco Central decidiu aumentar novamente a taxa SELIC para tentar combater as altas da inflação
Prediction: saude
Gold: economia
❌

Text: O Cruzeiro anunciou a contratação de um novo treinador para o restante da temporada de 2025
Prediction: saude
Gold: desporto
❌

Text: O filme Ainda Estou aqui, que conta com a atriz Fernanda Torres, foi indicado a três premios, incluindo Melhor Filme
Prediction: saude
Gold: cultura
❌

Text: Jõao Fonseca ganhou ontem seu primeiro torneio na Argentina
Prediction: saude
Gold: desporto
❌

Text: Aumenta o número de casos de catapora e sarampo nas escolas da rede pública do país
Prediction: saude
Gold: saude
✅

Text: O Ministro da Fazenda estuda implementar uma taxa sobre importações vindas da China
Prediction: saude
Gold: economia
❌

Text: Representantes de Rússia e Estados Unidos se reun

### LoRA model

In [18]:
# Test classification
sample_text = "O chanceler se encontrou ontem com o Primeiro Ministro britânico para discutir as relações comerciais entre os dois países"
gold = "mundo"
classify(sample_text, inference_model, tokenizer, gold)

sample_text = "O Banco Central decidiu aumentar novamente a taxa SELIC para tentar combater as altas da inflação"
gold = "economia"
classify(sample_text, inference_model, tokenizer, gold)

sample_text = "O Cruzeiro anunciou a contratação de um novo treinador para o restante da temporada de 2025"
gold = "desporto"
classify(sample_text, inference_model, tokenizer, gold)

sample_text = "O filme Ainda Estou aqui, que conta com a atriz Fernanda Torres, foi indicado a três premios, incluindo Melhor Filme"
gold = "cultura"
classify(sample_text, inference_model, tokenizer, gold)

sample_text = "Jõao Fonseca ganhou ontem seu primeiro torneio na Argentina"
gold = "desporto"
classify(sample_text, inference_model, tokenizer, gold)

sample_text = "Aumenta o número de casos de catapora e sarampo nas escolas da rede pública do país"
gold = "saude"
classify(sample_text, inference_model, tokenizer, gold)

sample_text = "O Ministro da Fazenda estuda implementar uma taxa sobre importações vindas da China"
gold = "economia"
classify(sample_text, inference_model, tokenizer, gold)

sample_text = "Representantes de Rússia e Estados Unidos se reuniram para debater o fim da guerra na Ucrânina"
gold = "mundo"
classify(sample_text, inference_model, tokenizer, gold)

sample_text = "O surto de casos de dengue em São Paulo fez com que o governo intesificasse a campanha de vacinação"
gold = "saude"
classify(sample_text, inference_model, tokenizer, gold)

Text: O chanceler se encontrou ontem com o Primeiro Ministro britânico para discutir as relações comerciais entre os dois países
Prediction: mundo
Gold: mundo
✅

Text: O Banco Central decidiu aumentar novamente a taxa SELIC para tentar combater as altas da inflação
Prediction: economia
Gold: economia
✅

Text: O Cruzeiro anunciou a contratação de um novo treinador para o restante da temporada de 2025
Prediction: desporto
Gold: desporto
✅

Text: O filme Ainda Estou aqui, que conta com a atriz Fernanda Torres, foi indicado a três premios, incluindo Melhor Filme
Prediction: cultura
Gold: cultura
✅

Text: Jõao Fonseca ganhou ontem seu primeiro torneio na Argentina
Prediction: desporto
Gold: desporto
✅

Text: Aumenta o número de casos de catapora e sarampo nas escolas da rede pública do país
Prediction: saude
Gold: saude
✅

Text: O Ministro da Fazenda estuda implementar uma taxa sobre importações vindas da China
Prediction: economia
Gold: economia
✅

Text: Representantes de Rússia e Estados 