In [None]:
!pip install adapters

Collecting adapters
  Downloading adapters-1.1.0-py3-none-any.whl.metadata (16 kB)
Collecting transformers~=4.47.1 (from adapters)
  Downloading transformers-4.47.1-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.1/44.1 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
Downloading adapters-1.1.0-py3-none-any.whl (293 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m293.4/293.4 kB[0m [31m24.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading transformers-4.47.1-py3-none-any.whl (10.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.1/10.1 MB[0m [31m128.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: transformers, adapters
  Attempting uninstall: transformers
    Found existing installation: transformers 4.48.3
    Uninstalling transformers-4.48.3:
      Successfully uninstalled transformers-4.48.3
Successfully installed adapters-1.1.0 transformers-4.47.1


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import BertTokenizer, BertModel, AdamW
from peft import get_peft_model, LoraConfig, TaskType

# Define a model that wraps BERT and applies an adaptor layer.
class TADA(nn.Module):
    def __init__(self, model_name='dccuchile/bert-base-spanish-wwm-uncased'):
        super(TADA, self).__init__()
        base_model = BertModel.from_pretrained(model_name)
        # Freeze all base model params
        for param in base_model.parameters():
            param.requires_grad = False

        lora_config = LoraConfig(
            task_type=TaskType.FEATURE_EXTRACTION,
            r=8,
            lora_alpha=16,
            lora_dropout=0.1,
            target_modules=["query", "value"]
        )
        self.bert = get_peft_model(base_model, lora_config)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        # Extract the [CLS] token representation (first token)
        cls_token = outputs.last_hidden_state[:, 0, :]
        return cls_token

In [None]:
# Instantiate the tokenizer and model.
tokenizer = BertTokenizer.from_pretrained('dccuchile/bert-base-spanish-wwm-uncased')
model = TADA()

# Define optimizer and loss function.
optimizer = AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=5e-5)
criterion = nn.MSELoss()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/310 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/248k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/134 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/486k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/650 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertModel were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-uncased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from torch.utils.data import Dataset, DataLoader
# Custom Dataset to hold text pairs.
class TextPairDataset(Dataset):
    def __init__(self, data_pairs):
        """
        data_pairs: List of tuples (original_text, transformed_text)
        """
        self.data = data_pairs

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        original_text, transformed_text = self.data[idx]
        return {'text1': original_text, 'text2': transformed_text}

# Collate function to batch and tokenize samples.
def collate_fn(batch, tokenizer, max_length=512):
    texts1 = [item['text1'] for item in batch]
    texts2 = [item['text2'] for item in batch]
    encoding1 = tokenizer(texts1, return_tensors="pt", padding=True, truncation=True, max_length=max_length)
    encoding2 = tokenizer(texts2, return_tensors="pt", padding=True, truncation=True, max_length=max_length)
    return encoding1, encoding2

In [None]:
import json
import random
# Load JSON dataset.
with open('top_n_chilean_examples.json', 'r', encoding='utf-8') as f:
    data_json = json.load(f)

# Extract paired texts: use the "original_text" (or the key as a fallback) and "transformed_text"
data_pairs = []
for key, value in data_json.items():
    original_text = value.get('original_text', key)
    transformed_text = value.get('transformed_text', None)
    if transformed_text is not None:
        data_pairs.append((original_text, transformed_text))

# Shuffle and split into train and eval sets (e.g., 80/20 split)
random.shuffle(data_pairs)
split_idx = int(len(data_pairs) * 0.8)
train_pairs = data_pairs[:split_idx]
eval_pairs = data_pairs[split_idx:]

# Create dataset objects.
train_dataset = TextPairDataset(train_pairs)
eval_dataset = TextPairDataset(eval_pairs)

In [None]:
# Create DataLoaders.
batch_size = 8
train_loader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    shuffle=True,
    collate_fn=lambda batch: collate_fn(batch, tokenizer)
)
eval_loader = DataLoader(
    eval_dataset,
    batch_size=batch_size,
    shuffle=False,
    collate_fn=lambda batch: collate_fn(batch, tokenizer)
)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

TADA(
  (bert): PeftModelForFeatureExtraction(
    (base_model): LoraModel(
      (model): BertModel(
        (embeddings): BertEmbeddings(
          (word_embeddings): Embedding(31002, 768, padding_idx=1)
          (position_embeddings): Embedding(512, 768)
          (token_type_embeddings): Embedding(2, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (encoder): BertEncoder(
          (layer): ModuleList(
            (0-11): 12 x BertLayer(
              (attention): BertAttention(
                (self): BertSdpaSelfAttention(
                  (query): lora.Linear(
                    (base_layer): Linear(in_features=768, out_features=768, bias=True)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.1, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      (default): Linear(in_features=768, out

In [None]:
num_epochs = 3
for epoch in range(num_epochs):
    # Training phase.
    model.train()
    total_train_loss = 0.0
    for encoding1, encoding2 in train_loader:
        # Move batch tensors to device.
        input_ids1 = encoding1['input_ids'].to(device)
        attention_mask1 = encoding1['attention_mask'].to(device)
        input_ids2 = encoding2['input_ids'].to(device)
        attention_mask2 = encoding2['attention_mask'].to(device)

        optimizer.zero_grad()
        cls1 = model(input_ids1, attention_mask1)
        cls2 = model(input_ids2, attention_mask2)
        loss = criterion(cls1, cls2)
        loss.backward()
        optimizer.step()

        total_train_loss += loss.item()

    avg_train_loss = total_train_loss / len(train_loader)

    # Evaluation phase.
    model.eval()
    total_eval_loss = 0.0
    with torch.no_grad():
        for encoding1, encoding2 in eval_loader:
            input_ids1 = encoding1['input_ids'].to(device)
            attention_mask1 = encoding1['attention_mask'].to(device)
            input_ids2 = encoding2['input_ids'].to(device)
            attention_mask2 = encoding2['attention_mask'].to(device)
            cls1 = model(input_ids1, attention_mask1)
            cls2 = model(input_ids2, attention_mask2)
            loss = criterion(cls1, cls2)
            total_eval_loss += loss.item()
    avg_eval_loss = total_eval_loss / len(eval_loader)

    print(f"Epoch {epoch+1}/{num_epochs} - Train Loss: {avg_train_loss:.4f}, Eval Loss: {avg_eval_loss:.4f}")

    # Save the model checkpoint.
    torch.save(model.state_dict(), f'tada_adaptor_epoch_{epoch+1}.pt')
    print(f"Model saved to tada_adaptor_epoch_{epoch+1}.pt")

Epoch 1/3 - Train Loss: 0.0537, Eval Loss: 0.0018
Model saved to tada_adaptor_epoch_1.pt
Epoch 2/3 - Train Loss: 0.0070, Eval Loss: 0.0004
Model saved to tada_adaptor_epoch_2.pt
Epoch 3/3 - Train Loss: 0.0044, Eval Loss: 0.0002
Model saved to tada_adaptor_epoch_3.pt
