In [1]:
!pip install adapters

Collecting adapters
  Downloading adapters-1.1.0-py3-none-any.whl.metadata (16 kB)
Collecting transformers~=4.47.1 (from adapters)
  Downloading transformers-4.47.1-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.1/44.1 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
Downloading adapters-1.1.0-py3-none-any.whl (293 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m293.4/293.4 kB[0m [31m16.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading transformers-4.47.1-py3-none-any.whl (10.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.1/10.1 MB[0m [31m104.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: transformers, adapters
  Attempting uninstall: transformers
    Found existing installation: transformers 4.48.3
    Uninstalling transformers-4.48.3:
      Successfully uninstalled transformers-4.48.3
Successfully installed adapters-1.1.0 transformers-4.47.1


In [16]:
import torch
import torch.nn as nn
from transformers import BertModel, BertTokenizer, BertConfig, AdamW
from peft import get_peft_model, LoraConfig, TaskType

class SentimentBERT(nn.Module):
    def __init__(self, model_name="dccuchile/bert-base-spanish-wwm-uncased"):
        super().__init__()
        base_model = BertModel.from_pretrained(model_name)

        # Freeze all base model params
        for param in base_model.parameters():
            param.requires_grad = False

        lora_config = LoraConfig(
            task_type=TaskType.FEATURE_EXTRACTION,
            r=8,
            lora_alpha=16,
            lora_dropout=0.1,
            target_modules=["query", "value"]
        )
        self.bert = get_peft_model(base_model, lora_config)

        self.regressor = nn.Sequential(
            nn.Linear(self.bert.config.hidden_size, 256),
            nn.ReLU(),
            nn.Linear(256, 1)  # Predict sentiment score from 1 to 5
        )

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output  # CLS token representation
        return self.regressor(pooled_output).squeeze(-1)

In [7]:
import json
from torch.utils.data import Dataset

class JSONSentimentDataset(Dataset):
    def __init__(self, data, tokenizer, max_len=128):
        self.data = data
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sample = self.data[idx]
        title = sample.get("title", "")
        review = sample.get("review", "")
        rating = sample.get("rating", 0.0)

        combined_text = f"{title}. {review}"  # or customize formatting here

        encoding = self.tokenizer(
            combined_text,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'score': torch.tensor(rating / 5.0, dtype=torch.float)
        }

In [8]:
def balanced_sample_json(json_path, samples_per_class):
    data = json.load(open(json_path, "r", encoding="utf-8"))

    grouped_reviews = defaultdict(list)
    for review in data:
        rating = review.get("rating")
        if rating is not None:
            grouped_reviews[rating].append(review)

    sampled_reviews = []
    for rating in grouped_reviews:
        sampled = random.sample(
            grouped_reviews[rating],
            min(samples_per_class, len(grouped_reviews[rating]))
        )
        sampled_reviews.extend(sampled)

    random.shuffle(sampled_reviews)
    return sampled_reviews

In [9]:
def prepare_dataloaders(json_path, tokenizer, samples_per_class, batch_size=8, max_len=128):
    sampled_data = balanced_sample_json(json_path, samples_per_class)

    dataset = JSONSentimentDataset(sampled_data, tokenizer, max_len)

    total_size = len(dataset)
    train_size = int(0.8 * total_size)
    val_size = int(0.1 * total_size)
    test_size = total_size - train_size - val_size

    train_dataset, val_dataset, test_dataset = random_split(
        dataset,
        [train_size, val_size, test_size],
        generator=torch.Generator().manual_seed(42)
    )

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size)
    test_loader = DataLoader(test_dataset, batch_size=batch_size)
    print(f"Train loader length (batches): {len(train_loader)}")
    print(f"Validation loader length (batches): {len(val_loader)}")
    print(f"Validation loader length (batches): {len(test_loader)}")

    return train_loader, val_loader, test_loader

In [10]:
def train_epoch(model, dataloader, optimizer, criterion, device):
    model.train()
    total_loss = 0

    for batch in dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['score'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    return total_loss / len(dataloader)


def eval_epoch(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['score'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            loss = criterion(outputs, labels)
            total_loss += loss.item()

    return total_loss / len(dataloader)

In [14]:
def run_training(json_path, samples_per_class=100, batch_size=8, num_epochs=5):
    tokenizer = BertTokenizer.from_pretrained("dccuchile/bert-base-spanish-wwm-uncased")
    train_loader, val_loader, test_loader = prepare_dataloaders(
        json_path, tokenizer, samples_per_class, batch_size=batch_size
    )

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = SentimentBERT().to(device)

    optimizer = AdamW(model.parameters(), lr=2e-5)
    criterion = nn.MSELoss()

    for epoch in range(num_epochs):
        train_loss = train_epoch(model, train_loader, optimizer, criterion, device)
        val_loss = eval_epoch(model, val_loader, criterion, device)
        print(f"Epoch {epoch+1}/{num_epochs} | Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}")

    test_loss = eval_epoch(model, test_loader, criterion, device)
    print(f"\nFinal Test Loss: {test_loss:.4f}")

    return model

In [None]:
new_model = run_training("labeled_data.json", samples_per_class=2000, batch_size=8, num_epochs=3)

Some weights of BertModel were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-uncased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3 | Train Loss: 0.0728 | Val Loss: 0.0485


In [None]:
import json

def score_and_save_predictions(model, dataloader, device, output_json="predictions.json"):
    model.eval()
    results = []

    correct = 0
    total = 0

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['score'].to(device)  # normalized [0, 1]

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)

            preds = outputs * 5.0  # denormalize to [0, 5]
            targets = labels * 5.0

            preds_rounded = torch.clamp(preds.round(), 0, 5)
            targets_rounded = torch.clamp(targets.round(), 0, 5)

            correct += (preds_rounded == targets_rounded).sum().item()
            total += targets.size(0)

            # Save individual results (for exporting to JSON)
            for i in range(len(preds)):
                results.append({
                    "predicted_score": preds[i].item(),
                    "predicted_score_rounded": preds_rounded[i].item(),
                    "true_score": targets[i].item(),
                    "true_score_rounded": targets_rounded[i].item()
                })

    accuracy = correct / total
    print(f"Test Accuracy (rounded, range 0-5): {accuracy * 100:.2f}%")

    with open(output_json, "w", encoding="utf-8") as f:
        json.dump(results, f, indent=4, ensure_ascii=False)

    print(f"Predictions saved to {output_json}")
    return accuracy

In [None]:
score_and_save_predictions(trained_model, test_loader)

Testing Accuracy: 100%|██████████| 686/686 [00:40<00:00, 16.91it/s, batch_acc=0.562]


Test Accuracy: 60.36%
Saved 10976 predictions to contrastive_model_pairwise_predictions.json
