In [1]:
import json
from datasets import Dataset
from transformers import RobertaTokenizer, DataCollatorWithPadding

# Load your dataset
with open("train_data.json", "r") as f:
    data = json.load(f)["data"]

# Convert the dataset to the Hugging Face datasets format
dataset_dict = {
    "text": [entry["displayed_text"] for entry in data],
    "scores_labels": [[0.0 if v is None else float(v) for v in [
        entry["correctness_score"], entry["logic_score"], entry["truthfulness_score"],
        entry["confidence_score"], entry["calculation_error"], entry["hallucination_error"],
        entry["omission_error"], entry["irrelevant_error"], entry["logic_error"], entry["everything_okay"]
    ]] for entry in data]
}

dataset = Dataset.from_dict(dataset_dict)

# Initialize tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

# Tokenization function with padding and truncation
def tokenize_function(examples):
    tokenized_inputs = tokenizer(examples["text"], padding="max_length", truncation=True, max_length=256)
    tokenized_inputs["labels"] = examples["scores_labels"]
    return tokenized_inputs

# Apply the tokenization function
tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Split dataset into training and evaluation sets
split_dataset = tokenized_datasets.train_test_split(test_size=0.1)  # 10% for validation
train_dataset = split_dataset["train"]
eval_dataset = split_dataset["test"]

# Remove redundant columns to avoid confusion
train_dataset = train_dataset.remove_columns(['text', 'scores_labels'])
eval_dataset = eval_dataset.remove_columns(['text', 'scores_labels'])

# Data collator for padding
data_collator = DataCollatorWithPadding(tokenizer)




  0%|          | 0/1 [00:00<?, ?ba/s]

In [2]:
from torch.utils.data import DataLoader

# Data loaders
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=8, collate_fn=data_collator)
eval_dataloader = DataLoader(eval_dataset, batch_size=8, collate_fn=data_collator)

# Print a batch from train_dataloader to verify
for batch in train_dataloader:
    print("Batch keys:", batch.keys())
    print("Input IDs shape:", batch["input_ids"].shape)
    print("Attention Mask shape:", batch["attention_mask"].shape)
    print("Labels shape:", batch["labels"].shape)
    
    # Optionally print the actual tensor contents
    print("Input IDs:", batch["input_ids"])
    print("Attention Mask:", batch["attention_mask"])
    print("Labels:", batch["labels"])
    break  # Print only the first batch


Batch keys: dict_keys(['attention_mask', 'input_ids', 'labels'])
Input IDs shape: torch.Size([8, 256])
Attention Mask shape: torch.Size([8, 256])
Labels shape: torch.Size([8, 10])
Input IDs: tensor([[    0, 45641,    35,  ...,     7,  1591,     2],
        [    0, 45641,    35,  ...,  3838, 11124,     2],
        [    0, 45641,    35,  ..., 31566,  9713,     2],
        ...,
        [    0, 45641,    35,  ...,   322,   407,     2],
        [    0, 45641,    35,  ...,     5,  2557,     2],
        [    0, 45641,    35,  ...,  3226, 44128,     2]])
Attention Mask: tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1]])
Labels: tensor([[4., 5., 5., 5., 0., 0., 1., 0., 0., 0.],
        [3., 5., 5., 5., 0., 0., 0., 0., 0., 1.],
        [5., 5., 5., 5., 0., 0., 0., 0., 0., 1.],
        [5., 5., 5., 4., 0., 0., 0., 0., 0., 1.],
     

In [3]:
import torch
from torch import nn
from transformers import TrainingArguments, RobertaModel

# Define loss function
class CustomWeightedLoss(nn.Module):
    def __init__(self, primary_weight=1.0, secondary_weight=0.1):
        super(CustomWeightedLoss, self).__init__()
        self.primary_weight = primary_weight
        self.secondary_weight = secondary_weight
        self.loss_fn = nn.SmoothL1Loss(reduction='none')

    def forward(self, logits, labels):
        # Compute loss for the first three values with a higher weight
        primary_loss = self.loss_fn(logits[:, :3], labels[:, :3]) * self.primary_weight
        # Compute loss for the remaining values with a smaller weight
        secondary_loss = self.loss_fn(logits[:, 3:], labels[:, 3:]) * self.secondary_weight
        # Combine the losses
        loss = torch.cat([primary_loss, secondary_loss], dim=1).mean()
        return loss

# Define the model
class RobertaForMultilabelRegression(nn.Module):
    def __init__(self, roberta_model_name, num_labels, primary_weight=1.0, secondary_weight=0.1):
        super(RobertaForMultilabelRegression, self).__init__()
        self.num_labels = num_labels
        self.roberta = RobertaModel.from_pretrained(roberta_model_name)
        self.regressor = nn.Linear(self.roberta.config.hidden_size, num_labels)
        self.loss_fn = CustomWeightedLoss(primary_weight, secondary_weight)

    def forward(self, input_ids, attention_mask=None, labels=None):
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        sequence_output = outputs[0][:, 0, :]  # Take <s> token (equiv. to [CLS])
        logits = self.regressor(sequence_output)
        loss = None
        if labels is not None:
            loss = self.loss_fn(logits, labels)
        return (loss, logits) if loss is not None else logits

    def save_model(self, save_directory):
        if not os.path.exists(save_directory):
            os.makedirs(save_directory)
        torch.save(self.state_dict(), os.path.join(save_directory, 'pytorch_model.bin'))
        with open(os.path.join(save_directory, 'config.json'), 'w') as f:
            f.write(self.roberta.config.to_json_string())

    @classmethod
    def load_model(cls, save_directory, roberta_model_name, num_labels, primary_weight=1.0, secondary_weight=0.1):
        model = cls(roberta_model_name, num_labels, primary_weight, secondary_weight)
        model.load_state_dict(torch.load(os.path.join(save_directory, 'pytorch_model.bin')))
        return model

# Initialize the model
model = RobertaForMultilabelRegression('roberta-base', num_labels=10) 


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
from transformers import TrainingArguments, Trainer
from torch.optim.lr_scheduler import StepLR
# Initialize the model
model = RobertaForMultilabelRegression('roberta-base', num_labels=10, primary_weight=1.0, secondary_weight=0.1)

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=0.002,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10,
    weight_decay=0.01,
    logging_dir='./logs',  # Directory for logging
    logging_steps=10,  # Log every 10 steps
)

device = torch.device("mps" if torch.has_mps else "cpu")
model.to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=training_args.learning_rate)
scheduler = StepLR(optimizer, step_size=30, gamma=0.9)

for epoch in range(training_args.num_train_epochs):
    model.train()
    for step, batch in enumerate(train_dataloader):
        batch = {k: v.to(training_args.device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs[0]
        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

    model.eval()
    eval_loss = 0
    for step, batch in enumerate(eval_dataloader):
        batch = {k: v.to(training_args.device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)
            eval_loss += outputs[0].item()
    eval_loss /= len(eval_dataloader)
    print(f"Epoch {epoch + 1}/{training_args.num_train_epochs}, Evaluation Loss: {eval_loss}")


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  device = torch.device("mps" if torch.has_mps else "cpu")


Epoch 1/10, Evaluation Loss: 0.20008041337132454
Epoch 2/10, Evaluation Loss: 0.1779889091849327
Epoch 3/10, Evaluation Loss: 0.19569828659296035
Epoch 4/10, Evaluation Loss: 0.17517112493515014
Epoch 5/10, Evaluation Loss: 0.17760688662528992
Epoch 6/10, Evaluation Loss: 0.17577214017510415
Epoch 7/10, Evaluation Loss: 0.17948233410716058
Epoch 8/10, Evaluation Loss: 0.18440624549984933
Epoch 9/10, Evaluation Loss: 0.17658943831920623
Epoch 10/10, Evaluation Loss: 0.17892688065767287


In [5]:
# Save the model
import os
model.save_model("fine-tuned-roberta")
tokenizer.save_pretrained("fine-tuned-roberta")

('fine-tuned-roberta/tokenizer_config.json',
 'fine-tuned-roberta/special_tokens_map.json',
 'fine-tuned-roberta/vocab.json',
 'fine-tuned-roberta/merges.txt',
 'fine-tuned-roberta/added_tokens.json')