In [1]:
import json
from datasets import Dataset
from transformers import RobertaTokenizer, DataCollatorWithPadding

# Load your dataset
with open("train_data.json", "r") as f:
    data = json.load(f)["data"]

# Convert the dataset to the Hugging Face datasets format
dataset_dict = {
    "text": [entry["displayed_text"] for entry in data],
    "scores_labels": [[0.0 if v is None else float(v) for v in [
        entry["correctness_score"], entry["logic_score"], entry["truthfulness_score"],
        entry["confidence_score"], entry["calculation_error"], entry["hallucination_error"],
        entry["omission_error"], entry["irrelevant_error"], entry["logic_error"], entry["everything_okay"]
    ]] for entry in data]
}

dataset = Dataset.from_dict(dataset_dict)

# Initialize tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

# Tokenization function with padding and truncation
def tokenize_function(examples):
    tokenized_inputs = tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)
    tokenized_inputs["labels"] = examples["scores_labels"]
    return tokenized_inputs

# Apply the tokenization function
tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Split dataset into training and evaluation sets
split_dataset = tokenized_datasets.train_test_split(test_size=0.1)  # 10% for validation
train_dataset = split_dataset["train"]
eval_dataset = split_dataset["test"]

# Remove redundant columns to avoid confusion
train_dataset = train_dataset.remove_columns(['text', 'scores_labels'])
eval_dataset = eval_dataset.remove_columns(['text', 'scores_labels'])

# Data collator for padding
data_collator = DataCollatorWithPadding(tokenizer)




tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]



  0%|          | 0/1 [00:00<?, ?ba/s]

In [2]:
from torch.utils.data import DataLoader

# Data loaders
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=8, collate_fn=data_collator)
eval_dataloader = DataLoader(eval_dataset, batch_size=8, collate_fn=data_collator)

# Print a batch from train_dataloader to verify
for batch in train_dataloader:
    print("Batch keys:", batch.keys())
    print("Input IDs shape:", batch["input_ids"].shape)
    print("Attention Mask shape:", batch["attention_mask"].shape)
    print("Labels shape:", batch["labels"].shape)
    
    # Optionally print the actual tensor contents
    print("Input IDs:", batch["input_ids"])
    print("Attention Mask:", batch["attention_mask"])
    print("Labels:", batch["labels"])
    break  # Print only the first batch


Batch keys: dict_keys(['attention_mask', 'input_ids', 'labels'])
Input IDs shape: torch.Size([8, 128])
Attention Mask shape: torch.Size([8, 128])
Labels shape: torch.Size([8, 10])
Input IDs: tensor([[    0, 45641,    35,  ...,   338, 14982,     2],
        [    0, 45641,    35,  ...,     9,    86,     2],
        [    0, 45641,    35,  ...,    75,    28,     2],
        ...,
        [    0, 45641,    35,  ...,    42,    35,     2],
        [    0, 45641,    35,  ...,    16,     5,     2],
        [    0, 45641,    35,  ...,  1258,  6680,     2]])
Attention Mask: tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1]])
Labels: tensor([[5., 5., 5., 5., 0., 0., 0., 0., 0., 1.],
        [5., 5., 5., 5., 0., 0., 0., 0., 0., 0.],
        [4., 4., 4., 5., 0., 1., 1., 0., 0., 0.],
        [3., 4., 3., 0., 0., 1., 0., 0., 1., 0.],
     

In [3]:
import torch
from torch import nn
from transformers import TrainingArguments, RobertaModel

# Define loss function
class CustomWeightedLoss(nn.Module):
    def __init__(self, primary_weight=1.0, secondary_weight=0.1):
        super(CustomWeightedLoss, self).__init__()
        self.primary_weight = primary_weight
        self.secondary_weight = secondary_weight
        self.loss_fn = nn.SmoothL1Loss(reduction='none')

    def forward(self, logits, labels):
        # Compute loss for the first three values with a higher weight
        primary_loss = self.loss_fn(logits[:, :3], labels[:, :3]) * self.primary_weight
        # Compute loss for the remaining values with a smaller weight
        secondary_loss = self.loss_fn(logits[:, 3:], labels[:, 3:]) * self.secondary_weight
        # Combine the losses
        loss = torch.cat([primary_loss, secondary_loss], dim=1).mean()
        return loss

# Define the model
class RobertaForMultilabelRegression(nn.Module):
    def __init__(self, roberta_model_name, num_labels, primary_weight=1.0, secondary_weight=0.1):
        super(RobertaForMultilabelRegression, self).__init__()
        self.num_labels = num_labels
        self.roberta = RobertaModel.from_pretrained(roberta_model_name)
        self.regressor = nn.Linear(self.roberta.config.hidden_size, num_labels)
        self.loss_fn = CustomWeightedLoss(primary_weight, secondary_weight)

    def forward(self, input_ids, attention_mask=None, labels=None):
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        sequence_output = outputs[0][:, 0, :]  # Take <s> token (equiv. to [CLS])
        logits = self.regressor(sequence_output)
        loss = None
        if labels is not None:
            loss = self.loss_fn(logits, labels)
        return (loss, logits) if loss is not None else logits

# Initialize the model
model = RobertaForMultilabelRegression('roberta-base', num_labels=10) 




model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
from transformers import TrainingArguments, Trainer

# Initialize the model
model = RobertaForMultilabelRegression('roberta-base', num_labels=10, primary_weight=1.0, secondary_weight=0.1)

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=0.002,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir='./logs',  # Directory for logging
    logging_steps=10,  # Log every 10 steps
)

# Trainer initialization
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
)

# Start training
trainer.train()


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.21.0`: Please run `pip install transformers[torch]` or `pip install accelerate -U`