In [1]:
import torch
from torch import nn
from transformers import RobertaModel, RobertaTokenizer

# Define custom weighted loss (assume you have implemented this class)
class CustomWeightedLoss(nn.Module):
    def __init__(self, primary_weight, secondary_weight):
        super(CustomWeightedLoss, self).__init__()
        self.primary_weight = primary_weight
        self.secondary_weight = secondary_weight

    def forward(self, logits, labels):
        # Implement your custom loss calculation here
        pass

# Define the model
class RobertaForMultilabelRegression(nn.Module):
    def __init__(self, roberta_model_name, num_labels, primary_weight=1.0, secondary_weight=0.1):
        super(RobertaForMultilabelRegression, self).__init__()
        self.num_labels = num_labels
        self.roberta = RobertaModel.from_pretrained(roberta_model_name)
        self.regressor = nn.Linear(self.roberta.config.hidden_size, num_labels)
        self.loss_fn = CustomWeightedLoss(primary_weight, secondary_weight)

    def forward(self, input_ids, attention_mask=None, labels=None):
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        sequence_output = outputs[0][:, 0, :]  # Take <s> token (equiv. to [CLS])
        logits = self.regressor(sequence_output)
        loss = None
        if labels is not None:
            loss = self.loss_fn(logits, labels)
        return (loss, logits) if loss is not None else logits

    @classmethod
    def load_model(cls, save_directory, roberta_model_name, num_labels, primary_weight=1.0, secondary_weight=0.1):
        model = cls(roberta_model_name, num_labels, primary_weight, secondary_weight)
        model.load_state_dict(torch.load(f"{save_directory}/pytorch_model.bin"))
        return model

# Load the model
model = RobertaForMultilabelRegression.load_model("fine-tuned-roberta", 'roberta-base', num_labels=10)
model.eval()  # Set the model to evaluation mode

# Load the tokenizer
tokenizer = RobertaTokenizer.from_pretrained("fine-tuned-roberta")
true_avg_scores = {'correctness_score': [], 'logic_score': [], 'truthfulness_score': []}
true_std_scores = {'correctness_score': [], 'logic_score': [], 'truthfulness_score': []}
predicted_scores = {'correctness_score': [], 'logic_score': [], 'truthfulness_score': []}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
# Example usage: Perform inference
import json
# Load your dataset
# Load the JSON files
with open('test_data.json', 'r') as test_file, open('score_dis.json', 'r') as score_dis_file:
    test_data = json.load(test_file)
    score_dis_data = json.load(score_dis_file)

# Create dictionaries for grader statistics
grader_stats = {}
for grader in score_dis_data:
    grader_name = grader['grader']
    grader_stats[grader_name] = {
        'correctness_score': grader['correctness_score'],
        'logic_score': grader['logic_score'],
        'truthfulness_score': grader['truthfulness_score']
    }

# Initialize dictionaries to hold the scores
score_sums = {}
score_counts = {}

# Initialize dictionaries to hold the standardized score sums
standardized_sums = {}
standardized_counts = {}

# Iterate over each item in the data
for item in test_data['data']:
    displayed_text = item['displayed_text']
    grader = item['grader']
    scores = ['correctness_score', 'logic_score', 'truthfulness_score']
    
    if displayed_text not in score_sums:
        score_sums[displayed_text] = {score: 0 for score in scores}
        score_counts[displayed_text] = {score: 0 for score in scores}
        standardized_sums[displayed_text] = {score: 0 for score in scores}
        standardized_counts[displayed_text] = {score: 0 for score in scores}

    for score in scores:
        score_sums[displayed_text][score] += item[score]
        score_counts[displayed_text][score] += 1
        
        mean_score = grader_stats[grader][score]['mean']
        std_dev_score = grader_stats[grader][score]['std_dev']
        standardized_score = (item[score] - mean_score) / std_dev_score
        standardized_sums[displayed_text][score] += standardized_score
        standardized_counts[displayed_text][score] += 1

# Compute the average scores and rescale the standardized scores
average_scores = {}
rescaled_scores = {}
for text in score_sums:
    average_scores[text] = {score: score_sums[text][score] / score_counts[text][score] for score in score_sums[text]}
    rescaled_scores[text] = {
        score: (
            (standardized_sums[text][score] / standardized_counts[text][score]) * grader_stats['abram'][score]['std_dev']
            + grader_stats['abram'][score]['mean']
        ) for score in standardized_sums[text]
    }
    # Get model predictions
    inputs = tokenizer(text, return_tensors="pt")
    with torch.no_grad():
        logits = model(**inputs)

    # Assuming the first three labels are correctness, logic, and truthfulness scores
    predicted_scores['correctness_score'].append(logits[0, 0].item())
    predicted_scores['logic_score'].append(logits[0, 1].item())
    predicted_scores['truthfulness_score'].append(logits[0, 2].item())
    
    # Append true scores for MSE calculation
    true_avg_scores['correctness_score'].append(average_scores[text]['correctness_score'])
    true_avg_scores['logic_score'].append(average_scores[text]['logic_score'])
    true_avg_scores['truthfulness_score'].append(average_scores[text]['truthfulness_score'])
    true_std_scores['correctness_score'].append(rescaled_scores[text]['correctness_score'])
    true_std_scores['logic_score'].append(rescaled_scores[text]['logic_score'])
    true_std_scores['truthfulness_score'].append(rescaled_scores[text]['truthfulness_score'])
print(predicted_scores)
"""# Print the average and rescaled scores
for text in average_scores:
    print(f"Displayed Text: {text}")
    print("  Average Scores:")
    for score, avg in average_scores[text].items():
        print(f"    {score}: {avg}")
    print("  Rescaled Standardized Scores:")
    for score, rescaled in rescaled_scores[text].items():
        print(f"    {score}: {rescaled}")
    print()"""

Token indices sequence length is longer than the specified maximum sequence length for this model (668 > 512). Running this sequence through the model will result in indexing errors


RuntimeError: The expanded size of the tensor (668) must match the existing size (514) at non-singleton dimension 1.  Target sizes: [1, 668].  Tensor sizes: [1, 514]

In [None]:
# Calculate MSE for average and standardized scores
mse_avg = {score: mean_squared_error(true_avg_scores[score], predicted_scores[score]) for score in true_avg_scores}
mse_std = {score: mean_squared_error(true_std_scores[score], predicted_scores[score]) for score in true_std_scores}

# Print MSE results
print("MSE for average scores:")
for score, mse in mse_avg.items():
    print(f"  {score}: {mse}")

print("\nMSE for standardized scores:")
for score, mse in mse_std.items():
    print(f"  {score}: {mse}")
