In [1]:
import torch
from torch import nn
from transformers import RobertaModel, RobertaTokenizer

graders=['ryan', 'haoran', 'abram', 'ziwei', 'louis', 'mikeli']
class RobertaForMultilabelRegression(nn.Module):
    def __init__(self, roberta_model_name: str, num_labels: int):
        super(RobertaForMultilabelRegression, self).__init__()
        self.num_labels = num_labels
        self.roberta = RobertaModel.from_pretrained(roberta_model_name)
        self.regressor = nn.Linear(self.roberta.config.hidden_size, num_labels)
        self.loss_fn = nn.SmoothL1Loss(reduction='mean')  # or 'none' if you prefer

    def forward(self, input_ids, attention_mask=None, labels=None):
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        sequence_output = outputs[0][:, 0, :]  # Take <s> token (equiv. to [CLS])
        logits = self.regressor(sequence_output)
        
        loss = None
        if labels is not None:
            loss = self.loss_fn(logits, labels)

        return (loss, logits) if loss is not None else logits

    def save_model(self, save_directory: str):
        if not os.path.exists(save_directory):
            os.makedirs(save_directory)
        torch.save(self.state_dict(), os.path.join(save_directory, 'pytorch_model.bin'))
        with open(os.path.join(save_directory, 'config.json'), 'w') as f:
            f.write(self.roberta.config.to_json_string())

    @classmethod
    def load_model(cls, save_directory: str, roberta_model_name: str, num_labels: int):
        model = cls(roberta_model_name, num_labels)
        model.load_state_dict(torch.load(os.path.join(save_directory, 'pytorch_model.bin')))
        return model


In [2]:
import json
import os
from openai import OpenAI
client = OpenAI()
def generate_score(text):
    response=client.chat.completions.create(
        model="gpt-4",
        temperature=0.8,
        max_tokens=800,
        messages=[
            {"role": "system", "content": """You are reviewing response to physics questions. You will be given a question, a response, 
            and a ground truth. Using the ground truth as reference, comment on the response in terms of the presence of calculation error, 
            hallucination error, irrelevancy, and logic error. Be objective and comprehensive, but keep it concise. You must keep your 
            response within 200 words at most."""},
            {"role": "user", "content": text},
        ],
    )
    return response.choices[0].message.content
    
with open('test_data.json', 'r') as test_file:
    test_data = json.load(test_file)

models={}
for grader in graders:
    models[grader]=RobertaForMultilabelRegression.load_model("fine-tuned-roberta_"+grader, 'roberta-base', num_labels=3)
    models[grader].eval()
tokenizer = RobertaTokenizer.from_pretrained("fine-tuned-roberta")

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['robert

In [3]:
import pandas as pd
# Initialize dictionaries to hold the scores
score_sums = {}
score_counts = {}
pred_sums={}
texts=[]
model_mse={'ryan':{'correctness_score':0.0, 'logic_score':0.0, 'truthfulness_score':0.0},
           'haoran':{'correctness_score':0.0, 'logic_score':0.0, 'truthfulness_score':0.0},
           'abram':{'correctness_score':0.0, 'logic_score':0.0, 'truthfulness_score':0.0},
           'ziwei':{'correctness_score':0.0, 'logic_score':0.0, 'truthfulness_score':0.0},
           'louis':{'correctness_score':0.0, 'logic_score':0.0, 'truthfulness_score':0.0},
           'mikeli':{'correctness_score':0.0, 'logic_score':0.0, 'truthfulness_score':0.0}}
grader_count={'ryan':0, 'haoran':0, 'abram':0, 'ziwei':0, 'louis':0, 'mikeli':0}

# Iterate over each item in the data
for item in test_data['data']:
    displayed_text = item['displayed_text']
    grader = item['grader']
    grader_count[grader]+=1
    scores = ['correctness_score', 'logic_score', 'truthfulness_score']
    text=generate_score(displayed_text)
    texts.append(text)
    inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True)
    with torch.no_grad():
        logits = models[grader](**inputs)
    
    pred={'correctness_score':logits[0, 0].item(), 'logic_score': logits[0, 1].item(), 'truthfulness_score': logits[0, 2].item()}
    for score in scores:
        model_mse[grader][score]+=((pred[score]-item[score])**2)
    
    if displayed_text not in score_sums:
        score_sums[displayed_text] = {score: 0 for score in scores}
        score_counts[displayed_text] = {score: 0 for score in scores}
        pred_sums[displayed_text] = {score: 0 for score in scores}

    for score in scores:
        score_sums[displayed_text][score] += item[score]
        score_counts[displayed_text][score] += 1
        pred_sums[displayed_text][score]+= pred[score]

for grader in graders:
    for key in model_mse[grader]:
        model_mse[grader][key]/=grader_count[grader]

df = pd.DataFrame(model_mse).T  # Transpose to switch rows and columns
print(df)

# Dump the output gpt-4 comments
with open('gpt_comment_test.json', 'w') as test_output:
    json.dump(texts, test_output, indent=4)

        correctness_score  logic_score  truthfulness_score
ryan             1.246545     1.682893            1.694764
haoran           1.559733     0.640667            1.921097
abram            1.409028     0.417089            0.985830
ziwei            2.287383     1.543196            1.505417
louis            1.121188     0.556699            0.351259
mikeli           0.590309     0.499870            0.094581


In [6]:
# Compute the average scores and rescale the standardized scores
average_scores = {}
avg_pred = {}

for text in score_sums:
    average_scores[text] = {score: score_sums[text][score] / score_counts[text][score] for score in score_sums[text]}
    avg_pred[text]= {score: pred_sums[text][score] / score_counts[text][score] for score in pred_sums[text]}

mse={"correctness_score":0.0, "logic_score":0.0, "truthfulness_score":0.0}
count=0
for text in score_sums:
    for score in score_sums[text]:
        mse[score]+=(avg_pred[text][score]-average_scores[text][score])**2
        count+=1

for key in mse:
    mse[key]/=count
print(mse)

{'correctness_score': 0.2415935747882112, 'logic_score': 0.14646256179636497, 'truthfulness_score': 0.17154164194284982}
