In [1]:
import json
from datasets import Dataset
from transformers import RobertaTokenizer, DataCollatorWithPadding
from openai import OpenAI
client = OpenAI()

# Load your dataset
with open("train_data.json", "r") as f:
    data = json.load(f)["data"]

# Convert the dataset to the Hugging Face datasets format
dataset_dict = {
    "text": [entry["displayed_text"] for entry in data],
    "scores_labels": [[0.0 if v is None else float(v) for v in [
        entry["correctness_score"], entry["logic_score"], entry["truthfulness_score"]
    ]] for entry in data],
    "grader": [entry["grader"] for entry in data],
}



In [None]:
"""
# Define function chatgpt grading
def generate_score(text):
    response=client.chat.completions.create(
        model="gpt-4",
        temperature=0.8,
        max_tokens=800,
        messages=[
            {"role": "system", "content": """You are reviewing response to physics questions. You will be given a question, a response, 
            and a ground truth. Using the ground truth as reference, comment on the response in terms of the presence of calculation error, 
            hallucination error, irrelevancy, and logic error. Be objective and comprehensive, but keep it concise. You must keep your 
            response within 200 words at most."""},
            {"role": "user", "content": text},
        ],
    )
    return response.choices[0].message.content

comment_dict={}

for entry in dataset_dict["text"]:
    if entry in comment_dict:
        continue
    else:
        response=generate_score(entry)
        comment_dict[entry]=response
"""

In [2]:
"""with open("gpt-comment.json", 'w') as file:
    json.dump(comment_dict, file, indent=4)"""
with open("gpt-comment.json", 'r') as file:
    comment_dict=json.load(file)

In [3]:
# Finding unique graders
unique_graders = set(dataset_dict["grader"])

# Splitting the dataset by grader
datasets_by_grader = {grader: {"text": [], "scores_labels": [], "grader": []} for grader in unique_graders}

# Populate the data for each grader
for i, grader in enumerate(dataset_dict["grader"]):
    datasets_by_grader[grader]["text"].append(dataset_dict["text"][i])
    datasets_by_grader[grader]["scores_labels"].append(dataset_dict["scores_labels"][i])
    datasets_by_grader[grader]["grader"].append(grader)

In [4]:
import torch
from torch import nn
from transformers import TrainingArguments, RobertaModel

# Define the model
class RobertaForMultilabelRegression(nn.Module):
    def __init__(self, roberta_model_name: str, num_labels: int):
        super(RobertaForMultilabelRegression, self).__init__()
        self.num_labels = num_labels
        self.roberta = RobertaModel.from_pretrained(roberta_model_name)
        self.regressor = nn.Linear(self.roberta.config.hidden_size, num_labels)
        self.loss_fn = nn.SmoothL1Loss(reduction='mean')  # or 'none' if you prefer

    def forward(self, input_ids, attention_mask=None, labels=None):
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        sequence_output = outputs[0][:, 0, :]  # Take <s> token (equiv. to [CLS])
        logits = self.regressor(sequence_output)
        
        loss = None
        if labels is not None:
            loss = self.loss_fn(logits, labels)

        return (loss, logits) if loss is not None else logits

    def save_model(self, save_directory: str):
        if not os.path.exists(save_directory):
            os.makedirs(save_directory)
        torch.save(self.state_dict(), os.path.join(save_directory, 'pytorch_model.bin'))
        with open(os.path.join(save_directory, 'config.json'), 'w') as f:
            f.write(self.roberta.config.to_json_string())

    @classmethod
    def load_model(cls, save_directory: str, roberta_model_name: str, num_labels: int):
        model = cls(roberta_model_name, num_labels)
        model.load_state_dict(torch.load(os.path.join(save_directory, 'pytorch_model.bin')))
        return model

In [5]:
import os
import torch
from torch.utils.data import DataLoader
from transformers import RobertaTokenizer, TrainingArguments, Trainer
from transformers import DataCollatorWithPadding
from datasets import Dataset
from torch.optim.lr_scheduler import StepLR

def tokenize_function(examples):
    # Assuming comment_dict is properly defined and accessible here
    texts = [comment_dict.get(key, "")[:500] for key in examples["text"]]
    # Tokenize texts while ensuring padding and truncation
    tokenized_inputs = tokenizer(texts, padding="max_length", truncation=True, max_length=512)
    tokenized_inputs["labels"] = examples["scores_labels"]
    return tokenized_inputs

tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

for grader in unique_graders:
    dataset = Dataset.from_dict(datasets_by_grader[grader])
    tokenized_datasets = dataset.map(tokenize_function, batched=True)
    split_dataset = tokenized_datasets.train_test_split(test_size=0.1)  # 10% for validation

    train_dataset = split_dataset["train"].remove_columns(['text', 'grader', 'scores_labels'])
    eval_dataset = split_dataset["test"].remove_columns(['text', 'grader', 'scores_labels'])
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")
    
    train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=8, collate_fn=data_collator)
    eval_dataloader = DataLoader(eval_dataset, batch_size=8, collate_fn=data_collator)
    
        
    model = RobertaForMultilabelRegression('roberta-base', num_labels=3)
    training_args = TrainingArguments(
        output_dir=f"./results_{grader}",
        evaluation_strategy="epoch",
        learning_rate=0.001,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=6,
        weight_decay=0.01,
        logging_dir=f'./logs_{grader}',
        logging_steps=10,
    )

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    optimizer = torch.optim.AdamW(model.parameters(), lr=training_args.learning_rate)
    scheduler = StepLR(optimizer, step_size=30, gamma=0.8)

    # Training loop
    for epoch in range(training_args.num_train_epochs):
        model.train()
        for batch in train_dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs[0]
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            scheduler.step()

        model.eval()
        eval_loss = 0
        for batch in eval_dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            with torch.no_grad():
                outputs = model(**batch)
                eval_loss += outputs[0].item()
        eval_loss /= len(eval_dataloader)
        print(f"Epoch {epoch + 1}/{training_args.num_train_epochs}, Evaluation Loss: {eval_loss}")

    model.save_model(f"fine-tuned-roberta_{grader}")




  0%|          | 0/1 [00:00<?, ?ba/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/6, Evaluation Loss: 0.893435001373291
Epoch 2/6, Evaluation Loss: 1.001727283000946
Epoch 3/6, Evaluation Loss: 0.9255439639091492
Epoch 4/6, Evaluation Loss: 1.010292410850525
Epoch 5/6, Evaluation Loss: 0.98626509308815
Epoch 6/6, Evaluation Loss: 0.9446495771408081


  0%|          | 0/1 [00:00<?, ?ba/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/6, Evaluation Loss: 0.695886492729187
Epoch 2/6, Evaluation Loss: 1.0352841019630432
Epoch 3/6, Evaluation Loss: 0.7124559283256531
Epoch 4/6, Evaluation Loss: 0.6837186217308044
Epoch 5/6, Evaluation Loss: 0.6802806854248047
Epoch 6/6, Evaluation Loss: 0.8791805505752563


  0%|          | 0/1 [00:00<?, ?ba/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/6, Evaluation Loss: 0.8284439146518707
Epoch 2/6, Evaluation Loss: 0.7024035155773163
Epoch 3/6, Evaluation Loss: 0.723159521818161
Epoch 4/6, Evaluation Loss: 0.6798948496580124
Epoch 5/6, Evaluation Loss: 0.715583473443985
Epoch 6/6, Evaluation Loss: 0.7225950956344604


  0%|          | 0/1 [00:00<?, ?ba/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/6, Evaluation Loss: 0.6494656950235367
Epoch 2/6, Evaluation Loss: 0.6053258031606674
Epoch 3/6, Evaluation Loss: 0.5281317383050919
Epoch 4/6, Evaluation Loss: 0.5143789798021317
Epoch 5/6, Evaluation Loss: 0.5616019368171692
Epoch 6/6, Evaluation Loss: 0.5403618514537811


  0%|          | 0/1 [00:00<?, ?ba/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/6, Evaluation Loss: 0.804334819316864
Epoch 2/6, Evaluation Loss: 0.6968740671873093
Epoch 3/6, Evaluation Loss: 0.6437824815511703
Epoch 4/6, Evaluation Loss: 0.6646958738565445
Epoch 5/6, Evaluation Loss: 0.6389547288417816
Epoch 6/6, Evaluation Loss: 0.6614431887865067


  0%|          | 0/1 [00:00<?, ?ba/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/6, Evaluation Loss: 0.2492600530385971
Epoch 2/6, Evaluation Loss: 0.3338034152984619
Epoch 3/6, Evaluation Loss: 0.31757013499736786
Epoch 4/6, Evaluation Loss: 0.23966208845376968
Epoch 5/6, Evaluation Loss: 0.23144873976707458
Epoch 6/6, Evaluation Loss: 0.2575795277953148


In [7]:
tokenizer.save_pretrained("fine-tuned-roberta")

('fine-tuned-roberta/tokenizer_config.json',
 'fine-tuned-roberta/special_tokens_map.json',
 'fine-tuned-roberta/vocab.json',
 'fine-tuned-roberta/merges.txt',
 'fine-tuned-roberta/added_tokens.json')