In [None]:
!pip install datasets

In [None]:
from datasets import load_dataset, DatasetDict, Dataset

# Load the Hugging Face data set
dataset = load_dataset("Muadil/cleaned_openai_summary_comparisons")

# Data set processing
def convert_to_new_format(example):
    """
    Converts an example into a new message-based format for preference modeling.

    Args:
        example (dict): A dictionary containing 'prompt', 'chosen', and 'rejected' keys.

    Returns:
        list[dict]: A list of formatted message pairs with labels for training.
    """
    prompt = example["prompt"]
    chosen = example["chosen"]
    rejected = example["rejected"]

    return [
        {
            "messages": [
                {"content": f"System: I want you to summarize this text\nDocument: {prompt}", "role": "user"},
                {"content": chosen, "role": "assistant"}
            ],
            "label": True
        },
        {
            "messages": [
                {"content": f"System: I want you to summarize this text\nDocument: {prompt}", "role": "user"},
                {"content": rejected, "role": "assistant"}
            ],
            "label": False
        }
    ]


# Transform all data (for both train and test)
formatted_train_data = []
for example in dataset["train"]:
    formatted_train_data.extend(convert_to_new_format(example))

formatted_test_data = []
for example in dataset["test"]:
    formatted_test_data.extend(convert_to_new_format(example))

# Create new data sets
new_train_dataset = Dataset.from_list(formatted_train_data)
new_test_dataset = Dataset.from_list(formatted_test_data)

# Create a new DatasetDict
new_dataset = DatasetDict({
    "train": new_train_dataset,
    "test": new_test_dataset
})

# Saving the new data set in Hugging Face
new_dataset.push_to_hub("username/repository")
