# Creating a synthetic RAG dataset with Distilabel

In this notebook we accomplish the following:
* Select a random sample of rows from the [rajpurkar/squad](https://huggingface.co/datasets/rajpurkar/squad) dataset
* Use the `context` and `question` fields to generate positive (accurate) and negative (inaccurate) synthetic answers for a dataset for retrieval and embedding model fine-tuning using [Distilabel](https://github.com/argilla-io/distilabel) and `gpt-4o`
* Save the resulting dataset to the HuggingFace Hub for later analysis

In [1]:
import os
import distilabel

from datasets import Dataset, load_dataset_builder

from distilabel.llms import OpenAILLM
from distilabel.pipeline import Pipeline
from distilabel.steps import LoadDataFromHub
from distilabel.steps.tasks import GenerateSentencePair

  from distilabel.llms import OpenAILLM


In [2]:
import os
import sys

if 'google.colab' in sys.modules:
    # Running in Google Colab
    from google.colab import userdata
    openai_api_key = userdata.get_secret("OPENAI_APIKEY")
    if not openai_api_key:
        raise ValueError("API key not found in Google Colab secrets. Please set the 'OPENAI_APIKEY' secret.")
else:
    # Not running in Google Colab
    openai_api_key = os.getenv("OPENAI_API_KEY")
    if not openai_api_key:
        raise ValueError("API key not found in environment variables. Please set the OPENAI_API_KEY environment variable.")

# If we've reached this point, we have a valid API key
print("API key successfully loaded.")


API key successfully loaded.


In [3]:
# Define repo name of synthetic dataset to be created
synthetic_repo = "m-newhauser/rag-synthetic-distilabel"

# Define the original RAG dataset
original_repo = "rajpurkar/squad"

## Create the triplet dataset

In [4]:
# Load dataset builder
builder = load_dataset_builder(original_repo)
# print(builder.info.config_name) # this needs to go in config for LoadDataFromHub

In [5]:
llm = OpenAILLM(
    model="gpt-4o",
    api_key=openai_api_key,
)

with Pipeline(name="generate_embedding_queries") as pipeline:
    load_dataset = LoadDataFromHub(
        repo_id=original_repo,
        split="train",
        num_examples=5000,
        output_mappings={"question": "anchor"},
        config=builder.info.config_name # get this from dataset builder
    )
    generate_sentence_pair = GenerateSentencePair(
        triplet=True,  # `False` to generate only positive
        action="answer",
        llm=llm,
        input_batch_size=10,
        context="context",
    )

    load_dataset >> generate_sentence_pair

In [None]:
distiset = pipeline.run(  #
    parameters={
        load_dataset.name: {
            "repo_id": original_repo,
            "split": "train",
        },
        generate_sentence_pair.name: {
            "llm": {
                "generation_kwargs": {
                    "temperature": 0.7,
                    "max_new_tokens": 512,
                }
            }
        },
    },
    use_cache=False,  # comment out for demo
)

In [None]:
# Function to postprocess the distiset
def postprocess_distiset(distiset: distilabel.distiset.Distiset) -> Dataset:
    ds = distiset["default"]
    df = (
        ds["train"]
        .to_pandas()
        .assign(answers=lambda x: x.apply(lambda row: row["answers"]["text"][0], axis=1))
        .drop(columns=["id", "title", "distilabel_metadata", "model_name"])
        .rename(columns={"positive": "synthetic_positive", "negative": "synthetic_negative", "answers": "human_positive"})
        .drop_duplicates(subset=["anchor", "human_positive"])
    )
    return Dataset.from_pandas(df)

ds_synthetic = postprocess_distiset(distiset)
ds_synthetic

In [18]:
# Write new dataset with synthetic data to Hub
ds_synthetic.push_to_hub(f"{synthetic_repo}")