# Variables, Constants and Libraries definition

In [1]:
from dotenv import load_dotenv # requires python-dotenv
from azure.identity import DefaultAzureCredential, get_bearer_token_provider #requires azure-identity
from openai import AzureOpenAI
import openai, os, json, time, glob, kagglehub # openai is used for error catching, otherwise we use AzureOpenAI
import pandas as pd

load_dotenv("./../config/credentials_my.env")

openai_endpoint       = os.environ["azure_openai_endpoint"]
openai_api_key        = os.environ["azure_openai_api_key"]
openai_api_version    = os.environ["openai_api_version"]
azure_deployment_name = os.environ["azure_openai_chat_deployment_name"]

credential            = DefaultAzureCredential()

token_provider        = get_bearer_token_provider(
    DefaultAzureCredential(), "https://cognitiveservices.azure.com/.default")

folder_path = "./enrichment"

# Create the folder if it doesn't exist
os.makedirs(folder_path, exist_ok=True)
output_file_name = os.path.join(folder_path, "questions_enriched")

# Get the current user's home directory
home_dir = os.path.expanduser("~")

# [Single-Topic RAG Evaluation Dataset](https://www.kaggle.com/datasets/samuelmatsuoharris/single-topic-rag-evaluation-dataset) retrieval

In [2]:
# Define the relative path from the home directory for the Kaggle dataset
dataset_relative_path = ".cache/kagglehub/datasets/samuelmatsuoharris/single-topic-rag-evaluation-dataset/versions/3"

dataset_path = os.path.join(home_dir, dataset_relative_path)

if not os.path.exists(dataset_path):
    dataset_path = kagglehub.dataset_download("samuelmatsuoharris/single-topic-rag-evaluation-dataset") # Download latest version

print("Path to dataset files:", dataset_path)

# Find all CSV files in the directory
csv_files = glob.glob(os.path.join(dataset_path, "*.csv"))

# Load each CSV file into a DataFrame and store them in a list
dfs = [(os.path.splitext(os.path.basename(file))[0], pd.read_csv(file)) for file in csv_files]
i=0
for df in dfs:
    print(f"\n\ndataframe #{i} <{df[0]}>:")
    display(df[1].head())
    i+=1

Path to dataset files: C:\Users\mauromi\.cache/kagglehub/datasets/samuelmatsuoharris/single-topic-rag-evaluation-dataset/versions/3


dataframe #0 <documents>:


Unnamed: 0,index,source_url,text
0,0,https://enterthegungeon.fandom.com/wiki/Bullet...,Bullet Kin\nBullet Kin are one of the most com...
1,1,https://www.dropbox.com/scl/fi/ljtdg6eaucrbf1a...,---The Paths through the Underground/Underdark...
2,2,https://bytes-and-nibbles.web.app/bytes/stici-...,Semantic and Textual Inference Chatbot Interfa...
3,3,https://github.com/llmware-ai/llmware,llmware\n\nBuilding Enterprise RAG Pipelines w...
4,4,https://docs.marimo.io/recipes.html,Recipes\nThis page includes code snippets or “...




dataframe #1 <multi_passage_answer_questions>:


Unnamed: 0,document_index,question,answer
0,0,Which enemy types wield an AK-47?,Assault-rifle wielding Bullet and Tankers wiel...
1,0,What makes jammed enemies different?,"Jammed Keybullet Kin drop 2 keys instead of 1,..."
2,1,What enemies are encountered in the second enc...,26 kobolds and 1 kobold inventor are encounter...
3,1,What monsters are encountered in this journey?,"Ropers, kobolds, kobold inventors, fire giants..."
4,2,What framework was chosen to execute the RAG p...,The LangChain framework was used to orchestrat...




dataframe #2 <no_answer_questions>:


Unnamed: 0,document_index,question
0,0,How much health does the Mutant Bullet Kin have?
1,0,Where can bishops be found?
2,1,What happened on day 10?
3,1,What did the goblins say?
4,2,Why was the H100 GPU chosen for computation?




dataframe #3 <single_passage_answer_questions>:


Unnamed: 0,document_index,question,answer
0,0,What do keybullet kin drop?,Keybullet kin drop a key upon death.
1,0,What kind of gun does the bandana bullet kin use?,The bandana bullet kin wields a machine pistol.
2,1,What do the giants look like?,"One giant is burly, grey-skinned, and 20 feet ..."
3,1,What happens on day 2?,"After a few miles of winding tunnel, you emerg..."
4,2,What were the requirements for the project?,The tool had the following requirements:\n- Ch...


In [3]:
docs_df      = dfs[0][1] # documents
ko_df        = dfs[2][1] # no_answer_questions
questions_df = pd.concat([dfs[3][1], dfs[1][1]]) # single_passage_answer_questions + multi_passage_answer_questions
docs_df["text"].to_csv(f"{folder_path}/documents.txt", index=False, header=False, sep="\n")

# Open AI client via Azure OpenAI SDK (with Responses API's)

In [4]:
client = AzureOpenAI(
    azure_ad_token_provider = token_provider,
    api_version = openai_api_version)

response = client.responses.create(
    model = azure_deployment_name,
    input="how can I make a good pizza?")

print(f"Response: {response.output_text[:30]}...")

Response: Making a good pizza at home is...


# Create full evaluation dataset

In [5]:
def enrich_dataset(content: dict) -> dict:

    system_message = """
    given a json dictionary whose keys are "context", "query", "ground_truth", please generate two outputs:
    1) "response_correct": another correct answer, possibly shorter and simpler, equivalent to the original one but with different words.
    2) "response_wrong": a wrong answer that is contextualized and may look correct to a person who doesn't know the document well, using the same style as the ground truth and the second truth
    """

    input = [
        {"role": "system", "content": system_message},
        {"role": "user", "content": json.dumps(content)}
    ]
    
    text = {
        "format": {
            "type": "json_schema",
            "name": "enriched_dataset",
            "schema": {
                "type": "object",
                "properties": {
                    "response_correct": {"type": "string"},
                    "response_wrong": {"type": "string"}
                },
                "required": ["response_correct", "response_wrong"],
                "additionalProperties": False
            },
            "strict": True
        }
    }

    response = client.responses.create(
        model = azure_deployment_name,
        input = input,
        text  = text
    )

    return json.loads(response.output_text)

# Consolidate the source dataframes into a single one called `consolidated_df`

In [6]:
# Join DataFrames on docs_df.index == multi_df.document_index
consolidated_df = questions_df.merge(docs_df, left_on='document_index', right_on='index')

# Rename columns
consolidated_df = consolidated_df.rename(columns={'text': 'context', 'question': 'query', 'answer': 'ground_truth'})

# Drop unwanted columns
consolidated_df = consolidated_df.drop(columns=['index', 'source_url'])

# Reorder columns
consolidated_df = consolidated_df[['document_index', 'context', 'query', 'ground_truth']]

consolidated_df.head()

Unnamed: 0,document_index,context,query,ground_truth
0,0,Bullet Kin\nBullet Kin are one of the most com...,What do keybullet kin drop?,Keybullet kin drop a key upon death.
1,0,Bullet Kin\nBullet Kin are one of the most com...,What kind of gun does the bandana bullet kin use?,The bandana bullet kin wields a machine pistol.
2,1,---The Paths through the Underground/Underdark...,What do the giants look like?,"One giant is burly, grey-skinned, and 20 feet ..."
3,1,---The Paths through the Underground/Underdark...,What happens on day 2?,"After a few miles of winding tunnel, you emerg..."
4,2,Semantic and Textual Inference Chatbot Interfa...,What were the requirements for the project?,The tool had the following requirements:\n- Ch...


# Enrich `consolidated_df` into `enriched_df`

In [9]:
# Initialize empty lists for the new columns
response_correct_list = []
response_wrong_list = []

enriched_df = consolidated_df.copy()

i = 0
total_rows = len(enriched_df)-1

# Iterate over the rows using a loop
for _, row in enriched_df.iterrows():
    if i >= len(response_correct_list):
        while True:
            try:
                # Call enrich_dataset() for each row
                enriched_data = enrich_dataset({
                    "context": row['context'],
                    "query": row['query'],
                    "ground_truth": row['ground_truth']
                })

                # Ensure the expected keys exist
                if 'response_correct' in enriched_data and 'response_wrong' in enriched_data:
                    response_correct_list.append(enriched_data['response_correct'])
                    response_wrong_list.append(enriched_data['response_wrong'])

                    print(f"Index {i}/{total_rows}:\n- second_truth: <{response_correct_list[i]}>\n- wrong_answer: <{response_wrong_list[i]}>\n")
                    break  # Exit loop on success
                else:
                    print("Missing keys in enriched_data, retrying...\n")
            except openai.RateLimitError:
                print("Rate limit exceeded. Waiting for 90 seconds before retrying...\n")
                time.sleep(90)  # Wait before retrying
    else:
        print(f"Index {i}/{total_rows} (skipping):\n- second_truth: <{response_correct_list[i]}>\n- wrong_answer: <{response_wrong_list[i]}>\n")

    i += 1


# Assign the lists as new columns in the DataFrame
enriched_df['response_correct'] = response_correct_list
enriched_df['response_wrong'] = response_wrong_list

# Display the updated DataFrame
display(enriched_df.head())

Index 0/79:
- second_truth: <Keybullet kin release a key when defeated.>
- wrong_answer: <Keybullet kin do not drop any items upon death and simply disappear.>

Index 1/79:
- second_truth: <The Bandana Bullet Kin uses a machine pistol.>
- wrong_answer: <The Bandana Bullet Kin uses an AK-47.>

Index 2/79:
- second_truth: <One giant has grey skin, is very muscular and about 20 feet tall. It's wearing black iron armor with spikes and holds two large, spiked shields. The other giant wears lighter armor and holds a giant maul, appearing uninterested.>
- wrong_answer: <The giants are small, standing only 6 feet tall, and have brightly colored clothing. They carry simple wooden clubs and look curious and approachable.>

Index 3/79:
- second_truth: <You find yourself in a damp grotto after walking the tunnels for miles and come across two ropers.>
- wrong_answer: <Day 2 involves crossing paths with friendly dwarves who guide you safely through the underground rivers.>

Index 4/79:
- second_tru

Unnamed: 0,document_index,context,query,ground_truth,response_correct,response_wrong
0,0,Bullet Kin\nBullet Kin are one of the most com...,What do keybullet kin drop?,Keybullet kin drop a key upon death.,Keybullet kin release a key when defeated.,Keybullet kin do not drop any items upon death...
1,0,Bullet Kin\nBullet Kin are one of the most com...,What kind of gun does the bandana bullet kin use?,The bandana bullet kin wields a machine pistol.,The Bandana Bullet Kin uses a machine pistol.,The Bandana Bullet Kin uses an AK-47.
2,1,---The Paths through the Underground/Underdark...,What do the giants look like?,"One giant is burly, grey-skinned, and 20 feet ...","One giant has grey skin, is very muscular and ...","The giants are small, standing only 6 feet tal..."
3,1,---The Paths through the Underground/Underdark...,What happens on day 2?,"After a few miles of winding tunnel, you emerg...",You find yourself in a damp grotto after walki...,Day 2 involves crossing paths with friendly dw...
4,2,Semantic and Textual Inference Chatbot Interfa...,What were the requirements for the project?,The tool had the following requirements:\n- Ch...,The project requirements were:\n- A chatbot th...,The requirements included:\n- A chatbot capabl...


In [10]:
enriched_df.to_csv (f"{output_file_name}.csv", index=False)
enriched_df.to_json(f"{output_file_name}_records.jsonl", orient="records", lines=True)
enriched_df.to_json(f"{output_file_name}_split.json", orient="split")
enriched_df.to_json(f"{output_file_name}_table.json", orient="table")
enriched_df.to_json(f"{output_file_name}_index.json", orient="index")