In [1]:
import json
import os

from react_agent.src.util.llm_proxy import LLM_PROXY

dimensions = [
    "product",
    "category",
    "source",
    "persona",
    "activity",
    "country",
]

notebook_dir = os.path.abspath("")
resources_dir = os.path.join(notebook_dir, "resources")

labeled_set = os.path.join(resources_dir, "labeled_set.json")

In [2]:
prompt_template = """## Role
You are an expert in labeling questions in Electronic Document Processing, with deep domain knowledge in SAP Document and Reporting Compliance, Peppol, UBL, and eInvoicing standards.

## Goal
Your goal is to assign exactly one label to the question based on the provided dimension. The label should be the most relevant one that accurately describes the question.

## Instructions
Respond with the label only. Do not include any additional text or explanations.

## Question
{question}

## Dimension
{dimension}
"""

In [3]:
data_set_path = os.path.abspath(os.path.join(resources_dir, "data_set.json"))

with open(data_set_path) as f:
    data_set = json.load(f)

In [4]:
def create_dimension_string(dimension: str) -> str:
    response = ""
    dimension_path = os.path.join(resources_dir, f"{dimension}.json")
    with open(dimension_path) as f:
        dimension = json.load(f)

    for row in dimension:
        response += f"VALUE: {row["value"]} \tDESCRIPTION: {row["description"]}\n"
    return response

In [None]:
for row in data_set:
    for dimension in dimensions:
        prompt = prompt_template.format(
            question=row["question"],
            dimension=create_dimension_string(dimension),
        )
        dimension_label = LLM_PROXY.invoke(prompt)

        row[dimension] = dimension_label

In [6]:
with open(labeled_set, "w") as f:
    json.dump(data_set, f, indent=4)