In [None]:
import os
from dotenv import load_dotenv

if not load_dotenv():
    print("No .env file found, was this intentional?")
else:
    print(".env found! Loading environement variables.")

env_vars = [
    "ACS_API_KEY",
    "AZURE_SEARCH_SERVICE_ENDPOINT",
    "AOAI_BASE_ENDPOINT",
    "AOAI_API_KEY",
]

for env_var in env_vars:
    if os.getenv(env_var) is None:
        print(f"Missing environment variable: {env_var}")
        exit(1)

In [None]:
# Given a chunk, create a query with ground truth (filename, page_number)
from openai import AzureOpenAI

def create_query(chunk, filename, page_number):
    api_key = os.environ.get("AOAI_API_KEY")
    api_endpoint = os.environ.get("AOAI_BASE_ENDPOINT")
    api_version = "2023-07-01-preview"

    GPT_MODEL_DEPLOYMENT = "gpt-35-turbo"

    TEMPERATURE = 0.2
    MAX_TOKENS = 2000

    GENERATE_QUESTION_PROMPT = """You are an AI assistant.
    Your job is to generate one relevant question-answer pair based only on the current context:

    ```
    {current_context}
    ```

    - Use this format: Question|Answer
    - Make sure to output the content in the Question and Answer fields are on the same line.
    """

    messages = []
    prompt = GENERATE_QUESTION_PROMPT.format(current_context=chunk)

    messages.append(
        {
            "role": "system",
            "content": prompt
        }
    )

    client = AzureOpenAI(api_key=api_key, api_version=api_version, azure_endpoint=api_endpoint)

    response = client.chat.completions.create(
        model=GPT_MODEL_DEPLOYMENT,
        messages=messages,
        temperature=TEMPERATURE,
        max_tokens=MAX_TOKENS,
        frequency_penalty=0,
        presence_penalty=0,
        stop=None,
    )
    try:
        query, answer = response.choices[0].message.content.split("|")
        if query == "" or answer == "":
            print(f"Unable to generate qa from {response.choices[0].message.content}")
            return None
    except:
        print(f"Unable to generate qa from {response.choices[0].message.content}")
        return None

    return {
        "query": query,
        "answer": answer,
        "sources": [
            {
                "filename":filename,
                "page_number": page_number
            }
        ]
    }

In [None]:
result = create_query("The quick brown fox jumps over the lazy dog.", "test.pdf", 1)
result

results = []
results.append(result)
results.append(result)

In [None]:
# Write queries to file
import json

def write_queries_to_file(results, filename):
    with open(filename, "w") as f:
        for result in results:
            f.write(json.dumps(result))
            f.write("\n")

In [None]:
# Retrieve chunks and create query for each
from azure.core.credentials import AzureKeyCredential
from azure.search.documents.indexes import SearchIndexerClient
from azure.search.documents import SearchClient

acs_endpoint = os.getenv("AZURE_SEARCH_SERVICE_ENDPOINT")
acs_key = os.getenv("ACS_API_KEY")
index_name = "search-eval-index"

search_indexer_client = SearchIndexerClient(acs_endpoint, AzureKeyCredential(acs_key))
search_client = SearchClient(acs_endpoint, index_name, AzureKeyCredential(acs_key))

search_results = search_client.search(search_text="*")

query_results = []
for sr in search_results:
    query_result = create_query(sr["content"], sr["filename"], sr["page_number"])
    if query_result is not None:
        query_results.append(query_result)

write_queries_to_file(query_results, "./mlops/evaluation/data/queries.jsonl")