In [159]:
import json, random
from openai import OpenAI
from dotenv import dotenv_values

from typing import List

In [160]:
env = dotenv_values()

API_KEY = env['OPENAI_API_KEY']
MODEL = env['OPENAI_MODEL']

summaries_filename = "01_summaries.json"
queries_filename = "02_queries.json"

In [162]:
client = OpenAI(api_key=API_KEY)


In [163]:
# Define a helper function to extract the only key-value pair from a dictionary no matter what the key is.

def get_value_from_dict(dict):
  return dict[list(dict.keys())[0]]


# Pipeline


In [164]:
# Create an assistant
assistant = client.beta.assistants.create(
    name="asst_bachelor_queries",
    instructions="You are a helpful assistant that generates keyword-based, Google-like user search queries that retrieve a subset of a specified list of documents.",
    response_format={"type": "json_object"},
    temperature=0.7,
    model=MODEL
)

In [165]:
prompt = "Generate 10 English user search queries that would retrieve a subset of an arbitrary size from the list of documents provided above. Each object contains a document id, title, and a short summary that describes its contents. Return a JSON array of strings, where each string represents a single query. The queries should be keyword-based and should resemble Google search queries of various levels of specificity."

prompt = "Generate 10 English user search queries that would retrieve a subset of an arbitrary size from the list of documents provided above. Each object contains a document id, title, and a short summary that describes its contents. Use advanced, domain-specific keywords directly from the document summaries to craft the queries. Return a JSON array of strings, where each string represents a single query. The queries should be 3 to 5 words long, keyword-based and should resemble Google search queries."

In [166]:
# Load summaries, leaving out those with None/null value

BATCH_SIZE = 40

with open(summaries_filename, "r") as f:
    summaries = json.load(f)

summaries = [s for s in summaries if s["summary"] is not None]
batches = [summaries[i:i + BATCH_SIZE]
           for i in range(0, len(summaries), BATCH_SIZE)]

In [167]:
def get_queries_for_batch(batch):
    # Create a single thread for this entire step
    thread = client.beta.threads.create()

    client.beta.threads.messages.create(
        thread_id=thread.id,
        content=[
            {
                "type": "text",
                "text": json.dumps(
                    batch,
                    indent=2,
                    ensure_ascii=False
                )
            },
            {
                "type": "text",
                "text": prompt
            }
        ],
        role="user"
    )

    run = client.beta.threads.runs.create_and_poll(
        thread_id=thread.id,
        assistant_id=assistant.id,
    )

    print(run)

    if run.status != "completed":
        raise Exception("Run failed:", run.status)

    messages_cursor = client.beta.threads.messages.list(
        thread.id,
        limit=1,
        order="desc"
    )

    message = [message for message in messages_cursor][0]
    response_content = message.content[0].text.value
    response_dict = json.loads(response_content)
    results: List[str] = get_value_from_dict(response_dict)

    client.beta.threads.delete(thread.id)

    return results

In [168]:
# Generate queries for each batch of summaries,
# repeat 3 times with shuffling.

queries = []

for _ in range(5):
    random.shuffle(batches)

    for batch in batches:
        batch_queries = get_queries_for_batch(batch)
        queries.extend(batch_queries)

        print(json.dumps(
            batch_queries,
            indent=2,
            ensure_ascii=False
        ))

Run(id='run_6QR4vLAnUl2qU2A3au2WQLMv', assistant_id='asst_SreO4Pk8aTxVftJmDMm2ybHM', cancelled_at=None, completed_at=1736190403, created_at=1736190399, expires_at=None, failed_at=None, incomplete_details=None, instructions='You are a helpful assistant that generates keyword-based, Google-like user search queries that retrieve a subset of a specified list of documents.', last_error=None, max_completion_tokens=None, max_prompt_tokens=None, metadata={}, model='gpt-4o', object='thread.run', parallel_tool_calls=True, required_action=None, response_format=ResponseFormatJSONObject(type='json_object'), started_at=1736190400, status='completed', thread_id='thread_BssWHFKNBqC1Rtjk0W08hID8', tool_choice='auto', tools=[], truncation_strategy=TruncationStrategy(type='auto', last_messages=None), usage=Usage(completion_tokens=92, prompt_tokens=12892, total_tokens=12984, prompt_token_details={'cached_tokens': 0}), temperature=0.7, top_p=1.0, tool_resources={})
[
  "marine protected areas Arctic",
  "R

In [169]:
# Save generated queries to a json file

with open(queries_filename, 'w') as f:
    json.dump(queries, f, indent=2, ensure_ascii=False)

In [170]:
# Delete the assistant

res = client.beta.assistants.delete(assistant.id)
print(res)

AssistantDeleted(id='asst_SreO4Pk8aTxVftJmDMm2ybHM', deleted=True, object='assistant.deleted')
