In [2]:
from openai import OpenAI
import pandas as pd
import random


In [11]:
import os
from dotenv import load_dotenv

load_dotenv()

client = OpenAI(
  api_key=os.environ['OPENAI_API_KEY'],
)

In [5]:
df = pd.read_parquet("processed_danish_wikipedia.parquet")

In [6]:
df.head()

Unnamed: 0,article_id,title,url,positive,negatives
0,20231101.da_28930_0,2-modulusdeler,https://da.wikipedia.org/wiki/2-modulusdeler,Men har man anvendt en fast fordeler (eng. pre...,[Indenfor den digitale elektronik bruges beteg...
1,20231101.da_720900_0,A.O. Andersen (skibsreder),https://da.wikipedia.org/wiki/A.O.%20Andersen%...,Fra 1919 havde Andersen sæde i Privatbankens b...,[A.O. Andersen fik 1. juli 1890 ansættelse i s...
2,20231101.da_43159_0,ASA Film,https://da.wikipedia.org/wiki/ASA%20Film,A/S Filmatelieret ASA (Oprindelige navn: Aktie...,[ASA-film A/S blev grundlagt i 1936 af Lau Lau...
3,20231101.da_13868_0,Adolf Hitler,https://da.wikipedia.org/wiki/Adolf%20Hitler,Den nationalsocialistiske udenrigspolitik vist...,"[Hitler førte den samme udenrigspolitisk, men ..."
4,20231101.da_996759_0,Amanda Nunes,https://da.wikipedia.org/wiki/Amanda%20Nunes,Nunes var planlagt til at møde Kaitlin Young p...,"[Nunes vandt fem kampe i træk, alle ved en for..."


In [12]:
def generate_prompt(language, positive):
    prompt = (
        f"Your task is to anticipate possible search queries by users in the form of a question for a given document.\n"
        f"- The question must be written in {language}\n"
        f"- The question should be formulated concretely and precisely and relate to the information from the given document\n"
        f"- The question must be coherent and should make sense without knowing the document\n"
        f"- The question must be answerable by the document\n"
        f"- The question should focus on one aspect and avoid using subclauses connected with 'and'\n"
        f"- The question should not be overly specific and should mimic a request of a user who is just starting to research the given topic\n"
        f"- Do not draw on your prior knowledge\n\n"
        f"Generate a question in {language} for the following document:\n"
        f"<document>\n{positive}\n</document>\n\n"
        f"Search query:"
    )
    return prompt

In [22]:
def request_query_generation(prompt):
    client = OpenAI()
    response = client.completions.create(
        model="gpt-3.5-turbo-instruct",
        prompt=prompt,
        max_tokens=512,
        temperature=0.0
    )
    return response.choices[0].text.strip()

In [23]:
language = "Danish" 

# Generate queries for the positive paragraph in each article
queries = []
for _, row in df.iterrows():
    positive = row['positive']
    prompt = generate_prompt(language, positive)
    
    # Get the generated query from the LLM
    query = request_query_generation(prompt)
    queries.append({
        'article_id': row['article_id'],
        'title': row['title'],
        'url': row['url'],
        'positive': positive,
        'generated_query': query
    })

In [24]:

queries_df = pd.DataFrame(queries)
queries_df.to_parquet("positive_queries_with_prompt.parquet", index=False)

Query generation complete. Results saved to positive_queries_with_prompt.parquet


In [26]:
queries_df.shape

(100, 5)

In [28]:
queries_df.head()

Unnamed: 0,article_id,title,url,positive,generated_query
0,20231101.da_28930_0,2-modulusdeler,https://da.wikipedia.org/wiki/2-modulusdeler,Men har man anvendt en fast fordeler (eng. pre...,Hvad er N-værdien for en fast fordeler på 16?
1,20231101.da_720900_0,A.O. Andersen (skibsreder),https://da.wikipedia.org/wiki/A.O.%20Andersen%...,Fra 1919 havde Andersen sæde i Privatbankens b...,Hvordan hjalp Andersen med at rekonstruere Pri...
2,20231101.da_43159_0,ASA Film,https://da.wikipedia.org/wiki/ASA%20Film,A/S Filmatelieret ASA (Oprindelige navn: Aktie...,Hvem var de centrale personer på A/S Filmateli...
3,20231101.da_13868_0,Adolf Hitler,https://da.wikipedia.org/wiki/Adolf%20Hitler,Den nationalsocialistiske udenrigspolitik vist...,Hvordan viste Tysklands udenrigspolitik sit kl...
4,20231101.da_996759_0,Amanda Nunes,https://da.wikipedia.org/wiki/Amanda%20Nunes,Nunes var planlagt til at møde Kaitlin Young p...,Hvem skulle Nunes møde på Invicta FC 5 og hvor...
