In [1]:
import pandas as pd
import os 
from dotenv import load_dotenv


In [3]:

# Load environment variables
load_dotenv()

# Retrieve environment variables
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

In [4]:
from openai import OpenAI

client = OpenAI()

In [6]:
df = pd.read_csv("../data/game-dataset.csv",sep = ',')
len(df)

590

In [7]:
documents = df.to_dict(orient='records')

In [9]:
prompt_template = """
You emulate a user of our game assistant application.
Formulate 5 questions this user might ask based on a provided games.
Make the questions specific to this game.
The record should contain the answer to the questions, and the questions should
be complete and not too short. Use as fewer words as possible from the record. 

The record:

 gameName: {gameName}
 alternateNames: {alternateNames}
 subcategory: {subcategory}
 level: {level},
 description: {description}
 playersMax: {playersMax}
 ageRange: {ageRange}
 duration: {duration}
 equipmentNeeded: {equipmentNeeded}
 objective: {objective}
 skillsDeveloped: {skillsDeveloped}
 setupTime: {setupTime}
 place: {place}
 physicalIntensityLevel: {physicalIntensityLevel}
 educationalBenefits: {educationalBenefits}
 category: {category}


Provide the output in parsable JSON without using code blocks:

{{"questions": ["question1", "question2", ..., "question5"]}}
""".strip()

In [10]:
prompt = prompt_template.format(**documents[0])

In [11]:
def llm(prompt):
    response = client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [12]:
questions = llm(prompt)

In [13]:
import json

In [14]:
json.loads(questions)

{'questions': ['What is the primary objective of playing soccer, and how does a team achieve this during a match?',
  'Can you describe the typical duration of a soccer game and what setup time is generally required before play begins?',
  'What essential equipment is necessary for participating in a soccer match, and why is each item important?',
  'At what age range can players start enjoying soccer, and how does the sport accommodate different skill levels from beginner to professional?',
  "What skills are developed through playing soccer, and how do these skills contribute to a player's overall fitness and teamwork abilities?"]}

In [15]:
def generate_questions(doc):
    prompt = prompt_template.format(**doc)

    response = client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[{"role": "user", "content": prompt}]
    )

    json_response = response.choices[0].message.content
    return json_response

In [16]:
from tqdm.auto import tqdm

In [17]:
results = {}

In [18]:
for doc in tqdm(documents): 
    doc_id = doc['gameId']
    if doc_id in results:
        continue

    questions_raw = generate_questions(doc)
    questions = json.loads(questions_raw)
    results[doc_id] = questions['questions']

  0%|          | 0/590 [00:00<?, ?it/s]

In [19]:
final_results = []

for doc_id, questions in results.items():
    for q in questions:
        final_results.append((doc_id, q))

In [20]:

final_results[0]

(1, 'What is the primary objective of playing soccer during a match?')

In [21]:
df_results = pd.DataFrame(final_results, columns=['q_id', 'question'])

In [22]:
df_results.to_csv('../data/ground-truth-retrieval.csv', index=False)

In [23]:
!head ../data/ground-truth-retrieval.csv

q_id,question
1,What is the primary objective of playing soccer during a match?
1,How many players are allowed on the field from both teams combined?
1,What age group is suitable for participating in soccer games?
1,What equipment do I need to play soccer safely and effectively?
1,How long does a typical soccer match last?
2,What is the maximum number of players that can participate in an Ultimate Frisbee game?
2,Can you describe the primary objective of playing Ultimate Frisbee?
2,How long does a typical game of Ultimate Frisbee last?
2,What equipment do I need to play Ultimate Frisbee?
