In [48]:
import pandas as pd

## Ingestion

In [49]:
df = pd.read_csv('../data/game-dataset.csv')

In [50]:
documents = df.to_dict(orient='records')

In [51]:
df.columns

Index(['gameId', 'gameName', 'alternateNames', 'subcategory', 'level',
       'description', 'playersMax', 'ageRange', 'duration', 'equipmentNeeded',
       'objective', 'skillsDeveloped', 'setupTime', 'place',
       'physicalIntensityLevel', 'educationalBenefits', 'category'],
      dtype='object')

In [52]:
import minsearch

# Initialize the Index
index = minsearch.Index(    
    text_fields=[
        'gameName', 'subcategory', 'level', 'description', 'ageRange',
        'duration', 'objective', 'skillsDeveloped', 
        'place', 'physicalIntensityLevel', 'educationalBenefits', 'category','playersMax',
    ],
    keyword_fields=['gameId', 'setupTime', 'equipmentNeeded', 'alternateNames']


)

documents = df.to_dict(orient='records')

# Fit the index with documents
index.fit(documents)

<minsearch.Index at 0x7ac1a22862d0>

## RAG flow

In [53]:
import os 
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Retrieve environment variables
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')


In [54]:
from openai import OpenAI

client = OpenAI()

In [55]:
def search(query):
    boost = {}

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )

    return results

In [56]:
prompt_template = """
You're a game event organizer. Answer the QUESTION based on the CONTEXT from our exercises database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT:
{context}
""".strip()

entry_template = """
 gameName: {gameName}
 alternateNames: {alternateNames}
 subcategory: {subcategory}
 level: {level},
 description: {description}
 playersMax: {playersMax}
 ageRange: {ageRange}
 duration: {duration}
 equipmentNeeded: {equipmentNeeded}
 objective: {objective},
 skillsDeveloped: {skillsDeveloped},
 setupTime: {setupTime},
 place: {place}
 physicalIntensityLevel: {physicalIntensityLevel}
 educationalBenefits: {educationalBenefits}
 category: {category}
""".strip()

def build_prompt(query, search_results):
    context = ""
    
    for doc in search_results:
        context = context + entry_template.format(**doc) + "\n\n"

    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [57]:
def llm(prompt, model='gpt-4o-mini'):
    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [58]:
def rag(query, model='gpt-4o-mini'):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    print(prompt)
    answer = llm(prompt, model=model)
    return answer

## RAG evaluation

In [59]:
df_question = pd.read_csv('../data/ground-truth-retrieval.csv')

In [60]:
df_question.head()

Unnamed: 0,q_id,question
0,1,What is the primary objective of playing socce...
1,1,How many players are allowed on the field from...
2,1,What age group is suitable for participating i...
3,1,What equipment do I need to play soccer safely...
4,1,How long does a typical soccer match last?


In [61]:
ground_truth = df_question.to_dict(orient='records')

In [62]:
ground_truth[0]

{'q_id': 1,
 'question': 'What is the primary objective of playing soccer during a match?'}

## RAG evaluation

In [63]:
prompt2_template = """
You are an expert evaluator for a RAG system.
Your task is to analyze the relevance of the generated answer to the given question.
Based on the relevance of the generated answer, you will classify it
as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Question: {question}
Generated Answer: {answer_llm}

Please analyze the content and context of the generated answer in relation to the question
and provide your evaluation in parsable JSON without using code blocks:

{{
  "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
  "Explanation": "[Provide a brief explanation for your evaluation]"
}}
""".strip()

In [64]:
len(ground_truth)

2950

In [65]:
record = ground_truth[0]
question = record['question']
answer_llm = rag(question)

You're a game event organizer. Answer the QUESTION based on the CONTEXT from our exercises database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: What is the primary objective of playing soccer during a match?

CONTEXT:
gameName: Dungeons & Dragons
 alternateNames: D&D
 subcategory: Role-Playing Game
 level: Beginner to Advanced,
 description: A tabletop role-playing game where players create characters and embark on imaginary adventures.
 playersMax: Unlimited
 ageRange: 12+
 duration: Variable
 equipmentNeeded: Rulebooks; Dice; Character sheets
 objective: Collaboratively tell a story and overcome challenges set by the Dungeon Master.,
 skillsDeveloped: Creativity; Teamwork; Problem-solving,
 setupTime: 15 minutes,
 place: Tabletop
 physicalIntensityLevel: Low
 educationalBenefits: Enhances creativity, storytelling, and social skills
 category: Role-Playing Games

gameName: Dungeons & Dragons (USA)
 alternateNames: D&D
 subcategory: Role-Playing Game
 l

In [66]:
print(answer_llm)

The primary objective of playing soccer during a match is to score more goals than the opposing team within the allotted time.


In [67]:
prompt = prompt2_template.format(question=question, answer_llm=answer_llm)
print(prompt)

You are an expert evaluator for a RAG system.
Your task is to analyze the relevance of the generated answer to the given question.
Based on the relevance of the generated answer, you will classify it
as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Question: What is the primary objective of playing soccer during a match?
Generated Answer: The primary objective of playing soccer during a match is to score more goals than the opposing team within the allotted time.

Please analyze the content and context of the generated answer in relation to the question
and provide your evaluation in parsable JSON without using code blocks:

{
  "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
  "Explanation": "[Provide a brief explanation for your evaluation]"
}


In [68]:
import json

In [69]:
df_sample = df_question.sample(n=200, random_state=1)

In [70]:
sample = df_sample.to_dict(orient='records')

In [71]:
from tqdm.auto import tqdm

In [72]:
evaluations = []

for record in tqdm(sample):
    question = record['question']
    answer_llm = rag(question) 

    prompt = prompt2_template.format(
        question=question,
        answer_llm=answer_llm
    )

    evaluation = llm(prompt)
    evaluation = json.loads(evaluation)

    evaluations.append((record, answer_llm, evaluation))

  0%|          | 0/200 [00:00<?, ?it/s]

You're a game event organizer. Answer the QUESTION based on the CONTEXT from our exercises database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: Can you explain how the scoring system works in Tetris and any strategies to maximize points?

CONTEXT:
gameName: Orienteering
 alternateNames: nan
 subcategory: Navigation Sport
 level: Beginner to Advanced,
 description: A sport where participants find their way to various checkpoints using a map and compass.
 playersMax: Not specified
 ageRange: 8+
 duration: 30 minutes to several hours
 equipmentNeeded: Map; Compass; Running shoes
 objective: Locate all checkpoints in the shortest possible time.,
 skillsDeveloped: Navigation; Decision-making; Endurance,
 setupTime: Varies (setting up the course),
 place: Forests, parks, wilderness areas
 physicalIntensityLevel: Moderate to High
 educationalBenefits: Improves navigation skills, decision-making, and physical fitness
 category: Outdoor Sports and Games

gameNam

In [73]:
evaluations

[({'q_id': 518,
   'question': 'Can you explain how the scoring system works in Tetris and any strategies to maximize points?'},
  "I'm sorry, but the provided context does not include information about the scoring system in Tetris or any strategies related to it. Would you like to know about the scoring systems or strategies for any of the games listed in the context?",
  {'Relevance': 'NON_RELEVANT',
   'Explanation': 'The generated answer does not address the question about the scoring system in Tetris or any strategies to maximize points. Instead, it states that the information is not included in the provided context and does not provide any relevant details or alternatives related to Tetris.'}),
 ({'q_id': 163,
   'question': 'How long can players expect to spend completing Elden Ring?'},
  'Players can expect to spend 60 or more hours completing Elden Ring.',
  {'Relevance': 'RELEVANT',
   'Explanation': 'The generated answer directly addresses the question by providing a specifi

In [74]:
df_eval_mini = pd.DataFrame(evaluations, columns=['record', 'answer', 'evaluation'])

df_eval_mini['q_id'] = df_eval_mini.record.apply(lambda d: d['q_id'])
df_eval_mini['question'] = df_eval_mini.record.apply(lambda d: d['question'])

df_eval_mini['relevance'] = df_eval_mini.evaluation.apply(lambda d: d['Relevance'])
df_eval_mini['explanation'] = df_eval_mini.evaluation.apply(lambda d: d['Explanation'])

del df_eval_mini['record']
del df_eval_mini['evaluation']

In [75]:
df_eval_mini.relevance.value_counts()

relevance
RELEVANT           113
NON_RELEVANT        63
PARTLY_RELEVANT     24
Name: count, dtype: int64

In [76]:
df_eval_mini.relevance.value_counts(normalize=True)

relevance
RELEVANT           0.565
NON_RELEVANT       0.315
PARTLY_RELEVANT    0.120
Name: proportion, dtype: float64

In [77]:
df_eval_mini.to_csv('../data/rag-eval-gpt-4o-mini.csv', index=False)

In [78]:
df_eval_mini[df_eval_mini.relevance == 'NON_RELEVANT']

Unnamed: 0,answer,q_id,question,relevance,explanation
0,"I'm sorry, but the provided context does not i...",518,Can you explain how the scoring system works i...,NON_RELEVANT,The generated answer does not address the ques...
2,The provided context does not contain informat...,568,How long does a typical game of The Quacks of ...,NON_RELEVANT,The generated answer states that it cannot pro...
20,The main objective of Carcassonne during gamep...,121,What is the main objective of Carcassonne duri...,NON_RELEVANT,The generated answer does not address the ques...
23,The provided context does not include the game...,571,How long does a typical game of Scythe take to...,NON_RELEVANT,The generated answer does not address the ques...
24,The context provided does not include informat...,97,How long does a typical game of Yahtzee take t...,NON_RELEVANT,The generated answer states that there is no i...
...,...,...,...,...,...
185,The context provided does not include any info...,309,How many players can participate in a game of ...,NON_RELEVANT,The generated answer fails to address the ques...
187,The context does not provide information about...,533,How long does a typical game of KerPlunk last?,NON_RELEVANT,The generated answer does not provide any info...
193,The context does not provide information about...,465,What essential equipment do I need to play Mys...,NON_RELEVANT,The generated answer states that it cannot pro...
194,The provided context does not include informat...,505,What is the main objective of the game Monopol...,NON_RELEVANT,The generated answer states that it does not h...


In [79]:
df_eval_mini[df_eval_mini.relevance == 'NON_RELEVANT'].to_csv('../data/gpt-4o-mini-nonrelevant.csv', index=False)

In [80]:
evaluations_gpt4o = []

for record in tqdm(sample):
    question = record['question']
    answer_llm = rag(question, model='gpt-4o') 

    prompt = prompt2_template.format(
        question=question,
        answer_llm=answer_llm
    )

    evaluation = llm(prompt)
    evaluation = json.loads(evaluation)
    
    evaluations_gpt4o.append((record, answer_llm, evaluation))

  0%|          | 0/200 [00:00<?, ?it/s]

You're a game event organizer. Answer the QUESTION based on the CONTEXT from our exercises database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: Can you explain how the scoring system works in Tetris and any strategies to maximize points?

CONTEXT:
gameName: Orienteering
 alternateNames: nan
 subcategory: Navigation Sport
 level: Beginner to Advanced,
 description: A sport where participants find their way to various checkpoints using a map and compass.
 playersMax: Not specified
 ageRange: 8+
 duration: 30 minutes to several hours
 equipmentNeeded: Map; Compass; Running shoes
 objective: Locate all checkpoints in the shortest possible time.,
 skillsDeveloped: Navigation; Decision-making; Endurance,
 setupTime: Varies (setting up the course),
 place: Forests, parks, wilderness areas
 physicalIntensityLevel: Moderate to High
 educationalBenefits: Improves navigation skills, decision-making, and physical fitness
 category: Outdoor Sports and Games

gameNam

In [81]:
df_eval = pd.DataFrame(evaluations_gpt4o, columns=['record', 'answer', 'evaluation'])

df_eval['q_id'] = df_eval.record.apply(lambda d: d['q_id'])
df_eval['question'] = df_eval.record.apply(lambda d: d['question'])

df_eval['relevance'] = df_eval.evaluation.apply(lambda d: d['Relevance'])
df_eval['explanation'] = df_eval.evaluation.apply(lambda d: d['Explanation'])

del df_eval['record']
del df_eval['evaluation']

In [82]:
df_eval.relevance.value_counts()

relevance
RELEVANT           116
NON_RELEVANT        59
PARTLY_RELEVANT     25
Name: count, dtype: int64

In [83]:
df_eval.relevance.value_counts(normalize=True)

relevance
RELEVANT           0.580
NON_RELEVANT       0.295
PARTLY_RELEVANT    0.125
Name: proportion, dtype: float64

In [85]:
df_eval.to_csv('../data/rag-eval-gpt-4o.csv', index=False)

In [86]:
df_eval[df_eval.relevance == 'NON_RELEVANT'].to_csv('../data/gpt-4o-nonrelevant.csv', index=False)