In [44]:
import os
import pandas as pd
from openai import OpenAI
from dotenv import load_dotenv
from groq import Groq
import openai
import json
from tqdm.auto import tqdm

In [2]:
import minsearch

## Ingestion

In [3]:
df = pd.read_csv('../data/data.csv')

In [4]:
documents = df.to_dict(orient='records')

In [5]:
index = minsearch.Index(
    text_fields=['exercise_name', 'type_of_activity', 'type_of_equipment', 'body_part',
       'type', 'muscle_groups_activated', 'instructions'],
    keyword_fields=['id']
)

In [6]:
index.fit(documents)

<minsearch.Index at 0x7fb4376c81f0>

## Rag Flow

In [7]:
load_dotenv()

True

In [8]:
client = OpenAI()

In [9]:
# client = Groq(
#     api_key=os.environ.get("GROQ_API_KEY"),
# )

In [10]:
def search(query):
    boost = {}

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )

    return results

In [11]:
prompt_template = """
You're a fitness instructor. Answer the QUESTION based on the CONTEXT from our exercises database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT: 
{context}
""".strip()

entry_template = """ 
exercise_name : {exercise_name}
type_of_activity : {type_of_activity}
type_of_equipment : {type_of_equipment}
body_part : {body_part}
type : {type}
muscle_groups_activated : {muscle_groups_activated}
instructions : {instructions}
""".strip()
    
def build_prompt(query, search_results):
    
    context = ""
    
    for doc in search_results:
        context = context + entry_template.format(**doc) + "\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt


In [12]:
query =  "Can you explain how to do a Glute Bridge, I'm not sure about the movement."

In [13]:
search_results = search(query)
prompt = build_prompt(query, search_results)

In [53]:
def llm(prompt, model="gpt-4o-mini"):
    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [54]:
def rag(query, model="gpt-4o-mini",):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt, model=model)
    return answer

In [16]:
answer = rag("What specific muscle groups are predominantly activated during the Cable Face Pull exercise?")
print(answer)

The specific muscle groups predominantly activated during the Cable Face Pull exercise are the Rear Delts, Traps, and Upper Back.


### Retrieval Evaluation

In [17]:
df_questions = pd.read_csv('../data/ground_truth_retrieval.csv')

In [38]:
df_questions.head()

Unnamed: 0,id,question
0,0,What is the starting position for a push-up?
1,0,Which muscle groups are activated during push-...
2,0,What type of exercise is a push-up classified as?
3,0,Do I need any equipment to perform push-ups?
4,0,What is the correct form for lowering my body ...


In [19]:
ground_truth = df_questions.to_dict(orient="records")

In [20]:
ground_truth[0]

{'id': 0, 'question': 'What is the starting position for a push-up?'}

In [21]:
def hit_rate(relevance_input):
    cnt = 0
    for line in relevance_input:
        if True in line:
            cnt = cnt + 1
            
    return cnt / len(relevance_input)

In [22]:
def mrr(relavance_input):
    total_score = 0

    for line in relavance_input:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relavance_input)

In [23]:
def minsearch_search(query):
    boost = {}

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )

    return results

In [24]:
def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['id']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

In [44]:
evaluate(ground_truth, lambda q: minsearch_search(q['question']))

  0%|          | 0/1300 [00:00<?, ?it/s]

{'hit_rate': 0.9346153846153846, 'mrr': 0.7993296703296706}

#### Finding the best parameters

In [25]:
df_Validation = df_questions[:100]

In [26]:
import random

def simple_optimize(param_ranges, objective_function, n_iterations =10):
    best_params = None
    best_score = float('-inf')

    for _ in range(n_iterations):
        #Generate random parameters
        current_params = {}

        for param, (min_val, max_val) in param_ranges.items():
            if isinstance(min_val, int) and isinstance(max_val, int):
                current_params[param] = random.randint(min_val, max_val)
            else:
                current_params[param] = random.uniform(min_val, max_val)
        #Evaluate objective function
        current_score = objective_function(current_params)


        #update best if current is better
        if current_score > best_score: #change to > if maximizing
            best_score = current_score
            best_params = current_params

    return best_params, best_score

In [27]:
gt_val = df_Validation.to_dict(orient="records")

In [28]:
def minsearch_search(query, boost= None):
    if boost is None:
        boost = {}

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )

    return results

In [29]:
param_ranges = {
    'exercise_name': (0.0, 3.0),
     'type_of_activity': (0.0, 3.0),
     'type_of_equipment': (0.0, 3.0),
     'body_part': (0.0, 3.0),
     'type': (0.0, 3.0),
     'muscle_groups_activated': (0.0, 3.0),
     'instructions': (0.0, 3.0),
}

def objective(boost_params):
    def search_function(q):
        return minsearch_search(q['question'], boost_params)
                                
    results = evaluate(gt_val, search_function)
    #return results['mrr']
    return results['hit_rate']

In [30]:
simple_optimize(param_ranges, objective, n_iterations =10)

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

({'exercise_name': 2.748814222971305,
  'type_of_activity': 0.45441801967546935,
  'type_of_equipment': 1.0662915487547175,
  'body_part': 0.09483258455313393,
  'type': 2.404566536752043,
  'muscle_groups_activated': 0.5806360412150631,
  'instructions': 1.666090730933937},
 0.97)

 ({'exercise_name': 3,  
  'type_of_activity': 0,  
  'type_of_equipment': 0,  
  'body_part': 2,  
  'type': 1,  
  'muscle_groups_activated': 3,  
  'instructions': 0},  
 0.8690833333333333)  

In [34]:
def minsearch_improved(query):
    boost = {
    
    'exercise_name': 2.74,
    'type_of_activity': 0.45,
    'type_of_equipment': 1.06,
    'body_part': 0.094,
    'type': 2.40,
    'muscle_groups_activated': 0.58,
    'instructions': 1.66
    }


    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )

    return results

evaluate(ground_truth, lambda q: minsearch_improved(q['question']))

  0%|          | 0/1300 [00:00<?, ?it/s]

{'hit_rate': 0.9453846153846154, 'mrr': 0.8690955433455434}

In [30]:
{'hit_rate': 0.946923076923077, 'mrr': 0.868453296703297}

{'hit_rate': 0.946923076923077, 'mrr': 0.868453296703297}

### Rag Evaluation
#### LLM As a Judge

In [31]:
prompt2_template = """
You are an expert evaluator for a RAG system.
Your task is to analyze the relevance of the generated answer to the given question.
Based on the relevance of the generated answer, you will classify it
as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Question: {question}
Generated Answer: {answer_llm}

Please analyze the content and context of the generated answer in relation to the question
and provide your evaluation in parsable JSON without using code blocks:

{{
  "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
  "Explanation": "[Provide a brief explanation for your evaluation]"
}}
""".strip()

In [34]:
record = ground_truth[0]
question = record['question']
answer_llm = rag(question)

In [35]:
print(answer_llm)

The starting position for a push-up is to begin in a plank position.


In [36]:
prompt = prompt2_template.format(question = question, answer_llm= answer_llm)
print(prompt)

You are an expert evaluator for a RAG system.
Your task is to analyze the relevance of the generated answer to the given question.
Based on the relevance of the generated answer, you will classify it
as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Question: What is the starting position for a push-up?
Generated Answer: The starting position for a push-up is to begin in a plank position.

Please analyze the content and context of the generated answer in relation to the question
and provide your evaluation in parsable JSON without using code blocks:

{
  "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
  "Explanation": "[Provide a brief explanation for your evaluation]"
}


In [37]:
llm(prompt)

'{\n  "Relevance": "RELEVANT",\n  "Explanation": "The generated answer accurately describes the starting position for a push-up as a plank position, which directly addresses the question."\n}'

In [50]:
df_sample = df_questions.sample(n=200, random_state =1)

In [51]:
sample = df_sample.to_dict(orient="records")

In [52]:
evaluations = []

for record in tqdm(sample):
    question = record['question']
    answer_llm = rag(question)

    prompt = prompt2_template.format(
        question = question, 
        answer_llm= answer_llm
    )
    evaluation = llm(prompt)
    evaluation = json.loads(evaluation)

    evaluations.append((record, answer_llm, evaluation))

  0%|          | 0/200 [00:00<?, ?it/s]

In [55]:
df_eval = pd.DataFrame(evaluations, columns = ['record', 'answer', 'evaluation'])

In [60]:
df_eval['id'] = df_eval.record.apply(lambda d: d['id'])
df_eval['question'] = df_eval.record.apply(lambda d: d['question'])

df_eval['relevance'] = df_eval.evaluation.apply(lambda d: d['Relevance'])
df_eval['explanation'] = df_eval.evaluation.apply(lambda d: d['Explanation'])

In [62]:
del df_eval['record']
del df_eval['evaluation']

In [66]:
df_eval.relevance.value_counts(normalize=True)

relevance
RELEVANT           0.845
PARTLY_RELEVANT    0.150
NON_RELEVANT       0.005
Name: proportion, dtype: float64

In [65]:
df_eval[df_eval.relevance == 'NON_RELEVANT']

Unnamed: 0,answer,id,question,relevance,explanation
55,The Banded Lateral Walk is a push exercise type.,105,Is the Banded Lateral Walk a push or pull exer...,NON_RELEVANT,The generated answer incorrectly classifies th...


In [67]:
evaluations_gpt_4_1 = []

for record in tqdm(sample):
    question = record['question']
    answer_llm = rag(question, model = 'gpt-4.1-mini')

    prompt = prompt2_template.format(
        question = question, 
        answer_llm= answer_llm
    )
    evaluation = llm(prompt, model = 'gpt-4.1-mini')
    evaluation = json.loads(evaluation)

    evaluations_gpt_4_1.append((record, answer_llm, evaluation))

  0%|          | 0/200 [00:00<?, ?it/s]

In [68]:
df_eval_2 = pd.DataFrame(evaluations_gpt_4_1, columns = ['record', 'answer', 'evaluation'])

In [69]:
df_eval_2['id'] = df_eval_2.record.apply(lambda d: d['id'])
df_eval_2['question'] = df_eval_2.record.apply(lambda d: d['question'])

df_eval_2['relevance'] = df_eval_2.evaluation.apply(lambda d: d['Relevance'])
df_eval_2['explanation'] = df_eval_2.evaluation.apply(lambda d: d['Explanation'])

In [70]:
del df_eval_2['record']
del df_eval_2['evaluation']

In [71]:
df_eval_2.relevance.value_counts(normalize=True)

relevance
RELEVANT           0.910
PARTLY_RELEVANT    0.085
NON_RELEVANT       0.005
Name: proportion, dtype: float64