In [29]:
import os
import pandas as pd
from openai import OpenAI
from dotenv import load_dotenv
from groq import Groq
import openai
from tqdm.auto import tqdm

In [30]:
import minsearch

## Ingestion

In [31]:
df = pd.read_csv('../data/data.csv')

In [32]:
documents = df.to_dict(orient='records')

In [33]:
index = minsearch.Index(
    text_fields=['exercise_name', 'type_of_activity', 'type_of_equipment', 'body_part',
       'type', 'muscle_groups_activated', 'instructions'],
    keyword_fields=['id']
)

In [34]:
index.fit(documents)

<minsearch.Index at 0x779fb3438560>

## Rag Flow

In [35]:
load_dotenv()

True

In [36]:
client = OpenAI()

In [37]:
# client = Groq(
#     api_key=os.environ.get("GROQ_API_KEY"),
# )

In [38]:
def search(query):
    boost = {}

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )

    return results

In [39]:
prompt_template = """
You're a fitness instructor. Answer the QUESTION based on the CONTEXT from our exercises database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT: 
{context}
""".strip()

entry_template = """ 
exercise_name : {exercise_name}
type_of_activity : {type_of_activity}
type_of_equipment : {type_of_equipment}
body_part : {body_part}
type : {type}
muscle_groups_activated : {muscle_groups_activated}
instructions : {instructions}
""".strip()
    
def build_prompt(query, search_results):
    
    context = ""
    
    for doc in search_results:
        context = context + entry_template.format(**doc) + "\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt


In [40]:
query =  "Can you explain how to do a Glute Bridge, I'm not sure about the movement."

In [41]:
search_results = search(query)
prompt = build_prompt(query, search_results)

In [42]:
def llm(prompt):
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [43]:
def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    #print(prompt)
    answer = llm(prompt)
    return answer

In [44]:
answer = rag("What specific muscle groups are predominantly activated during the Cable Face Pull exercise?")
print(answer)

The specific muscle groups predominantly activated during the Cable Face Pull exercise are the Rear Delts, Traps, and Upper Back.


### Retrieval Evaluation

In [45]:
df_questions = pd.read_csv('../data/ground_truth_retrieval.csv')

In [46]:
df_questions

Unnamed: 0,id,question
0,0,What is the starting position for a push-up?
1,0,Which muscle groups are activated during push-...
2,0,What type of exercise is a push-up classified as?
3,0,Do I need any equipment to perform push-ups?
4,0,What is the correct form for lowering my body ...
...,...,...
1295,259,What is the name of the exercise that targets ...
1296,259,What type of equipment do I need for the Supin...
1297,259,How do I properly perform the Supine Hamstring...
1298,259,What body part is primarily involved in the Su...


In [47]:
ground_truth = df_questions.to_dict(orient="records")

In [48]:
ground_truth[0]

{'id': 0, 'question': 'What is the starting position for a push-up?'}

In [49]:
def hit_rate(relevance_input):
    cnt = 0
    for line in relevance_input:
        if True in line:
            cnt = cnt + 1
            
    return cnt / len(relevance_input)

In [50]:
def mrr(relavance_input):
    total_score = 0

    for line in relavance_input:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relavance_input)

In [51]:
def minsearch_search(query):
    boost = {}

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )

    return results

In [52]:
def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['id']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

In [44]:
evaluate(ground_truth, lambda q: minsearch_search(q['question']))

  0%|          | 0/1300 [00:00<?, ?it/s]

{'hit_rate': 0.9346153846153846, 'mrr': 0.7993296703296706}

#### Finding the best parameters

In [53]:
df_Validation = df_questions[:100]

In [54]:
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

In [84]:
import random

def simple_optimize(param_ranges, objective_function, n_iterations =10):
    best_params = None
    best_score = float('-inf')

    for _ in range(n_iterations):
        #Generate random parameters
        current_params = {}

        for param, (min_val, max_val) in param_ranges.items():
            if isinstance(min_val, int) and isinstance(max_val, int):
                current_params[param] = random.randint(min_val, max_val)
            else:
                current_params[param] = random.uniform(min_val, max_val)
        #Evaluate objective function
        current_score = objective_function(current_params)


        #update best if current is better
        if current_score > best_score: #change to > if maximizing
            best_score = current_score
            best_params = current_params

    return best_params, best_score

In [85]:
gt_val = df_Validation.to_dict(orient="records")

In [86]:
def minsearch_search(query, boost= None):
    if boost is None:
        boost = {}

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )

    return results

In [87]:
param_ranges = {
    'exercise_name': (0.0, 3.0),
     'type_of_activity': (0.0, 3.0),
     'type_of_equipment': (0.0, 3.0),
     'body_part': (0.0, 3.0),
     'type': (0.0, 3.0),
     'muscle_groups_activated': (0.0, 3.0),
     'instructions': (0.0, 3.0),
}

def objective(boost_params):
    def search_function(q):
        return minsearch_search(q['question'], boost_params)
                                
    results = evaluate(gt_val, search_function)
    return results['mrr']

In [88]:
simple_optimize(param_ranges, objective, n_iterations =20)

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

({'exercise_name': 2.772783357349355,
  'type_of_activity': 2.492075223531411,
  'type_of_equipment': 1.4130731740223967,
  'body_part': 0.6746770192988303,
  'type': 2.4213127428405747,
  'muscle_groups_activated': 1.0617426409802255,
  'instructions': 1.1005264566993094},
 0.8947619047619049)

 ({'exercise_name': 3,  
  'type_of_activity': 0,  
  'type_of_equipment': 0,  
  'body_part': 2,  
  'type': 1,  
  'muscle_groups_activated': 3,  
  'instructions': 0},  
 0.8690833333333333)  

In [None]:
def minsearch_improved(query, boost):
    boost ={
        'exercise_name': 2.7,
        'type_of_activity': 2.4,
        'type_of_equipment': 1.4,
        'body_part': 0.67,
        'type': 2.42,
        'muscle_groups_activated': 1.06,
        'instructions': 1.1
    }

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )

    return results

evaluate(ground_truth, lambda q: minsearch_improved(q['question']))