In [138]:
from tqdm.notebook import tqdm
from dotenv import load_dotenv
from openai import OpenAI
import polars as pl
import json
import sys
import os

In [106]:
load_dotenv()
os.environ["OPENAI_MODEL_NAME"] = "llama-3.1-8b-instant" # To be use as query rewrite

In [107]:
sys.path.append(os.path.abspath('../core'))

In [108]:
from retrival import VectorSearcher, HybridSearcher
from rag import query_rewrite 

In [116]:
OPENAI_API_URL = os.environ["OPENAI_API_URL"]
OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]
j_client = OpenAI(base_url=f"{OPENAI_API_URL}/v1", api_key=OPENAI_API_KEY)

In [157]:
def judge(question, answer, mode="retrival"):
    if mode=="retrival":
        user_msg = f"""
You are an expert evaluator for a Vector Database retrival that answer a recipe based on the user question.
Your task is to analyze the relevance of the retrived answer to the given question.
Based on the relevance of the generated answer, you will classify it
as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Definitions:
- NON_RELEVANT: The answer does not relate to the user’s question.
- PARTLY_RELEVANT: The answer addresses some aspects but omits or misinterprets key parts of the question.
- RELEVANT: The answer fully addresses the question with correct, useful information.

Please analyze the content and context of the generated answer in relation to the question
and provide your evaluation in parsable JSON without using code blocks:

{{
  "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
  "Explanation": "[Provide a brief explanation for your evaluation]"
}}

Here is the data for evaluation:
QUESTION:
{question}

RETRIVED ANSWER:
{answer}
""".strip()

    elif mode=="rag":
        user_msg = f"""
You are an expert evaluator for a Retrieval-Augmented Generation (RAG) system that answer creative and detailed cooking suggestions for a single recipe idea, and instructions.
Your task is to analyze the relevance of the generated answer to the given question.
Based on the relevance of the generated answer, you will classify it
as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Definitions:
- NON_RELEVANT: The answer does not relate to the user’s question.
- PARTLY_RELEVANT: The answer addresses some aspects but omits or misinterprets key parts of the question.
- RELEVANT: The answer fully addresses the question with correct, useful information.

Please analyze the content and context of the generated answer in relation to the question
and provide your evaluation in parsable JSON without using code blocks:

{{
  "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
  "Explanation": "[Provide a brief explanation for your evaluation]"
}}

Here is the data for evaluation:
QUESTION:
{question}

GENERATED ANSWER:
{answer}
""".strip()
    else:
        raise Exception("Wrong mode")
    
    verdict = j_client.chat.completions.create(
        model="llama-3.1-70b-versatile",
        messages=[{"role": "user", "content": user_msg}],
    )

    content = verdict.choices[0].message.content
    try:
        return json.loads(content)
    except:
        raise Exception(content)


## Get questions

In [109]:
question_list = pl.read_csv("./dataset/synthetic-questions.csv").to_dicts()

In [110]:
question_list[:3]

[{'question_id': 1, 'question': 'How do I make a lemon herb baked salmon?'},
 {'question_id': 2, 'question': 'How do I make homemade hummus?'},
 {'question_id': 3, 'question': 'What can I make with polenta and mushrooms?'}]

# Retrival eval

## Generate answers

In [115]:
def generate_answers(client, question_list, q_rewrite_func=False, **kwargs):
    answers = []
    no_answers = []

    for question in tqdm(question_list):
        qid, q = list(question.values())[:2]
        final_q = q
        
        if callable(q_rewrite_func):
            q_decision = q_rewrite_func(q)
            if q_decision["search"] == "yes":
                final_q = q_decision["query"]
            elif q_decision["search"] == "no":
                no_answers.append({'question_id': qid, 'question': q, "rank": 99,"answer": "NO QUERY"})
                continue
        
        results = client.search(final_q)
        results = list(enumerate(results, start=1))
        
        if not results:
            no_answers.append({'question_id': qid, 'question': q, "rank": 98,"answer": "NO ANSWER"})
            continue
        
        for r, a in results:
            answers.append({'question_id': qid, 'question': q, "rank": r,"answer": a})

    # When using query rewrite the LLM can sometime at first reply as seach as no even it should be yes
    if "attempt" in kwargs and "max_attempts" in kwargs and no_answers:
        attempt = kwargs["attempt"]
        max_attempts = kwargs["max_attempts"]
        if not attempt > max_attempts:
            print(f"No answers: {len(no_answers)} | Attempt num: {attempt}")
            attempt_answers, final_no_answers = generate_answers(client, no_answers, q_rewrite_func=False, attempt=attempt+1, max_attempts=max_attempts)
            answers += attempt_answers
            no_answers = final_no_answers
    
    return answers, no_answers

### VectorSearcher

In [70]:
vector_client = VectorSearcher()

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

In [71]:
vector_answers, vector_no_answers = generate_answers(vector_client, question_list)

print(len(vector_no_answers))

  0%|          | 0/450 [00:00<?, ?it/s]

0


In [72]:
pl.from_dicts(vector_answers).write_csv("./dataset/vector-answers.csv")

### HybridSearcher

In [73]:
hybrid_client = HybridSearcher()

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

Fetching 6 files:   0%|          | 0/6 [00:00<?, ?it/s]

In [75]:
hybrid_answers, hybrid_no_answers = generate_answers(hybrid_client, question_list)

print(len(hybrid_no_answers))

  0%|          | 0/450 [00:00<?, ?it/s]

0


In [77]:
pl.from_dicts(hybrid_answers).write_csv("./dataset/hybrid-answers.csv")

### Query rewrite + HybridSearcher

In [84]:
print(question_list[0]['question'])
print(query_rewrite(question_list[0]['question']))

How do I make a lemon herb baked salmon?
{'search': 'yes', 'query': 'Ingredients: salmon\nlemon\nherb\nDirections: bake'}


In [103]:
qrewrite_answers, qrewrite_no_answers = generate_answers(vector_client, question_list, query_rewrite, attempt=1, max_attempts=5)

  0%|          | 0/450 [00:00<?, ?it/s]

No answers: 52 | Attempt num: 1


  0%|          | 0/52 [00:00<?, ?it/s]

In [121]:
print(qrewrite_no_answers)

[]


In [114]:
pl.from_dicts(qrewrite_answers).write_csv("./dataset/qrewrite-answers.csv")

## Judge

In [158]:
hybrid_answers[0]['question']

'How do I make a lemon herb baked salmon?'

In [159]:
print(hybrid_answers[0]['answer'])

Title: Easy Herbed Grilled Salmon
Ingredients:
1/2 lb. salmon filet
1 Tbsp. butter or margarine
1/2 lemon
2 Tbsp. white wine
1/2 tsp. salt (optional)
1/2 tsp. onion powder
1/2 tsp. garlic powder
1/2 tsp. lemon pepper
1 tsp. oregano
1/2 tsp. dill weed
1/2 tsp. parsley flakes
1/4 tsp. paprika
Directions:
Preheat grill. Make a tray out of heavy-duty foil by folding a long piece in half and folding up all 4 sides with the dull side up. Spray the bottom of the foil tray with cooking spray. Place fish filet in the tray, skin side down. Smear a thin line of butter on filet. Squeeze lemon juice liberally over filet and then a splash of white wine. Sprinkle remaining seasonings lightly over filet and transfer the foil tray to the hot grill. Cover. Cook for 10 minutes per inch of thickness of filet. DO NOT overcook or it will be dry and unpalatable. Turning is not necessary. Salmon is done when it turns a light pink color throughout.


In [160]:
judge(hybrid_answers[0]['question'], hybrid_answers[0]['answer'], mode="retrival")

{'Relevance': 'PARTLY_RELEVANT',
 'Explanation': "The retrieved answer is related to cooking salmon with herbs, but it focuses on grilling instead of baking and doesn't directly address lemon herb baked salmon. Some of the ingredients, such as lemon and herbs, match the question, but the cooking method differs."}

In [168]:
def generate_veredicts(answers, mode="retrival"):
    
    for ans in tqdm(answers):
        q = ans['question']
        a = ans['answer']
        v = judge(q, a, mode)
        ans["relevance"] = v["Relevance"] 
        ans["explanation"] = v["Explanation"]

    return answers

### VectorSearcher

In [169]:
vector_veredict = generate_veredicts(vector_answers)

  0%|          | 0/2250 [00:00<?, ?it/s]

Exception: "Relevance": "RELEVANT", 
"Explanation": "The retrieved answer directly addresses the user's request for a vegetarian lasagna recipe by providing a title, ingredients, and detailed directions for preparation. The answer fully matches the question, providing a complete and useful response."

In [None]:
pl.from_dicts(vector_veredict).write_csv("./dataset/vector-veredict.csv")

### HybridSearcher

In [None]:
hybrid_veredict = generate_veredicts(hybrid_answers)

In [None]:
pl.from_dicts(hybrid_veredict).write_csv("./dataset/hybrid-veredict.csv")

### Query rewrite + HybridSearcher

In [None]:
qrewrite_veredict = generate_veredicts(qrewrite_answers)

In [None]:
pl.from_dicts(qrewrite_veredict).write_csv("./dataset/qrewrite-veredict.csv")