In [None]:
from concurrent.futures import ThreadPoolExecutor
from json.decoder import JSONDecodeError
from tqdm.notebook import tqdm
from dotenv import load_dotenv
from openai import OpenAI
import nest_asyncio
import polars as pl
import asyncio
import shelve
import json
import sys
import os

In [None]:
nest_asyncio.apply()

In [None]:
load_dotenv()

In [None]:
sys.path.append(os.path.abspath('../core'))

In [None]:
from retrival import VectorSearcher, HybridSearcher
from generation import LLM
from rag import ChatGourmet

## Get questions

In [None]:
question_list = pl.read_csv("./dataset/synthetic-questions.csv").to_dicts()

In [None]:
question_list[:3]

# Generate answers

## Retrival

In [None]:
def generate_answers(client, question_list, q_rewrite_func=False, attempt=1, max_attempts=5):
    # The actual query rewrite function is not set to check twice by default as in the rag we will be iterating in a chat and not necessarily will need a search, can be some comments
    answers = []
    no_answers = []

    for question in tqdm(question_list):
        qid, q = list(question.values())[:2]
        final_q = q
        
        if callable(q_rewrite_func):
            q_decision = q_rewrite_func(q)
            if q_decision["search"] == "yes":
                final_q = q_decision["query"]
            elif q_decision["search"] == "no":
                no_answers.append({'question_id': qid, 'question': q, "rank": 99,"answer": "NO QUERY"})
                continue
        
        results = client.search(final_q)
        results = list(enumerate(results, start=1))
        
        if not results:
            no_answers.append({'question_id': qid, 'question': q, "rank": 98,"answer": "NO ANSWER"})
            continue
        
        for r, a in results:
            answers.append({'question_id': qid, 'question': q, "rank": r,"answer": a})

    # When using query rewrite the LLM can sometime at first reply as seach as no even it should be yes
    if callable(q_rewrite_func) and attempt <= max_attempts:
        print(f"No answers: {len(no_answers)} | Attempt num: {attempt}")
        attempt_answers, final_no_answers = generate_answers(client, no_answers, q_rewrite_func, attempt=attempt+1, max_attempts=max_attempts)
        answers += attempt_answers
        no_answers = final_no_answers
    
    return answers, no_answers

### VectorSearcher

In [None]:
vector_client = VectorSearcher()

In [None]:
vector_answers, vector_no_answers = generate_answers(vector_client, question_list)

print(len(vector_no_answers))

In [None]:
pl.from_dicts(vector_answers).write_csv("./dataset/vector-answers.csv")

### HybridSearcher

In [None]:
hybrid_client = HybridSearcher()

In [None]:
hybrid_answers, hybrid_no_answers = generate_answers(hybrid_client, question_list)

print(len(hybrid_no_answers))

In [None]:
pl.from_dicts(hybrid_answers).write_csv("./dataset/hybrid-answers.csv")

### Query rewrite + HybridSearcher

# NEED TO RERUN FOR NEW CHANGES

In [None]:
cg = ChatGourmet()

In [None]:
def query_rewrite(q):
    return asyncio.run(cg._query_rewrite(q))

In [None]:
print(question_list[0]['question'])
print(query_rewrite(question_list[0]['question']))

In [None]:
qrewrite_answers, qrewrite_no_answers = generate_answers(vector_client, question_list, query_rewrite, attempt=1, max_attempts=5)

In [None]:
print(qrewrite_no_answers)

In [None]:
pl.from_dicts(qrewrite_answers).write_csv("./dataset/qrewrite-answers.csv")

## RAG

In [None]:
class Rag:
    def __init__(self):
        self._client = OpenAI(base_url=f"http://localhost:8000/v1", api_key=None)
    
    def chat(self, question):
        try:
            q = question['question']
            response = self._client.chat.completions.create(
                model=None,
                messages=[
                    {"role": "user", "content": q}
                ]
            )
            content = response.choices[0].message.content
            question['answer'] = content
            return question
        except:
            return None

rag = Rag()

In [None]:
rag.chat({"question":"How do I make a lemon herb baked salmon?"})

In [None]:
def generate_rag_answers(question_list, db_name, num_threads=4, batch_size=50, attempt=1, max_attempts=3):
    counter = 0
    
    with shelve.open(f'./shelve/{db_name}') as db:
        if 'remaining' not in db:
            db['processed'] = []
            db['remaining'] = question_list

        processed_question = db['processed']
        remaining_question = db['remaining']

    remaining_question_dict = {str(ans['question_id']): ans for ans in remaining_question}

    with ThreadPoolExecutor(max_workers=num_threads) as executor:
        futures = [executor.submit(rag.chat, question) for question in remaining_question]

        for future in tqdm(futures):
            result = future.result()
            if result:
                processed_question.append(result)
                remaining_question_dict.pop(str(result['question_id']), None)

                counter += 1
                if counter % batch_size == 0:
                    remaining_answers = list(remaining_answers_dict.values())
                    with shelve.open(f'./shelve/{db_name}') as db:
                        db['processed'] = processed_question
                        db['remaining'] = remaining_question

    remaining_question = list(remaining_question_dict.values())
    with shelve.open(f'./shelve/{db_name}') as db:
        db['processed'] = processed_question
        db['remaining'] = remaining_question

    if remaining_question and attempt <= max_attempts:
        print(f"Remaining questions: {len(remaining_question)} | Attempt num: {attempt}")
        generate_rag_answers(remaining_question, db_name, num_threads=num_threads, attempt=attempt+1)

### RAG - meta-llama/Meta-Llama-3.1-8B-Instruct

The model is selected based on what is running on `vllm-serve` in Kaggle

In [None]:
generate_rag_answers(question_list, "Llama-3.1-8B-answer")

In [None]:
with shelve.open('./shelve/Llama-3.1-8B-answer') as db:
    llama_31_8b_answers = db['processed']
    llama_31_8b_no_answers  = db['remaining']

In [None]:
pl.from_dicts(llama_31_8b_answers).write_csv("./dataset/Llama-3.1-8B-answer.csv")

### RAG - microsoft/Phi-3.5-mini-instruct

The model is selected based on what is running on `vllm-serve` in Kaggle

In [None]:
generate_rag_answers(question_list, "Phi-3.5-mini-answer", batch_size=5)

In [None]:
with shelve.open('./shelve/Phi-3.5-mini-answer') as db:
    phi_35_mini_answers = db['processed']
    phi_35_mini_no_answers  = db['remaining']

In [None]:
pl.from_dicts(phi_35_mini_answers).write_csv("./dataset/Phi-3.5-mini-answer.csv")

### RAG - neuralmagic/Phi-3-medium-128k-instruct-quantized.w8a8

The model is selected based on what is running on `vllm-serve` in Kaggle

# Judge

# JUDGE NEED REFACTOR TO USE ASYNC

In [None]:
your_honor = LLM()

In [None]:
async def judge(question, answer, mode="retrieval"):
    if mode=="retrieval":
        system_msg = """
You are an expert evaluator for a Vector Database retrieval that answer a recipe based on the user question.
Your task is to analyze the relevance of the retrieved answer to the given question.
Based on the relevance of the retrieved answer, you will classify it
as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Definitions:
- NON_RELEVANT: The answer does not relate to the user’s question.
- PARTLY_RELEVANT: The answer addresses some aspects but omits or misinterprets key parts of the question.
- RELEVANT: The answer fully addresses the question with correct, useful information.

Please analyze the content and context of the generated answer in relation to the question
and provide your evaluation in parsable JSON without using code blocks:

{
  "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
  "Explanation": "[Provide a brief explanation for your evaluation]"
}
""".strip()

    elif mode=="rag":
        system_msg = """
You are an expert evaluator for a Retrieval-Augmented Generation (RAG) system that answer creative and detailed cooking suggestions for a single recipe idea, and instructions.
Your task is to analyze the relevance of the generated answer to the given question.
Based on the relevance of the generated answer, you will classify it
as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Definitions:
- NON_RELEVANT: The answer does not relate to the user’s question.
- PARTLY_RELEVANT: The answer addresses some aspects but omits or misinterprets key parts of the question.
- RELEVANT: The answer fully addresses the question with correct, useful information.

Please analyze the content and context of the generated answer in relation to the question
and provide your evaluation in parsable JSON without using code blocks:

{
  "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
  "Explanation": "[Provide a brief explanation for your evaluation]"
}
""".strip()
    else:
        raise Exception("Wrong mode")

    user_msg = f"""
Here is the data for evaluation:

QUESTION:
{question}

ANSWER:
{answer}
""".strip()

    verdict = await your_honor.chat(
        messages=[
            {"role": "system", "content": system_msg},
            {"role": "user", "content": user_msg}
        ],
    )
   
    for _ in range(2):
        try:
            return json.loads(verdict)
        except JSONDecodeError:
            content = f"{{{verdict}}}"
    raise JSONDecodeError("Failed to decode JSON after retry", data, 0)


In [None]:
def process_single_verdict(ans, mode="retrieval"):
    try:
        q = ans['question']
        a = ans['answer']
        v = asyncio.run(judge(q, a, mode))
        ans["relevance"] = v["Relevance"]
        ans["explanation"] = v["Explanation"]
        return ans
    except:
        return None

In [None]:
mock_question = {
    "question": "How do I make a lemon herb baked salmon?",
    "answer": "Title: Easy Herbed Grilled Salmon\nIngredients:\n1/2 lb. salmon filet\n1 Tbsp. butter or margarine\n1/2 lemon\n2 Tbsp. white wine\n1/2 tsp. salt (optional)\n1/2 tsp. onion powder\n1/2 tsp. garlic powder\n1/2 tsp. lemon pepper\n1 tsp. oregano\n1/2 tsp. dill weed\n1/2 tsp. parsley flakes\n1/4 tsp. paprika\nDirections:\nPreheat grill. Make a tray out of heavy-duty foil by folding a long piece in half and folding up all 4 sides with the dull side up. Spray the bottom of the foil tray with cooking spray. Place fish filet in the tray, skin side down. Smear a thin line of butter on filet. Squeeze lemon juice liberally over filet and then a splash of white wine. Sprinkle remaining seasonings lightly over filet and transfer the foil tray to the hot grill. Cover. Cook for 10 minutes per inch of thickness of filet. DO NOT overcook or it will be dry and unpalatable. Turning is not necessary. Salmon is done when it turns a light pink color throughout."
}

In [None]:
process_single_verdict(mock_question)

In [None]:
def generate_veredicts(answers, db_name, mode="retrieval", num_threads=4, batch_size=250, attempt=1, max_attempts=3):
    counter = 0

    with shelve.open(f'./shelve/{db_name}') as db:
        if 'remaining' not in db:
            db['processed'] = []
            db['remaining'] = answers

        processed_answers = db['processed']
        remaining_answers = db['remaining']

    if mode=="retrieval":
        remaining_answers_dict = {f"{ans['question_id']}_{ans['rank']}": ans for ans in remaining_answers}
    else:
        remaining_answers_dict = {str(ans['question_id']): ans for ans in remaining_answers}

    with ThreadPoolExecutor(max_workers=num_threads) as executor:
        futures = [executor.submit(process_single_verdict, ans, mode) for ans in remaining_answers]

        for future in tqdm(futures):
            result = future.result()
            if result:
                processed_answers.append(result)

                if mode=="retrieval":
                    remaining_answers_dict.pop(f"{result['question_id']}_{result['rank']}", None)
                else:
                    remaining_answers_dict.pop(str(result['question_id']), None)

                counter += 1
                if counter % batch_size == 0:
                    remaining_answers = list(remaining_answers_dict.values())
                    with shelve.open(f'./shelve/{db_name}') as db:
                        db['processed'] = processed_answers
                        db['remaining'] = remaining_answers

    remaining_answers = list(remaining_answers_dict.values())
    with shelve.open(f'./shelve/{db_name}') as db:
        db['processed'] = processed_answers
        db['remaining'] = remaining_answers

    if remaining_answers and attempt <= max_attempts:
        print(f"Remaining answers: {len(remaining_answers)} | Attempt num: {attempt}")
        generate_veredicts(remaining_answers, db_name, mode, num_threads=num_threads, attempt=attempt+1)


## Retrival

#### Get answers

In [None]:
vector_answers = pl.read_csv("./dataset/vector-answers.csv").to_dicts()
hybrid_answers = pl.read_csv("./dataset/hybrid-answers.csv").to_dicts()
qrewrite_answers = pl.read_csv("./dataset/qrewrite-answers.csv").to_dicts()

### VectorSearcher

In [None]:
generate_veredicts(vector_answers, 'vector_answers') #1:50:23 running in serie

In [None]:
with shelve.open('./shelve/vector_answers') as db:
    vector_veredict = db['processed']
    vector_no_veredict  = db['remaining']

In [None]:
pl.from_dicts(vector_veredict).write_csv("./dataset/vector-veredict.csv")

### HybridSearcher

In [None]:
generate_veredicts(hybrid_answers, 'hybrid_answers') # 34:58 running in parallel

In [None]:
with shelve.open('./shelve/hybrid_answers') as db:
    hybrid_veredict = db['processed']
    hybrid_no_veredict  = db['remaining']

In [None]:
pl.from_dicts(hybrid_veredict).write_csv("./dataset/hybrid-veredict.csv")

### Query rewrite + HybridSearcher

In [None]:
generate_veredicts(qrewrite_answers, 'qrewrite_answers')

In [None]:
with shelve.open('./shelve/qrewrite_answers') as db:
    qrewrite_veredict = db['processed']
    qrewrite_no_veredict  = db['remaining']

In [None]:
pl.from_dicts(qrewrite_veredict).write_csv("./dataset/qrewrite-veredict.csv")