In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('../data/clean_data.csv')

In [3]:
documents = df.to_dict(orient='records')

## Minsearch indexing

In [4]:
import minsearch

In [5]:
minsearch_text_index = minsearch.Index(
    text_fields=['Title', 'Instructions'], # 'Cleaned_Ingredients', 'Image_Name'],
    keyword_fields=['Id'],
    type='text'
)

In [6]:
minsearch_text_index.fit(documents)

<minsearch.Index at 0x15789be50>

In [7]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')
minsearch_vector_index = minsearch.Index(
    text_fields=['Title', 'Instructions'], # 'Cleaned_Ingredients', 'Image_Name'],
    keyword_fields=['Id'],
    model=model,
    type='vector'
)
minsearch_vector_index.fit(documents)

  0%|          | 0/13501 [00:00<?, ?it/s]

<minsearch.Index at 0x157f00340>

In [19]:
minsearch_hybrid_index = minsearch.Index(
    text_fields=['Title', 'Instructions'], # 'Cleaned_Ingredients', 'Image_Name'],
    keyword_fields=['Id'],
    model=model,
    type='hybrid'
)
minsearch_hybrid_index.fit(documents)

  0%|          | 0/13501 [00:00<?, ?it/s]

<minsearch.Index at 0x175c68ee0>

In [18]:
X_en = model.encode('what do you cook?')
X_en1 = model.encode('How do you cook?').tolist()

In [20]:
X_en = X_en.reshape(1, -1)
X_en.shape

(1, 384)

In [35]:
results = []

results.append(X_en)
#results.append(X_en1)

In [36]:
import numpy as np
results = np.array(results)


In [37]:
results.shape

(1, 384)

## Search with ElasticSearch

In [7]:
from elasticsearch import Elasticsearch
es_client = Elasticsearch('http://localhost:9200')

INDEX_NAME="recipe-questions"

index_settings = {
    "settings": {"number_of_shards": 1, "number_of_replicas": 0},
    "mappings": {
        "properties": {
            "Title": {"type": "text"},
            "Instructions": {"type": "text"},
            "Cleaned_Ingredients": {"type": "text"},
            "Image_Name": {"type": "text"},
            # "Title": {"type": "keyword"},
            # "Instructions": {"type": "keyword"},
            # "Cleaned_Ingredients": {"type": "keyword"},
            # "Image_Name": {"type": "keyword"},
            "Id": {"type": "keyword"},
            # "title_vector": {
            #     "type": "dense_vector",
            #     "dims": 384, #768, #
            #     "index": True,
            #     "similarity": "cosine",
            # },
            # "instr_vector": {
            #     "type": "dense_vector",
            #     "dims": 384, #768, #
            #     "index": True,
            #     "similarity": "cosine",
            # },
            # "ingr_vector": {
            #     "type": "dense_vector",
            #     "dims": 384, #768, #
            #     "index": True,
            #     "similarity": "cosine",
            # },
            # "instr_ingr_vector": {
            #     "type": "dense_vector",
            #     "dims": 384, #768, #
            #     "index": True,
            #     "similarity": "cosine",
            # },
            "title_instr_vector": {
                "type": "dense_vector",
                "dims": 384, #768, #
                "index": True,
                "similarity": "cosine",
            },
            # "title_ingr_vector": {
            #     "type": "dense_vector",
            #     "dims": 384, #768, #
            #     "index": True,
            #     "similarity": "cosine",
            # },
            "title_instr_ingr_vector": {
                "type": "dense_vector",
                "dims": 384, #768, #
                "index": True,
                "similarity": "cosine",
            },
        }
    }
}

es_client.indices.delete(index=INDEX_NAME, ignore_unavailable=True)
es_client.indices.create(index=INDEX_NAME, body=index_settings)
print(f"Elasticsearch index '{INDEX_NAME}' created")

Elasticsearch index 'recipe-questions' created


In [8]:
def index_document(es_client, doc, model):
    title = doc["Title"]
    instructions = doc["Instructions"]
    ingredients = doc["Cleaned_Ingredients"]
    # doc["title_vector"] = model.encode(title)
    # doc["instr_vector"] = model.encode(instructions)
    # doc["ingr_vector"] = model.encode(ingredients)
    doc["title_instr_vector"] = model.encode(title + " " + instructions).tolist()
    # doc["title_ingr_vector"] = model.encode(title + " " + ingredients).tolist()
    # doc["instr_ingr_vector"] = model.encode(instructions + " " + ingredients).tolist()
    doc["title_instr_ingr_vector"] = model.encode(title + " " + instructions + " " + ingredients).tolist()
    es_client.index(index=INDEX_NAME, document=doc)

In [9]:
from concurrent.futures import ThreadPoolExecutor
from tqdm.auto import tqdm

executor = ThreadPoolExecutor(8)

with tqdm(total=len(documents)) as progress:
        futures = []
        for doc in documents:
            future = executor.submit(index_document, es_client, doc, model)
            # attaches a callback to the future that will update
            # the progress bar each time a task is completed
            future.add_done_callback(lambda p: progress.update())
            futures.append(future)
        for future in futures:
            # The code waits for each future to complete by calling future.result().
            # This call  will block until the task is finished, and
            # then it retrieves the result.
            result = future.result()

  0%|          | 0/13501 [00:00<?, ?it/s]

### Elastic Search text

In [59]:
def elastic_search_text(query,  index_name=INDEX_NAME):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["Title", "Instructions"],
                        "type": "most_fields",
                    }
                },
            }
        },
    }

    response = es_client.search(index=index_name, body=search_query)
    return [hit["_source"] for hit in response["hits"]["hits"]]

### ElasticSearch vector

In [11]:
def elastic_search_knn(field, vector, index_name=INDEX_NAME):
    knn = {
        "field": field,
        "query_vector": vector,
        "k": 5,
        "num_candidates": 10000,
    }

    search_query = {
        "knn": knn,
        "_source": ["Image_Name", "Title", "Cleaned_Ingredients", "Instructions", "Id"],
    }

    es_results = es_client.search(index=index_name, body=search_query)

    return [hit["_source"] for hit in es_results["hits"]["hits"]]

### ElasticSearch hybrid

In [63]:
def elastic_search_hybrid(field, query, vector, index_name=INDEX_NAME):
    knn_query = {
        "field": field,
        "query_vector": vector,
        "k": 5,
        "num_candidates": 10000,
        "boost": 0.5
    }

    keyword_query = {
        "bool": {
            "must": {
                "multi_match": {
                    "query": query,
                    "fields": ["Cleaned_Ingredients", "Title", "Instructions"],
                    "type": "best_fields",
                    "boost": 0.5,
                }
            }
        }
    }

    search_query = {
        "knn": knn_query,
        "query": keyword_query,
        "size": 5,
        "_source": ["Image_Name", "Title", "Cleaned_Ingredients", "Instructions", "Id"]
    }

    es_results = es_client.search(
        index=index_name,
        body=search_query
    )

    result_docs = []

    for hit in es_results['hits']['hits']:
        result_docs.append(hit['_source'])

    return result_docs

## RAG flow

In [92]:
from openai import OpenAI

client = OpenAI()

In [21]:
def minsearch_text_search(query):
    # boost = {'Cleaned_Ingredients': 2.0, 'Instructions': 4.0, 'Title': 1.0}
    boost = {}
    results = minsearch_text_index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )

    return results


def minsearch_vector_search(query):
    # boost = {'Cleaned_Ingredients': 2.0, 'Instructions': 4.0, 'Title': 1.0}
    #boost = {}
    results = minsearch_vector_index.search(
        query=query,
        filter_dict={},
        #boost_dict=boost,
        num_results=10
    )

    return results


def minsearch_hybrid_search(query):
    boost = {'Instructions': 3.0, 'Title': 2.0}
    #boost = {}
    results = minsearch_hybrid_index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        hybrid_boost={'text':0.5, 'vector':0.5},
        num_results=10
    )

    return results

In [87]:
prompt_template = """
You're a well-known chief. Your goal is to provide recipes to users who can
give you a list of ingredients or ask you about recipes. Answer the QUESTION based on the CONTEXT from our recipes database.
Use only the facts from the CONTEXT when answering the QUESTION and always cite the title and the Image_Name
Your answer must be in mardown format with an image displaying the food where Image_Name is the url.
The url must have the following form : ../data/Food_Images/Image_Name.jpg

If Image_Name is empty write that the image not available.

The ingredients must come first before the instructions.

QUESTION: {question}

CONTEXT:
{context}
""".strip()


entry_template = """
title: {Title}
instructions: {Instructions}
ingredients: {Cleaned_Ingredients}
image_name: {Image_Name}

""".strip()

def build_prompt(query, search_results):
    context = ""

    for doc in search_results:
        context = context + entry_template.format(**doc) + "\n\n"

    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [90]:
def llm(prompt, model='gpt-4o-mini'):
    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}]
    )

    return response.choices[0].message.content

In [18]:
def rag(query, model='gpt-4o-mini'):
    search_results = minsearch_text_search(query)
    prompt = build_prompt(query, search_results)
    #print(prompt)
    answer = llm(prompt, model=model)
    return answer

In [19]:
question = 'I have eggs, oinion, bread what can i cook?'
answer = rag(question)
print(answer)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


You can make **Hard-Boiled Eggs** with the ingredients you have. Here’s how:

### Ingredients
- 4–8 large eggs

### Instructions
1. Bring a large saucepan of water to a boil over medium-high heat. 
2. Using a slotted spoon, carefully lower eggs into the water one at a time. 
3. Cook for 10 minutes, maintaining a gentle boil. 
4. Carefully transfer the eggs to a bowl of ice water and let cool until just slightly warm, about 2 minutes. 
5. Gently crack the eggs all over and peel, starting from the fat end containing the air pocket. 

Eggs can be cooked and peeled 3 days ahead. Transfer to an airtight container and chill.

![Hard-Boiled Eggs](../data/Food_Images/hard-boiled-eggs-recipe.jpg)


With the ingredients you have (eggs, onion, and bread), you can make **Egg and Onion Toast**. Here’s a simple recipe you can follow:

### Ingredients:
- **Eggs**: 2 (or more depending on how many servings you want)
- **Onion**: 1 small, finely chopped
- **Bread**: 2 slices (your choice, such as sourdough or whole wheat)
- **Salt and pepper**: to taste
- **Butter or oil**: for frying

### Instructions:
1. **Prepare the Ingredients**: Chop the onion finely.
2. **Cook the Onion**: Heat a skillet over medium heat and add butter or oil. Once hot, add the chopped onion and sauté until caramelized and golden brown, about 5-7 minutes.
3. **Cook the Eggs**: In the same skillet, crack the eggs directly over the sautéed onions (you can scramble them or cook them sunny-side up based on your preference). Season with salt and pepper.
4. **Toast the Bread**: While the eggs are cooking, you can toast the bread slices in a separate toaster or in the same skillet if there’s room.
5. **Assemble the Toast**: Once the eggs are cooked to your liking, serve them over the toasted bread topped with the sautéed onions. 

Enjoy your Egg and Onion Toast!

![Egg and Onion Toast](../data/Food_Images/hard-boiled-eggs-recipe.jpg)
Id

## Retrieval evaluation

In [8]:
df_question = pd.read_csv('../data/ground-truth-retrieval.csv')

In [9]:
df_question.head()

Unnamed: 0,Id,question
0,1,What type of chicken should I use for the Miso...
1,1,How should I prepare the acorn squash before r...
2,1,What herbs are used in the herb butter mixture...
3,1,How long does the chicken need to rest after r...
4,1,What type of wine is recommended for the gravy...


In [10]:
ground_truth = df_question.to_dict(orient='records')

In [11]:
ground_truth[0]

{'Id': 1,
 'question': 'What type of chicken should I use for the Miso-Butter Roast Chicken With Acorn Squash Panzanella recipe?'}

In [12]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

In [13]:
def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['Id']
        results = search_function(q)
        relevance = [d['Id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

In [14]:
def evaluate1(q, search_function):

    doc_id = q['Id']
    results = search_function(q['question'])
    relevance = [d['Id'] == doc_id for d in results]

    return relevance

In [73]:
from tqdm.auto import tqdm

from concurrent.futures import ThreadPoolExecutor
#max_workers=8
pool = ThreadPoolExecutor(max_workers=8)

def map_progress(pool, seq,search_function, f):

    """
    The map_progress function essentially applies a given function f
      to each element of a sequence seq in parallel using multiple threads,
      while also displaying a progress bar to track how much of the work has
      been completed. This can be useful when you have a large number of tasks
      to process and want to speed up the work using concurrency,
      while also having a visual indicator of progress.
    """
    results = []

    with tqdm(total=len(seq)) as progress:
        futures = []

        for el in seq:
            # For each element, the function f is submitted to the thread pool
            # for execution with pool.submit(f, el). This returns a future,
            # an object that  represents the result of the task that will be
            # completed in the future.
            future = pool.submit(f, el, search_function)
            # attaches a callback to the future that will update
            # the progress bar each time a task is completed
            future.add_done_callback(lambda p: progress.update())
            futures.append(future)

        for future in futures:
            # The code waits for each future to complete by calling future.result().
            # This call  will block until the task is finished, and
            # then it retrieves the result.
            result = future.result()
            results.append(result)

    return {
        'hit_rate': hit_rate(results),
        'mrr': mrr(results),
    }

### Evaluate minsearch

In [24]:
map_progress(pool, ground_truth, minsearch_text_search, evaluate1)

  0%|          | 0/67505 [00:00<?, ?it/s]

{'hit_rate': 0.6206503221983557, 'mrr': 0.4916421299774336}

In [18]:
map_progress(pool, ground_truth, minsearch_vector_search, evaluate1)

  0%|          | 0/67505 [00:00<?, ?it/s]

{'hit_rate': 0.6240722909414117, 'mrr': 0.4914756343739321}

In [23]:
map_progress(pool, ground_truth, minsearch_hybrid_search, evaluate1)

  0%|          | 0/67505 [00:00<?, ?it/s]

{'hit_rate': 0.6722168728242353, 'mrr': 0.5320085754964565}

### Evaluate elastic search text

In [58]:
# evaluate(ground_truth, lambda q: elastic_search_text(q['question']))
map_progress(pool, ground_truth, elastic_search_text, evaluate1)

  0%|          | 0/67505 [00:00<?, ?it/s]

{'hit_rate': 0.5364047107621658, 'mrr': 0.4308330247142114}

### Evaluate elastic search vector

In [30]:
# v_i = evaluate(ground_truth, lambda q: elastic_search_knn('instr_vector', model.encode(q['question'])))
# print (f"the search evaluation for the field instr_vector is: \n {v_i}")


# v_i = evaluate(ground_truth, lambda q: elastic_search_knn('ingr_vector', model.encode(q['question'])))
# print (f"the search evaluation for the field ingr_vector is: \n {v_i}")



# v_i = evaluate(ground_truth, lambda q: elastic_search_knn('instr_ingr_vector', model.encode(q['question'])))
# print (f"the search evaluation for the field instr_ingr_vector is: \n {v_i}")

# v_i = evaluate(ground_truth, lambda q: elastic_search_knn('title_vector', model.encode(q['question'])))
# print (f"the search evaluation for the field title_vector is: \n {v_i}")


# v_i = evaluate(ground_truth, lambda q: elastic_search_knn('title_ingr_vector', model.encode(q['question'])))
# print (f"the search evaluation for the field title_ingr_vector is: \n {v_i}")

v_i = evaluate(ground_truth, lambda q: elastic_search_knn('title_instr_vector', model.encode(q['question'])))
print (f"the search evaluation for the field title_instr_vector is: \n {v_i}")


# v_i = evaluate(ground_truth, lambda q: elastic_search_knn('title_instr_ingr_vector', model.encode(q['question'])))
# print (f"the search evaluation for the field title_instr_ingr_vector is: \n {v_i}")


  0%|          | 0/67505 [00:00<?, ?it/s]

the search evaluation for the field instr_vector is: 
 {'hit_rate': 0.2995926227686838, 'mrr': 0.21272103300991294}


  0%|          | 0/67505 [00:00<?, ?it/s]

the search evaluation for the field ingr_vector is: 
 {'hit_rate': 0.1406118065328494, 'mrr': 0.08949633360491584}


  0%|          | 0/67505 [00:00<?, ?it/s]

the search evaluation for the field instr_ingr_vector is: 
 {'hit_rate': 0.3209095622546478, 'mrr': 0.22942152433154606}


  0%|          | 0/67505 [00:00<?, ?it/s]

the search evaluation for the field title_vector is: 
 {'hit_rate': 0.5192504258943782, 'mrr': 0.4397284151791178}


  0%|          | 0/67505 [00:00<?, ?it/s]

the search evaluation for the field title_ingr_vector is: 
 {'hit_rate': 0.525501814680394, 'mrr': 0.43574401896155385}


  0%|          | 0/67505 [00:00<?, ?it/s]

the search evaluation for the field title_instr_vector is: 
 {'hit_rate': 0.5677801644322643, 'mrr': 0.4774441399402419}


  0%|          | 0/67505 [00:00<?, ?it/s]

the search evaluation for the field title_instr_ingr_vector is: 
 {'hit_rate': 0.5644026368417154, 'mrr': 0.4744791743821372}


### Evaluate elastic search hybrid

In [83]:
# v_i = evaluate(ground_truth, lambda q: elastic_search_hybrid('instr_vector',q['question'], model.encode(q['question'])))
# print (f"the search evaluation for the field instr_vector is: \n {v_i}")
# print(_)

# v_i = evaluate(ground_truth, lambda q: elastic_search_hybrid('ingr_vector',q['question'], model.encode(q['question'])))
# print (f"the search evaluation for the field ingr_vector is: \n {v_i}")
# print(_)

# v_i = evaluate(ground_truth, lambda q: elastic_search_hybrid('instr_ingr_vector',q['question'], model.encode(q['question'])))
# print (f"the search evaluation for the field instr_ingr_vector is: \n {v_i}")
# print(_)

v_i = evaluate(ground_truth, lambda q: elastic_search_hybrid('title_instr_vector',q['question'], model.encode(q['question'])))
print (f"the search evaluation for the field title_instr_ingr_vector is: \n {v_i}")

# v_i = evaluate(ground_truth, lambda q: elastic_search_hybrid('title_instr_ingr_vector',q['question'], model.encode(q['question'])))
# print (f"the search evaluation for the field title_instr_ingr_vector is: \n {v_i}")

  0%|          | 0/67505 [00:00<?, ?it/s]

the search evaluation for the field instr_vector is: 
 {'hit_rate': 0.3245092956077328, 'mrr': 0.2324835197392945}
{'hit_rate': 0.3214872972372417, 'mrr': 0.22995012715109064}


  0%|          | 0/67505 [00:00<?, ?it/s]

the search evaluation for the field ingr_vector is: 
 {'hit_rate': 0.3227020220724391, 'mrr': 0.23125595634892787}
{'hit_rate': 0.3214872972372417, 'mrr': 0.22995012715109064}


  0%|          | 0/67505 [00:00<?, ?it/s]

the search evaluation for the field instr_ingr_vector is: 
 {'hit_rate': 0.32490926598029773, 'mrr': 0.23273683430858574}
{'hit_rate': 0.3214872972372417, 'mrr': 0.22995012715109064}


## Finding the best parameters

In [45]:
df_q_sample = df_question.sample(frac=1)
df_validation = df_q_sample[:30000]
df_test = df_question[20000:]

In [56]:
import random

def simple_optimize(param_ranges, objective_function, n_iterations=10):
    best_params = None
    best_score = float('-inf')  # Assuming we're minimizing. Use float('-inf') if maximizing.

    for _ in range(n_iterations):
        # Generate random parameters
        current_params = {}
        for param, (min_val, max_val) in param_ranges.items():
            if isinstance(min_val, int) and isinstance(max_val, int):
                current_params[param] = random.randint(min_val, max_val)
            else:
                current_params[param] = random.uniform(min_val, max_val)

        # Evaluate the objective function
        current_score = objective_function(current_params)

        # Update best if current is better
        if current_score > best_score:  # Change to > if maximizing
            best_score = current_score
            best_params = current_params

    return best_params, best_score

In [48]:
gt_val = df_validation.to_dict(orient='records')

In [49]:
def minsearch_hybrid_search(query, boost=None, hybrid_boost=None):
    if boost is None:
        boost = {}

    if hybrid_boost is None:
        hybrid_boost = {'text':0.5, 'vector':0.5}


    results = minsearch_hybrid_index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        hybrid_boost=hybrid_boost,
        num_results=10
    )

    return results

In [50]:
def evaluate2(q, search_function):

    doc_id = q['Id']
    results = search_function(q)
    relevance = [d['Id'] == doc_id for d in results]

    return relevance

In [69]:
from typing import Dict
param_ranges = {
    'Title': (0.0, 3.0),
    'Instructions': (0.0, 3.0),
    'text': (0.0, 1.0),
    # 'Cleaned_Ingredients': (0.0, 3.0),
    # 'Image_Name': (0.0, 3.0),
}

def objective(boost_params:Dict[str, float]):
    params = boost_params.copy()
    text_boost = params.pop('text')
    def search_function(q):
        return minsearch_hybrid_search(q['question'], params, hybrid_boost={'text': text_boost, 'vector': 1-text_boost})

    # results = evaluate(gt_val, search_function)
    results = map_progress(pool, gt_val, search_function, evaluate2)
    return results['mrr']

In [72]:
simple_optimize(param_ranges, objective, n_iterations=20)

  0%|          | 0/30000 [00:00<?, ?it/s]

  0%|          | 0/30000 [00:00<?, ?it/s]

  0%|          | 0/30000 [00:00<?, ?it/s]

  0%|          | 0/30000 [00:00<?, ?it/s]

  0%|          | 0/30000 [00:00<?, ?it/s]

  0%|          | 0/30000 [00:00<?, ?it/s]

  0%|          | 0/30000 [00:00<?, ?it/s]

  0%|          | 0/30000 [00:00<?, ?it/s]

  0%|          | 0/30000 [00:00<?, ?it/s]

  0%|          | 0/30000 [00:00<?, ?it/s]

  0%|          | 0/30000 [00:00<?, ?it/s]

  0%|          | 0/30000 [00:00<?, ?it/s]

  0%|          | 0/30000 [00:00<?, ?it/s]

  0%|          | 0/30000 [00:00<?, ?it/s]

  0%|          | 0/30000 [00:00<?, ?it/s]

  0%|          | 0/30000 [00:00<?, ?it/s]

  0%|          | 0/30000 [00:00<?, ?it/s]

  0%|          | 0/30000 [00:00<?, ?it/s]

  0%|          | 0/30000 [00:00<?, ?it/s]

  0%|          | 0/30000 [00:00<?, ?it/s]

({'Title': 1.301766512843531,
  'Instructions': 0.7387164179150028,
  'text': 0.36555723393033346},
 0.5693462169312256)

In [74]:
def minsearch_hybrid_improved(query):
    boost = {
  'Title': 1.301766512843531,
  'Instructions': 0.7387164179150028,
  'text': 0.36555723393033346
#   'Cleaned_Ingredients': 1.3020399036175552,
#   'Image_Name': 1.7024187651904266
    }

    text = boost.pop('text')

    results = minsearch_hybrid_index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        hybrid_boost={'text':text, 'vector': 1-text},
        num_results=10
    )

    return results

map_progress(pool, ground_truth, minsearch_hybrid_improved, evaluate1)
#evaluate(ground_truth, lambda q: minsearch_improved(q['question']))

  0%|          | 0/67505 [00:00<?, ?it/s]

{'hit_rate': 0.6847640915487742, 'mrr': 0.5694318527845589}

In [84]:
import pickle

with open('index.pkl', 'wb') as ind:
    pickle.dump(minsearch_hybrid_index, ind, protocol=pickle.HIGHEST_PROTOCOL)

In [85]:
with open('index.pkl', 'rb') as ind:
    index = pickle.load(ind)

def minsearch_hybrid_improved(query):
    boost = {
  'Title': 1.301766512843531,
  'Instructions': 0.7387164179150028,
  'text': 0.36555723393033346
#   'Cleaned_Ingredients': 1.3020399036175552,
#   'Image_Name': 1.7024187651904266
    }

    text = boost.pop('text')

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        hybrid_boost={'text':text, 'vector': 1-text},
        num_results=5
    )

    return results

In [78]:
minsearch_hybrid_improved('How to cook boiled egg?')

[{'Id': 11281,
  'Title': 'Hard-Boiled Eggs',
  'Instructions': '1. \x07  Put eggs into a 1-quart saucepan, then add enough cold water to cover them by 1/2 inch. Bring water to a boil over high heat, then reduce heat to moderately high and cook eggs at a gentle boil, uncovered, 10 minutes. Pour off hot water. If using eggs right away, shake pan gently so eggs bump into one another (to crack shells). Run cold water into pot to stop cooking. Let eggs stand in cold water 15 minutes, adding more cold water or ice to keep water cold.\n\n',
  'Ingredients': '*\x08  4 large eggs\n',
  'Cleaned_Ingredients': '*\x08  4 large eggs\n',
  'Image_Name': 'hard-boiled-eggs-236719'},
 {'Id': 2092,
  'Title': 'Hard-Boiled Eggs',
  'Instructions': '1. \x07  Bring a large saucepan of water to a boil over medium-high heat. Using a slotted spoon, carefully lower eggs into water one at a time. Cook 10 minutes, maintaining a gentle boil. Carefully transfer eggs to a bowl of ice water and let cool until just 

## RAG evaluation

In [79]:
prompt2_template = """
You are an expert evaluator for a RAG system.
Your task is to analyze the relevance of the generated answer to the given question.
Based on the relevance of the generated answer, you will classify it
as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Question: {question}
Generated Answer: {answer_llm}

Please analyze the content and context of the generated answer in relation to the question
and provide your evaluation in parsable JSON without using code blocks:

{{
  "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
  "Explanation": "[Provide a brief explanation for your evaluation]"
}}
""".strip()

In [80]:
len(ground_truth)

67505

In [81]:
record = ground_truth[0]


In [88]:
def rag(query, model='gpt-4o-mini'):
    search_results = minsearch_hybrid_improved(query)
    prompt = build_prompt(query, search_results)
    #print(prompt)
    answer = llm(prompt, model=model)
    return answer

In [97]:
question = 'How to cook boiled egg?'
answer_llm = rag(question)

In [98]:
print(answer_llm)

To cook boiled eggs, you can choose from a few different methods. Below is a simple recipe for hard-boiled eggs.

### Ingredients
- 4 large eggs

### Instructions
1. Put the eggs into a 1-quart saucepan, then add enough cold water to cover them by 1/2 inch. 
2. Bring the water to a boil over high heat, then reduce heat to moderately high and cook the eggs at a gentle boil, uncovered, for 10 minutes. 
3. Pour off the hot water. If using the eggs right away, shake the pan gently so the eggs bump into one another (to crack the shells).
4. Run cold water into the pot to stop the cooking process. Let the eggs stand in the cold water for 15 minutes, adding more cold water or ice to keep the water cold.

![Hard-Boiled Eggs](../data/Food_Images/hard-boiled-eggs-236719.jpg) 

This recipe is titled "Hard-Boiled Eggs", and you can find the image [here](../data/Food_Images/hard-boiled-eggs-236719.jpg).


In [99]:
prompt = prompt2_template.format(question=question, answer_llm=answer_llm)
print(prompt)

You are an expert evaluator for a RAG system.
Your task is to analyze the relevance of the generated answer to the given question.
Based on the relevance of the generated answer, you will classify it
as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Question: How to cook boiled egg?
Generated Answer: To cook boiled eggs, you can choose from a few different methods. Below is a simple recipe for hard-boiled eggs.

### Ingredients
- 4 large eggs

### Instructions
1. Put the eggs into a 1-quart saucepan, then add enough cold water to cover them by 1/2 inch. 
2. Bring the water to a boil over high heat, then reduce heat to moderately high and cook the eggs at a gentle boil, uncovered, for 10 minutes. 
3. Pour off the hot water. If using the eggs right away, shake the pan gently so the eggs bump into one another (to crack the shells).
4. Run cold water into the pot to stop the cooking process. Let the eggs stand in the cold water for 15 minutes, adding m

In [131]:
import json

In [134]:
df_sample = df_question.sample(n=500, random_state=1)

In [135]:
sample = df_sample.to_dict(orient='records')

In [133]:
from groq import Groq
from openai import OpenAI

client_groq = Groq()
client_openai = OpenAI()

def llm(prompt, client, model='gemma2-9b-it'):
    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}]
    )

    return response.choices[0].message.content

def rag(query, client, model='gpt-4o-mini'):
    search_results = minsearch_hybrid_improved(query)
    prompt = build_prompt(query, search_results)
    #print(prompt)
    answer = llm(prompt, client, model=model)
    return answer

In [149]:
def build_evaluation(record, prompt, client_llm, client_eval, model_llm, model_eval):
    question = record['question']
    answer_llm = rag(question, client_llm, model_llm)

    prompt1 = prompt.format(
        question=question,
        answer_llm=answer_llm
    )

    evaluation = llm(prompt1, client_eval, model_eval)
    evaluation = json.loads(evaluation)

    return (record, answer_llm, evaluation)

# evaluations = []


def evaluation_func (documents, prompt, client_llm, client_eval, model_llm, model_eval):
    executor = ThreadPoolExecutor(8)
    results = []

    with tqdm(total=len(documents)) as progress:
        futures = []
        for doc in documents:
            future = executor.submit(build_evaluation, doc, prompt, client_llm, client_eval, model_llm, model_eval)
            future.add_done_callback(lambda p: progress.update())
            futures.append(future)
        for future in futures:
            result = future.result()
            results.append(result)
    return results

# for record in tqdm(sample):
#     question = record['question']
#     answer_llm = rag(question)

#     prompt = prompt2_template.format(
#         question=question,
#         answer_llm=answer_llm
#     )

#     evaluation = llm(prompt)
#     evaluation = json.loads(evaluation)

#     evaluations.append((record, answer_llm, evaluation))

In [None]:
evaluations_gpt_o_mini = evaluation_func (sample, prompt2_template, client_llm=client_openai, client_eval=client_openai, model_llm='gpt-4o-mini', model_eval='gpt-4o-mini')

In [152]:
def save_evaluation(evaluations, model_name):

    df_eval = pd.DataFrame(evaluations, columns=['record', 'answer', 'evaluation'])

    df_eval['id'] = df_eval.record.apply(lambda d: d['Id'])
    df_eval['question'] = df_eval.record.apply(lambda d: d['question'])

    df_eval['relevance'] = df_eval.evaluation.apply(lambda d: d['Relevance'])
    df_eval['explanation'] = df_eval.evaluation.apply(lambda d: d['Explanation'])

    del df_eval['record']
    del df_eval['evaluation']
    print (f" Evaluation of {model_name} \n : {df_eval.relevance.value_counts(normalize=True)}")
    df_eval.to_csv(f'../data/rag-eval-{model_name}.csv', index=False)
    return df_eval

In [None]:
# evaluations = evaluation_func (sample, prompt2_template,'gemma2-9b-it')
evaluations_mixtral_8x7b_32768 = []
for record in tqdm(sample):
    result = build_evaluation(record, prompt2_template, client_llm=client_groq, client_eval=client_openai, model_llm='mixtral-8x7b-32768', model_eval='gpt-4o-mini')

    evaluations_mixtral_8x7b_32768.append(result)

save_evaluation(evaluations_mixtral_8x7b_32768, 'mixtral-8x7b-32768')