In [21]:
# %pip install cohere

In [22]:
# %pip install diskcache

In [3]:
from rich import inspect as rinspect
from rich import print as rprint

# Context

In `make_synthetic_questions.ipynb`, we generated synthetic questions to bootstrap evaluation of the retrieval system in our hardware store's Q&A system.

This notebook shows the first step in calculating precision and recall with different retrieval parameters. We will run more advanced experiments in future notebooks after we have these baseline scores.

## Data

Here is a brief review of the data.

In [4]:
import json
import lancedb
import os
import pandas as pd
from typing import List, Dict
from concurrent.futures import ThreadPoolExecutor

pd.set_option("display.max_colwidth", 160)

db = lancedb.connect("./lancedb")
reviews_table = db.open_table("reviews")
reviews_table.to_pandas().head()

  from .autonotebook import tqdm as notebook_tqdm


Unnamed: 0,id,product_title,product_description,review,vector
0,0,Cordless Drill,"This powerful cordless drill features an ergonomic design perfect for all-day use. With 20 torque settings and a lithium-ion battery, it offers unmatched ve...","I've been using this cordless drill for the past 6 months, and it's been a game-changer for my DIY projects. The 20 torque settings allow me to adjust the p...","[-0.0038200605, -0.009364537, -0.026297344, -0.020058312, 0.0376258, 0.0038050916, -0.000636552, 0.06768333, 0.023818497, -0.0054426882, 0.011939186, 0.0186..."
1,1,Cordless Drill,"This powerful cordless drill features an ergonomic design perfect for all-day use. With 20 torque settings and a lithium-ion battery, it offers unmatched ve...","Purchased this cordless drill a year ago and it has not disappointed. The 20 torque settings provide great control and precision, especially on delicate tas...","[-0.023377769, -0.01479075, -0.014503318, -0.029437784, 0.029629406, -0.0010449336, -0.022671165, 0.08718758, -0.0075630434, -0.0136050945, 0.041390147, 0.0..."
2,2,Cordless Drill,Our lightweight cordless drill comes equipped with a flexible LED work light to illuminate your workspace. The 18V battery provides ample power for tough ma...,"I've been using this cordless drill for the past six months on various projects around the house, and I am thoroughly impressed. The 18V battery provides in...","[-0.015202215, 0.0038308129, 0.0022391798, -0.008841734, 0.008781216, 0.008865941, -0.007449812, 0.061050933, 0.045049876, -0.01014288, 0.037472975, -0.0050..."
3,3,Cordless Drill,Our lightweight cordless drill comes equipped with a flexible LED work light to illuminate your workspace. The 18V battery provides ample power for tough ma...,"I purchased this cordless drill about a year ago for use in my small woodworking shop, and it has exceeded my expectations. The 18V lithium-ion battery prov...","[-0.025799386, -0.0032572313, -0.018315684, -0.022368867, 0.03334183, -0.0056832666, -0.02441308, 0.06597876, 0.019596254, -0.023543702, 0.05545223, 0.00619..."
4,4,Cordless Drill,"Engineered for precision, this cordless drill has a compact design that allows for maximum maneuverability in tight spaces. It includes a built-in battery i...","I've been using the Cordless Drill for about six months now, and it has exceeded my expectations. The compact design makes it easy to use in tight spaces, w...","[-0.009312706, 0.01556987, 0.0027725082, -0.0062510776, 0.012757799, 0.015630737, 0.0029307632, 0.058724828, 0.06598022, -0.004050723, 0.014145574, -0.00955..."


In [5]:
with open("synthetic_eval_dataset.json", "r") as f:
    synthetic_questions = json.load(f)
synthetic_questions[:5]

[{'question': 'How good is the battery life on this cordless drill?',
  'answer': 'It comes with two included batteries, ensuring that you never run out of power on the job.',
  'chunk_id': '0'},
 {'question': 'Is this cordless drill easy to handle for long tasks?',
  'answer': 'Yes, its lightweight design makes it easy to use for extended periods without fatigue.',
  'chunk_id': '0'},
 {'question': 'How powerful is the motor in this cordless drill?',
  'answer': 'The cordless drill features a powerful motor that exceeds expectations for professional use.',
  'chunk_id': '1'},
 {'question': 'What design features make this drill suitable for overhead tasks?',
  'answer': 'The cordless drill has a lightweight design and ergonomic build, making it perfect for overhead tasks.',
  'chunk_id': '1'},
 {'question': 'How durable are the batteries for this cordless drill?',
  'answer': 'The batteries charge quickly and last a long time, which is a huge plus.',
  'chunk_id': '2'}]

## Set Up Evaluation

Load the evaluation questions into a structured format.

In [6]:
from pydantic import BaseModel


class EvalQuestion(BaseModel):
    question: str
    answer: str
    chunk_id: str


In [7]:
eval_questions = [EvalQuestion(**question) for question in synthetic_questions]

In [8]:
eval_questions[:5]

[EvalQuestion(question='How good is the battery life on this cordless drill?', answer='It comes with two included batteries, ensuring that you never run out of power on the job.', chunk_id='0'),
 EvalQuestion(question='Is this cordless drill easy to handle for long tasks?', answer='Yes, its lightweight design makes it easy to use for extended periods without fatigue.', chunk_id='0'),
 EvalQuestion(question='How powerful is the motor in this cordless drill?', answer='The cordless drill features a powerful motor that exceeds expectations for professional use.', chunk_id='1'),
 EvalQuestion(question='What design features make this drill suitable for overhead tasks?', answer='The cordless drill has a lightweight design and ergonomic build, making it perfect for overhead tasks.', chunk_id='1'),
 EvalQuestion(question='How durable are the batteries for this cordless drill?', answer='The batteries charge quickly and last a long time, which is a huge plus.', chunk_id='2')]

In [9]:
from pydantic import BaseModel


class EvalQuestion(BaseModel):
    question: str
    answer: str
    chunk_id: str


eval_questions = [EvalQuestion(**question) for question in synthetic_questions]

Build a simple search function

In [23]:
eval_questions[:5]

[EvalQuestion(question='How good is the battery life on this cordless drill?', answer='It comes with two included batteries, ensuring that you never run out of power on the job.', chunk_id='0'),
 EvalQuestion(question='Is this cordless drill easy to handle for long tasks?', answer='Yes, its lightweight design makes it easy to use for extended periods without fatigue.', chunk_id='0'),
 EvalQuestion(question='How powerful is the motor in this cordless drill?', answer='The cordless drill features a powerful motor that exceeds expectations for professional use.', chunk_id='1'),
 EvalQuestion(question='What design features make this drill suitable for overhead tasks?', answer='The cordless drill has a lightweight design and ergonomic build, making it perfect for overhead tasks.', chunk_id='1'),
 EvalQuestion(question='How durable are the batteries for this cordless drill?', answer='The batteries charge quickly and last a long time, which is a huge plus.', chunk_id='2')]

In [10]:
q = eval_questions[0]
n_return_vals = 5

In [11]:
rinspect(reviews_table.search, help=True)

In [12]:
q.question

'How good is the battery life on this cordless drill?'

In [13]:
reviews_table.search(q.question, n_return_vals).select(['id', 'review']).limit(n_return_vals).to_pandas()

Unnamed: 0,id,review,score
0,3,"I purchased this cordless drill about a year ago for use in my small woodworking shop, and it has exceeded my expectations. The 18V lithium-ion battery prov...",7.032466
1,4,"I've been using the Cordless Drill for about six months now, and it has exceeded my expectations. The compact design makes it easy to use in tight spaces, w...",6.986802
2,2,"I've been using this cordless drill for the past six months on various projects around the house, and I am thoroughly impressed. The 18V battery provides in...",6.564123
3,1,"Purchased this cordless drill a year ago and it has not disappointed. The 20 torque settings provide great control and precision, especially on delicate tas...",4.909324
4,5,"I purchased this Cordless Drill a year ago, and it has quickly become one of my most trusted tools. The precision is top-notch, allowing me to drill perfect...",4.146757


In [14]:
results = (reviews_table.search(q.question, n_return_vals).select(['id', 'review']).limit(n_return_vals).to_pandas())

In [15]:
results

Unnamed: 0,id,review,score
0,3,"I purchased this cordless drill about a year ago for use in my small woodworking shop, and it has exceeded my expectations. The 18V lithium-ion battery prov...",7.032466
1,4,"I've been using the Cordless Drill for about six months now, and it has exceeded my expectations. The compact design makes it easy to use in tight spaces, w...",6.986802
2,2,"I've been using this cordless drill for the past six months on various projects around the house, and I am thoroughly impressed. The 18V battery provides in...",6.564123
3,1,"Purchased this cordless drill a year ago and it has not disappointed. The 20 torque settings provide great control and precision, especially on delicate tas...",4.909324
4,5,"I purchased this Cordless Drill a year ago, and it has quickly become one of my most trusted tools. The precision is top-notch, allowing me to drill perfect...",4.146757


In [16]:
results = (reviews_table.search(q.question, n_return_vals).select(['id', 'review']).limit(n_return_vals).to_list())

In [17]:
results[0]['id'], q.chunk_id

('3', '0')

In [18]:
[str(q.chunk_id) == str(r["id"]) for r in results]

[False, False, False, False, False]

In [19]:
def run_simple_request(q: EvalQuestion, n_return_vals=5):
    results = (
        reviews_table.search(q.question).select(["id"]).limit(n_return_vals).to_list()
    )
    return [str(q.chunk_id) == str(r["id"]) for r in results]

Now do the benchmarking. For simplicity, we just compare retrieval sizes with a simple semantic search in this cell.

Precision = Did we retrieve only the relevant documents?  (Out of all the documents retrieved how many are relevant)

Recall = Did we retrieve all the relevant documents? (Out of all the relevant documents how many did we retrieve)

See https://www.perplexity.ai/search/explain-this-code-with-an-exam-vlIBEgF.QnWY_nbPnIeMbw

In [20]:
def score(hits):
    # This implementation assumes
    n_retrieval_requests = len(hits)
    total_retrievals = sum(len(l) for l in hits)
    true_positives = sum(sum(sublist) for sublist in hits)
    precision = true_positives / total_retrievals if total_retrievals > 0 else 0
    recall = true_positives / n_retrieval_requests if n_retrieval_requests > 0 else 0
    return {"precision": precision, "recall": recall}


In [35]:
sample_eval_questions = eval_questions[:10]

In [36]:
def score_simple_search(n_to_retrieve: List[int]) -> Dict[str, float]:
    # parallelize to speed this up 5-10X
    with ThreadPoolExecutor() as executor:
        hits = list(
            executor.map(lambda q: run_simple_request(q, n_to_retrieve), sample_eval_questions)
        )
    return score(hits)

In [39]:
k_to_retrieve = [5, 10, 20]
scores = pd.DataFrame([score_simple_search(n) for n in k_to_retrieve])
scores["n_retrieved"] = k_to_retrieve
scores

Unnamed: 0,precision,recall,n_retrieved
0,0.18,0.9,5
1,0.1,1.0,10
2,0.055556,1.0,20


In [5]:
def score(hits):
    # This implementation assumes
    n_retrieval_requests = len(hits)
    total_retrievals = sum(len(l) for l in hits)
    true_positives = sum(sum(sublist) for sublist in hits)
    precision = true_positives / total_retrievals if total_retrievals > 0 else 0
    recall = true_positives / n_retrieval_requests if n_retrieval_requests > 0 else 0
    return {"precision": precision, "recall": recall}


def score_simple_search(n_to_retrieve: List[int]) -> Dict[str, float]:
    # parallelize to speed this up 5-10X
    with ThreadPoolExecutor() as executor:
        hits = list(
            executor.map(lambda q: run_simple_request(q, n_to_retrieve), eval_questions)
        )
    return score(hits)


k_to_retrieve = [5, 10, 20]
scores = pd.DataFrame([score_simple_search(n) for n in k_to_retrieve])
scores["n_retrieved"] = k_to_retrieve
scores

Unnamed: 0,precision,recall,n_retrieved
0,0.10141,0.507048,5
1,0.070749,0.707489,10
2,0.044361,0.887225,20


If you have Cohere set up, you can see uf a reranker improves results (we'll talk more about rerankers in the coming weeks).

In [41]:
assert os.environ.get('COHERE_API_KEY')

In [46]:
import cohere
from diskcache import Cache
cohere_api_key = os.environ["COHERE_API_KEY"]

In [47]:
# Use diskcache to reduce re-running in case of error (or addition of new data)
cache = Cache("./cohere_cache")

In [49]:
q, n_return_vals, n_to_rerank = eval_questions[0], 5, 40

In [51]:
reviews_table.search(q.question, query_type='auto').select(['id', 'review']).limit(n_return_vals).to_pandas()

Unnamed: 0,id,review,_distance
0,4,"I've been using the Cordless Drill for about six months now, and it has exceeded my expectations. The compact design makes it easy to use in tight spaces, w...",0.604516
1,2,"I've been using this cordless drill for the past six months on various projects around the house, and I am thoroughly impressed. The 18V battery provides in...",0.609694
2,3,"I purchased this cordless drill about a year ago for use in my small woodworking shop, and it has exceeded my expectations. The 18V lithium-ion battery prov...",0.653324
3,0,"I've been using this cordless drill for the past 6 months, and it's been a game-changer for my DIY projects. The 20 torque settings allow me to adjust the p...",0.659256
4,1,"Purchased this cordless drill a year ago and it has not disappointed. The 20 torque settings provide great control and precision, especially on delicate tas...",0.704209


In [53]:
initial_results = reviews_table.search(q.question, query_type='auto').select(['id', 'review']).limit(n_to_rerank).to_list()

In [63]:
from fastcore.all import L
L(initial_results).itemgot('id')

(#18) ['4','2','3','0','1','5','16','17','8','9'...]

In [54]:
texts = [r["review"] for r in initial_results]

In [55]:
cache_key = f"{q.question}_{n_return_vals}".replace("?", "")

In [56]:
cache_key

'How good is the battery life on this cordless drill_5'

In [57]:
# Try to get the result from cache
cached_result = cache.get(cache_key)
if cached_result is not None:
    print(cached_result)

In [58]:
co = cohere.Client(cohere_api_key)
reranked = co.rerank(
    query=q.question,
    documents=texts,
    top_n=n_return_vals
)

In [59]:
reranked



In [60]:
# Map reranked results back to original IDs
reranked_ids = [initial_results[r.index]["id"] for r in reranked.results]

In [66]:
from fastcore.all import L
L(initial_results).itemgot('id')

(#18) ['4','2','3','0','1','5','16','17','8','9'...]

In [61]:
reranked_ids

['2', '4', '3', '0', '1']

In [67]:
result = [str(q.chunk_id) == str(r) for r in reranked_ids]

In [68]:
result

[False, False, False, True, False]

In [6]:
try:
    import cohere
    from diskcache import Cache
    cohere_api_key = os.environ["COHERE_API_KEY"]

    # Use diskcache to reduce re-running in case of error (or addition of new data)
    cache = Cache("./cohere_cache")
    
    def run_reranked_request(q: EvalQuestion, n_return_vals=5, n_to_rerank=40) -> List[bool]:
        # First, get more results than we need
        initial_results = reviews_table.search(q.question) \
            .select(["id", "review"]) \
            .limit(n_to_rerank) \
            .to_list()
        
        # Prepare texts for reranking
        texts = [r["review"] for r in initial_results]
        
        cache_key = f"{q.question}_{n_return_vals}".replace("?", "")
        # Try to get the result from cache
        cached_result = cache.get(cache_key)
        if cached_result is not None:
            return cached_result
        
        # Rerank using Cohere
        co = cohere.Client(cohere_api_key)
        reranked = co.rerank(
            query=q.question,
            documents=texts,
            top_n=n_return_vals
        )
        
        # Map reranked results back to original IDs
        reranked_ids = [initial_results[r.index]["id"] for r in reranked.results]
        result = [str(q.chunk_id) == str(r) for r in reranked_ids]
        cache.set(cache_key, result)
        return result

    def score_reranked_search(n_to_retrieve: List[int], n_to_rerank: int = 40) -> Dict[str, float]:
        with ThreadPoolExecutor() as executor:
            hits = list(executor.map(
                lambda q: run_reranked_request(q, n_to_retrieve, n_to_rerank), 
                eval_questions
            ))
        return score(hits)

    k_to_retrieve = [5, 10, 20]
    reranked_scores = pd.DataFrame([score_reranked_search(n) for n in k_to_retrieve])
    reranked_scores["n_retrieved"] = k_to_retrieve
    print(reranked_scores)
except Exception as e:
    print(f"Could not run reranker.\n{e}")
    print("Ensure COHERE_API_KEY env is set... and cohere library diskcache are installed.")
    print("Connection reset by peer is likely rate limiting from Cohere")

   precision    recall  n_retrieved
0   0.125198  0.625991            5
1   0.081806  0.818062           10
2   0.046960  0.939207           20
