# Evaluating retrieval in a RAG pipeline

In [1]:
from ranx import Qrels, Run, evaluate, compare

In [2]:
qrels_dict = {
  "q_1": { "d_12": 1 },
  "q_2": { "d_11": 1 },
  "q_3": { "d_10": 1 }
}

In [3]:
qrels = Qrels(qrels_dict)

In [4]:
run1_dict = {
  "q_1": { "d_12": 0.9, "d_23": 0.8, "d_25": 0.7 },
  "q_2": { "d_12": 0.9, "d_25": 0.7, "d_36": 0.6},
  "q_3": { "d_10": 1.0}
}

In [5]:
run1 = Run(run1_dict)

In [6]:
evaluate(qrels, run1, ["hit_rate"])

[1;35mnp.float64[0m[1m([0m[1;36m0.6666666666666666[0m[1m)[0m

In [7]:
run2_dict = {
  "q_1": { "d_32": 0.5, "d_35": 0.4},
  "q_2": { "d_12": 0.9, "d_11": 0.8, "d_35": 0.4},
}

In [8]:
run2 = Run(run2_dict)

In [9]:
evaluate(qrels, run2, ["hit_rate"])

AssertionError: Qrels and Run query ids do not match. Pass `make_comparable=True` to add empty results for queries missing from the run and remove those not appearing in qrels.

In [10]:
evaluate(qrels, run2, ["hit_rate"], make_comparable=True)

[1;35mnp.float64[0m[1m([0m[1;36m0.3333333333333333[0m[1m)[0m

In [11]:
compare(
    qrels,
    runs=[run1, run2],
    metrics=["hit_rate"],
)


#    Model      Hit Rate
---  -------  ----------
a    run_1         [1;36m0.667[0m
b    run_2         [1;36m0.333[0m

## Importing DuckDB database

In [12]:
import duckdb

In [13]:
con = duckdb.connect("evaluate_rag.duckdb")

In [14]:
con.sql("INSTALL httpfs")
con.sql("LOAD httpfs")

In [15]:
con.sql("""
ATTACH IF NOT EXISTS
'https://raw.githubusercontent.com/mneedham/LearnDataWithMark/main/evaluate-rag/olympics.duckdb' AS olympics
""")

In [16]:
con.sql("USE olympics")

In [17]:
con.sql("DESCRIBE olympics")


┌─────────────┬─────────────┬─────────┬─────────┬─────────┬─────────┐
│ column_name │ column_type │  null   │   key   │ default │  extra  │
│   varchar   │   varchar   │ varchar │ varchar │ varchar │ varchar │
├─────────────┼─────────────┼─────────┼─────────┼─────────┼─────────┤
│ index       │ BIGINT      │ YES     │ NULL    │ NULL    │ NULL    │
│ embeddings  │ FLOAT[1m[[0m[1;36m1024[0m[1m][0m │ YES     │ NULL    │ NULL    │ NULL    │
│ text        │ VARCHAR     │ YES     │ NULL    │ NULL    │ NULL    │
│ url         │ VARCHAR     │ YES     │ NULL    │ NULL    │ NULL    │
│ title       │ VARCHAR     │ YES     │ NULL    │ NULL    │ NULL    │
└─────────────┴─────────────┴─────────┴─────────┴─────────┴─────────┘

In [18]:
con.sql("SELECT index, text FROM olympics LIMIT 10")


┌───────┬──────────────────────────────────────────────────────────────────────────────────────────────────────────────┐
│ index │                                                     text                                                     │
│ int64 │                                                   varchar                                                    │
├───────┼──────────────────────────────────────────────────────────────────────────────────────────────────────────────┤
│     [1;36m0[0m │ The [1;36m2024[0m Olympics opened in Paris in spectacular style with thousands of athletes sailing along the River …  │
│     [1;36m1[0m │ Swapping a stadium for a waterway for the first time to open the [32m"greatest show on Earth"[0m, the near four-h…  │
│     [1;36m2[0m │ Blue, white and red fireworks had raised the Tricolore above Austerlitz Bridge before [1;36m6[0m,[1;36m800[0m athletes from …  │
│     [1;36m3[0m │ There were surprise performances through the ceremony, i

## Evaluating against real questions

In [19]:
qrels = Qrels.from_file("data/questions.json")

In [20]:
from search import Search
s = Search(con)

In [21]:
functions = [
  (s.fts, "Full-Text"), 
  (s.vector_search, "Vector"), 
  (s.hybrid, "Hybrid")
]

In [22]:
def create_run(qrels, retrieval_fn, name):
  run_dict = {
    question: {
      str(index): score
      for index, score in (retrieval_fn(question)
                            .select("index, score")
                            .fetchall()
                          )
    }
    for question in qrels.to_dict()
  }
  return Run(run_dict, name=name)

In [23]:
runs = [
   create_run(qrels, fn, name)
   for fn, name in functions
]

In [24]:
compare(
    qrels,
    runs=runs,
    metrics=["hit_rate"],
)


#    Model        Hit Rate
---  ---------  ----------
a    Full-Text        [1;36m0.7[0m
b    Vector           [1;36m0.75[0m
c    Hybrid           [1;36m0.9[0m

In [25]:
from rich.table import Table
from rich.console import Console
c = Console()

## Which ones did we get wrong?

In [26]:
table = Table(title="Comparing Retrieval Techniques")
table.add_column("Question")
for run in runs:
  table.add_column(run.name, justify="center")

cols = [col.header for col in table.columns][1:]
for question in qrels.to_dict():
  row = [question]
  for col in cols:
    selected_run = [r for r in runs if r.name == col][0]
    score = selected_run.scores['hit_rate'][question]
    row.append("✅" if score == 1.0 else "❌")
  table.add_row(*row)

In [27]:
with c.pager(styles=True):
  c.print(table)

[3m                            Comparing Retrieval Techniques                            [0m
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━┓
┃[1m [0m[1mQuestion                                            [0m[1m [0m┃[1m [0m[1mFull-Text[0m[1m [0m┃[1m [0m[1mVector[0m[1m [0m┃[1m [0m[1mHybrid[0m[1m [0m┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━╇━━━━━━━━┩
│ How many competitors are there?                      │    ❌     │   ✅   │   ✅   │
│ How many medals will be won on Saturday?             │    ✅     │   ✅   │   ✅   │
│ How many times has Paris hosted the Olympic games?   │    ✅     │   ✅   │   ✅   │
│ What colors were the fireworks during the ceremony?  │    ✅     │   ✅   │   ✅   │
│ What started the day of the opening ceremony?        │    ✅     │   ❌   │   ❌   │
│ What things went wrong?                              │    ❌     │   ❌   │   ❌   │
│ What was Serena's role?                   

## What wrong answers did we return?

In [28]:
def get_text(ids):
  result = con.sql("""SELECT text
      FROM olympics 
      WHERE list_contains($ids::BIGINT[], index)
      """, 
      params={"ids": ids}
  )
  return [row[0] for row in result.fetchall()]

In [29]:
run = [r for r in runs if r.name == "Hybrid"][0]
with c.pager(styles=True):
  for question, score in run.scores['hit_rate'].items():
    if score == 0.0:
      c.print(question, style="bold")

      correct_ids = list(qrels.to_dict()[question])
      c.print("Correct Answer", style="Green italic")
      c.print("\n".join(get_text(correct_ids)), style="Green")

      answer_ids = list(run.to_dict()[question])      
      c.print("Run Answer", style="Yellow italic")
      c.print("\n".join(get_text(answer_ids)) or "None", style="Yellow")

      c.print()

[1mWhat started the day of the opening ceremony?[0m
[3;32mCorrect Answer[0m
[32mThe day had started with major disruption when the French train network was hit by arson attacks and heavy rain in [0m
[32mthe evening put paid to the original plan by artistic director Thomas Jolly to use the Parisian sun to [0m[32m"make the [0m
[32mwater sparkle"[0m[32m. [0m
[3;33mRun Answer[0m
[33mThe [0m[1;36m2024[0m[33m Olympics opened in Paris in spectacular style with thousands of athletes sailing along the River Seine [0m
[33mpast lively performers on bridges, banks and rooftops in an ambitious take on an opening ceremony.   [0m
[33mGiven the miserable weather after what had been a sunny week in Paris until now, it seemed fitting that the [0m
[33mstoryline at the start of the ceremony was about the arrival of the Olympic flame in Paris not going according to [0m
[33mplan.[0m
[33mThe peace anthem, part of all Olympic opening ceremonies, is aligned with the message of un