# Evaluation

In [26]:
from script.evaluate import create_qna_dataset, qna_critique, eval_ret, rag, eval_gen, filter_crit

In [8]:
from utils.models import LLM, JudgeLLM
from utils.embeddings import Embedder
from utils.vecdb import lancedb_table, lancedb_setup


from script.parse import parse_dir
from script.ingest import ingest

import os
import json
import pandas as pd
from rerankers import Reranker

Choose between Together or Bedrock for the QnA generation and RAG.
For evaluation, prometheus-eval has been used as an LLM.

In [9]:
# rag_llm = LLM("llama31-8", "together")
rag_llm = LLM("llama31-8", "bedrock")
qna_llm = LLM("mixtral", "bedrock") # Using mixtral, we can't use system prompt
# qna_llm = LLM("mixtral", "together") # Using mixtral, we can't use system prompt
judge_llm = JudgeLLM()



Here, we just generate QnA for a few files, not all ones.

In [None]:
dir_path = os.path.abspath('./data_qna/')
docs = parse_dir(dir_path, 1000, 100)

Each entry/doc will look something like this 

<img src="./img/00.png" alt="dataset" style="width: 500px;"/>

First, we create a synthetic q and a dataset

In [None]:
# we generate for all the dataset. We could also choose a subset if we wanted to
N_GENERATIONS = 395

print(f"Generating {N_GENERATIONS} QA couples...")

outputs =  create_qna_dataset(docs, N_GENERATIONS, qna_llm)

One element (e.g. `outputs[0]`) would look like as follows

<img src="./img/01.png" alt="qna-without-eval" style="width: 500px;"/>

In [22]:
# saving if needed for later
answer_path = './results/01_qna.json'
with open(answer_path, "w") as f:
    json.dump(outputs, f)

In [11]:
# load if opening later
answer_path = './results/01_qna.json'
with open(answer_path, "r") as f:
    outputs = json.load(f)

Now, we will evaluate this Q and A dataset

In [5]:
print("Generating critique for each QA couple...")
qna_critique(outputs,judge_llm) # appends to the existing dataset

Each entry in outputs now has additional pair of data for the feedback and the score

<img src="./img/02.png" alt="qna-with-eval" style="width: 500px;"/>

In [17]:
# saving if needed for later
answer_path = './results/02_qna_with_critique.json'
with open(answer_path, "w") as f:
    json.dump(outputs, f)

In [12]:
# load if opening later
answer_path = './results/02_qna_with_critique.json'
with open(answer_path, "r") as f:
    outputs = json.load(f)

Now we can filter out low-quality questions based on feedback from our llm-as-a-judge



In [13]:
qna_with_critique = pd.DataFrame.from_dict(outputs)

In [14]:
qna_with_critique_filtered = filter_crit(qna_with_critique)

In [15]:
# saving if needed for later
qna_with_critique_filtered.to_csv('./results/03_qna_with_critique_filtered.csv', index=False)

In [16]:
# load if opening later
qna_with_critique_filtered = pd.read_csv('./results/03_qna_with_critique_filtered.csv')

## Retreival Evaluation

We evaluate our retrieval using metrics such as hit-rate and mrr. This step won't generate answers.

Note, this time we will embed our entire dataset.

Our entire dataset contains additional 100 files (LLM papers) from [Kaggle]. (https://www.kaggle.com/datasets/ruchi798/100-llm-papers-to-explore). We place all files in the `data` directory.

In [None]:
dir_path = os.path.abspath('./data/')
docs = parse_dir(dir_path, 1000, 100)


In [None]:
model_name = 'BAAI/bge-small-en-v1.5'
emb_model = Embedder(model=model_name, provider='fastembed')

and add it (i.e. perform ingestion) to our database

In [None]:
table = lancedb_setup(384)

In [21]:
table = lancedb_table()

In [None]:
table.count_rows()

In [None]:
ingest(table, docs, emb_model)

In [None]:
table.count_rows()

We create a index so that we can perform text-based search

In [None]:
table.create_fts_index("text")

We initialize our reranker models

In [None]:
reranker_cross = Reranker(
    "mixedbread-ai/mxbai-rerank-xsmall-v1", model_type="cross-encoder"
)

reranker_colbert = Reranker("colbert")

reranker = {"None": None, "crossencoder": reranker_cross, "colbert": reranker_colbert}

We evaluate for
- both text and vector search
- with no reranker, cross-encode reranker, and colbert reranker

for a total of six configurations.

In [None]:
metrics_dict = []
for search_type in ["vector", "text"]:
    for reranker_name, reranker_function in reranker.items():
        print(f"Evaluating RAG - Search Type:{search_type}, ReRanker: {reranker_name}")
        output, mrr, hit_rate = eval_ret(
            qna_with_critique_filtered,
            table,
            emb_model,
            reranker=reranker_function,
            reranker_name=reranker_name,
            method=search_type,
        )
        answer_path = (
            f"./04_rag_{search_type}_{reranker_name}_mrr_{mrr}_hitrate_{hit_rate}.json"
        )
        metrics_dict.append(
            {
                "search_type": search_type,
                "reranker": reranker_name,
                "mrr": mrr,
                "hit_rate": hit_rate,
            }
        )
        with open(answer_path, "w") as f:
            json.dump(output, f)


In [None]:
metrics_path = "./results/04_rag_metrics.json"
with open(metrics_path, "w") as f:
    json.dump(metrics_dict, f)

This is the final result

For Hit-Rate

<img src="./img/hitrate.png" alt="hitrate" style="width: 500px;"/>


For MRR

<img src="./img/mrr.png" alt="mrr" style="width: 500px;"/>


We see that using text search with the cross-encoder reranker has the highest MRR.
We choose this configuration to proceed further for RAG. 

## RAG

We take each question from the QnA dataset, get relevant docs and generate answer

As stated previously, we use text search with cross-encoder as the reranker. 

In [None]:
outputs = rag(
    qna_with_critique_filtered,
    table,
    emb_model,
    rag_llm,
    reranker=reranker_cross,
    reranker_name="crossencoder",
    method="text",
)

In [None]:
rag_path = "./results/04_rag.csv"
rag_df = pd.DataFrame.from_dict(outputs)
rag_df.to_csv(rag_path, index=False)

In [None]:
rag_path_j = "./results/04_rag.json"

with open(rag_path_j, "w") as f:
    json.dump(outputs, f)

## Evaluate

We use Prometheus as the llm-as-judge to evaluate our generated answer against our ground truth answers. We can then average all the scores. We obtain an average score of 4.32 on this technique.

In [None]:
avg_score = eval_gen(outputs, judge_llm)

In [None]:
print(f"Average Score is: {avg_score}")

In [32]:
answer_path = './results/05_rag_with_eval.json'
with open(answer_path, "w") as f:
    json.dump(outputs, f)

The final result would look something like this

<img src="./img/05.png" alt="RAG-with-eval" style="width: 500px;"/>
