In [None]:
import nest_asyncio

nest_asyncio.apply()

import os
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.node_parser import SentenceSplitter
import pandas as pd

os.environ["OPENAI_API_KEY"] = ""

from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.core import Settings

llm = OpenAI(model="gpt-4o-mini", temperature=0.0)
Settings.embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")

documents = SimpleDirectoryReader("data").load_data()
splitter = SentenceSplitter(chunk_size=512, chunk_overlap=20)
nodes = splitter.get_nodes_from_documents(documents)

In [None]:
from llama_index.retrievers.bm25 import BM25Retriever
import Stemmer
from llama_index.core.retrievers import QueryFusionRetriever

bm25_retriever = BM25Retriever.from_defaults(
    nodes=nodes,
    similarity_top_k=2,
    stemmer=Stemmer.Stemmer("english"),
    language="english",
)

dense_retriever = VectorStoreIndex(nodes).as_retriever(similarity_top_k=2)

hybrid_retriever = QueryFusionRetriever(
    [
        dense_retriever,
        bm25_retriever
    ],
    num_queries=1,
    use_async=True
)

In [None]:
from llama_index.core.evaluation import (
    generate_question_context_pairs
)

qa_dataset = generate_question_context_pairs(
    nodes, llm=llm, num_questions_per_chunk=1
)

queries = qa_dataset.queries.values()

In [None]:
from llama_index.core.evaluation import RetrieverEvaluator

metrics = ["hit_rate", "mrr", "precision", "recall", "ap", "ndcg"]

bm25_retriever_evaluator = RetrieverEvaluator.from_metric_names(
    metrics, retriever=bm25_retriever
)
bm25_eval_results = await bm25_retriever_evaluator.aevaluate_dataset(qa_dataset)

dense_retriever_evaluator = RetrieverEvaluator.from_metric_names(
    metrics, retriever=dense_retriever
)
dense_eval_results = await dense_retriever_evaluator.aevaluate_dataset(qa_dataset)

hybrid_retriever_evaluator = RetrieverEvaluator.from_metric_names(
    metrics, retriever=hybrid_retriever
)
hybrid_eval_results = await hybrid_retriever_evaluator.aevaluate_dataset(qa_dataset)

In [None]:
def display_all_results(results_dict, metrics):
    all_results = []

    for name, eval_results in results_dict.items():
        metric_dicts = []
        for eval_result in eval_results:
            metric_dict = eval_result.metric_vals_dict
            metric_dicts.append(metric_dict)

        # Compute averages for all metrics
        full_df = pd.DataFrame(metric_dicts)
        avg_metrics = {metric: full_df[metric].mean() for metric in metrics}

        # Add retriever name and metrics to results
        all_results.append({"retriever": name, **avg_metrics})

        results_df = pd.DataFrame(all_results)
    return results_df

In [None]:
retriever_results = {
    "BM25": bm25_eval_results,
    "Dense": dense_eval_results,
    "Hybrid": hybrid_eval_results,
}
display_all_results(retriever_results, metrics)