### Evaluate Retrieval
* modified using **OCI Data Science Model Deployment**
* inspired by: https://docs.llamaindex.ai/en/stable/examples/evaluation/retrieval/retriever_eval.html
* https://blog.llamaindex.ai/boosting-rag-picking-the-best-embedding-reranker-models-42d079022e83

In [2]:
from llama_index import VectorStoreIndex, SimpleDirectoryReader, ServiceContext
from llama_index.node_parser import SimpleNodeParser

# LLM
from llama_index.llms import MistralAI


# Retrievers
from llama_index.retrievers import (
    BaseRetriever,
    VectorIndexRetriever,
)

from llama_index.indices.query.schema import QueryBundle, QueryType
from llama_index.schema import NodeWithScore

# Evaluator
from llama_index.evaluation import (
    generate_question_context_pairs,
    EmbeddingQAFinetuneDataset,
)
from llama_index.evaluation import RetrieverEvaluator

import ads

from oci_utils import load_oci_config

# rerankers
from llama_index.postprocessor.cohere_rerank import CohereRerank
from oci_baai_reranker import OCIBAAIReranker
from oci_llama_reranker import OCILLamaReranker
# Embeddings
from ads.llm import GenerativeAIEmbeddings

from typing import List
import pandas as pd

In [3]:
from config_private import COHERE_API_KEY, MISTRAL_API_KEY, COMPARTMENT_OCID, ENDPOINT
from config import EMBED_MODEL, RERANKER_ID

In [4]:
!wget --user-agent "Mozilla" "https://arxiv.org/pdf/2307.09288.pdf" -O "llama2.pdf"

--2024-01-01 18:01:48--  https://arxiv.org/pdf/2307.09288.pdf
Resolving arxiv.org (arxiv.org)... 151.101.131.42, 151.101.3.42, 151.101.67.42, ...
Connecting to arxiv.org (arxiv.org)|151.101.131.42|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 13661300 (13M) [application/pdf]
Saving to: ‘llama2.pdf’


2024-01-01 18:01:54 (2,25 MB/s) - ‘llama2.pdf’ saved [13661300/13661300]



In [5]:
documents = SimpleDirectoryReader(input_files=['llama2.pdf']).load_data()

In [25]:
# with this doc should be max 35
N_PAGES = 20

node_parser = SimpleNodeParser.from_defaults(chunk_size=512)
nodes = node_parser.get_nodes_from_documents(documents[2:N_PAGES+2])

In [26]:
# Prompt to generate questions
qa_generate_prompt_tmpl = """\
Context information is below.

---------------------
{context_str}
---------------------

Given the context information and not prior knowledge.
generate only questions based on the below query.

You are a Professor. Your task is to setup \
{num_questions_per_chunk} questions for an upcoming \
quiz/examination. The questions should be diverse in nature \
across the document. The questions should not contain options, not start with Q1/ Q2. \
Restrict the questions to the context information provided.\
"""

In [27]:
# function to clean the dataset
def filter_qa_dataset(qa_dataset):
    """
    Filters out queries from the qa_dataset that contain certain phrases and the corresponding
    entries in the relevant_docs, and creates a new EmbeddingQAFinetuneDataset object with
    the filtered data.

    :param qa_dataset: An object that has 'queries', 'corpus', and 'relevant_docs' attributes.
    :return: An EmbeddingQAFinetuneDataset object with the filtered queries, corpus and relevant_docs.
    """

    # Extract keys from queries and relevant_docs that need to be removed
    queries_relevant_docs_keys_to_remove = {
        k for k, v in qa_dataset.queries.items()
        if 'Here are 2' in v or 'Here are two' in v
    }

    # Filter queries and relevant_docs using dictionary comprehensions
    filtered_queries = {
        k: v for k, v in qa_dataset.queries.items()
        if k not in queries_relevant_docs_keys_to_remove
    }
    filtered_relevant_docs = {
        k: v for k, v in qa_dataset.relevant_docs.items()
        if k not in queries_relevant_docs_keys_to_remove
    }

    # Create a new instance of EmbeddingQAFinetuneDataset with the filtered data
    return EmbeddingQAFinetuneDataset(
        queries=filtered_queries,
        corpus=qa_dataset.corpus,
        relevant_docs=filtered_relevant_docs
    )

In [28]:
#
# This LLM is used to generare qa dataset
#
llm = MistralAI(
            api_key=MISTRAL_API_KEY,
            model="mistral-small",
            temperature=0.2,
            max_tokens=1024,
        )

qa_dataset = generate_question_context_pairs(
    nodes, llm=llm, num_questions_per_chunk=2, qa_generate_prompt_tmpl=qa_generate_prompt_tmpl
)

100%|██████████████████████████████████████████████████████████████| 58/58 [01:47<00:00,  1.85s/it]


In [29]:
# filter out pairs with phrases `Here are 2 questions based on provided context`
qa_dataset = filter_qa_dataset(qa_dataset)

In [30]:
qa_dataset.save_json("qa_dataset.json")

In [31]:
# reload dataset
qa_dataset = EmbeddingQAFinetuneDataset.from_json("qa_dataset.json")

In [32]:
# how many queries
queries = qa_dataset.queries.values()

len(queries)

114

In [33]:
print(list(queries)[2])

Explain the role of Reinforcement Learning with Human Feedback (RLHF) in aligning Large Language Models with human preferences.


In [34]:
# to add OCI
# this is the ID of the Model deployment
ID = RERANKER_ID

oci_config = load_oci_config()

# need to do this way
api_keys_config = ads.auth.api_keys(oci_config)

baai_reranker = OCIBAAIReranker(
    auth=api_keys_config, deployment_id=ID, region="eu-frankfurt-1"
)

2024-01-01 18:19:43,190 - INFO - Created OCI reranker client...
2024-01-01 18:19:43,190 - INFO - Region: eu-frankfurt-1...
2024-01-01 18:19:43,190 - INFO - Deployment id: ocid1.datasciencemodeldeployment.oc1.eu-frankfurt-1.amaaaaaangencdyaokm6zawt3akgu3lr7u4hm4o4zrr64emfr3vi2qmzw2xa...


In [35]:
TOP_N = 4
TOP_K = 6

EMBEDDINGS = {
    "OCICohereV3": GenerativeAIEmbeddings(
        compartment_id=COMPARTMENT_OCID,
        model=EMBED_MODEL,
        auth=api_keys_config,
        # Optionally you can specify keyword arguments for the OCI client
        # e.g. service_endpoint.
        client_kwargs={"service_endpoint": ENDPOINT},
    )
}

RERANKERS = {
    "NOReranker": "None",
    "CohereRerank": CohereRerank(api_key=COHERE_API_KEY, top_n=TOP_N),
    "OCIBAAReranker": OCILLamaReranker(oci_reranker=baai_reranker, top_n=TOP_N)
}

In [36]:
# taken from: https://docs.llamaindex.ai/en/stable/examples/evaluation/retrieval/retriever_eval.html

def display_results(embedding_name, reranker_name, eval_results):
    """Display results from evaluate."""

    metric_dicts = []
    for eval_result in eval_results:
        metric_dict = eval_result.metric_vals_dict
        metric_dicts.append(metric_dict)

    full_df = pd.DataFrame(metric_dicts)

    hit_rate = full_df["hit_rate"].mean()
    mrr = full_df["mrr"].mean()

    metric_df = pd.DataFrame(
        {"Embedding": [embedding_name], "Reranker": [reranker_name], "hit_rate": [hit_rate], "mrr": [mrr]}
    )

    return metric_df

In [37]:
class CustomRetriever(BaseRetriever):
    """Custom retriever that adds a reranker"""

    def __init__(
        self,
        vector_retriever: VectorIndexRetriever,
        reranker = None
    ) -> None:
        """Init params."""

        self._vector_retriever = vector_retriever
        self.reranker = reranker

    def _retrieve(self, query_bundle: QueryBundle) -> List[NodeWithScore]:
        """Retrieve nodes given query."""

        retrieved_nodes = self._vector_retriever.retrieve(query_bundle)

        if reranker != 'None':
            retrieved_nodes = self.reranker.postprocess_nodes(retrieved_nodes, query_bundle)
        else:
            retrieved_nodes = retrieved_nodes[:TOP_N]

        return retrieved_nodes

    async def _aretrieve(self, query_bundle: QueryBundle) -> List[NodeWithScore]:
        """Asynchronously retrieve nodes given query.

        Implemented by the user.

        """
        return self._retrieve(query_bundle)

    async def aretrieve(self, str_or_query_bundle: QueryType) -> List[NodeWithScore]:
        if isinstance(str_or_query_bundle, str):
            str_or_query_bundle = QueryBundle(str_or_query_bundle)
        return await self._aretrieve(str_or_query_bundle)

In [38]:
results_df = pd.DataFrame()

# Loop over embeddings
for embed_name, embed_model in EMBEDDINGS.items():

    service_context = ServiceContext.from_defaults(llm=None, embed_model=embed_model)
    
    # base vector index to which we will add reranking
    vector_index = VectorStoreIndex(nodes, service_context=service_context)
    vector_retriever = VectorIndexRetriever(index=vector_index, similarity_top_k=TOP_K, 
                                            service_context=service_context)

    # Loop over rerankers
    for rerank_name, reranker in RERANKERS.items():
        print(f"Running Evaluation for Embedding Model: {embed_name} and Reranker: {rerank_name}")

        # Define Retriever
        custom_retriever = CustomRetriever(vector_retriever, reranker)

        metrics = ["mrr", "hit_rate"]
        
        retriever_evaluator = RetrieverEvaluator.from_metric_names(metrics, 
                                                                   retriever=custom_retriever
        )
        # here we do the evaluation on the dataset
        eval_results = await retriever_evaluator.aevaluate_dataset(qa_dataset)

        current_df = display_results(embed_name, rerank_name, eval_results)
        # concat to overall
        results_df = pd.concat([results_df, current_df], ignore_index=True)

LLM is explicitly disabled. Using MockLLM.
Running Evaluation for Embedding Model: OCICohereV3 and Reranker: NOReranker
Running Evaluation for Embedding Model: OCICohereV3 and Reranker: CohereRerank
Running Evaluation for Embedding Model: OCICohereV3 and Reranker: OCIBAAReranker


In [39]:
results_df

Unnamed: 0,Embedding,Reranker,hit_rate,mrr
0,OCICohereV3,NOReranker,0.842105,0.68348
1,OCICohereV3,CohereRerank,0.894737,0.790205
2,OCICohereV3,OCIBAAReranker,0.894737,0.775585


In [40]:
results_df.to_csv("evaluate_reranker.csv")