In [1]:
# %pip install llama-index-postprocessor-cohere-rerank


In [2]:
# Set COHERE API key
# import os
# os.environ['COHERE_API_KEY'] = '...'

In [26]:
import os
from dotenv import load_dotenv
load_dotenv()

from llama_index.core.query_pipeline import QueryPipeline
from llama_index.core import PromptTemplate
from llama_index.postprocessor.cohere_rerank import CohereRerank
from llama_index.core.response_synthesizers import TreeSummarize

from llama_index.core import Settings
from llama_index.llms.azure_openai import AzureOpenAI
from llama_index.embeddings.azure_openai import AzureOpenAIEmbedding

# You need to deploy your own embedding model as well as your own chat completion model
llm = AzureOpenAI(
    deployment_name='gpt-35-turbo16k',
    model='gpt-35-turbo',
    api_key=os.environ['AZURE_OPENAI_API_KEY'],
    azure_endpoint=os.environ['AZURE_OPENAI_ENDPOINT'],
    api_version=os.environ['AZURE_OPENAI_API_VERSION'],
)

# llm = AzureOpenAI(
#     deployment_name='gpt-4o',
#     model='gpt-4o',
#     api_key=os.environ['AZURE_OPENAI_API_KEY'],
#     azure_endpoint=os.environ['AZURE_OPENAI_ENDPOINT'],
#     api_version=os.environ['AZURE_OPENAI_API_VERSION'],
# )


embed_model = AzureOpenAIEmbedding(
    model="text-embedding-ada-002",
    deployment_name="text-embedding-ada-002",
    api_key=os.environ['AZURE_OPENAI_API_KEY'],
    azure_endpoint=os.environ['AZURE_OPENAI_ENDPOINT'],
    api_version=os.environ['AZURE_OPENAI_API_VERSION'],
)

# global settings
Settings.llm = llm
Settings.embed_model = embed_model

In [4]:
from datasets import load_dataset
import pandas as pd
from llama_index.core import Document


ds = load_dataset("rag-datasets/rag-mini-bioasq", "text-corpus")
ds = ds['passages'].to_pandas().set_index('id', drop=True)
query_set = load_dataset("rag-datasets/rag-mini-bioasq", "question-answer-passages")
queries = query_set['test'].take(5)

# create a subset of the documents for faster testing

passages_required = set()
[ passages_required.update([int(id) for id in ids[1:-1].split(", ")]) for ids in query_set['test'].take(15)['relevant_passage_ids'] ];


docs = [Document(text=ds.loc[id].passage, metadata = {'id' : id}) for id in passages_required]
for x in docs:
    x.doc_id = str(x.metadata['id'])
    x.excluded_llm_metadata_keys = ['id']

In [None]:
from llama_index.core import VectorStoreIndex
 
index = VectorStoreIndex.from_documents(docs)

def prepare_pipeline(new_index, k, only_retriever=False):
    
    k = 2*k # k=10 is a good choice as seen in querying notebook, we take double that, rerank them and take top k

    # converting vector store to retriever for RAG pipeline, response synthesis will be handled by the summarizer node
    retriever = new_index.as_retriever(similarity_top_k=k)  

    prompt_str = '''
    Expand or rephrase the given query for querying a RAG index, if the query is already quite detailed return the input verbatim.  
    QUERY : {query}
    '''

    prompt_tmpl = PromptTemplate(prompt_str)

    reranker = CohereRerank(top_n=k/2)
    summarizer = TreeSummarize()
    p = QueryPipeline(verbose=True)

    # Adding the modules to the pipeline
    p.add_modules(
        {
            "llm": llm,
            "prompt_tmpl": prompt_tmpl,
            "retriever": retriever,
            "reranker": reranker,
        }
    )

    if not only_retriever:
        p.add_modules({"summarizer": summarizer})

    #Define the links between modules
    p.add_link("prompt_tmpl", "llm")
    p.add_link("llm", "retriever")
    p.add_link("retriever", "reranker", dest_key="nodes")
    p.add_link("llm", "reranker", dest_key="query_str")
    if not only_retriever:
        p.add_link("reranker", "summarizer", dest_key="nodes")
        p.add_link("llm", "summarizer", dest_key="query_str")

        # look at summarizer input keys
        print(summarizer.as_query_component().input_keys)


    return p

p = prepare_pipeline(new_index = index, k=10)

required_keys={'query_str', 'nodes'} optional_keys=set()


In [13]:
response, intermediates = p.run_with_intermediates(query=queries[0]['question'])
print(response, response.get_formatted_sources(length = 500))

[1;3;38;2;155;135;227m> Running module prompt_tmpl with input: 
query: Is Hirschsprung disease a mendelian or a multifactorial disorder?

[0m[1;3;38;2;155;135;227m> Running module llm with input: 
messages: 
    Expand or rephrase the given query for querying a RAG index, if the query is already quite detailed return the input verbatim.  
    QUERY : Is Hirschsprung disease a mendelian or a multifactoria...

[0m[1;3;38;2;155;135;227m> Running module retriever with input: 
input: assistant: Can you provide information on whether Hirschsprung disease is classified as a mendelian disorder or a multifactorial disorder?

[0m[1;3;38;2;155;135;227m> Running module reranker with input: 
query_str: assistant: Can you provide information on whether Hirschsprung disease is classified as a mendelian disorder or a multifactorial disorder?
nodes: [NodeWithScore(node=TextNode(id_='972a2f43-ddca-4340-ad71-1b5c00ed6e37', embedding=None, metadata={'id': 23001136}, excluded_embed_metadata_keys=[],

In [14]:
intermediates['summarizer'].inputs

{'query_str': ChatResponse(message=ChatMessage(role=<MessageRole.ASSISTANT: 'assistant'>, content='Can you provide information on whether Hirschsprung disease is classified as a mendelian disorder or a multifactorial disorder?', additional_kwargs={}), raw=ChatCompletion(id='chatcmpl-Aa6er0YvC5meXArUQOYu1j5GAZAYS', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='Can you provide information on whether Hirschsprung disease is classified as a mendelian disorder or a multifactorial disorder?', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=None), content_filter_results={'hate': {'filtered': False, 'severity': 'safe'}, 'self_harm': {'filtered': False, 'severity': 'safe'}, 'sexual': {'filtered': False, 'severity': 'safe'}, 'violence': {'filtered': False, 'severity': 'safe'}})], created=1733168765, model='gpt-35-turbo-16k', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage

In [44]:
from llama_index.core.evaluation import RetrieverEvaluator
from llama_index.core.evaluation.retrieval.metrics import resolve_metrics, HitRate, MRR
from llama_index.core.node_parser import SentenceSplitter, TokenTextSplitter

metric_dict = {}
metrics = ["precision", "recall", "ap", "ndcg"]
metrics = [x() for x in resolve_metrics(metrics)] + [HitRate(use_granular_hit_rate=True), MRR(use_granular_mrr=True)]

results_data = []
splitters = {'sentance_512_0': SentenceSplitter(chunk_size=512, chunk_overlap=0),
             'token_512': TokenTextSplitter(chunk_size=512, chunk_overlap=100)}

for splitter_name in splitters.keys():
    splitter = splitters[splitter_name]
    index = VectorStoreIndex.from_documents(docs, transformations = [splitter])
    
    for k in [5, 10]:
        p = prepare_pipeline(new_index = index, k=k, only_retriever=True) # we only need to eval QR+RR retriever in this loop
        p.verbose = False
        for row in queries:
            row['relevant_passage_ids'] = row['relevant_passage_ids'][1:-1].split(', ')
            query = row['question']
            retrieved_nodes = p.run(query) # without llm it will return the nodes from reranker 
            retrieved_passage_ids = [str(node.metadata['id']) for node in retrieved_nodes]
            
            for metric in metrics:
                eval_result = metric.compute(
                    query, row['relevant_passage_ids'], retrieved_passage_ids,
                )
                metric_dict[metric.metric_name] = eval_result.score
            
            results_data.append({
                'splitter': splitter_name,
                'k': k,
                'query': query,
                'retrieved_ids': retrieved_passage_ids,
                'relevant_ids': row['relevant_passage_ids'],
                **metric_dict
            })

results_df = pd.DataFrame(results_data)
results_df.drop(['query', 'retrieved_ids', 'relevant_ids'], axis=1).groupby(['k','splitter']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,precision,recall,ap,ndcg,hit_rate,mrr
k,splitter,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
5,sentance_512_0,0.92,0.484722,0.484722,0.947518,0.484722,0.482333
5,token_512,0.92,0.464722,0.484722,0.947518,0.484722,0.482333
10,sentance_512_0,0.593333,0.567222,0.587222,0.71544,0.587222,0.432328
10,token_512,0.622222,0.567222,0.607222,0.73112,0.607222,0.422661


In [45]:
from ragas.dataset_schema import SingleTurnSample
                            
from ragas.metrics import (LLMContextRecall,LLMContextPrecisionWithReference, Faithfulness, 
                            SemanticSimilarity, NonLLMContextRecall, answer_correctness, FactualCorrectness)
from ragas import evaluate, EvaluationDataset
from langchain_openai import AzureOpenAIEmbeddings
from langchain.chat_models import AzureChatOpenAI
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper

# evaluator_llm = LangchainLLMWrapper(AzureChatOpenAI(
#                 openai_api_version=os.environ['AZURE_OPENAI_API_VERSION'],
#                 azure_deployment='gpt-4o',
#                 model='gpt-4o',
#             ))

evaluator_llm = LangchainLLMWrapper(AzureChatOpenAI(
                openai_api_version=os.environ['AZURE_OPENAI_API_VERSION'],
                azure_deployment='gpt-35-turbo16k',
                model='gpt-35-turbo',
            ))

evaluator_embeddings = LangchainEmbeddingsWrapper( AzureOpenAIEmbeddings(
                openai_api_version=os.environ['AZURE_OPENAI_API_VERSION'],
                azure_deployment='text-embedding-ada-002',
                model='text-embedding-ada-002',
))


metrics = [
    LLMContextRecall(), # Recall based on claims made in response vs those in reference, uses LLM
    LLMContextPrecisionWithReference(), # Precision based on claims made in response vs those in reference, uses LLM
    FactualCorrectness(), # F1-Score of claims made in response vs those in reference
    SemanticSimilarity(), # embedding based similarity between generated answer and ground truth
    answer_correctness,
    Faithfulness()
]

results_data = []
splitters = {'sentance_512_0': SentenceSplitter(chunk_size=512, chunk_overlap=0),
             'token_512': TokenTextSplitter(chunk_size=512, chunk_overlap=100)}

for splitter_name in splitters.keys():
    splitter = splitters[splitter_name]
    index = VectorStoreIndex.from_documents(docs, transformations = [splitter])
    
    for k in [5, 10]:
        p = prepare_pipeline(new_index = index, k=k) # we only need to eval QR+RR retriever in this loop
        p.verbose = False
        samples = []
        for row in queries:
            query = row['question']
            response = p.run(query)
            retrieved_nodes = response.source_nodes
            retrieved_passage_ids = [node.metadata['id'] for node in retrieved_nodes]
            retrieved_passages = [ ds.loc[int(id)].passage for id in retrieved_passage_ids ]
            relevant_passages = [ ds.loc[int(id)].passage for id in row["relevant_passage_ids"][1:-1].split(', ') ]
            
            sample = SingleTurnSample(
                user_input=query,
                reference=row["answer"],
                response=response.response,
                retrieved_contexts=retrieved_passages,
                reference_contexts=relevant_passages,
            )
            samples.append(sample)

        eval_dataset = EvaluationDataset(samples = samples)
        results = evaluate(dataset=eval_dataset, metrics=metrics, llm = evaluator_llm, embeddings = evaluator_embeddings)
        df = results.to_pandas()
        df['k'] = k
        df['splitter'] = splitter_name
        results_data.append(df)

results_df = pd.concat(results_data).reset_index(drop=True)
results_df.drop(['user_input','retrieved_contexts','reference_contexts','response','reference'], axis=1).groupby(['k','splitter']).mean()

  evaluator_llm = LangchainLLMWrapper(AzureChatOpenAI(


required_keys={'query_str', 'nodes'} optional_keys=set()


Evaluating:   0%|          | 0/30 [00:00<?, ?it/s]

required_keys={'query_str', 'nodes'} optional_keys=set()


Evaluating:   0%|          | 0/30 [00:00<?, ?it/s]

required_keys={'query_str', 'nodes'} optional_keys=set()


Evaluating:   0%|          | 0/30 [00:00<?, ?it/s]

required_keys={'query_str', 'nodes'} optional_keys=set()


Evaluating:   0%|          | 0/30 [00:00<?, ?it/s]

Unnamed: 0_level_0,Unnamed: 1_level_0,context_recall,llm_context_precision_with_reference,factual_correctness,semantic_similarity,answer_correctness,faithfulness
k,splitter,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
5,sentance_512_0,1.0,0.98,0.634,0.919111,0.494778,0.571429
5,token_512,1.0,0.98,0.706,0.917222,0.510972,0.571429
10,sentance_512_0,0.9,0.918255,0.656,0.91628,0.511913,0.483333
10,token_512,1.0,0.913872,0.694,0.919359,0.476268,0.571429
