<a href="https://colab.research.google.com/github/nguyenkien1402/llamaindex-practices/blob/main/evaluation-pipeline-rag/rag_evaluation_pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
from google.colab import drive
MOUNTPOINT = '/content/gdrive'
DATADIR = os.path.join(MOUNTPOINT, 'MyDrive', 'llm-poc')
drive.mount(MOUNTPOINT)

Mounted at /content/gdrive


In [2]:
!pip install -r gdrive/MyDrive/llm-doc/requirements.txt

Collecting llama-index==0.8.28 (from -r gdrive/MyDrive/llm-doc/requirements.txt (line 1))
  Downloading llama_index-0.8.28-py3-none-any.whl (806 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m806.3/806.3 kB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langchain==0.0.292 (from -r gdrive/MyDrive/llm-doc/requirements.txt (line 2))
  Downloading langchain-0.0.292-py3-none-any.whl (1.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m61.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers==4.33.2 (from -r gdrive/MyDrive/llm-doc/requirements.txt (line 3))
  Downloading transformers-4.33.2-py3-none-any.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m124.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting openai==0.28.0 (from -r gdrive/MyDrive/llm-doc/requirements.txt (line 4))
  Downloading openai-0.28.0-py3-none-any.whl (76 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━

In [1]:
from llama_index import download_loader
from llama_index import SimpleDirectoryReader

UnstructuredReader = download_loader('UnstructuredReader')

In [2]:
dir_reader = SimpleDirectoryReader('gdrive/MyDrive/llm-doc/data', file_extractor={
  ".pdf": UnstructuredReader(),
})
documents = dir_reader.load_data()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [3]:
import pandas as pd
from llama_index.evaluation import DatasetGenerator, RelevancyEvaluator, ResponseEvaluator, FaithfulnessEvaluator, QueryResponseEvaluator
from llama_index import (
    SimpleDirectoryReader,
    VectorStoreIndex,
    ServiceContext,
    LLMPredictor,
    Response,
    StorageContext,
    load_index_from_storage,
    SummaryIndex
)
from llama_index.node_parser import SimpleNodeParser
from llama_index.storage.index_store import SimpleIndexStore
from llama_index.prompts import Prompt
from llama_index.tools import QueryEngineTool, ToolMetadata
from llama_index.schema import IndexNode
from llama_index.agent import OpenAIAgent

# define recursive retriever
from llama_index.retrievers import RecursiveRetriever
from llama_index.query_engine import RetrieverQueryEngine
from llama_index.response_synthesizers import get_response_synthesizer


import chromadb
from llama_index.vector_stores import ChromaVectorStore

import openai
from llama_index.llms import OpenAI

import time
import asyncio
import nest_asyncio
nest_asyncio.apply()

# openai.api_key = ""


# **Create Embedding Mode**


In [4]:
from langchain.embeddings import HuggingFaceBgeEmbeddings

model_name = "BAAI/bge-base-en"
model_kwargs = {'device': 'cuda'}
encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity

model_norm = HuggingFaceBgeEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs,
    query_instruction="Represent this sentence for searching relevant passages: finance, investments, economics, real estate"
)

Downloading (…)f5f20/.gitattributes:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)6fb9cf5f20/README.md:   0%|          | 0.00/88.8k [00:00<?, ?B/s]

Downloading (…)b9cf5f20/config.json:   0%|          | 0.00/719 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Downloading (…)f5f20/tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

Downloading (…)6fb9cf5f20/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)9cf5f20/modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

# **Initialize Service Context**

In [6]:
# create client and a new collection
chroma_client = chromadb.PersistentClient(path='gdrive/MyDrive/llm-doc/vector_store/')
chroma_collection = chroma_client.get_or_create_collection("finance")

node_parser = SimpleNodeParser.from_defaults(chunk_size=1024, chunk_overlap=20)

vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
service_context = ServiceContext.from_defaults(llm=OpenAI(model="gpt-3.5-turbo-16k", max_tokens=512, temperature=0.1),
                                               embed_model=model_norm,
                                               node_parser=node_parser)


# **Approach 1: Naive Approach with VectorIndex Only**

In [7]:
indexid = 'finance_index'
index_path = 'gdrive/MyDrive/llm-doc/index'
try:
  ## load index from storage
  print(f"Load {indexid} from local path")
  storage_context = StorageContext.from_defaults(vector_store=vector_store,
                                                  persist_dir=index_path)
  index = load_index_from_storage(storage_context=storage_context, index_id=indexid)
except Exception as e:
  print(str(e))
  print("Creating new index")
  storage_context = StorageContext.from_defaults(vector_store=vector_store)
  index = VectorStoreIndex.from_documents(documents, service_context=service_context, storage_context=storage_context)
  index.set_index_id(indexid)
  index.storage_context.persist(persist_dir=index_path)

Load finance_index from local path
Failed to load index with ID finance_index
Creating new index


In [8]:
## testing index
query_engine = index.as_query_engine()
reponse = query_engine.query("How does the Nvidia's revenue in this report compare to the previous fiscal year, and what factors contributed to any changes?")
reponse.response

"NVIDIA's revenue in this report has significantly increased compared to the previous fiscal year. In the second quarter of fiscal year 2024, revenue was up 101% and for the first half of fiscal year 2024, revenue was up 38%. The increase in revenue was primarily driven by higher Data Center revenue in the Compute & Networking segment, with Compute GPUs growing 208% year-on-year and 112% compared to the first half of fiscal year 2023. This growth was due to the demand for NVIDIA's HGX platform based on the Hopper and Ampere GPU architecture for large language models and generative AI. Additionally, strong growth in InfiniBand infrastructure to support the HGX platform contributed to the increase in revenue. In the Graphics segment, the increase in revenue in the second quarter of fiscal year 2024 was primarily due to growth in Gaming GPUs, specifically the demand for the GeForce RTX 40 Series GPUs based on the NVIDIA Ada Lovelace architecture following normalization of channel inventor

**Step 1: Question Generation**

In [None]:
data_generator = DatasetGenerator.from_documents(
                      documents,
                      text_question_template=Prompt(
                      "A sample from the documents is below.\n"
                      "---------------------\n"
                      "{context_str}\n"
                      "---------------------\n"
                      "Using the documentation sample, carefully follow the instructions below:\n"
                      "{query_str}"
                      ),
                      question_gen_query=(
                          "You are a search pipeline evaluator. Using the papers provided, "
                          "you must create a list of summary questions and question/answer questions. "
                          "Limit the queries to the information supplied in the context.\n"
                          "Question: "
                      ),
                      service_context=service_context)

In [None]:
generated_questions  = data_generator.generate_questions_from_nodes(num=50)
print(f"Generated {len(generated_questions)} questions.")

# save the questions into a txt file for resuse later on
with open("gdrive/MyDrive/llm-doc/questions.txt", "w") as f:
  for question in generated_questions:
    f.write(f"{question.strip()}\n")

In [13]:
with open('gdrive/MyDrive/llm-doc/questions.txt', 'r') as f:
    generated_questions = f.readlines()
    generated_questions = [line.rstrip() for line in generated_questions]

In [14]:
# importing random module
import random
eval_questions = random.sample(generated_questions,20)
eval_questions

['What are some trends indicating that short rates are close to their peak?',
 'How have infrastructure, real estate, and healthcare performed compared to financial assets during periods of high inflation over the past 20 years?',
 "What is Microsoft's approach to security?",
 'What is the current official cash rate in Australia?',
 "Provide an overview of any significant legal or regulatory issues mentioned in Nvidia's",
 'How have retail sales, CPI, and cash rates changed over time?',
 'What resources and support does Microsoft provide to higher education institutions for cybersecurity training?',
 "How has Australia's headline inflation rate changed from December 2022 to February 2023?",
 'Did Microsoft meet its goal to double the percentage of transaction volumes with Black- and African American-owned financial institutions by 2023?',
 'What is the forecasted population growth rate in Australia over the next five years?',
 'What areas of technology does Microsoft aim to lead in?',


**Step 2: Evaluation Testing**

In [15]:
# define jupyter display function
def display_eval_df(query: str, response: Response, eval_result: str) -> None:
  eval_df = pd.DataFrame(
      {
          "Query": str(query),
          "Response": str(response),
          "Source": response.source_nodes[0].node.get_content()[:500] + "...",
          "Evaluation Result": eval_result.feedback
      },
      index=[0],
  )
  eval_df = eval_df.style.set_properties(
      **{
          "inline-size": "600px",
          "overflow-wrap": "break-word",
      },
      subset=["Response", "Source"]
  )
  display(eval_df)

In [18]:
# use gpt4 for question generation
service_context_gpt4 = ServiceContext.from_defaults(llm=OpenAI(temperature=0.1, max_tokens=512, model="gpt-4"),
                                                    embed_model=model_norm,
                                                    node_parser=node_parser)
# call ResponseEvaluator to evaluate the responses
query_engine = index.as_query_engine()
evaluator = ResponseEvaluator(service_context=service_context_gpt4)
response_vector = query_engine.query(eval_questions[3])
eval_result = evaluator.evaluate_response(response=response_vector)

display_eval_df(eval_questions[3], response_vector, eval_result)

Unnamed: 0,Query,Response,Source,Evaluation Result
0,What is the current official cash rate in Australia?,The current official cash rate in Australia is 3.6%.,"Australian Real Estate Quarterly Review Q2 2023 Quarter Quay Tower, Sydney Australian Real Estate Quarterly Review | Q2 2023 Page 1 of 14 Inside Page 3 Investment climate Page 4 The case for real assets Page 5 Performance / transactions Page 6 Office Page 7 Office market wrap Page 8 Industrial Page 9 Industrial by region Page 10 Retail indicators Page 11 Retail performance Page 12 Healthcare Australian Real Estate Quarterly Review | Q2 2023 Page 2 of 14 Investment clim...",YES


**Step 3: Evaluating Response Faithfulness**

In [21]:
"""
If your OpenAI API license does not have limit to call the API, then you can use this code to fasten the process
"""
from llama_index.evaluation import BatchEvalRunner

## Method 1
# faithfulness_gpt4 = FaithfulnessEvaluator(service_context=service_context_gpt4)
# relevancy_gpt4 = RelevancyEvaluator(service_context=service_context_gpt4)

# runner = BatchEvalRunner(
#     {"faithfulness": faithfulness_gpt4, "relevancy": relevancy_gpt4},
#     workers=8,
# )

# eval_results = await runner.aevaluate_queries(
#     index.as_query_engine(), queries=eval_questions[0:5]
# )

## Method 2
### For async query
async def run_query(query_engine, q):
  try:
      return await query_engine.aquery(q)
  except:
      return Response(response="Error, query failed.")

def async_evaluate_query_engine(evaluator, query_engine, questions):
  total_correct = 0
  all_results = []
  for batch_size in range(0, len(questions), 5):
      batch_qs = questions[batch_size:batch_size+5]

      tasks = [run_query(query_engine, q) for q in batch_qs]
      responses = asyncio.run(asyncio.gather(*tasks))
      print(f"finished batch {(batch_size // 5) + 1} out of {len(questions) // 5}")

      # if eval for hallucination
      if isinstance(evaluator, FaithfulnessEvaluator):
        print("Use FaithfulnessEvaluator")
        for response in responses:
            eval_result = 1 if "YES" in evaluator.evaluate_response(response=response).feedback else 0
            total_correct += eval_result
            all_results.append(eval_result)
      # eval for answer quality
      elif isinstance(evaluator, RelevancyEvaluator):
        print("Use RelevancyEvaluator")
        for question, response in zip(batch_qs, responses):
            eval_result = 1 if "YES" in evaluator.evaluate_response(response=response).feedback else 0
            total_correct += eval_result
            all_results.append(eval_result)
      # helps avoid rate limits
      time.sleep(1)

  return total_correct, all_results


In [25]:
## for normal function query 1 by 1
def evaluate_query_engine(evaluator, query_engine, questions):
  total_correct = 0
  all_results = []
  if isinstance(evaluator, FaithfulnessEvaluator):
    print("Use FaithfulnessEvaluator")
  elif isinstance(evaluator, RelevancyEvaluator):
    print("Use RelevancyEvaluator")

  for query in questions:
    print(f"Questions: {query}")
    response = query_engine.query(query)
    eval_result = 1 if "YES" in evaluator.evaluate_response(response=response).feedback else 0
    total_correct += eval_result
    all_results.append(eval_result)
    time.sleep(4)

  return total_correct, all_results


In [3]:
# eval for Faithfulness/hallucination
query_engine = index.as_query_engine()
faithfulness_evaluator = FaithfulnessEvaluator(service_context=service_context_gpt4)
total_correct, all_results = evaluate_query_engine(faithfulness_evaluator, query_engine, eval_questions)
print(f"Faithfulness:  Scored {total_correct} out of {len(eval_questions)} questions correctly.")

Faithfulness: Scored 17 out of 20 questions correctly.


**Step 4: Evaluating Response for the Quality Answer**

In [4]:
# eval for Faithfulness/hallucination
relevancy_evaluator = RelevancyEvaluator(service_context=service_context_gpt4)
total_correct, all_results = evaluate_query_engine(relevancy_evaluator, query_engine, eval_questions)
print(f"Relevancy: Scored {total_correct} out of {len(eval_questions)} questions correctly.")

Relevancy: Scored 15 out of 20 questions correctly.


# **Approach 2: With Summary + Recursive Document Agent**

In [None]:
# load all the documents
import glob

files = []
for file in glob.glob("gdrive/MyDrive/llm-doc/data/*.pdf"):
    print(file)
    files.append(file)
all_docs = {}
for title in files:
  doc = title.split('/')[-1].split('.')[0]
  all_docs[doc] = SimpleDirectoryReader(input_files=[title]).load_data()

gdrive/MyDrive/llm-doc/data/australia quarterly real estate report.pdf
gdrive/MyDrive/llm-doc/data/perpetual private quarterly market update.pdf
gdrive/MyDrive/llm-doc/data/Nividia 10k report June 2023.pdf
gdrive/MyDrive/llm-doc/data/Microsoft 10k report June 2023.pdf


**Build Document Agent**

In [None]:
agents = {}

for doc in all_docs:
  # build vector index
  vector_index = VectorStoreIndex.from_documents(all_docs[doc], service_context=service_context)
  # build summary index
  summary_index = SummaryIndex.from_documents(all_docs[doc], service_context=service_context)

  # define query engine
  vector_query_engine = vector_index.as_query_engine()
  summary_query_engine = summary_index.as_query_engine()

  # define tools
  query_engine_tools = [
    QueryEngineTool(
        query_engine = vector_query_engine,
        metadata = ToolMetadata(
            name="vector_tool",
            description=f"Useful for retrieving specific context from {doc} "
        )
    ),
    QueryEngineTool(
        query_engine = summary_query_engine,
        metadata = ToolMetadata(
            name="summary_tool",
            description=f"Useful for summarization questions related to {doc} "
        )
    ),
  ]

  # build agent
  function_llm = OpenAI(model = 'gpt-3.5-turbo-16k')
  agent = OpenAIAgent.from_tools(
      query_engine_tools,
      llm=function_llm,
      verbose=True
  )
  agents[doc] = agent

**Build Recursive Retriever over the Agents**

In [None]:
# define top-level nodes
nodes = []
for doc in all_docs:
  doc_summary = (
      f"This content contains content about {doc}. "
      f"Use this index if you need to lookup specific facts about {doc}.\n"
  )
  node = IndexNode(text=doc_summary, index_id=doc)
  nodes.append(node)

# define top-level retriever
vector_index = VectorStoreIndex(nodes)
vector_retriever = vector_index.as_retriever(similarity_top_k=1)

In [None]:
# note: can pass `agents` dict as `query_engine_dict` since every agent can be used as a query engine
recursive_retriever = RecursiveRetriever(
    "vector",
    retriever_dict={"vector": vector_retriever},
    query_engine_dict=agents,
    verbose=True,
)

In [None]:
response_synthesizer = get_response_synthesizer(
    # service_context=service_context,
    response_mode="compact",
)
query_engine = RetrieverQueryEngine.from_args(
    recursive_retriever,
    response_synthesizer=response_synthesizer,
    service_context=service_context,
)

**Evaluating With Approach 2**

**Step 3: Evaluating Response Faithfulness**

In [5]:
# eval for Faithfulness/hallucination
faithfulness_evaluator = FaithfulnessEvaluator(service_context=service_context_gpt4)
total_correct, all_results = evaluate_query_engine(faithfulness_evaluator, query_engine, eval_questions)
print(f"Faithfulness:  Scored {total_correct} out of {len(eval_questions)} questions correctly.")

Faithfulness:  Scored 18 out of 20 questions correctly.


**Step 4: Evaluating Response for the Quality Answer**

In [6]:
# eval for Faithfulness/hallucination
relevancy_evaluator = RelevancyEvaluator(service_context=service_context_gpt4)
total_correct, all_results = evaluate_query_engine(relevancy_evaluator, query_engine, eval_questions)
print(f"Quality Answer Scored {total_correct} out of {len(eval_questions)} questions correctly.")

Quality Answer Scored 17 out of 20 questions correctly.
