<a href="https://colab.research.google.com/github/nguyenkien1402/llamaindex-practices/blob/main/evaluation-pipeline-rag/rag_evaluation_pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
from google.colab import drive
MOUNTPOINT = '/content/gdrive'
DATADIR = os.path.join(MOUNTPOINT, 'MyDrive', 'llm-poc')
drive.mount(MOUNTPOINT)

Mounted at /content/gdrive


In [None]:
!pip install -r gdrive/MyDrive/llm-doc/requirements.txt

In [1]:
from llama_index import download_loader
from llama_index import SimpleDirectoryReader

UnstructuredReader = download_loader('UnstructuredReader')

In [17]:
dir_reader = SimpleDirectoryReader('gdrive/MyDrive/llm-doc/data', file_extractor={
  ".pdf": UnstructuredReader(),
})
documents = dir_reader.load_data()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [88]:
import pandas as pd
from llama_index.evaluation import DatasetGenerator, RelevancyEvaluator, ResponseEvaluator, FaithfulnessEvaluator, QueryResponseEvaluator
from llama_index import (
    SimpleDirectoryReader,
    VectorStoreIndex,
    ServiceContext,
    LLMPredictor,
    Response,
    StorageContext,
    load_index_from_storage,
    SummaryIndex
)
from llama_index.node_parser import SimpleNodeParser
from llama_index.storage.index_store import SimpleIndexStore
from llama_index.prompts import Prompt

import chromadb
from llama_index.vector_stores import ChromaVectorStore

import openai
from llama_index.llms import OpenAI

import time
import asyncio
import nest_asyncio
nest_asyncio.apply()

# openai.api_key = ""


# **Create Embedding Mode**


In [5]:
from langchain.embeddings import HuggingFaceBgeEmbeddings

model_name = "BAAI/bge-base-en"
model_kwargs = {'device': 'cuda'}
encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity

model_norm = HuggingFaceBgeEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs,
    query_instruction="Represent this sentence for searching relevant passages: finance, investments, economics, real estate"
)

Downloading (…)f5f20/.gitattributes:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)6fb9cf5f20/README.md:   0%|          | 0.00/88.8k [00:00<?, ?B/s]

Downloading (…)b9cf5f20/config.json:   0%|          | 0.00/719 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Downloading (…)f5f20/tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

Downloading (…)6fb9cf5f20/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)9cf5f20/modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

# **Initialize Service Context**

In [18]:
# create client and a new collection
chroma_client = chromadb.PersistentClient(path='gdrive/MyDrive/llm-doc/vector_store/')
chroma_collection = chroma_client.get_or_create_collection("finance_australia")

node_parser = SimpleNodeParser.from_defaults(chunk_size=1024, chunk_overlap=20)

vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
service_context = ServiceContext.from_defaults(llm=OpenAI(model="gpt-3.5-turbo-16k", max_tokens=512, temperature=0.1),
                                               embed_model=model_norm,
                                               node_parser=node_parser)


# **Approach 1: Naive Approach with VectorIndex Only**

In [19]:
indexid = 'finance_australia_index'
index_path = 'gdrive/MyDrive/llm-doc/index'
try:
  ## load index from storage
  print(f"Load {indexid} from local path")
  storage_context = StorageContext.from_defaults(vector_store=vector_store,
                                                  persist_dir=index_path)
  index = load_index_from_storage(storage_context=storage_context, index_id=indexid)
except Exception as e:
  print(str(e))
  print("Creating new index")
  storage_context = StorageContext.from_defaults(vector_store=vector_store)
  index = VectorStoreIndex.from_documents(documents, service_context=service_context, storage_context=storage_context)
  index.set_index_id(indexid)
  index.storage_context.persist(persist_dir=index_path)

Load finance_australia_index from local path
Failed to load index with ID finance_australia_index
Creating new index


In [10]:
## testing index
query_engine = index.as_query_engine()
reponse = query_engine.query("How does the Nvidia's revenue in this report compare to the previous fiscal year, and what factors contributed to any changes?")
reponse.response

"NVIDIA's revenue in this report has increased compared to the previous fiscal year. The revenue for the second quarter of fiscal year 2024 was $13.51 billion, which is a 101% increase from the same period in the previous year. The revenue for the first half of fiscal year 2024 was $20.70 billion, a 38% increase from the first half of the previous year. \n\nThe increase in revenue can be attributed to several factors. In the Compute & Networking segment, there was higher Data Center revenue, with Compute GPUs growing 208% year-on-year and 112% compared to the first half of fiscal year 2023. This growth was driven by demand for the NVIDIA HGX platform based on the Hopper and Ampere GPU architecture for large language models and generative AI. Networking also saw strong growth, with a 94% year-on-year increase and a 63% increase compared to the first half of the previous year, driven by growth in InfiniBand infrastructure to support the HGX platform.\n\nIn the Graphics segment, the incre

**Step 1: Question Generation**

In [20]:
data_generator = DatasetGenerator.from_documents(
                      documents,
                      text_question_template=Prompt(
                      "A sample from the documents is below.\n"
                      "---------------------\n"
                      "{context_str}\n"
                      "---------------------\n"
                      "Using the documentation sample, carefully follow the instructions below:\n"
                      "{query_str}"
                      ),
                      question_gen_query=(
                          "You are a search pipeline evaluator. Using the papers provided, "
                          "you must create a list of summary questions and question/answer questions. "
                          "Limit the queries to the information supplied in the context.\n"
                          "Question: "
                      ),
                      service_context=service_context)

In [21]:
eval_questions  = data_generator.generate_questions_from_nodes(num=20)
print(f"Generated {len(eval_questions)} questions.")

Generated 15 questions.


In [None]:
# importing random module
import random

random.sample(eval_questions,20)

In [31]:
# importing random module
import random

random.sample(eval_questions,20)

['What is the current official cash rate in Australia?',
 "How has Australia's headline inflation rate changed from December 2022 to February 2023?",
 'What are some trends indicating that short rates are close to their peak?',
 'How did the Australian economy perform in Q4 2022?',
 'What factors suggest a general slowing of business activity in the year ahead?',
 'What is the forecasted population growth rate in Australia over the next five years?',
 'What are the expected changes in real GDP, employment, and business investment in Australia?',
 'How have retail sales, CPI, and cash rates changed over time?',
 'What is the current 10-year bond yield in Australia?',
 'Why should investors consider investing in real assets in 2023?',
 'What has led to increased volatility in pricing across all asset classes?',
 'Why should investors maintain or increase exposure to real assets in the short term?',
 'What are the qualities of real assets that make them more apparent during times of volat

In [29]:
# save the questions into a txt file for resuse later on
with open("gdrive/MyDrive/llm-doc/questions.txt", "w") as f:
  for question in eval_questions:
    f.write(f"{question.strip()}\n")

**Step 2: Evaluation Testing**

In [85]:
# define jupyter display function
def display_eval_df(query: str, response: Response, eval_result: str) -> None:
  eval_df = pd.DataFrame(
      {
          "Query": str(query),
          "Response": str(response),
          "Source": response.source_nodes[0].node.get_content()[:500] + "...",
          "Evaluation Result": eval_result.feedback
      },
      index=[0],
  )
  eval_df = eval_df.style.set_properties(
      **{
          "inline-size": "600px",
          "overflow-wrap": "break-word",
      },
      subset=["Response", "Source"]
  )
  display(eval_df)

In [91]:
# use gpt4 for question generation
service_context_gpt4 = ServiceContext.from_defaults(llm=OpenAI(temperature=0.1, max_tokens=512, model="gpt-4"),
                                                    embed_model=model_norm,
                                                    node_parser=node_parser)
# call ResponseEvaluator to evaluate the responses
query_engine = index.as_query_engine()
evaluator = ResponseEvaluator(service_context=service_context_gpt4)
response_vector = query_engine.query(eval_questions[2])
eval_result = evaluator.evaluate_response(response=response_vector)

display_eval_df(eval_questions[1], response_vector, eval_result)

Unnamed: 0,Query,Response,Source,Evaluation Result
0,How has Australia's headline inflation rate changed from December 2022 to February 2023?,The context information does not provide any specific trends indicating that short rates are close to their peak.,"Average income returns over the past 3 years High interest rates have led to increased volatility in pricing across all asset classes. Understandably, investors are cautious about the outlook. However, there are reasons to maintain or increase exposure to real assets in the short term. It’s more than just ‘buy in gloom, sell in boom’ argument. Quality real assets are by nature tightly held, so when the weight of capital returns, opportunities will become significantly harder to find. It is in ...",YES


**Step 3: Evaluating Response Faithfulness**

In [103]:
async def run_query(query_engine, q):
  try:
      return await query_engine.aquery(q)
  except:
      return Response(response="Error, query failed.")

def async_evaluate_query_engine(evaluator, query_engine, questions):
  total_correct = 0
  all_results = []
  for batch_size in range(0, len(questions), 5):
      batch_qs = questions[batch_size:batch_size+5]

      tasks = [run_query(query_engine, q) for q in batch_qs]
      responses = asyncio.run(asyncio.gather(*tasks))
      print(f"finished batch {(batch_size // 5) + 1} out of {len(questions) // 5}")

      # if eval for hallucination
      if isinstance(evaluator, FaithfulnessEvaluator):
        print("Use FaithfulnessEvaluator")
        for response in responses:
            eval_result = 1 if "YES" in evaluator.evaluate_response(response=response).feedback else 0
            total_correct += eval_result
            all_results.append(eval_result)
      # eval for answer quality
      elif isinstance(evaluator, RelevancyEvaluator):
        print("Use RelevancyEvaluator")
        for question, response in zip(batch_qs, responses):
            eval_result = 1 if "YES" in evaluator.evaluate_response(response=response).feedback else 0
            total_correct += eval_result
            all_results.append(eval_result)
      # helps avoid rate limits
      time.sleep(1)

  return total_correct, all_results

def evaluate_query_engine(evaluator, query_engine, questions):
  total_correct = 0
  all_results = []
  if isinstance(evaluator, FaithfulnessEvaluator):
    print("Use FaithfulnessEvaluator")
  elif isinstance(evaluator, RelevancyEvaluator):
    print("Use RelevancyEvaluator")

  for query in questions:
    response = query_engine.query(query)
    eval_result = 1 if "YES" in evaluator.evaluate_response(response=response).feedback else 0
    total_correct += eval_result
    all_results.append(eval_result)
    time.sleep(1)

  return total_correct, all_results


In [122]:
query_engine = index.as_query_engine()
# faithfulness_evaluator = FaithfulnessEvaluator(service_context=service_context_gpt4)
relevancy_evaluator = RelevancyEvaluator(service_context=service_context_gpt4)
response_vector = query_engine.query(eval_questions[2])
# eval_result = relevancy_evaluator.evaluate(response=response_vector)

eval_result.feedback



KeyboardInterrupt: ignored

In [104]:
# eval for Faithfulness/hallucination
query_engine = index.as_query_engine()
faithfulness_evaluator = FaithfulnessEvaluator(service_context=service_context_gpt4)
total_correct, all_results = evaluate_query_engine(faithfulness_evaluator, query_engine, eval_questions)
print(f"Hallucination? Scored {total_correct} out of {len(eval_questions)} questions correctly.")

finished batch 1 out of 6
Use FaithfulnessEvaluator




finished batch 2 out of 6
Use FaithfulnessEvaluator




finished batch 3 out of 6
Use FaithfulnessEvaluator




finished batch 4 out of 6
Use FaithfulnessEvaluator




finished batch 5 out of 6
Use FaithfulnessEvaluator




finished batch 6 out of 6
Use FaithfulnessEvaluator




Hallucination? Scored 0 out of 30 questions correctly.


**Step 4: Evaluating Response for the Quality Answer**

In [105]:
# eval for Faithfulness/hallucination
relevancy_evaluator = RelevancyEvaluator(service_context=service_context_gpt4)
total_correct, all_results = evaluate_query_engine(relevancy_evaluator, query_engine, eval_questions)
print(f"Quality Answer Scored {total_correct} out of {len(eval_questions)} questions correctly.")

finished batch 1 out of 6
Use RelevancyEvaluator


ValueError: ignored

# **Approach 2: With Summary + Recursive Document Agent**

In [127]:
# load all the documents
import glob

files = []
for file in glob.glob("gdrive/MyDrive/llm-doc/data/*.pdf"):
    print(file)
    files.append(file)
all_docs = {}
for title in files:
  doc = title.split('/')[-1].split('.')[0]
  all_docs[doc] = SimpleDirectoryReader(input_files=[title]).load_data()

gdrive/MyDrive/llm-doc/data/australia quarterly real estate report.pdf
gdrive/MyDrive/llm-doc/data/perpetual private quarterly market update.pdf
gdrive/MyDrive/llm-doc/data/Nividia 10k report June 2023.pdf
gdrive/MyDrive/llm-doc/data/Microsoft 10k report June 2023.pdf


**Build Document Agent**

In [None]:
SummaryIndex,
from llama_index.agent import OpenAIAgent

agents = {}

for doc in all_docs:
  # build vector index
  vector_index = VectorStoreIndex.from_documents(all_docs[doc], service_context=service_context)
  # build summary index
  summary_index = SummaryIndex.from_documents(all_docs[doc], service_context=service_context)

  # define query engine
  vector_query_engine = vector_index.as_query_engine()
  summary_query_engine = summary_index.as_query_engine()

  # define tools
  query_engine_tools = [

  ]