# CRM RAG Evaluation

Author: Theodore Mui <thephilmui@gmail.com>


## 1. Setup


In [22]:
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv())

import nest_asyncio
nest_asyncio.apply()

import warnings
import logging
from IPython.display import clear_output

warnings.filterwarnings('ignore')
logging.getLogger().setLevel(logging.ERROR)

import pandas as pd
pd.set_option('display.max_columns', None)

In [2]:
import sys
from llama_index.core import (
    PromptTemplate,
    Settings,
    SimpleDirectoryReader,
    VectorStoreIndex,
    StorageContext,
    load_index_from_storage,
)
from llama_index.llms.ollama import Ollama
from llama_index.embeddings.ollama import OllamaEmbedding
from llama_index.core.response_synthesizers import ResponseMode

from llama_index.core.tools import QueryEngineTool, ToolMetadata

In [10]:
llm = Ollama(model="llama3.2:3b-instruct-q8_0", temperature=0.01)
embedding = OllamaEmbedding(model_name="mxbai-embed-large")

Settings.llm = llm
Settings.embed_model = embedding

hello_embedding = embedding.get_text_embedding("hello")
EMBEDDING_DIM = len(hello_embedding)
print(EMBEDDING_DIM)

1024


## 2. File Ingestion


In [5]:
import os
from pathlib import Path
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_cloud_services import LlamaParse

notebook_dir = Path().absolute()

crm_folder = str(notebook_dir / ".." / "data" / "crm-docs")
LLAMA_CLOUD_API_KEY = os.getenv("LLAMA_CLOUD_API_KEY")
len(LLAMA_CLOUD_API_KEY)

52

In [6]:
%%time
# llama cloud ingestion
parser = LlamaParse(
    num_workers=6,
    result_type="markdown",
    api_key=LLAMA_CLOUD_API_KEY
)
file_extractor = {".pdf": parser}
documents = SimpleDirectoryReader(
    input_dir=crm_folder,
    file_extractor=file_extractor
).load_data(num_workers=10)
len(documents)

Started parsing the file under job_id c7b00490-7685-4ab4-8d17-bf4bde76c05f
CPU times: user 80.7 ms, sys: 70.5 ms, total: 151 ms
Wall time: 7.28 s


155

In [8]:
from pprint import pprint

pprint(documents[0].text[:100])

('# Table of Contents\n'
 '\n'
 '# UNITED STATES\n'
 '\n'
 '# SECURITIES AND EXCHANGE COMMISSION\n'
 '\n'
 '# Washington, D.C. 20549')


In [9]:
pprint(documents[0].metadata)


{'creation_date': '2025-04-05',
 'file_name': 'salesforce-fy24-10k.pdf',
 'file_path': '/Users/pmui/SynologyDrive/research/2025/research2025/projects/advanced-rag/notebook/../data/crm-docs/salesforce-fy24-10k.pdf',
 'file_size': 1516703,
 'file_type': 'application/pdf',
 'last_modified_date': '2025-04-05'}


## 3. Loading Evaluation Datasets


In [11]:
from ragas.testset import Testset

folder_path = notebook_dir / ".." / "data" / "crm-eval"
if not folder_path.exists():
    folder_path.mkdir(parents=True, exist_ok=True)

sdg_folder = str(folder_path)

single_hop_specific_testset = Testset.from_jsonl(f"{sdg_folder}/single_hop_specific_testset.jsonl")
multi_hop_specific_testset = Testset.from_jsonl(f"{sdg_folder}/multi_hop_specific_testset.jsonl")
multi_hop_abstract_testset = Testset.from_jsonl(f"{sdg_folder}/multi_hop_abstract_testset.jsonl")

single_hop_specific_list = single_hop_specific_testset.to_list()
multi_hop_specific_list = multi_hop_specific_testset.to_list()
multi_hop_abstract_list = multi_hop_abstract_testset.to_list()

len(single_hop_specific_list), len(multi_hop_specific_list), len(multi_hop_abstract_list)

(100, 100, 100)

In [13]:
single_hop_specific_list[1]

{'user_input': 'How does Salesforce enhance customer relationship management for businesses?',
 'reference_contexts': ['Overview Salesforce, Inc. (“Salesforce,” the “Company,” “we” or “our”) is a global leader in customer relationship management (“CRM”) technology, enabling companies of every size and industry to connect with their customers through the power of data, artificial intelligence (“AI”), CRM and trust. Founded in 1999, we bring humans together with AI agents to drive customer success on one deeply unified platform. Our AI-powered Salesforce Platform unites our offerings — spanning sales, service, marketing, commerce, collaboration, integration, AI, analytics, automation, industries and more — by connecting customer data across systems, applications and devices to create a complete view of customers. With this single source of customer truth, teams can be more responsive, productive and efficient and deliver intelligent, personalized and automated experiences across every ch

## 4. Constructing RAG Engines


### 4.1 Creating Text Chunkers ("Node Parsers")


In [16]:
from llama_index.core.node_parser import (
    TokenTextSplitter,
    MarkdownNodeParser, MarkdownElementNodeParser,
    SemanticSplitterNodeParser,
    SentenceSplitter, 
    SentenceWindowNodeParser,
)

from llama_index.core.retrievers import RecursiveRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from typing import Callable, List

In [18]:
token_text_splitter = TokenTextSplitter(
    chunk_size=1024,
    chunk_overlap=32,
)

sentence_splitter = SentenceSplitter(
    chunk_size=1024,
    chunk_overlap=32,
)

md_node_parser = MarkdownNodeParser.from_defaults(
    include_metadata=True,
    include_prev_next_rel=True,
)

mde_node_parser = MarkdownElementNodeParser(
    llm=llm,
    num_workers=10,
).from_defaults()

semantic_splitter = SemanticSplitterNodeParser(
    breakpoint_percentile_threshold=95,
    embed_model=embedding,
)

### 4.2 Creating Indices & Query Engines


In [19]:
token_text_splitter_index = VectorStoreIndex.from_documents(
    documents=documents,
    show_progress=True,
    node_parser=token_text_splitter,
)
token_text_splitter_query_engine = token_text_splitter_index.as_query_engine()

Parsing nodes: 100%|██████████| 155/155 [00:00<00:00, 233.21it/s]
Generating embeddings: 100%|██████████| 183/183 [00:13<00:00, 13.56it/s]


In [24]:
sentence_splitter_index = VectorStoreIndex.from_documents(
    documents=documents,
    show_progress=True,
    node_parser=sentence_splitter,
    num_workers=10,
)
sentence_splitter_query_engine = sentence_splitter_index.as_query_engine()

Parsing nodes: 100%|██████████| 155/155 [00:00<00:00, 2049.76it/s]
Generating embeddings: 100%|██████████| 183/183 [00:12<00:00, 14.08it/s]


In [21]:
md_nodes = md_node_parser.get_nodes_from_documents(documents)
md_index = VectorStoreIndex(
    nodes=md_nodes, 
    show_progress=True,
    num_workers=10,
)
md_query_engine = md_index.as_query_engine()

Generating embeddings: 100%|██████████| 568/568 [00:23<00:00, 23.74it/s]


In [23]:
mde_nodes = mde_node_parser.get_nodes_from_documents(documents=documents, verbose=False)

print(f"Number of initial mde nodes: {len(mde_nodes)}")

# Get text nodes and object (table) nodes
base_nodes, objects = mde_node_parser.get_nodes_and_objects(nodes=mde_nodes)

print(f"Number of text nodes: {len(base_nodes)}")
print(f"Number of table nodes: {len(objects)}")

# insert the table markdown into the text of each table object
for i in range(len(objects)):
    objects[i].text = objects[i].text[:]

mde_index = VectorStoreIndex(
    nodes=base_nodes + objects,
    show_progress=True,
    num_workers=10,
)
mde_query_engine = mde_index.as_query_engine()

2it [00:00, 50231.19it/s]
1it [00:00, 12633.45it/s]
1it [00:00, 17260.51it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
1it [00:00, 20460.02it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
1it [00:00, 15592.21it/s]
1it [00:00, 26379.27it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
5it [00:00, 128659.63it/s]
3it [00:00, 83886.08it/s]
3it [00:00, 49932.19it/s]
2it [00:00, 55188.21it/s

Number of initial mde nodes: 426
Number of text nodes: 244
Number of table nodes: 91


Generating embeddings: 100%|██████████| 335/335 [00:16<00:00, 20.21it/s]


In [25]:
semantic_nodes = semantic_splitter.get_nodes_from_documents(documents=documents)

semantic_index = VectorStoreIndex(
    nodes=semantic_nodes,
    show_progress=True,
    num_workers=10,
)
semantic_query_engine = semantic_index.as_query_engine()


Generating embeddings: 100%|██████████| 358/358 [00:19<00:00, 18.47it/s]


## 5. Evaluating RAG Engines


In [14]:
from ragas import evaluate, EvaluationDataset
from ragas.llms import LlamaIndexLLMWrapper
from ragas.metrics import (
    AnswerAccuracy,
    AnswerCorrectness,
    AnswerRelevancy, 
    ContextRelevance,
    ContextPrecision, 
    ContextRecall,
    Faithfulness, 
    FactualCorrectness, 
    ResponseGroundedness, 
    ResponseRelevancy
)

evaluator_llm = LlamaIndexLLMWrapper(llm)

### 5.1 Single Hop Queries


In [26]:
%%time

from tqdm import tqdm

total_queries = len(single_hop_specific_list)
for test_case in tqdm(single_hop_specific_list, 
                      desc="Responding to queries",
                      total=total_queries,
                      unit="query",
                      position=0,
                      leave=True):
    token_text_splitter_response = token_text_splitter_query_engine.query(test_case["user_input"])
    test_case["response"] = str(token_text_splitter_response.response)
    test_case["retrieved_contexts"] = [node.text for node in token_text_splitter_response.source_nodes]

Responding to queries: 100%|██████████| 100/100 [04:26<00:00,  2.66s/query]

CPU times: user 1.9 s, sys: 148 ms, total: 2.05 s
Wall time: 4min 26s





In [27]:
single_hop_specific_list[0], len(single_hop_specific_list)

({'user_input': 'What CRM do Salesforce do?',
  'reference_contexts': ['Overview Salesforce, Inc. (“Salesforce,” the “Company,” “we” or “our”) is a global leader in customer relationship management (“CRM”) technology, enabling companies of every size and industry to connect with their customers through the power of data, artificial intelligence (“AI”), CRM and trust. Founded in 1999, we bring humans together with AI agents to drive customer success on one deeply unified platform. Our AI-powered Salesforce Platform unites our offerings — spanning sales, service, marketing, commerce, collaboration, integration, AI, analytics, automation, industries and more — by connecting customer data across systems, applications and devices to create a complete view of customers. With this single source of customer truth, teams can be more responsive, productive and efficient and deliver intelligent, personalized and automated experiences across every channel. With Agentforce, the agentic layer of the

In [28]:
single_hop_specific_eval_dataset = EvaluationDataset.from_list(single_hop_specific_list[:5])
len(single_hop_specific_eval_dataset)

5

In [29]:
%%time
single_hop_specific_token_text_splitter_result = evaluate(
    dataset=single_hop_specific_eval_dataset,
    llm=evaluator_llm,
    metrics=[
        AnswerAccuracy(),
        ContextRelevance(),
        ResponseGroundedness(), 

        AnswerCorrectness(),
        AnswerRelevancy(), 
        ContextPrecision(), 
        ContextRecall(),
        Faithfulness(), 
        FactualCorrectness(), 
        ResponseRelevancy(),
    ],
    show_progress=True,
)
single_hop_specific_token_text_splitter_result



An error occurred: . Skipping a sample by assigning it nan score.




An error occurred: . Skipping a sample by assigning it nan score.


Exception raised in Job[7]: TimeoutError()
Exception raised in Job[8]: TimeoutError()
Exception raised in Job[3]: TimeoutError()
Exception raised in Job[5]: TimeoutError()
Exception raised in Job[13]: TimeoutError()


An error occurred: . Skipping a sample by assigning it nan score.


Exception raised in Job[16]: TimeoutError()
Exception raised in Job[17]: TimeoutError()
Exception raised in Job[18]: TimeoutError()
Exception raised in Job[23]: TimeoutError()
Exception raised in Job[25]: TimeoutError()
Exception raised in Job[26]: TimeoutError()
Exception raised in Job[27]: TimeoutError()
Exception raised in Job[28]: TimeoutError()
Exception raised in Job[29]: TimeoutError()
Exception raised in Job[33]: TimeoutError()
Exception raised in Job[35]: TimeoutError()
Exception raised in Job[36]: TimeoutError()
Exception raised in Job[37]: TimeoutError()
Exception raised in Job[38]: TimeoutError()
Exception raised in Job[43]: TimeoutError()
Exception raised in Job[45]: TimeoutError()
Exception raised in Job[47]: TimeoutError()
Evaluating: 100%|██████████| 50/50 [06:58<00:00,  8.37s/it]


{'nv_accuracy': 1.0000, 'nv_context_relevance': 1.0000, 'nv_response_groundedness': 1.0000, 'answer_correctness': nan, 'answer_relevancy': 0.9526, 'context_precision': 1.0000, 'context_recall': 0.7500, 'faithfulness': nan, 'factual_correctness(mode=f1)': 0.4000}

In [30]:
pprint(single_hop_specific_token_text_splitter_result)

{'nv_accuracy': 1.0000, 'nv_context_relevance': 1.0000, 'nv_response_groundedness': 1.0000, 'answer_correctness': nan, 'answer_relevancy': 0.9526, 'context_precision': 1.0000, 'context_recall': 0.7500, 'faithfulness': nan, 'factual_correctness(mode=f1)': 0.4000}


In [31]:
def process_queries(query_engine, test_cases_list):
    total_queries = len(test_cases_list)
    for test_case in tqdm(test_cases_list, 
                          desc="\tResponding to queries",
                          total=total_queries,
                          unit="query",
                          position=0,
                          leave=True):
        response = query_engine.query(test_case["user_input"])
        test_case["response"] = str(response.response)
        test_case["retrieved_contexts"] = [node.text for node in response.source_nodes]
    
    try:
        eval_dataset = EvaluationDataset.from_list(test_cases_list)
        eval_result = evaluate(
            dataset=eval_dataset,
            llm=evaluator_llm,
            metrics=[
                AnswerCorrectness(),
                AnswerRelevancy(), 
                ContextPrecision(), 
                ContextRecall(),
                Faithfulness(), 
                FactualCorrectness(), 
                ResponseRelevancy(),
            ],
            show_progress=False,
        )
    except Exception as e:
        pass # ignore the exception
    # only select the newly added metrics which are of dtype float64
    return eval_result.to_pandas().select_dtypes(include=['float64'])

query*engines = {
"token": token*,
"markdown": md_query_engine,
"markdown_element": mde_query_engine,
"semantic": semantic_query_engine,
"sentwin": sentwin_query_engine
}

eval_results = {}
