# CRM RAG Evaluation

Author: Theodore Mui <theodoremui@gmail.com>

## 1. Setup

In [30]:
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv())

import nest_asyncio
nest_asyncio.apply()

import warnings
import logging
from IPython.display import clear_output

warnings.filterwarnings('ignore')
logging.getLogger().setLevel(logging.ERROR)

import pandas as pd
pd.set_option('display.max_columns', None)

In [41]:
import sys
from llama_index.core import (
    PromptTemplate,
    Settings,
    SimpleDirectoryReader,
    VectorStoreIndex,
    StorageContext,
    load_index_from_storage,
)
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core.response_synthesizers import ResponseMode

from llama_index.core.tools import QueryEngineTool, ToolMetadata

In [43]:
llm = OpenAI(model="gpt-4o-mini", temperature=0.01, timeout=240)
embedding = OpenAIEmbedding(model="text-embedding-3-small")

Settings.llm = llm
Settings.embed_model = embedding

hello_embedding = embedding.get_text_embedding("hello")
EMBEDDING_DIM = len(hello_embedding)
print(EMBEDDING_DIM)

1536


## 2. File Ingestion

In [4]:
import os
from pathlib import Path
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_cloud_services import LlamaParse

notebook_dir = Path().absolute()

crm_folder = str(notebook_dir / ".." / "data" / "crm-docs")
LLAMA_CLOUD_API_KEY = os.getenv("LLAMA_CLOUD_API_KEY")
len(LLAMA_CLOUD_API_KEY)

52

We are now going to read Salesforce SEC 10K filing: [10K](https://investor.salesforce.com/financials/sec-filings/sec-filings-details/default.aspx?FilingId=18259118)

In [5]:
%%time

parser = LlamaParse(
    num_workers=6,
    result_type="markdown",
    api_key=LLAMA_CLOUD_API_KEY
)
file_extractor = {".pdf": parser}
documents = SimpleDirectoryReader(
    input_dir=crm_folder,
    file_extractor=file_extractor
).load_data(num_workers=10)
len(documents)

CPU times: total: 31.2 ms
Wall time: 8.09 s


155

In [6]:
from pprint import pprint

pprint(documents[0].text[:100])

('# Table of Contents\n'
 '\n'
 '# UNITED STATES\n'
 '\n'
 '# SECURITIES AND EXCHANGE COMMISSION\n'
 '\n'
 '# Washington, D.C. 20549')


In [7]:
pprint(documents[0].metadata)

{'creation_date': '2025-04-05',
 'file_name': 'salesforce-fy24-10k.pdf',
 'file_path': 'c:\\Users\\theod\\OneDrive\\dev\\research2025\\projects\\advanced-rag\\notebooks\\..\\data\\crm-docs\\salesforce-fy24-10k.pdf',
 'file_size': 1516703,
 'file_type': 'application/pdf',
 'last_modified_date': '2025-04-05'}


## 3. Loading Evaluation Datasets

In [12]:
from ragas.testset import Testset
import json

folder_path = notebook_dir / ".." / "data" / "crm-eval"
if not folder_path.exists():
    folder_path.mkdir(parents=True, exist_ok=True)

sdg_folder = str(folder_path)

# Helper function to read JSONL files with UTF-8 encoding
def read_jsonl_with_utf8(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        return [json.loads(line) for line in f]

# Read the files with UTF-8 encoding
single_hop_specific_testset = Testset.from_list(read_jsonl_with_utf8(f"{sdg_folder}/single_hop_specific_testset.jsonl"))
multi_hop_specific_testset = Testset.from_list(read_jsonl_with_utf8(f"{sdg_folder}/multi_hop_specific_testset.jsonl"))
multi_hop_abstract_testset = Testset.from_list(read_jsonl_with_utf8(f"{sdg_folder}/multi_hop_abstract_testset.jsonl"))

single_hop_specific_list = single_hop_specific_testset.to_list()
multi_hop_specific_list = multi_hop_specific_testset.to_list()
multi_hop_abstract_list = multi_hop_abstract_testset.to_list()

len(single_hop_specific_list), len(multi_hop_specific_list), len(multi_hop_abstract_list)

(100, 100, 100)

In [13]:
single_hop_specific_list[1]

{'user_input': 'How does Salesforce enhance customer relationship management for businesses?',
 'reference_contexts': ['Overview Salesforce, Inc. (“Salesforce,” the “Company,” “we” or “our”) is a global leader in customer relationship management (“CRM”) technology, enabling companies of every size and industry to connect with their customers through the power of data, artificial intelligence (“AI”), CRM and trust. Founded in 1999, we bring humans together with AI agents to drive customer success on one deeply unified platform. Our AI-powered Salesforce Platform unites our offerings — spanning sales, service, marketing, commerce, collaboration, integration, AI, analytics, automation, industries and more — by connecting customer data across systems, applications and devices to create a complete view of customers. With this single source of customer truth, teams can be more responsive, productive and efficient and deliver intelligent, personalized and automated experiences across every ch

## 4. Constructing RAG Engines

### 4.1 Creating Text Chunkers ("Node Parsers")

In [15]:
from llama_index.core.node_parser import (
    TokenTextSplitter,
    MarkdownNodeParser, MarkdownElementNodeParser,
    SemanticSplitterNodeParser,
    SentenceSplitter, 
    SentenceWindowNodeParser,
)

from llama_index.core.retrievers import RecursiveRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from typing import Callable, List

In [16]:
token_text_splitter = TokenTextSplitter(
    chunk_size=1024,
    chunk_overlap=32,
)

sentence_splitter = SentenceSplitter(
    chunk_size=1024,
    chunk_overlap=32,
)

md_node_parser = MarkdownNodeParser.from_defaults(
    include_metadata=True,
    include_prev_next_rel=True,
)

mde_node_parser = MarkdownElementNodeParser(
    llm=llm,
    num_workers=10,
).from_defaults()

semantic_splitter = SemanticSplitterNodeParser(
    breakpoint_percentile_threshold=95,
    embed_model=embedding,
)

### 4.2 Creating Indices & Query Engines

In [17]:
token_text_splitter_index = VectorStoreIndex.from_documents(
    documents=documents,
    show_progress=True,
    node_parser=token_text_splitter,
)
token_text_splitter_query_engine = token_text_splitter_index.as_query_engine()

Parsing nodes: 100%|██████████| 155/155 [00:05<00:00, 26.27it/s]
Generating embeddings: 100%|██████████| 183/183 [00:12<00:00, 14.93it/s]


In [21]:
sentence_splitter_index = VectorStoreIndex.from_documents(
    documents=documents,
    show_progress=True,
    node_parser=sentence_splitter,
    num_workers=10,
)
sentence_splitter_query_engine = sentence_splitter_index.as_query_engine()

Parsing nodes: 100%|██████████| 155/155 [00:00<00:00, 628.86it/s]
Generating embeddings: 100%|██████████| 183/183 [00:21<00:00,  8.59it/s]


In [18]:
md_nodes = md_node_parser.get_nodes_from_documents(documents)
md_index = VectorStoreIndex(
    nodes=md_nodes,
    show_progress=True,
    num_workers=10,
)
md_query_engine = md_index.as_query_engine()

Generating embeddings: 100%|██████████| 568/568 [00:27<00:00, 20.67it/s]


In [20]:
mde_nodes = mde_node_parser.get_nodes_from_documents(documents=documents, verbose=False)

print(f"Number of initial mde nodes: {len(mde_nodes)}")

# Get text nodes and object (table) nodes
base_nodes, objects = mde_node_parser.get_nodes_and_objects(nodes=mde_nodes)

print(f"Number of text nodes: {len(base_nodes)}")
print(f"Number of table nodes: {len(objects)}")
for i in range(len(objects)):
    objects[i].text = objects[i].text[:]

mde_index = VectorStoreIndex(
    nodes=base_nodes + objects,
    show_progress=True,
    num_workers=10,
)
mde_query_engine = mde_index.as_query_engine()

2it [00:00, ?it/s]
1it [00:00, ?it/s]
1it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
1it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
1it [00:00, ?it/s]
1it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
5it [00:00, ?it/s]
3it [00:00, 3043.76it/s]
3it [00:00, ?it/s]
2it [00:00, 556.38it/s]
1it [00:00, 659.38it/s]
2it [00:00, ?it/s]
0it [00:00, ?it

Number of initial mde nodes: 426
Number of text nodes: 244
Number of table nodes: 91


Generating embeddings: 100%|██████████| 335/335 [01:08<00:00,  4.92it/s]


In [22]:
semantic_nodes = semantic_splitter.get_nodes_from_documents(documents=documents)

semantic_index = VectorStoreIndex(
    nodes=semantic_nodes,
    show_progress=True,
    num_workers=10,
)
semantic_query_engine = semantic_index.as_query_engine()

Generating embeddings: 100%|██████████| 358/358 [00:33<00:00, 10.56it/s]


## 5. Evaluating RAG engines

In [14]:
from ragas import evaluate, EvaluationDataset
from ragas.llms import LlamaIndexLLMWrapper
from ragas.metrics import (
    AnswerAccuracy,
    AnswerCorrectness,
    AnswerRelevancy, 
    ContextRelevance,
    ContextPrecision, 
    ContextRecall,
    Faithfulness, 
    FactualCorrectness, 
    ResponseGroundedness, 
    ResponseRelevancy
)

evaluator_llm = LlamaIndexLLMWrapper(llm)

In [24]:
def process_queries(query_engine, test_cases_list):

    # Step 1: user the query_engine to answer each `user_input`
    total_queries = len(test_cases_list)
    for test_case in tqdm(test_cases_list, 
                          desc="\tResponding to queries",
                          total=total_queries,
                          unit="query",
                          position=0,
                          leave=True):
        response = query_engine.query(test_case["user_input"])
        test_case["response"] = str(response.response)
        test_case["retrieved_contexts"] = [node.text for node in response.source_nodes]
    
    # Step 2: evaluate how good the answers are relative to the
    #         `reference` in the test_cases_list (eval dataset)
    eval_dataset = EvaluationDataset.from_list(test_cases_list)
    eval_result = evaluate(
        dataset=eval_dataset,
        llm=evaluator_llm,
        metrics=[
            AnswerCorrectness(),
            AnswerRelevancy(), 
            ContextPrecision(), 
            ContextRecall(),
            Faithfulness(), 
            FactualCorrectness(), 
            ResponseRelevancy(),
        ],
        show_progress=False,
    )
    # only select the newly added metrics which are of dtype float64
    return eval_result.to_pandas().select_dtypes(include=['float64'])

### 5.1 Single Hop Queries

### Example using `token_text_splitter_query_engine`

In [23]:
%%time

from tqdm import tqdm

total_queries = len(single_hop_specific_list)
for test_case in tqdm(single_hop_specific_list, 
                      desc="Responding to queries",
                      total=total_queries,
                      unit="query",
                      position=0,
                      leave=True):
    token_text_splitter_response = token_text_splitter_query_engine.query(test_case["user_input"])
    test_case["response"] = str(token_text_splitter_response.response)

Responding to queries: 100%|██████████| 100/100 [04:47<00:00,  2.88s/query]

CPU times: total: 5.23 s
Wall time: 4min 47s





In [25]:
single_hop_specific_list[0], len(single_hop_specific_list)

({'user_input': 'What CRM do Salesforce do?',
  'reference_contexts': ['Overview Salesforce, Inc. (“Salesforce,” the “Company,” “we” or “our”) is a global leader in customer relationship management (“CRM”) technology, enabling companies of every size and industry to connect with their customers through the power of data, artificial intelligence (“AI”), CRM and trust. Founded in 1999, we bring humans together with AI agents to drive customer success on one deeply unified platform. Our AI-powered Salesforce Platform unites our offerings — spanning sales, service, marketing, commerce, collaboration, integration, AI, analytics, automation, industries and more — by connecting customer data across systems, applications and devices to create a complete view of customers. With this single source of customer truth, teams can be more responsive, productive and efficient and deliver intelligent, personalized and automated experiences across every channel. With Agentforce, the agentic layer of the

### Create a mapping for all query engines 

In [44]:
query_engine_mapping = {
    "token": token_text_splitter_query_engine,
    "sentence": sentence_splitter_query_engine,
    "markdown": md_query_engine,
    "markdown_element": mde_query_engine,
    "semantic": semantic_query_engine,
}

In [31]:
%%time

NUM_RECORDS = 1

eval_results = {}
for query_engine_name, query_engine in query_engine_mapping.items():
    print(f"-----{query_engine_name}-----")
    single_hop = single_hop_specific_list.copy()[:NUM_RECORDS]
    eval_results["single_hop_" + query_engine_name]         = process_queries(query_engine, single_hop)

-----token-----


	Responding to queries: 100%|██████████| 1/1 [00:07<00:00,  7.74s/query]


### Saving and Restoring Evaluation Results

In [32]:
import json
import os
import pandas as pd

os.makedirs('results', exist_ok=True)
serializable_results = {}
for model_name, df in eval_results.items():
    serializable_results[model_name] = df.to_dict(orient='records')
with open('results/eval_results_basic.json', 'w') as f:
    json.dump(serializable_results, f, indent=4)

In [33]:
with open('results/eval_results_basic.json', 'r') as f:
    loaded_results = json.load(f)

eval_results_loaded = {
    model_name: pd.DataFrame(data) 
    for model_name, data in loaded_results.items()
}

In [34]:
from pandas.testing import assert_frame_equal

def quick_compare(original, loaded):
    try:
        for model in original.keys():
            assert_frame_equal(original[model], loaded[model])
        print("All DataFrames are identical!")
        return True
    except AssertionError as e:
        print(f"Differences found: {e}")
        return False

In [35]:
quick_compare(eval_results, eval_results_loaded)

All DataFrames are identical!


True

In [36]:
pprint(eval_results)

{'single_hop_markdown':    answer_correctness  answer_relevancy  context_precision  context_recall  \
0                 NaN               NaN                NaN             0.5   
1                 NaN               NaN                NaN             NaN   

   faithfulness  factual_correctness(mode=f1)  
0           NaN                           NaN  
1           NaN                           NaN  ,
 'single_hop_markdown_element':    answer_correctness  answer_relevancy  context_precision  context_recall  \
0                 NaN          0.796463                1.0             1.0   
1                 NaN          0.930385                NaN             NaN   

   faithfulness  factual_correctness(mode=f1)  
0           NaN                           NaN  
1           NaN                           NaN  ,
 'single_hop_semantic':    answer_correctness  answer_relevancy  context_precision  context_recall  \
0                 NaN          0.796462                NaN             NaN   
1   

### Display Evlauation Results

In [38]:
def get_metrics_df(eval_results: dict) -> pd.DataFrame:
    metrics_dict = {}
    for model_name, df in eval_results.items():
        # Extract just the metric values (first row since each DataFrame has only one row)
        metrics_dict[model_name] = {
            'answer_correctness': df['answer_correctness'].mean(),
            'answer_relevancy': df['answer_relevancy'].mean(),
            'context_precision': df['context_precision'].mean(),
            'context_recall': df['context_recall'].mean(),
            'faithfulness': df['faithfulness'].mean(),
            'factual_correctness(mode=f1)': df['factual_correctness(mode=f1)'].mean()
        }

    # Now create the DataFrame
    df = pd.DataFrame.from_dict(metrics_dict, orient='index')
    df = df.round(3)
    return df

df = get_metrics_df(eval_results)

In [39]:
df

Unnamed: 0,answer_correctness,answer_relevancy,context_precision,context_recall,faithfulness,factual_correctness(mode=f1)
single_hop_token,,,,0.75,,
single_hop_sentence,,0.942,,0.5,,
single_hop_markdown,,,,0.5,,
single_hop_markdown_element,,0.863,1.0,1.0,,
single_hop_semantic,,0.863,,1.0,,
