### Evaluations - LlamaIndex ###
- Template to set up evaluations using LlamaIndex native evaluators
- 

In [26]:
# Fetch API keys from config.py
import os
from config import set_environment 
set_environment()

import logging
import sys
#logging.basicConfig(stream=sys.stdout, level=logging.INFO)
#logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

# Only for notebook
import nest_asyncio
nest_asyncio.apply()

In [27]:
from llama_index.core import Settings

from llama_index.core import SimpleDirectoryReader
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core import VectorStoreIndex
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.postprocessor import SimilarityPostprocessor, KeywordNodePostprocessor
from llama_index.core import get_response_synthesizer

import pandas as pd

Set the parameters for the run here

In [28]:
# Node Parser
chunk_size = 1024
chunk_overlap = 50

# Retriever Settings
similarity_top_k = 3

# Context Post Processor Settings
required_key_words = [""]
excluded_key_words = [""]
similarity_cutoff = 0.2

# Response Synthesis - Minimal 
# This seems to work best for our data sets so far
response_mode_list = ["minimal"] 

# Response Synthesis - Full (Various additional processing of the LLM response )
# Doesn't seem to improve the answers for our data sets
#response_mode_list = ["minimal", "refine", "compact", "tree_summarize", "simple_summarize", "accumulate", "compact_accumulate"]



Pick the LLM for generation

In [29]:
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding

Settings.embed_model = OpenAIEmbedding(model="text-embedding-3-large",dimensions=512,)
Settings.llm = OpenAI(temperature=0, model="gpt-4")

In [None]:
from llama_index.llms.cohere import Cohere
from llama_index.core import ServiceContext
from llama_index.embeddings.cohere import CohereEmbedding

Settings.llm = Cohere(api_key=os.environ["COHERE_API_KEY"], model="command-r")
Settings.embed_model = CohereEmbedding(
    cohere_api_key=os.environ["COHERE_API_KEY"],
    model_name="embed-english-v3.0",
    input_type="search_query",
)

Pick the LLM for evaluation

In [30]:
Settings.eval_llm = OpenAI(temperature=0, model="gpt-4-0125-preview")
Settings.eval_model = "gpt-4-0125-preview"

Read the documents, create chunks, calculate embeddings, store in a vector database

In [31]:
reader = SimpleDirectoryReader("data")
documents = reader.load_data()

node_parser = SentenceSplitter(chunk_size=chunk_size, chunk_overlap = chunk_overlap)
nodes = node_parser.get_nodes_from_documents(documents)
# set node ids to be a constant
for idx, node in enumerate(nodes):
    node.id_ = f"node-{idx}"

index = VectorStoreIndex(nodes, embed_model=Settings.embed_model, show_progress=True)

Generating embeddings:   0%|          | 0/186 [00:00<?, ?it/s]

Set up retrieval and response generation

In [32]:
retriever = VectorIndexRetriever(
    index=index,
    similarity_top_k=similarity_top_k
)

node_postprocessors = [
    #KeywordNodePostprocessor(
    #   required_keywords=required_key_words, exclude_keywords=excluded_key_words
    #),
    SimilarityPostprocessor(similarity_cutoff=similarity_cutoff) 
]

# This is the most basic type of response generation. Send the retrieved chunks to the LLM and display the receieved response

query_engine_minimal = RetrieverQueryEngine(
    retriever=retriever,
    node_postprocessors=node_postprocessors
)

- Set up the query engine(s)

In [33]:
def generate_answer(value, response_mode):
    return query_engine_minimal.query(value)
    

- Read a set of questions from an excel file
- Generate responses (answers)

In [None]:
questions_path = 'questions/ORCL_UTD_SPD_Questions.xlsx' 
df = pd.read_excel(questions_path, sheet_name='final')

### Use Llamaindex inbuilt evaluators ####

In [34]:
from llama_index.core.evaluation import (
    CorrectnessEvaluator,
    SemanticSimilarityEvaluator,
    RelevancyEvaluator,
    FaithfulnessEvaluator,
    AnswerRelevancyEvaluator,
    ContextRelevancyEvaluator,
    EvaluationResult,
)

In [35]:
user_input = "is aromatherapy covered"
response_object = query_engine_minimal.query(user_input)
reference = "No, aromatherapy is not covered under the Oracle America, Inc. Flexible Benefit Plan. It is listed under the category of alternative treatments that are not covered."

In [36]:
def print_evaluation_result(evaluation_result:EvaluationResult):
    print("query -> " + str(evaluation_result.query))
    print("contexts -> " + str(evaluation_result.contexts))
    print("response -> " + str(evaluation_result.response))
    print("passing -> " + str(evaluation_result.passing))
    print("feedback -> " + str(evaluation_result.feedback))
    print("score -> " + str(evaluation_result.score))
    print("pairwise_source -> " + str(evaluation_result.pairwise_source))
    print("invalid_result -> " + str(evaluation_result.invalid_result))
    print("invalid_reason -> " + str(evaluation_result.invalid_reason))

#### Correctness Evaluator ####
- Evaluate the relevance and correctness of a generated answer against a reference answer.

In [37]:
evaluator = CorrectnessEvaluator(llm=Settings.eval_llm)
evaluation_result = evaluator.evaluate_response(
    query=user_input, response=response_object, reference = reference
)

print_evaluation_result(evaluation_result=evaluation_result)

query -> is aromatherapy covered
contexts -> None
response -> No, aromatherapy is not covered.
passing -> True
feedback -> The generated answer directly addresses the user query with a clear and correct response, matching the essential information provided in the reference answer.
score -> 5.0
pairwise_source -> None
invalid_result -> False
invalid_reason -> None


#### Faithfulness Evaluator ####
- Measure if the response from a query engine matches any source nodes.

In [38]:
evaluator = FaithfulnessEvaluator(llm=Settings.eval_llm)
evaluation_result = evaluator.evaluate_response(
    query=user_input, response=response_object
)

print_evaluation_result(evaluation_result=evaluation_result)

query -> None
contexts -> ['2024 ACME  America, Inc. Flexible Benefit Plan Document and SPD                                                                                                                                                                    81  \n      \nALTERNATIVE TREATMENTS  \n• Acupuncture treatment or therapy designed to provide the Covered Person continued well -being, or \nmaintain the optimum state of health, while minimizing recurrence of the clinical status.  Acupuncture \ntreatment to address chronic pain, disease, or injury is  a Covered Health Services. For more information, \nrefer to Acupuncture in Covered Health Services.  \n• Acupressure  \n• Aromatherapy  \n• Christian Science Provider  \n• Controlled Substances (Non -FDA Approved)  \n• Holistic or Homeopathic Care  \n• Hypnotism  \n• Marijuana used for recreational use  \n• Marijuana used for medical use - even if allowed by state law and used to treat a diagnosed medical \ncondition  \n• Massage Therap

#### Relevancy Evaluator ####
- Measure if the response + source nodes match the query.

In [None]:
evaluator = RelevancyEvaluator(llm=Settings.eval_llm)
evaluation_result = evaluator.evaluate_response(
    query=user_input, response=response_object
)

print_evaluation_result(evaluation_result=evaluation_result)

#### Semantic Similarity Evaluator ####
- Calculates the similarity score between embeddings of the generated answer and the reference answer.

In [None]:
evaluator = SemanticSimilarityEvaluator()
evaluation_result = evaluator.evaluate_response(
    query=user_input, response=response_object, reference = reference
)

print_evaluation_result(evaluation_result=evaluation_result)

#### Answer Relevancy Evaluator ####
- Measure the relevancy of the answer to the query

In [None]:
evaluator = AnswerRelevancyEvaluator(llm=Settings.eval_llm)
evaluation_result = evaluator.evaluate_response(
    query=user_input, response=response_object, reference = reference
)

print_evaluation_result(evaluation_result=evaluation_result)

#### Context Relevancy Evaluator ####
- Measure the relevancy of the context to the query

In [None]:
evaluator = ContextRelevancyEvaluator(llm=Settings.eval_llm)
evaluation_result = evaluator.evaluate_response(
    query=user_input, response=response_object, reference = reference
)

print_evaluation_result(evaluation_result=evaluation_result)