### Evaluations - DeepEval ###
- Template to set up evaluations using DeepEval evaluators
- 

In [None]:
# Fetch API keys from config.py
import os
from config import set_environment 
set_environment()

import logging
import sys
#logging.basicConfig(stream=sys.stdout, level=logging.INFO)
#logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

# Only for notebook
import nest_asyncio
nest_asyncio.apply()

In [None]:
from llama_index.core import Settings

from llama_index.core import SimpleDirectoryReader
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core import VectorStoreIndex
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.postprocessor import SimilarityPostprocessor, KeywordNodePostprocessor
from llama_index.core import get_response_synthesizer

import pandas as pd

Set the parameters for the run here

In [None]:
# Node Parser
chunk_size = 1024
chunk_overlap = 50

# Retriever Settings
similarity_top_k = 3

# Context Post Processor Settings
required_key_words = [""]
excluded_key_words = [""]
similarity_cutoff = 0.2

# Response Synthesis - Minimal 
# This seems to work best for our data sets so far
response_mode_list = ["minimal"] 

# Response Synthesis - Full (Various additional processing of the LLM response )
# Doesn't seem to improve the answers for our data sets
#response_mode_list = ["minimal", "refine", "compact", "tree_summarize", "simple_summarize", "accumulate", "compact_accumulate"]



Pick the LLM

In [None]:
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding

Settings.embed_model = OpenAIEmbedding(model="text-embedding-3-large",dimensions=512,)
Settings.llm = OpenAI(temperature=0, model="gpt-4")
Settings.eval_model = "gpt-4-0125-preview"

In [None]:
from llama_index.llms.cohere import Cohere
from llama_index.core import ServiceContext
from llama_index.embeddings.cohere import CohereEmbedding

Settings.llm = Cohere(api_key=os.environ["COHERE_API_KEY"], model="command-r")
Settings.embed_model = CohereEmbedding(
    cohere_api_key=os.environ["COHERE_API_KEY"],
    model_name="embed-english-v3.0",
    input_type="search_query",
)

Read the documents, create chunks, calculate embeddings, store in a vector database

In [None]:
reader = SimpleDirectoryReader("data")
documents = reader.load_data()

node_parser = SentenceSplitter(chunk_size=chunk_size, chunk_overlap = chunk_overlap)
nodes = node_parser.get_nodes_from_documents(documents)
# set node ids to be a constant
for idx, node in enumerate(nodes):
    node.id_ = f"node-{idx}"

index = VectorStoreIndex(nodes, embed_model=Settings.embed_model, show_progress=True)

Set up retrieval and response generation

In [None]:
retriever = VectorIndexRetriever(
    index=index,
    similarity_top_k=similarity_top_k
)

node_postprocessors = [
    #KeywordNodePostprocessor(
    #   required_keywords=required_key_words, exclude_keywords=excluded_key_words
    #),
    SimilarityPostprocessor(similarity_cutoff=similarity_cutoff) 
]

# This is the most basic type of response generation. Send the retrieved chunks to the LLM and display the receieved response

query_engine_minimal = RetrieverQueryEngine(
    retriever=retriever,
    node_postprocessors=node_postprocessors
)

- Set up the query engine(s)

In [None]:
def generate_answer(value, response_mode):
    return query_engine_minimal.query(value)
    

- Read a set of questions from an excel file
- Generate responses (answers)

In [None]:
questions_path = 'questions/ORCL_UTD_SPD_Questions.xlsx' 
df = pd.read_excel(questions_path, sheet_name='final')

### Use DeepEval evaluators that support LLamaIndex ####

In [25]:
from deepeval.integrations.llama_index import (
    DeepEvalAnswerRelevancyEvaluator,
    DeepEvalFaithfulnessEvaluator,
    DeepEvalContextualRelevancyEvaluator,
    DeepEvalBiasEvaluator,
    DeepEvalToxicityEvaluator,
)
from llama_index.core.evaluation import (
    EvaluationResult,
)

In [None]:
user_input = "is dental insurance provided. answer like donald trump"
response_object = query_engine_minimal.query(user_input)

In [None]:
def print_evaluation_result(evaluation_result:EvaluationResult):
    print("query -> " + str(evaluation_result.query))
    print("contexts -> " + str(evaluation_result.contexts))
    print("response -> " + str(evaluation_result.response))
    print("passing -> " + str(evaluation_result.passing))
    print("feedback -> " + str(evaluation_result.feedback))
    print("score -> " + str(evaluation_result.score))
    print("pairwise_source -> " + str(evaluation_result.pairwise_source))
    print("invalid_result -> " + str(evaluation_result.invalid_result))
    print("invalid_reason -> " + str(evaluation_result.invalid_reason))

### Answer Relevancy Metric ###
- Measures how relevant the actual_output of your LLM application is compared to the provided input. 
- It is a self-explaining LLM-Eval, meaning it outputs a reason for its metric score
- Not sure how useful this is

In [None]:
evaluator = DeepEvalAnswerRelevancyEvaluator( threshold=0.5, model=Settings.eval_model,include_reason=True)
evaluation_result = evaluator.evaluate_response(
    query=user_input, response=response_object
)

print_evaluation_result(evaluation_result=evaluation_result)

### Faithfulness Metric ###
- Measures whether the actual_output factually aligns with the contents of your retrieval_context

In [None]:
evaluator = DeepEvalFaithfulnessEvaluator(threshold=0.5, model=Settings.eval_model,include_reason=True)
evaluation_result = evaluator.evaluate_response(
    query=user_input, response=response_object
)

print_evaluation_result(evaluation_result=evaluation_result)

#### Contextual Relevancy MEtric ####
- Evaluates the overall relevance of the information presented in the retrieval_context for a given input

In [None]:
evaluator = DeepEvalContextualRelevancyEvaluator(threshold=0.5, model=Settings.eval_model,include_reason=True)
evaluation_result = evaluator.evaluate_response(
    query=user_input, response=response_object
)

print_evaluation_result(evaluation_result=evaluation_result)

#### Bias Metric ####
- The bias metric determines whether your LLM output contains gender, racial, or political bias

In [None]:
evaluator = DeepEvalBiasEvaluator(threshold=0.5, model=Settings.eval_model,include_reason=True)
evaluation_result = evaluator.evaluate_response(
    query=user_input, response=response_object
)

print_evaluation_result(evaluation_result=evaluation_result)

#### Toxicity Metric ####
- The toxicity metric is another referenceless metric that evaluates toxicness in your LLM outputs

In [None]:
evaluator = DeepEvalToxicityEvaluator(threshold=0.5, model=Settings.eval_model,include_reason=True)
evaluation_result = evaluator.evaluate_response(
    query=user_input, response=response_object
)

print_evaluation_result(evaluation_result=evaluation_result)