### S004 - Fuse results from a basic vector search AND a BM25 keyword search ###
- Supported strategies
    - S004_00 -> Fusion Basic
- Reranking
    - The Fusion retriever comes with a set of re-rankers
        - RECIPROCAL_RANK 
        - RELATIVE_SCORE 
        - DIST_BASED_SCORE 
        - SIMPLE 
    - There does not seem to be the ability to use external re-rankers (like Cohere)


In [1]:
# Fetch settings from config.py
import os
from config import set_environment 
set_environment()

import logging
import sys
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

# Only for notebook
import nest_asyncio
nest_asyncio.apply()

In [2]:
from llama_index.core import Settings
from llama_index.core import SimpleDirectoryReader
from llama_index.core import VectorStoreIndex
from llama_index.core.node_parser import SentenceSplitter
from llama_index.retrievers.bm25 import BM25Retriever
from llama_index.core.retrievers import QueryFusionRetriever
from llama_index.core.query_engine import RetrieverQueryEngine

from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding

from llama_index.llms.cohere import Cohere
from llama_index.embeddings.cohere import CohereEmbedding


from datetime import datetime
import pandas as pd

INFO:numexpr.utils:NumExpr defaulting to 8 threads.
NumExpr defaulting to 8 threads.


Choose the model for generation

In [3]:
generation_llm_family = os.environ["GENERATION_LLM_FAMILY"]
generation_llm_model = os.environ["GENERATION_LLM_MODEL"]

if generation_llm_family == "OPENAI":
    Settings.llm = OpenAI(temperature=0, model=generation_llm_model)
elif generation_llm_family == "COHERE":
    Settings.llm = Cohere(api_key=os.environ["COHERE_API_KEY"], model=generation_llm_model,temperature=0)


Choose the model for embedding

In [4]:
embedding_llm_family = os.environ["EMBEDDING_LLM_FAMILY"]
embedding_llm_model = os.environ["EMBEDDING_LLM_MODEL"]
embedding_dimensions = int(os.environ["EMBEDDING_DIMESIONS"])

if embedding_llm_family == "OPENAI":
    Settings.embed_model = OpenAIEmbedding(model=embedding_llm_model,dimensions=embedding_dimensions,)
elif embedding_llm_family == "COHERE":
    Settings.embed_model = CohereEmbedding(
    cohere_api_key=os.environ["COHERE_API_KEY"],
    model_name=embedding_llm_model,
    input_type="search_query",
)

Set the parameters for the run here

In [5]:
eval_name = os.environ["EVAL_NAME"]
eval_directory = os.environ["EVAL_DIRECTORY"]
eval_file = os.environ["EVAL_FILE"]
eval_questions = os.environ["EVAL_QUESTIONS"]
eval_results_dir = os.environ["EVAL_RESULTS_DIR"]
eval_quick_test = os.environ["EVAL_QUICK_TEST"]

rag_strategy = os.environ["RAG_STRATEGY"]

# Node Parser
chunk_size = int(os.environ["CHUNK_SIZE"])
chunk_overlap = 0.1 * chunk_size

similarity_top_k = int(os.environ["SIMILARITY_TOP_K"])
retriever_weights_string = os.environ["RETRIEVER_WEIGHTS"]
retriever_weights = [float(number) for number in retriever_weights_string.split('_')]
fusion_reranker = os.environ["FUSION_RERANKER"]

# Context Post Processor Settings
similarity_cutoff = float(os.environ["SIMILARITY_CUTOFF"])



Pick Strategy

In [6]:
if rag_strategy == "S004_00":
    rag_strategy_desc = "Fusion_Basic"
    run_id = f"{eval_name}_{rag_strategy}_GM_{generation_llm_model}_EM_{embedding_llm_model}_C_{chunk_size}_K_{similarity_top_k}_W_{retriever_weights_string}_RR_{fusion_reranker}_{datetime.today().strftime('%Y-%m-%d')}"

output_file = f"{eval_results_dir}/{run_id}.xlsx"  


Set up Token Counting

In [7]:
import tiktoken
from llama_index.core.callbacks import CallbackManager, TokenCountingHandler

token_counter = TokenCountingHandler(
    tokenizer=tiktoken.encoding_for_model("gpt-4").encode
)

Settings.callback_manager = CallbackManager([token_counter])
tokencount_df = pd.DataFrame()

Read the documents, create chunks, calculate embeddings, store in a vector database

In [8]:
reader = SimpleDirectoryReader(eval_directory)
documents = reader.load_data()

node_parser = SentenceSplitter(chunk_size=chunk_size, chunk_overlap = chunk_overlap)
nodes = node_parser.get_nodes_from_documents(documents)
# set node ids to be a constant
for idx, node in enumerate(nodes):
    node.id_ = f"node-{idx}"

index = VectorStoreIndex(nodes, embed_model=Settings.embed_model, show_progress=True)

Generating embeddings:   0%|          | 0/371 [00:00<?, ?it/s]

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


In [9]:
tokencount_df['document_tokens'] = [token_counter.total_embedding_token_count]
token_counter.reset_counts()

Setup Retrieval and Response Generation

In [10]:
vector_retriever = index.as_retriever(similarity_top_k=similarity_top_k)#,llm=Settings.llm)

bm25_retriever = BM25Retriever.from_defaults(
    docstore=index.docstore, similarity_top_k=similarity_top_k
)
retriever = QueryFusionRetriever(
    [vector_retriever, bm25_retriever],
    similarity_top_k=similarity_top_k,
    llm=Settings.llm,
    num_queries=1,  # set this to 1 to disable query generation
    mode=fusion_reranker,
    use_async=True,
    verbose=True,
    retriever_weights = retriever_weights,

    # query_gen_prompt="...",  # we could override the query generation prompt here
)


query_engine = RetrieverQueryEngine.from_args(retriever) 



Quick test of query engine

In [11]:
response = query_engine.query(eval_quick_test)
print(f"Question:{eval_quick_test}{chr(10)}")
print(f"Response:{chr(10)}{response.response}{chr(10)}")

text_md = ""
for n in response.source_nodes:
    
    text_md += (
        f"**Node ID:** {n.node.node_id}{chr(10)}"
        f"**Similarity:** {n.score}{chr(10)}"
        f"**Text:** {n.node.get_content()}{chr(10)}"
        f"**Metadata:** {n.node.metadata}{chr(10)}"
        f"~~~~{chr(10)}"
    )
print(text_md)

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
Question:Are bifocals covered?

Response:
Yes, bifocals are covered under the Vision Service Plan (VSP).

**Node ID:** node-256
**Similarity:** 0.03333333333333333
**Text:** • Complete the claim form. Make sure You have a copy of Your itemized receipt or statement that includes: o 
 Doctor name or office name o  Name of Patient o  Date of Service o  Each service 
received and the amount paid  
• After completing the claim form, You may attach Your receipt(s) or print and mail copies of Your claim form 
and receipt(s) to:  Vision Service Plan (VSP)   
PO Box 495918  
Cincinnati, OH 45249 -5918  
  
VISION - PLAN EXCLUSIONS (WHAT IS NOT COVERED)  
VSP pays in full

- Read the evalution question set (along with expected answers)
- This is structured in Llamaindex's format for batch evaluations
- Also, load into a data frame (which we will write back to an excel file with responses, evaluations etc.)

In [12]:
with open(eval_questions, 'r') as file:
    data = pd.read_json(file)
     
    queries_df = pd.DataFrame(list(data['queries'].items()), columns=['query_num', 'query'])
    responses_df = pd.DataFrame(list(data['responses'].items()), columns=['query_num', 'expected_answer'])
    
    responses_df = pd.merge(queries_df, responses_df, on='query_num')


Send questions to engine in bulk

In [13]:
from llama_index.core.evaluation.eval_utils import (
    get_responses,
)
from llama_index.core.evaluation import QueryResponseDataset

eval_dataset = QueryResponseDataset.from_json(eval_questions)
eval_qs = eval_dataset.questions
ref_response_strs = [r for (_, r) in eval_dataset.qr_pairs]
pred_responses = get_responses(
    eval_qs, query_engine, show_progress=True
)
pred_response_strs = [str(p) for p in pred_responses]

  return cls(**data)
  0%|          | 0/82 [00:00<?, ?it/s]

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:openai._base_client:Retrying request to /embeddings in 0.935448 seconds
Retrying request to /embeddings in 0.935448 seconds
INFO:openai._base_client:Retrying request to /embeddings in 0.851895 seconds
Retrying request to /embeddings in 0.851895 seconds
INFO:openai._base_client:Retrying request to /embeddings in 0.973528 seconds
Retrying request to /embeddings in 0.973528 seconds
INFO:openai._base_client:Retrying request to /embeddings in 0.944560 seconds
Retrying request to /embeddings in 0.944560 seconds
INFO:openai._base_client:Retrying request to /embeddings in 0.858485 seconds
Retrying request to /embeddings in 0.858485 seconds
INFO:openai._base_client:Retrying req

  1%|          | 1/82 [00:30<41:50, 31.00s/it]

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


  2%|▏         | 2/82 [00:42<26:14, 19.68s/it]

INFO:openai._base_client:Retrying request to /chat/completions in 1.906895 seconds
Retrying request to /chat/completions in 1.906895 seconds
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


  4%|▎         | 3/82 [00:45<15:50, 12.03s/it]

INFO:openai._base_client:Retrying request to /chat/completions in 1.929536 seconds
Retrying request to /chat/completions in 1.929536 seconds
INFO:openai._base_client:Retrying request to /chat/completions in 1.760533 seconds
Retrying request to /chat/completions in 1.760533 seconds
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:openai._base_client:Retrying request to /chat/completions in 1.954716 seconds
Retrying request to /chat/completions in 1.954716 seconds
INFO:openai._base_client:Retrying request to /chat/completions in 1.517230 seconds
Retrying request to /chat/completions in 1.517230 seconds
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTT

  7%|▋         | 6/82 [00:45<05:11,  4.10s/it]

INFO:openai._base_client:Retrying request to /chat/completions in 1.705429 seconds
Retrying request to /chat/completions in 1.705429 seconds
INFO:openai._base_client:Retrying request to /chat/completions in 1.554048 seconds
Retrying request to /chat/completions in 1.554048 seconds
INFO:openai._base_client:Retrying request to /chat/completions in 1.944635 seconds
Retrying request to /chat/completions in 1.944635 seconds
INFO:openai._base_client:Retrying request to /chat/completions in 1.619361 seconds
Retrying request to /chat/completions in 1.619361 seconds
INFO:openai._base_client:Retrying request to /chat/completions in 1.973434 seconds
Retrying request to /chat/completions in 1.973434 seconds
INFO:openai._base_client:Retrying request to /chat/completions in 1.787233 seconds
Retrying request to /chat/completions in 1.787233 seconds
INFO:openai._base_client:Retrying request to /chat/completions in 1.545013 seconds
Retrying request to /chat/completions in 1.545013 seconds
INFO:openai._

 10%|▉         | 8/82 [00:46<03:13,  2.62s/it]

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


 13%|█▎        | 11/82 [00:46<01:43,  1.45s/it]

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


 18%|█▊        | 15/82 [00:46<00:53,  1.26it/s]

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


 22%|██▏       | 18/82 [00:46<00:34,  1.84it/s]

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


 26%|██▌       | 21/82 [00:46<00:23,  2.60it/s]

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


 29%|██▉       | 24/82 [00:47<00:17,  3.24it/s]

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


 32%|███▏      | 26/82 [00:47<00:15,  3.54it/s]

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


 34%|███▍      | 28/82 [00:47<00:12,  4.36it/s]

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


 37%|███▋      | 30/82 [00:48<00:10,  4.89it/s]

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


 39%|███▉      | 32/82 [00:48<00:09,  5.31it/s]

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


 41%|████▏     | 34/82 [00:48<00:07,  6.64it/s]

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


 44%|████▍     | 36/82 [00:48<00:06,  6.85it/s]

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


 46%|████▋     | 38/82 [00:49<00:06,  6.77it/s]

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


 50%|█████     | 41/82 [00:49<00:05,  7.49it/s]

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


 51%|█████     | 42/82 [00:49<00:05,  7.27it/s]

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


 52%|█████▏    | 43/82 [00:49<00:07,  5.57it/s]

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


 55%|█████▍    | 45/82 [00:50<00:06,  5.90it/s]

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


 57%|█████▋    | 47/82 [00:50<00:07,  4.69it/s]

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


 60%|█████▉    | 49/82 [00:51<00:06,  5.33it/s]

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


 61%|██████    | 50/82 [00:51<00:09,  3.29it/s]

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


 62%|██████▏   | 51/82 [00:53<00:18,  1.64it/s]

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


 65%|██████▍   | 53/82 [00:53<00:12,  2.41it/s]

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


 70%|██████▉   | 57/82 [00:54<00:05,  4.53it/s]

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


 72%|███████▏  | 59/82 [00:54<00:04,  5.14it/s]

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


 76%|███████▌  | 62/82 [00:54<00:02,  6.94it/s]

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


 78%|███████▊  | 64/82 [00:54<00:02,  6.67it/s]

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


 80%|████████  | 66/82 [00:55<00:03,  5.20it/s]

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


 82%|████████▏ | 67/82 [00:55<00:03,  4.73it/s]

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


 83%|████████▎ | 68/82 [00:55<00:02,  4.72it/s]

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


 84%|████████▍ | 69/82 [00:56<00:02,  5.14it/s]

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


 85%|████████▌ | 70/82 [00:56<00:02,  4.93it/s]

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


 87%|████████▋ | 71/82 [00:56<00:02,  4.40it/s]

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


 88%|████████▊ | 72/82 [00:57<00:03,  2.68it/s]

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


 89%|████████▉ | 73/82 [00:57<00:02,  3.09it/s]

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


 90%|█████████ | 74/82 [00:57<00:02,  2.99it/s]

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


 91%|█████████▏| 75/82 [00:59<00:04,  1.50it/s]

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


 93%|█████████▎| 76/82 [01:00<00:04,  1.36it/s]

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


 95%|█████████▌| 78/82 [01:00<00:02,  1.95it/s]

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


 96%|█████████▋| 79/82 [01:01<00:01,  1.64it/s]

INFO:openai._base_client:Retrying request to /chat/completions in 3.300123 seconds
Retrying request to /chat/completions in 3.300123 seconds
INFO:openai._base_client:Retrying request to /chat/completions in 3.656027 seconds
Retrying request to /chat/completions in 3.656027 seconds
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


 98%|█████████▊| 80/82 [01:07<00:03,  1.96s/it]

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


 99%|█████████▉| 81/82 [01:09<00:01,  1.90s/it]

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


100%|██████████| 82/82 [01:14<00:00,  1.10it/s]


In [14]:
from evaluation_utils import get_eval_results_df, get_summary_scores_df, get_answers_source_nodes

answers, sources = get_answers_source_nodes(pred_responses)

responses_df['generated_answer'] = answers

sources_df = pd.DataFrame()
sources_df['query_num'] = responses_df['query_num']
sources_df['query'] = responses_df['query']
sources_df = sources_df.join(pd.DataFrame(sources)[0].str.split("~~~~", expand=True))

tokencount_df['answer_tokens' ] = [token_counter.total_llm_token_count]
token_counter.reset_counts()

In [15]:
with pd.ExcelWriter(output_file) as writer:
   responses_df.to_excel(writer, sheet_name="Responses", index=False)
   sources_df.to_excel(writer, sheet_name="Sources", index=False)
   tokencount_df.to_excel(writer, sheet_name="Token Counts", index=False)

Choose the LLM for evaluations

In [16]:
evaluation_llm_family = os.environ["EVALUATION_LLM_FAMILY"]
evaluation_llm_model = os.environ["EVALUATION_LLM_MODEL"]

if evaluation_llm_family == "OPENAI":
    Settings.eval_llm = OpenAI(temperature=0, model=evaluation_llm_model)
elif evaluation_llm_family == "COHERE":
    Settings.eval_llm = Cohere(api_key=os.environ["COHERE_API_KEY"], model=evaluation_llm_model)

Set up Evaluations

In [17]:
from llama_index.core.evaluation import QueryResponseDataset
from llama_index.core.evaluation.eval_utils import (
    get_responses,
)
from llama_index.core.evaluation import BatchEvalRunner

from llama_index.core.evaluation import (
    CorrectnessEvaluator,
)
from deepeval.integrations.llama_index import (
    DeepEvalAnswerRelevancyEvaluator,
    DeepEvalFaithfulnessEvaluator,
    DeepEvalContextualRelevancyEvaluator,
    DeepEvalBiasEvaluator,
    DeepEvalToxicityEvaluator,
)

eval_lidx_c = CorrectnessEvaluator(llm=Settings.eval_llm)
eval_deval_f = DeepEvalFaithfulnessEvaluator(threshold=0.5, model=evaluation_llm_model,include_reason=True)
eval_deval_ar = DeepEvalAnswerRelevancyEvaluator( threshold=0.5, model=evaluation_llm_model,include_reason=True)
eval_deval_cr = DeepEvalContextualRelevancyEvaluator(threshold=0.5, model=evaluation_llm_model,include_reason=True)
eval_deval_b = DeepEvalBiasEvaluator(threshold=0.5, model=evaluation_llm_model,include_reason=True)
eval_deval_t = DeepEvalToxicityEvaluator(threshold=0.5, model=evaluation_llm_model,include_reason=True)

In [18]:
# For large eval sets (30+ questions)
evaluator_dict_essential = {
    "Correctness": eval_lidx_c,
    "Faithfulness": eval_deval_f
}

# For troubleshooting 
evaluator_dict_extended = {
    "Correctness": eval_lidx_c,
    "Faithfulness": eval_deval_f,
    "Context_Relevancy": eval_deval_cr
}

# For small sets (< 10 questions)
evaluator_dict_full = {
    "Correctness": eval_lidx_c,
    "Faithfulness": eval_deval_f,
    "Answer_Relevancy": eval_deval_ar,
    "Context_Relevancy": eval_deval_cr,
    "Bias": eval_deval_b,
    "Toxicity": eval_deval_t ,
}

# Pick the list of evaluators to run
evaluator_dict = evaluator_dict_essential

# Make sure this list matches the chosenevaluator_dict 
evaluators = ["Correctness", "Faithfulness" ] 

batch_runner = BatchEvalRunner(evaluator_dict, workers=8)

In [19]:
eval_results = await batch_runner.aevaluate_responses(
    queries=eval_qs,
    responses=pred_responses,
    reference=ref_response_strs,
)

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

In [20]:
from evaluation_utils import get_eval_results_df, get_summary_scores_df

In [21]:
mean_df, sum_df = get_summary_scores_df(
    [eval_results ],
    [rag_strategy],
    evaluators
)

In [22]:
if "Correctness" in evaluators:
    correctness_df = get_eval_results_df(
        list(responses_df['query_num']),
        list(responses_df['expected_answer']),
        eval_results["Correctness"]
    )
    responses_df['correctness'] = correctness_df['score']

if "Faithfulness" in evaluators:
    faithfulness_df = get_eval_results_df(
        list(responses_df['query_num']),
        list(responses_df['expected_answer']),
        eval_results["Faithfulness"]
    )
    responses_df['faithfulness'] = faithfulness_df['score']

if "Answer_Relevancy" in evaluators:
    answer_relevancy_df = get_eval_results_df(
        list(responses_df['query_num']),
        list(responses_df['expected_answer']),
        eval_results["Answer_Relevancy"]
    )
    responses_df['answer_relevancy'] = answer_relevancy_df['score']

if "Context_Relevancy" in evaluators:
    context_relevancy_df = get_eval_results_df(
        list(responses_df['query_num']),
        list(responses_df['expected_answer']),
        eval_results["Context_Relevancy"]
    )
    responses_df['context_relevancy'] = context_relevancy_df['score']

if "Bias" in evaluators:
    bias_df = get_eval_results_df(
        list(responses_df['query_num']),
        list(responses_df['expected_answer']),
        eval_results["Bias"]
    )
    responses_df['bias'] = bias_df['score']

if "Toxicity" in evaluators:
    toxicity_df = get_eval_results_df(
        list(responses_df['query_num']),
        list(responses_df['expected_answer']),
        eval_results["Toxicity"]
    )
    responses_df['toxicity'] = toxicity_df['score']



In [23]:
responses_df['rag_strategy'] = rag_strategy
responses_df['rag_strategy_desc'] = rag_strategy_desc
responses_df['parameter_1'] = chunk_size
responses_df['parameter_2'] = similarity_top_k
responses_df['parameter_3'] = retriever_weights_string
responses_df['parameter_4'] = ""
responses_df['parameter_5'] = ""
responses_df['model'] = generation_llm_model 
responses_df['embed_model'] = embedding_llm_model 
responses_df['eval_model'] = evaluation_llm_model
responses_df['embed_dimensions'] = embedding_dimensions
responses_df['reranker'] = fusion_reranker
responses_df['run_date'] = datetime.today().strftime('%Y-%m-%d') 
responses_df['eval_name'] = eval_name

In [24]:
tokencount_df['eval_tokens' ] = [token_counter.total_llm_token_count]
token_counter.reset_counts()

In [25]:
with pd.ExcelWriter(output_file) as writer:
   responses_df.to_excel(writer, sheet_name="Responses", index=False)
   sources_df.to_excel(writer, sheet_name="Sources", index=False)
   
   sum_df.to_excel(writer, sheet_name="Summary", index=False, startrow=0 , startcol=0)
   mean_df.to_excel(writer, sheet_name="Summary", index=False,startrow=5, startcol=0)
   
  
   if "Correctness" in evaluators:
      correctness_df.to_excel(writer, sheet_name="Correctness", index=False)
   
   if "Faithfulness" in evaluators:
      faithfulness_df.to_excel(writer, sheet_name="Faithfulness", index=False)

   if "Context_Relevancy" in evaluators:
      context_relevancy_df.to_excel(writer, sheet_name="Context_Relevancy", index=False)
   
   if "Answer_Relevancy" in evaluators:
      answer_relevancy_df.to_excel(writer, sheet_name="Answer_Relevancy", index=False)
   
   if "Bias" in evaluators:
      bias_df.to_excel(writer, sheet_name="Bias", index=False)
   
   if "Toxicity" in evaluators:
      toxicity_df.to_excel(writer, sheet_name="Toxicity", index=False)
   
   tokencount_df.to_excel(writer, sheet_name="Token Counts", index=False)