### Recursive Retriever ###
- Embed chunks in a hierarchy. This facilitates matching on small chunks for retrieval, while using larger parent chunks for generation
- Supported strategies
    - S003_00 -> Recursive Retriever
    - S003_01 -> Recursive Retriever + Rereank

In [1]:
import os
from config import set_environment 
set_environment()

import logging
import sys
#logging.basicConfig(stream=sys.stdout, level=logging.INFO)
#logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

# Only for notebook
import nest_asyncio
nest_asyncio.apply()

In [2]:
from llama_index.core import Settings
from pathlib import Path
from llama_index.readers.file import PDFReader
from llama_index.core.retrievers import RecursiveRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core import VectorStoreIndex
from llama_index.core import Document
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.schema import IndexNode
from llama_index.core.postprocessor import SimilarityPostprocessor
from llama_index.postprocessor.cohere_rerank import CohereRerank

from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding

from llama_index.llms.cohere import Cohere
from llama_index.embeddings.cohere import CohereEmbedding


import json
from datetime import datetime
import pandas as pd
import numpy as np

Choose the model for generation

In [3]:
generation_llm_family = os.environ["GENERATION_LLM_FAMILY"]
generation_llm_model = os.environ["GENERATION_LLM_MODEL"]

if generation_llm_family == "OPENAI":
    Settings.llm = OpenAI(temperature=0, model=generation_llm_model)
elif generation_llm_family == "COHERE":
    Settings.llm = Cohere(api_key=os.environ["COHERE_API_KEY"], model=generation_llm_model,temperature=0)


Choose the model for embedding

In [4]:
embedding_llm_family = os.environ["EMBEDDING_LLM_FAMILY"]
embedding_llm_model = os.environ["EMBEDDING_LLM_MODEL"]
embedding_dimensions = int(os.environ["EMBEDDING_DIMESIONS"])

if embedding_llm_family == "OPENAI":
    Settings.embed_model = OpenAIEmbedding(model=embedding_llm_model,dimensions=embedding_dimensions,)
elif embedding_llm_family == "COHERE":
    Settings.embed_model = CohereEmbedding(
    cohere_api_key=os.environ["COHERE_API_KEY"],
    model_name=embedding_llm_model,
    input_type="search_query",
)

Set the parameters for the run here

In [5]:
eval_name = os.environ["EVAL_NAME"]
eval_directory = os.environ["EVAL_DIRECTORY"]
eval_file = os.environ["EVAL_FILE"]
eval_questions = os.environ["EVAL_QUESTIONS"]
eval_results_dir = os.environ["EVAL_RESULTS_DIR"]
eval_quick_test = os.environ["EVAL_QUICK_TEST"]

rag_strategy = os.environ["RAG_STRATEGY"]

similarity_top_k = int(os.environ["SIMILARITY_TOP_K"])

# Context Post Processor Settings
similarity_cutoff = float(os.environ["SIMILARITY_CUTOFF"])

# Node Parser
parent_chunk_size = int(os.environ["PARENT_CHUNK_SIZE"])
sub_chunk_sizes_string = os.environ["SUB_CHUNK_SIZES"]
sub_chunk_sizes = [int(number) for number in sub_chunk_sizes_string.split('_')]



Pick the strategy

In [6]:
if rag_strategy == "S003_00":
    rag_strategy_desc = "Recursive_Basic"
    run_id = f"{eval_name}_{rag_strategy}_GM_{generation_llm_model}_EM_{embedding_llm_model}_P_{parent_chunk_size}_K_{similarity_top_k}_{datetime.today().strftime('%Y-%m-%d')}"
elif rag_strategy == "S003_01": 
    rag_strategy_desc = "Recursive_Rerank"
    reranker = os.environ["RERANKER"]
    rerank_top_n = int(os.environ["RERANK_TOP_N"])
    run_id = f"{eval_name}_{rag_strategy}_GM_{generation_llm_model}_EM_{embedding_llm_model}_P_{parent_chunk_size}_K_{similarity_top_k}_RR_{reranker}_N_{rerank_top_n}_{datetime.today().strftime('%Y-%m-%d')}"

output_file = f"{eval_results_dir}/{run_id}.xlsx"  

Set up Token Counting

In [7]:
import tiktoken
from llama_index.core.callbacks import CallbackManager, TokenCountingHandler

token_counter = TokenCountingHandler(
    tokenizer=tiktoken.encoding_for_model("gpt-4").encode
)

Settings.callback_manager = CallbackManager([token_counter])
tokencount_df = pd.DataFrame()

Read the documents, create chunks, calculate embeddings, store in a vector database

In [8]:
loader = PDFReader()
docs0 = loader.load_data(file=Path(eval_file))
doc_text = "\n\n".join([d.get_content() for d in docs0])
docs = [Document(text=doc_text)]

node_parser = SentenceSplitter(chunk_size=parent_chunk_size)
base_nodes = node_parser.get_nodes_from_documents(docs)
# set node ids to be a constant
for idx, node in enumerate(base_nodes):
    node.id_ = f"node-{idx}"

sub_node_parsers = [
    SentenceSplitter(chunk_size=c, chunk_overlap=20) for c in sub_chunk_sizes
]

all_nodes = []
for base_node in base_nodes:
    for n in sub_node_parsers:
        sub_nodes = n.get_nodes_from_documents([base_node])
        sub_inodes = [
            IndexNode.from_text_node(sn, base_node.node_id) for sn in sub_nodes
        ]
        all_nodes.extend(sub_inodes)

    # also add original node to node
    original_node = IndexNode.from_text_node(base_node, base_node.node_id)
    all_nodes.append(original_node)

all_nodes_dict = {n.node_id: n for n in all_nodes}
vector_index_chunk = VectorStoreIndex(all_nodes, embed_model=Settings.embed_model)

In [9]:
tokencount_df['document_tokens'] = [token_counter.total_embedding_token_count]
token_counter.reset_counts()

Set up retrieval and response generation

In [10]:
vector_retriever_chunk = vector_index_chunk.as_retriever(similarity_top_k=similarity_top_k)

retriever_chunk = RecursiveRetriever(
    "vector",
    retriever_dict={"vector": vector_retriever_chunk},
    node_dict=all_nodes_dict,
    verbose=False,
)

    
if rag_strategy =="S003_00":
    node_postprocessors = [
        SimilarityPostprocessor(similarity_cutoff=similarity_cutoff) 
    ]
elif rag_strategy == "S003_01":
    cohere_rerank = CohereRerank(api_key=os.environ["COHERE_API_KEY"], top_n=rerank_top_n)
    node_postprocessors = [
        SimilarityPostprocessor(similarity_cutoff=similarity_cutoff), cohere_rerank
    ]
    

query_engine = RetrieverQueryEngine.from_args(retriever_chunk, llm=Settings.llm, node_postprocessors=node_postprocessors)

Quick test of query engine

In [11]:
response = query_engine.query(eval_quick_test)
print(f"Question:{eval_quick_test}{chr(10)}")
print(f"Response:{chr(10)}{response.response}{chr(10)}")

text_md = ""
for n in response.source_nodes:
    
    text_md += (
        f"**Node ID:** {n.node.node_id}{chr(10)}"
        f"**Similarity:** {n.score}{chr(10)}"
        f"**Text:** {n.node.get_content()}{chr(10)}"
        f"**Metadata:** {n.node.metadata}{chr(10)}"
        f"~~~~{chr(10)}"
    )
print(text_md)

Question:Are bifocals covered?

Response:
Yes, bifocals are covered.

**Node ID:** node-17
**Similarity:** 0.6348324389459428
**Text:** Health and Well-Being 26
Trying to decide whether the Standard Plan 
or Premier Plan is right for you? Talk to ALEX 
to discuss your options and get a personal 
recommendation based on your needs and 
your budget. See page 3  for step-by-step 
instructions to get started with ALEX.Ask  
Vision 
Vision coverage is offered through VSP. You can see 
any provider, but if you see an out-of-network provider, 
the plan will reimburse you up to a certain amount. 
The Premier Plan  includes a higher allowance for 
frames and contacts. You can also receive frames every 
calendar year with the Premier Plan  instead of every 
other calendar year with the Standard Plan .Plan participants are eligible for a variety of savings 
through VSP, including discounts on additional pairs 
of eyeglasses, sunglasses and LASIK surgery.
Need to find a VSP vision provider?
Contac

- Read the evalution question set (along with expected answers)
- This is structured in Llamaindex's format for batch evaluations
- Also, load into a data frame (which we will write back to an excel file with responses, evaluations etc.)

In [12]:
with open(eval_questions, 'r') as file:
    data = pd.read_json(file)
     
    queries_df = pd.DataFrame(list(data['queries'].items()), columns=['query_num', 'query'])
    responses_df = pd.DataFrame(list(data['responses'].items()), columns=['query_num', 'expected_answer'])
    
    responses_df = pd.merge(queries_df, responses_df, on='query_num')


In [13]:
from llama_index.core.evaluation.eval_utils import (
    get_responses,
)
from llama_index.core.evaluation import QueryResponseDataset

In [14]:
eval_dataset = QueryResponseDataset.from_json(eval_questions)
eval_qs = eval_dataset.questions
ref_response_strs = [r for (_, r) in eval_dataset.qr_pairs]
pred_responses = get_responses(
    eval_qs, query_engine, show_progress=True
)
pred_response_strs = [str(p) for p in pred_responses]

  return cls(**data)
100%|██████████| 36/36 [00:22<00:00,  1.61it/s]


In [15]:
from evaluation_utils import get_eval_results_df, get_summary_scores_df, get_answers_source_nodes

answers, sources = get_answers_source_nodes(pred_responses)

responses_df['generated_answer'] = answers

sources_df = pd.DataFrame()
sources_df['query_num'] = responses_df['query_num']
sources_df['query'] = responses_df['query']
sources_df = sources_df.join(pd.DataFrame(sources)[0].str.split("~~~~", expand=True))

tokencount_df['answer_tokens' ] = [token_counter.total_llm_token_count]
token_counter.reset_counts()

In [16]:
with pd.ExcelWriter(output_file) as writer:
   responses_df.to_excel(writer, sheet_name="Responses", index=False)
   sources_df.to_excel(writer, sheet_name="Sources", index=False)
   tokencount_df.to_excel(writer, sheet_name="Token Counts", index=False)

Choose the LLM for Evaluations

In [17]:
evaluation_llm_family = os.environ["EVALUATION_LLM_FAMILY"]
evaluation_llm_model = os.environ["EVALUATION_LLM_MODEL"]

if evaluation_llm_family == "OPENAI":
    Settings.eval_llm = OpenAI(temperature=0, model=evaluation_llm_model)
elif evaluation_llm_family == "COHERE":
    Settings.eval_llm = Cohere(api_key=os.environ["COHERE_API_KEY"], model=evaluation_llm_model, temperature=0)

Setup Evaluations

In [18]:
from llama_index.core.evaluation import QueryResponseDataset
from llama_index.core.evaluation.eval_utils import (
    get_responses,
)
from llama_index.core.evaluation import BatchEvalRunner

from llama_index.core.evaluation import (
    CorrectnessEvaluator,
)
from deepeval.integrations.llama_index import (
    DeepEvalAnswerRelevancyEvaluator,
    DeepEvalFaithfulnessEvaluator,
    DeepEvalContextualRelevancyEvaluator,
    DeepEvalBiasEvaluator,
    DeepEvalToxicityEvaluator,
)

eval_lidx_c = CorrectnessEvaluator(llm=Settings.eval_llm)
eval_deval_f = DeepEvalFaithfulnessEvaluator(threshold=0.5, model=evaluation_llm_model,include_reason=True)
eval_deval_ar = DeepEvalAnswerRelevancyEvaluator( threshold=0.5, model=evaluation_llm_model,include_reason=True)
eval_deval_cr = DeepEvalContextualRelevancyEvaluator(threshold=0.5, model=evaluation_llm_model,include_reason=True)
eval_deval_b = DeepEvalBiasEvaluator(threshold=0.5, model=evaluation_llm_model,include_reason=True)
eval_deval_t = DeepEvalToxicityEvaluator(threshold=0.5, model=evaluation_llm_model,include_reason=True)



In [19]:
# For large eval sets (30+ questions)
evaluator_dict_essential = {
    "Correctness": eval_lidx_c,
    "Faithfulness": eval_deval_f
}

# For troubleshooting 
evaluator_dict_extended = {
    "Correctness": eval_lidx_c,
    "Faithfulness": eval_deval_f,
    "Context_Relevancy": eval_deval_cr
}

# For small sets (< 10 questions)
evaluator_dict_full = {
    "Correctness": eval_lidx_c,
    "Faithfulness": eval_deval_f,
    "Answer_Relevancy": eval_deval_ar,
    "Context_Relevancy": eval_deval_cr,
    "Bias": eval_deval_b,
    "Toxicity": eval_deval_t ,
}

# Pick the list of evaluators to run
evaluator_dict = evaluator_dict_essential

# Make sure this list matches the chosenevaluator_dict 
evaluators = ["Correctness", "Faithfulness" ] 

batch_runner = BatchEvalRunner(evaluator_dict, workers=8)

In [20]:
eval_results = await batch_runner.aevaluate_responses(
    queries=eval_qs,
    responses=pred_responses,
    reference=ref_response_strs,
)

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

In [21]:
from evaluation_utils import get_eval_results_df, get_summary_scores_df

In [22]:
mean_df, sum_df = get_summary_scores_df(
    [eval_results ],
    [rag_strategy],
    evaluators
)

In [23]:
if "Correctness" in evaluators:
    correctness_df = get_eval_results_df(
        list(responses_df['query_num']),
        list(responses_df['expected_answer']),
        eval_results["Correctness"]
    )
    correctness_df.rename(columns={'score': 'correctness_llm'}, inplace=True)
    correctness_df.rename(columns={'feedback': 'feedback_llm'}, inplace=True)
    correctness_df['correctness_human'] = correctness_df['correctness_llm']
    correctness_df['feedback_human'] = ""
    
    responses_df['correctness_llm'] = correctness_df['correctness_llm']
    responses_df['correctness_human'] = correctness_df['correctness_human']

if "Faithfulness" in evaluators:
    faithfulness_df = get_eval_results_df(
        list(responses_df['query_num']),
        list(responses_df['expected_answer']),
        eval_results["Faithfulness"]
    )
    
    faithfulness_df.rename(columns={'score': 'faithfulness_llm'}, inplace=True)
    faithfulness_df.rename(columns={'feedback': 'feedback_llm'}, inplace=True)
    faithfulness_df['faithfulness_human'] = faithfulness_df['faithfulness_llm']
    faithfulness_df['feedback_human'] = ""
    
    responses_df['faithfulness_llm'] = faithfulness_df['faithfulness_llm']
    responses_df['faithfulness_human'] = faithfulness_df['faithfulness_human']

if "Answer_Relevancy" in evaluators:
    answer_relevancy_df = get_eval_results_df(
        list(responses_df['query_num']),
        list(responses_df['expected_answer']),
        eval_results["Answer_Relevancy"]
    )
    responses_df['answer_relevancy'] = answer_relevancy_df['score']

if "Context_Relevancy" in evaluators:
    context_relevancy_df = get_eval_results_df(
        list(responses_df['query_num']),
        list(responses_df['expected_answer']),
        eval_results["Context_Relevancy"]
    )
    responses_df['context_relevancy'] = context_relevancy_df['score']

if "Bias" in evaluators:
    bias_df = get_eval_results_df(
        list(responses_df['query_num']),
        list(responses_df['expected_answer']),
        eval_results["Bias"]
    )
    responses_df['bias'] = bias_df['score']

if "Toxicity" in evaluators:
    toxicity_df = get_eval_results_df(
        list(responses_df['query_num']),
        list(responses_df['expected_answer']),
        eval_results["Toxicity"]
    )
    responses_df['toxicity'] = toxicity_df['score']



In [24]:
responses_df['rag_strategy'] = rag_strategy
responses_df['rag_strategy_desc'] = rag_strategy_desc
responses_df['parameter_1'] = parent_chunk_size
responses_df['parameter_2'] = similarity_top_k
if rag_strategy =="S003_00":
   responses_df['parameter_3'] = ""
elif rag_strategy == "S003_01":
    responses_df['parameter_3'] = rerank_top_n
responses_df['parameter_4'] = ""
responses_df['parameter_5'] = ""
responses_df['model'] = generation_llm_model 
responses_df['embed_model'] = embedding_llm_model 
responses_df['eval_model'] = evaluation_llm_model
responses_df['embed_dimensions'] = embedding_dimensions
if rag_strategy =="S003_00":
   responses_df['reranker'] = ""
elif rag_strategy == "S003_01":
    responses_df['reranker'] = reranker
responses_df['run_date'] = datetime.today().strftime('%Y-%m-%d') 
responses_df['eval_name'] = eval_name

In [25]:
tokencount_df['eval_tokens' ] = [token_counter.total_llm_token_count]
token_counter.reset_counts()

In [26]:
with pd.ExcelWriter(output_file) as writer:
   responses_df.to_excel(writer, sheet_name="Responses", index=False)
   sources_df.to_excel(writer, sheet_name="Sources", index=False)
   
   sum_df.to_excel(writer, sheet_name="Summary", index=False, startrow=0 , startcol=0)
   mean_df.to_excel(writer, sheet_name="Summary", index=False,startrow=5, startcol=0)
   
  
   if "Correctness" in evaluators:
      correctness_df.to_excel(writer, sheet_name="Correctness", index=False)
   
   if "Faithfulness" in evaluators:
      faithfulness_df.to_excel(writer, sheet_name="Faithfulness", index=False)

   if "Context_Relevancy" in evaluators:
      context_relevancy_df.to_excel(writer, sheet_name="Context_Relevancy", index=False)
   
   if "Answer_Relevancy" in evaluators:
      answer_relevancy_df.to_excel(writer, sheet_name="Answer_Relevancy", index=False)
   
   if "Bias" in evaluators:
      bias_df.to_excel(writer, sheet_name="Bias", index=False)
   
   if "Toxicity" in evaluators:
      toxicity_df.to_excel(writer, sheet_name="Toxicity", index=False)
   
   tokencount_df.to_excel(writer, sheet_name="Token Counts", index=False)