### Advanced Parser ###
- Use Llamaparse to parse pdf into markdown

In [1]:
# Fetch API keys from config.py
import os
from config import set_environment 
set_environment()

import logging
import sys
#logging.basicConfig(stream=sys.stdout, level=logging.INFO)
#logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

# Only for notebook
import nest_asyncio
nest_asyncio.apply()

In [2]:
from llama_index.core import Settings
from llama_index.core import VectorStoreIndex
from llama_parse import LlamaParse
from llama_index.core.node_parser import MarkdownElementNodeParser
from datetime import datetime
import pandas as pd
import numpy as np

Choose the LLM

In [3]:
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding

#Settings.embed_model = OpenAIEmbedding(model="text-embedding-3-large",dimensions=512,)
#Settings.llm = OpenAI(temperature=0, model="gpt-4")
#Settings.model = "GPT-4"

In [4]:
from llama_index.llms.cohere import Cohere
from llama_index.embeddings.cohere import CohereEmbedding

Settings.llm = Cohere(api_key=os.environ["COHERE_API_KEY"], model="command-r")
Settings.embed_model = CohereEmbedding(
    cohere_api_key=os.environ["COHERE_API_KEY"],
    model_name="embed-english-v3.0",
    input_type="search_query",
)
Settings.model = "COMMAND-R"

Set the parameters for the run here

In [5]:

# Retriever Settings
similarity_top_k = 4

# Context Post Processor Settings
similarity_cutoff = 0.2

rag_strategy = "S005" 
rag_strategy_desc = "Advanced PDF Parser"

eval_name = "ACME_UTD_SPD_ALL" 
eval_files = "./data/ACME_SPD.pdf" 
eval_questions = "./questions/ACME_UTD_SPD_Questions_All.json"

parsing_instruction="""
This document is a benefits coverage policy document.
When a benefits/coverage/exclusion is described in the document ammend to it add a text in the following benefits string format (where coverage could be an exclusion).

For {nameofcondition} and in this condition {whenDoesThecoverageApply} the coverage is {coverageDescription}. 
                                        
If the document contain a benefits TABLE that describe coverage amounts, do not ouput it as a table, but instead as a list of benefits string.
                                       
"""

run_id = f"{eval_name}_{rag_strategy}_M_{Settings.model}_K_{similarity_top_k}_{datetime.today().strftime('%Y-%m-%d')}"

Set up Token Counting

In [6]:
import tiktoken
from llama_index.core.callbacks import CallbackManager, TokenCountingHandler

token_counter = TokenCountingHandler(
    tokenizer=tiktoken.encoding_for_model("gpt-4").encode
)

Settings.callback_manager = CallbackManager([token_counter])
tokencount_df = pd.DataFrame()

Read the documents, create chunks, calculate embeddings, store in a vector database

In [7]:
documents = LlamaParse(result_type="markdown",parsing_instruction= parsing_instruction).load_data(eval_files)
node_parser = MarkdownElementNodeParser(llm=Settings.llm, num_workers=8)
nodes = node_parser.get_nodes_from_documents(documents)
base_nodes, objects = node_parser.get_nodes_and_objects(nodes)
recursive_index = VectorStoreIndex(nodes=base_nodes+objects)

Started parsing the file under job_id 0b1e5c13-f3e0-470a-9236-1ff667fd948b
Embeddings have been explicitly disabled. Using MockEmbedding.


0it [00:00, ?it/s]
0it [00:00, ?it/s]


In [8]:
tokencount_df['document_tokens'] = [token_counter.total_embedding_token_count]
token_counter.reset_counts()

Set up retrieval and response generation

In [9]:
query_engine = recursive_index.as_query_engine(similarity_top_k=similarity_top_k)

Quick test of response generation

In [10]:
response = query_engine.query("Are bifocals covered")
print(f"Response ---{chr(10)}{response.response}{chr(10)}")

text_md = ""
for n in response.source_nodes:
    
    text_md += (
        f"**Node ID:** {n.node.node_id}{chr(10)}"
        f"**Similarity:** {n.score}{chr(10)}"
        f"**Text:** {n.node.get_content()}{chr(10)}"
        f"**Metadata:** {n.node.metadata}{chr(10)}"
        f"~~~~{chr(10)}"
    )
print(text_md)


Response ---
Yes, bifocals are covered. Under the section 'Benefits', it states that 'For Two Pairs of Lenses In Lieu Of Bifocals, the coverage is VSP will pay or deny claims within 30 calendar days of the receipt of the claim from the individual or their authorized representative.' It is also mentioned that VSP pays in full for any necessary spectacle lenses, which includes bifocals.

**Node ID:** 826ec402-0e8c-46eb-9737-04e3bdb10af1
**Similarity:** 0.4531044297653135
**Text:** How Benefits Eligibility is Determined

Your benefits eligibility for vision is based on a Calendar Year accumulation, even when You switch vision plans. Please contact VSP at 1-800-877-7195, or access the VSP website at www.vsp.com to obtain information on the date You last received an examination and/or purchased glasses (frames & lenses) and/or contacts and the date You will next be eligible to receive services.

Benefits:

- For vision and in this condition when You switch vision plans the coverage is based

- Read the evalution question set (along with expected answers)
- This is structured in Llamaindex's format for batch evaluations
- Also, load into a data frame (which we will write back to an excel file with responses, evaluations etc.)

In [11]:
with open(eval_questions, 'r') as file:
    data = pd.read_json(file,encoding="utf8" )
     
    queries_df = pd.DataFrame(list(data['queries'].items()), columns=['query_num', 'query'])
    responses_df = pd.DataFrame(list(data['responses'].items()), columns=['query_num', 'expected_answer'])
    
    responses_df = pd.merge(queries_df, responses_df, on='query_num')


In [12]:
from llama_index.core.evaluation.eval_utils import (
    get_responses,
)
from llama_index.core.evaluation import QueryResponseDataset

Send the questions to the query engine in bulk

In [13]:
eval_dataset = QueryResponseDataset.from_json(eval_questions)
eval_qs = eval_dataset.questions
ref_response_strs = [r for (_, r) in eval_dataset.qr_pairs]
pred_responses = get_responses(
    eval_qs, query_engine, show_progress=True
)
pred_response_strs = [str(p) for p in pred_responses]

  return cls(**data)
100%|██████████| 82/82 [00:55<00:00,  1.48it/s] 


In [14]:
from evaluation_utils import get_eval_results_df, get_summary_scores_df, get_answers_source_nodes

answers, sources = get_answers_source_nodes(pred_responses)

responses_df['generated_answer'] = answers

sources_df = pd.DataFrame()
sources_df['query_num'] = responses_df['query_num']
sources_df['query'] = responses_df['query']
sources_df = sources_df.join(pd.DataFrame(sources)[0].str.split("~~~~", expand=True))

tokencount_df['answer_tokens' ] = [token_counter.total_llm_token_count]
token_counter.reset_counts()

In [15]:
with pd.ExcelWriter(f"./evaluation/{run_id}.xlsx") as writer:
   responses_df.to_excel(writer, sheet_name="Responses", index=False)
   sources_df.to_excel(writer, sheet_name="Sources", index=False)
   tokencount_df.to_excel(writer, sheet_name="Token Counts", index=False)

Choose the LLM for evaluations

In [16]:
Settings.eval_llm = OpenAI(temperature=0, model="gpt-4-0125-preview")
Settings.eval_model = "gpt-4-0125-preview"

Setup Evaluations

In [17]:
from llama_index.core.evaluation.eval_utils import (
    get_responses,
)
from llama_index.core.evaluation import BatchEvalRunner

from llama_index.core.evaluation import (
    CorrectnessEvaluator,
)
from deepeval.integrations.llama_index import (
    DeepEvalAnswerRelevancyEvaluator,
    DeepEvalFaithfulnessEvaluator,
    DeepEvalContextualRelevancyEvaluator,
    DeepEvalBiasEvaluator,
    DeepEvalToxicityEvaluator,
)

eval_lidx_c = CorrectnessEvaluator(llm=Settings.eval_llm)
eval_deval_f = DeepEvalFaithfulnessEvaluator(threshold=0.5, model=Settings.eval_model,include_reason=True)
eval_deval_ar = DeepEvalAnswerRelevancyEvaluator( threshold=0.5, model=Settings.eval_model,include_reason=True)
eval_deval_cr = DeepEvalContextualRelevancyEvaluator(threshold=0.5, model=Settings.eval_model,include_reason=True)
eval_deval_b = DeepEvalBiasEvaluator(threshold=0.5, model=Settings.eval_model,include_reason=True)
eval_deval_t = DeepEvalToxicityEvaluator(threshold=0.5, model=Settings.eval_model,include_reason=True)



In [18]:
# For large eval sets (30+ questions)
evaluator_dict_essential = {
    "Correctness": eval_lidx_c,
    "Faithfulness": eval_deval_f
}

# For troubleshooting 
evaluator_dict_extended = {
    "Correctness": eval_lidx_c,
    "Faithfulness": eval_deval_f,
    "Context_Relevancy": eval_deval_cr
}

# For small sets (< 10 questions)
evaluator_dict_full = {
    "Correctness": eval_lidx_c,
    "Faithfulness": eval_deval_f,
    "Answer_Relevancy": eval_deval_ar,
    "Context_Relevancy": eval_deval_cr,
    "Bias": eval_deval_b,
    "Toxicity": eval_deval_t ,
}

# Pick the list of evaluators to run
evaluator_dict = evaluator_dict_essential

# Make sure this list matches the chosenevaluator_dict 
evaluators = ["Correctness", "Faithfulness" ] 

batch_runner = BatchEvalRunner(evaluator_dict, workers=8)

Run evaluations

In [19]:
eval_results = await batch_runner.aevaluate_responses(
    queries=eval_qs,
    responses=pred_responses,
    reference=ref_response_strs,
)

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

In [20]:
from evaluation_utils import get_eval_results_df, get_summary_scores_df

In [21]:
mean_df, sum_df = get_summary_scores_df(
    [eval_results ],
    [rag_strategy],
    evaluators
)

In [22]:
if "Correctness" in evaluators:
    correctness_df = get_eval_results_df(
        list(responses_df['query_num']),
        list(responses_df['expected_answer']),
        eval_results["Correctness"]
    )
    responses_df['correctness'] = correctness_df['score']

if "Faithfulness" in evaluators:
    faithfulness_df = get_eval_results_df(
        list(responses_df['query_num']),
        list(responses_df['expected_answer']),
        eval_results["Faithfulness"]
    )
    responses_df['faithfulness'] = faithfulness_df['score']

if "Answer_Relevancy" in evaluators:
    answer_relevancy_df = get_eval_results_df(
        list(responses_df['query_num']),
        list(responses_df['expected_answer']),
        eval_results["Answer_Relevancy"]
    )
    responses_df['answer_relevancy'] = answer_relevancy_df['score']

if "Context_Relevancy" in evaluators:
    context_relevancy_df = get_eval_results_df(
        list(responses_df['query_num']),
        list(responses_df['expected_answer']),
        eval_results["Context_Relevancy"]
    )
    responses_df['context_relevancy'] = context_relevancy_df['score']

if "Bias" in evaluators:
    bias_df = get_eval_results_df(
        list(responses_df['query_num']),
        list(responses_df['expected_answer']),
        eval_results["Bias"]
    )
    responses_df['bias'] = bias_df['score']

if "Toxicity" in evaluators:
    toxicity_df = get_eval_results_df(
        list(responses_df['query_num']),
        list(responses_df['expected_answer']),
        eval_results["Toxicity"]
    )
    responses_df['toxicity'] = toxicity_df['score']



In [23]:
responses_df['rag_strategy'] = rag_strategy
responses_df['parameter_1'] = similarity_top_k
responses_df['parameter_2'] = ""
responses_df['model'] = Settings.model
responses_df['eval_model'] = Settings.eval_model
responses_df['run_date'] = datetime.today().strftime('%Y-%m-%d') 
responses_df['eval_name'] = eval_name

In [24]:
tokencount_df['eval_tokens' ] = [token_counter.total_llm_token_count]
token_counter.reset_counts()

In [25]:
with pd.ExcelWriter(f"./evaluation/{run_id}.xlsx") as writer:
   responses_df.to_excel(writer, sheet_name="Responses", index=False)
   sources_df.to_excel(writer, sheet_name="Sources", index=False)
   
   sum_df.to_excel(writer, sheet_name="Summary", index=False, startrow=0 , startcol=0)
   mean_df.to_excel(writer, sheet_name="Summary", index=False,startrow=5, startcol=0)
   
  
   if "Correctness" in evaluators:
      correctness_df.to_excel(writer, sheet_name="Correctness", index=False)
   
   if "Faithfulness" in evaluators:
      faithfulness_df.to_excel(writer, sheet_name="Faithfulness", index=False)

   if "Context_Relevancy" in evaluators:
      context_relevancy_df.to_excel(writer, sheet_name="Context_Relevancy", index=False)
   
   if "Answer_Relevancy" in evaluators:
      answer_relevancy_df.to_excel(writer, sheet_name="Answer_Relevancy", index=False)
   
   if "Bias" in evaluators:
      bias_df.to_excel(writer, sheet_name="Bias", index=False)
   
   if "Toxicity" in evaluators:
      toxicity_df.to_excel(writer, sheet_name="Toxicity", index=False)
   
   tokencount_df.to_excel(writer, sheet_name="Token Counts", index=False)
