### Basic Query Engine ###
- Naive RAG strategy
- Set the following 
    - Node chunk size
    - Number of nodes to retrieve
- Kneel down and pray to the RAG gods


In [None]:
# Fetch API keys from config.py
import os
from config import set_environment 
set_environment()

import logging
import sys
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

# Only for notebook
import nest_asyncio
nest_asyncio.apply()

In [None]:
from llama_index.core import Settings

from llama_index.core import SimpleDirectoryReader
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core import VectorStoreIndex
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.postprocessor import SimilarityPostprocessor, KeywordNodePostprocessor
from llama_index.core import get_response_synthesizer

import pandas as pd

Set the parameters for the run here

In [None]:
# Node Parser
chunk_size = 1024
chunk_overlap = 50

# Retriever Settings
similarity_top_k = 3

# Context Post Processor Settings
required_key_words = [""]
excluded_key_words = [""]
similarity_cutoff = 0.2

# Response Synthesis - Minimal 
# This seems to work best for our data sets so far
response_mode_list = ["minimal"] 

# Response Synthesis - Full (Various additional processing of the LLM response )
# Doesn't seem to improve the answers for our data sets
#response_mode_list = ["minimal", "refine", "compact", "tree_summarize", "simple_summarize", "accumulate", "compact_accumulate"]



Pick the LLM

In [None]:
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding

Settings.embed_model = OpenAIEmbedding(model="text-embedding-3-large",dimensions=512,)
Settings.llm = OpenAI(temperature=0, model="gpt-4")

In [None]:
from llama_index.llms.cohere import Cohere
from llama_index.core import ServiceContext
from llama_index.embeddings.cohere import CohereEmbedding

Settings.llm = Cohere(api_key=os.environ["COHERE_API_KEY"], model="command-r")
Settings.embed_model = CohereEmbedding(
    cohere_api_key=os.environ["COHERE_API_KEY"],
    model_name="embed-english-v3.0",
    input_type="search_query",
)

Set up token counting

In [None]:
import tiktoken
from llama_index.core.callbacks import CallbackManager, TokenCountingHandler

token_counter = TokenCountingHandler(
    tokenizer=tiktoken.encoding_for_model("gpt-4").encode
)

Settings.callback_manager = CallbackManager([token_counter])
tokencount_df = pd.DataFrame()

Read the documents, create chunks, calculate embeddings, store in a vector database

In [None]:
reader = SimpleDirectoryReader("data")
documents = reader.load_data()

node_parser = SentenceSplitter(chunk_size=chunk_size, chunk_overlap = chunk_overlap)
nodes = node_parser.get_nodes_from_documents(documents)
# set node ids to be a constant
for idx, node in enumerate(nodes):
    node.id_ = f"node-{idx}"

index = VectorStoreIndex(nodes, embed_model=Settings.embed_model, show_progress=True)

In [None]:
tokencount_df['document_tokens'] = [token_counter.total_embedding_token_count]
token_counter.reset_counts()

Set up retrieval and response generation

In [None]:
retriever = VectorIndexRetriever(
    index=index,
    similarity_top_k=similarity_top_k
)

node_postprocessors = [
    #KeywordNodePostprocessor(
    #   required_keywords=required_key_words, exclude_keywords=excluded_key_words
    #),
    SimilarityPostprocessor(similarity_cutoff=similarity_cutoff) 
]

# This is the most basic type of response generation. Send the retrieved chunks to the LLM and display the receieved response

query_engine_minimal = RetrieverQueryEngine(
    retriever=retriever,
    node_postprocessors=node_postprocessors
)

- There are several "advanced" forms of response synthesis. 
- In practice they don't seem to make much difference (for our data sets) - so far ...
- So the next section is optional

In [None]:
query_engine_refine = RetrieverQueryEngine(
    retriever=retriever,
    response_synthesizer=get_response_synthesizer(response_mode = "refine"),
    node_postprocessors=node_postprocessors
)

query_engine_compact = RetrieverQueryEngine(
    retriever=retriever,
    response_synthesizer=get_response_synthesizer(response_mode = "compact"),
    node_postprocessors=node_postprocessors
)

query_engine_tree_summarize = RetrieverQueryEngine(
    retriever=retriever,
    response_synthesizer=get_response_synthesizer(response_mode = "tree_summarize"),
    node_postprocessors=node_postprocessors
)

query_engine_simple_summarize = RetrieverQueryEngine(
    retriever=retriever,
    response_synthesizer=get_response_synthesizer(response_mode = "simple_summarize"),
    node_postprocessors=node_postprocessors
)

query_engine_accumulate = RetrieverQueryEngine(
    retriever=retriever,
    response_synthesizer=get_response_synthesizer(response_mode = "accumulate"),
    node_postprocessors=node_postprocessors
)

query_engine_compact_accumulate = RetrieverQueryEngine(
    retriever=retriever,
    response_synthesizer=get_response_synthesizer(response_mode = "compact_accumulate"),
    node_postprocessors=node_postprocessors
)


- Set up the query engine(s)

In [None]:
def generate_answer(value, response_mode):
    if response_mode == "minimal":
        return query_engine_minimal.query(value)
    elif response_mode == "refine":
        return query_engine_refine.query(value)
    elif response_mode == "compact":
        return query_engine_compact.query(value)
    elif response_mode == "tree_summarize":
        return query_engine_tree_summarize.query(value)
    elif response_mode == "simple_summarize":
        return query_engine_simple_summarize.query(value)
    elif response_mode == "accumulate":
        return query_engine_accumulate.query(value)
    elif response_mode == "compact_accumulate":
        return query_engine_compact_accumulate.query(value)

- Read a set of questions from an excel file
- Generate responses (answers)

In [None]:
questions_path = 'questions/ORCL_UTD_SPD_Questions.xlsx' 
df = pd.read_excel(questions_path, sheet_name='final')

for response_mode in response_mode_list:
    df['generated_answer_'+ response_mode] = df['question'].apply(generate_answer, response_mode = response_mode)
    tokencount_df['answer_' + response_mode + ' tokens'] = [token_counter.total_llm_token_count]
    token_counter.reset_counts()

- Fetch the list of source nodes (context) used to answer each question

In [None]:
from llama_index.core.schema import ImageNode, MetadataMode, NodeWithScore
from llama_index.core.utils import truncate_text

In [None]:
def fetch_node_source(query:str, n:int = 0):
    text_md = ""
    retrievals = retriever.retrieve(query)
    
    #source_text_fmt = truncate_text(retrievals[n].node.get_content(metadata_mode=MetadataMode.NONE).strip(), chunk_size)
    text_md += (
        f"**Node ID:** {retrievals[n].node.node_id}{chr(10)}"
        f"**Similarity:** {retrievals[n].score}{chr(10)}"
        f"**Text:** {retrievals[n].node.get_content()}{chr(10)}"
        f"**Metadata:** {retrievals[n].node.metadata}{chr(10)}"
    )
        
    return text_md

In [None]:
source_df = pd.DataFrame()
source_df['question_num'] = df['question_num']
source_df['question'] = df['question']


In [None]:
for n in range(similarity_top_k):
    source_df['node '+ str(n)] = df['question'].apply(fetch_node_source, n =n)

- Write answers, sources, and token counts to excel file

In [None]:
with pd.ExcelWriter("result/output.xlsx") as writer:
   
    df.to_excel(writer, sheet_name="Answers", index=False)
    source_df.to_excel(writer, sheet_name="Sources", index=False)
    tokencount_df.to_excel(writer, sheet_name="Token Counts", index=False)
    

- In case you just want to examine response to a single question along with the sources used

In [None]:
query = "are bifocals covered"

In [None]:
print(generate_answer(query, response_mode="minimal"))

In [None]:
for n in range(similarity_top_k):
    print (fetch_node_source(query,n))