### Auto Merge Retriever ###

In [1]:
# Fetch API keys from config.py
import os
from config import set_environment 
set_environment()

import logging
import sys
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

# Only for notebook
import nest_asyncio
nest_asyncio.apply()

In [2]:
from llama_index.core import Settings
from llama_index.core import VectorStoreIndex
from llama_index.readers.file import PDFReader
from llama_index.readers.file import PyMuPDFReader

from pathlib import Path

from llama_index.core import Document
from llama_index.core.node_parser import (
    HierarchicalNodeParser,
    SentenceSplitter,
    get_leaf_nodes, 
    get_root_nodes
)
from llama_index.core.storage.docstore import SimpleDocumentStore
from llama_index.core import StorageContext
from llama_index.core.retrievers import AutoMergingRetriever
from llama_index.core.query_engine import RetrieverQueryEngine

import pandas as pd

INFO:numexpr.utils:Note: NumExpr detected 12 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
Note: NumExpr detected 12 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
INFO:numexpr.utils:NumExpr defaulting to 8 threads.
NumExpr defaulting to 8 threads.


In [3]:
# Node Parser
#chunk_sizes=[2048, 512, 128]
chunk_sizes=[4096, 1024, 256]

# Retriever Settings
similarity_top_k = 6

In [None]:
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding

Settings.embed_model = OpenAIEmbedding(model="text-embedding-3-large",dimensions=512,)
Settings.llm = OpenAI(temperature=0, model="gpt-4")

In [4]:
from llama_index.llms.cohere import Cohere
from llama_index.core import ServiceContext
from llama_index.embeddings.cohere import CohereEmbedding

Settings.llm = Cohere(api_key=os.environ["COHERE_API_KEY"], model="command-r")
Settings.embed_model = CohereEmbedding(
    cohere_api_key=os.environ["COHERE_API_KEY"],
    model_name="embed-english-v3.0",
    input_type="search_query",
)

In [5]:
import tiktoken
from llama_index.core.callbacks import CallbackManager, TokenCountingHandler

token_counter = TokenCountingHandler(
    tokenizer=tiktoken.encoding_for_model("gpt-4").encode
)

Settings.callback_manager = CallbackManager([token_counter])
tokencount_df = pd.DataFrame()

In [6]:
loader = PyMuPDFReader()

docs0 = loader.load("data/Oracle_United_SPD.pdf")
doc_text = "\n\n".join([d.get_content() for d in docs0])
docs = [Document(text=doc_text)]

node_parser = HierarchicalNodeParser.from_defaults(
    chunk_sizes=chunk_sizes
)
nodes = node_parser.get_nodes_from_documents(docs)
docstore = SimpleDocumentStore()

# insert nodes into docstore
docstore.add_documents(nodes)

# define storage context (will include vector store by default too)
storage_context = StorageContext.from_defaults(docstore=docstore)

## Load index into vector index
leaf_nodes = get_leaf_nodes(nodes)
root_nodes = get_root_nodes(nodes)

base_index = VectorStoreIndex(
    leaf_nodes,
    storage_context=storage_context,
    show_progress = True
)

Generating embeddings:   0%|          | 0/624 [00:00<?, ?it/s]

In [7]:
tokencount_df['document_tokens'] = [token_counter.total_embedding_token_count]
token_counter.reset_counts()

In [8]:
base_retriever = base_index.as_retriever(similarity_top_k=similarity_top_k)
retriever = AutoMergingRetriever(base_retriever, storage_context, verbose=True)

In [9]:
query_engine = RetrieverQueryEngine.from_args(retriever)

In [10]:
def generate_answer(value):
    return query_engine.query(value)

In [11]:
questions_path = 'questions/ORCL_UTD_SPD_Questions_Difficult.xlsx' 
df = pd.read_excel(questions_path, sheet_name='Final')

df['generated_answer'] = df['question'].apply(generate_answer)
tokencount_df['answer_tokens'] = [token_counter.total_llm_token_count]
token_counter.reset_counts()

INFO:llama_index.core.retrievers.auto_merging_retriever:> Merging 2 nodes into parent node.
> Parent node id: 9f4d223d-816e-460c-bf96-7a3bd11a8d22.
> Parent node text: 2024 Oracle America, Inc. Flexible Benefit Plan Document and SPD                                 ...

> Merging 2 nodes into parent node.
> Parent node id: 9f4d223d-816e-460c-bf96-7a3bd11a8d22.
> Parent node text: 2024 Oracle America, Inc. Flexible Benefit Plan Document and SPD                                 ...

> Merging 2 nodes into parent node.
> Parent node id: 9f4d223d-816e-460c-bf96-7a3bd11a8d22.
> Parent node text: 2024 Oracle America, Inc. Flexible Benefit Plan Document and SPD                                 ...

INFO:llama_index.core.retrievers.auto_merging_retriever:> Merging 1 nodes into parent node.
> Parent node id: 2baa3a4e-07c2-4979-96cd-2b74be4e7869.
> Parent node text: By agreeing to provide this assignment in exchange for participating in and accepting benefits, Y...

> Merging 1 nodes into parent no

In [12]:
def fetch_node_source(query:str):
    text_md = ""
    nodes = retriever.retrieve(query)
    
    for node in nodes:
        text_md += (
            f"**Node ID:** {node.node_id}{chr(10)}"
            f"**Similarity:** {node.score}{chr(10)}"
            f"**Text:** {node.get_content()}{chr(10)}"
            f"**Metadata:** {node.metadata}{chr(10)}"
        )
        
    return text_md

In [13]:
source_df = pd.DataFrame()
source_df['question_num'] = df['question_num']
source_df['question'] = df['question']


In [14]:
source_df['nodes'] = df['question'].apply(fetch_node_source)

INFO:llama_index.core.retrievers.auto_merging_retriever:> Merging 2 nodes into parent node.
> Parent node id: 9f4d223d-816e-460c-bf96-7a3bd11a8d22.
> Parent node text: 2024 Oracle America, Inc. Flexible Benefit Plan Document and SPD                                 ...

> Merging 2 nodes into parent node.
> Parent node id: 9f4d223d-816e-460c-bf96-7a3bd11a8d22.
> Parent node text: 2024 Oracle America, Inc. Flexible Benefit Plan Document and SPD                                 ...

> Merging 2 nodes into parent node.
> Parent node id: 9f4d223d-816e-460c-bf96-7a3bd11a8d22.
> Parent node text: 2024 Oracle America, Inc. Flexible Benefit Plan Document and SPD                                 ...

INFO:llama_index.core.retrievers.auto_merging_retriever:> Merging 1 nodes into parent node.
> Parent node id: 2baa3a4e-07c2-4979-96cd-2b74be4e7869.
> Parent node text: By agreeing to provide this assignment in exchange for participating in and accepting benefits, Y...

> Merging 1 nodes into parent no

In [15]:
with pd.ExcelWriter("result/output.xlsx") as writer:
   
    df.to_excel(writer, sheet_name="Answers", index=False)
    source_df.to_excel(writer, sheet_name="Sources", index=False)
    tokencount_df.to_excel(writer, sheet_name="Token Counts", index=False)

In [None]:
from llama_index.core.response.notebook_utils import display_source_node

In [None]:
query = "Is laser surgery for eyes covered by United"
nodes = retriever.retrieve(query)
base_nodes = base_retriever.retrieve(query)

In [None]:
print(fetch_node_source(query))

In [None]:
for node in nodes:
    display_source_node(node, source_length=256)

In [None]:
for node in base_nodes:
    display_source_node(node, source_length=256)