### Advanced PDF Parser ###

In [None]:
from config import set_environment
set_environment()

In [None]:
import logging
import sys

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

In [None]:
import nest_asyncio

nest_asyncio.apply()

In [None]:
from llama_index.core import Settings
from llama_index.core.response.notebook_utils import display_response, display_source_node

from llama_index.core import SimpleDirectoryReader
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core import VectorStoreIndex
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.postprocessor import SimilarityPostprocessor, KeywordNodePostprocessor

from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding

from llama_index.core import get_response_synthesizer

from llama_parse import LlamaParse

In [None]:
# Various Test Docs

paul_graham = "data/test/paul_graham_essay.txt"

aetna_policy = "data/benefits_qa_store/Aetna"
company_policy = "data/benefits_qa_store/Company - SPD.pdf"


In [None]:
# Node Parser
chunk_size = 1024
chunk_overlap = 20

# Retriever Settings
similarity_top_k = 2

# Context Post Processor Settings
required_key_words = [""]
excluded_key_words = [""]
similarity_cutoff = 0.2

# Response Settings
response_mode = "refine" # "refine", "tree_summarize"

# Source Node Display Length
source_length = 200

# Document 
document_list = [aetna_policy]

# Query

query = "what is the overall deductible"

In [None]:
Settings.embed_model = OpenAIEmbedding(model="text-embedding-3-large",dimensions=512,)
#Settings.embed_model = OpenAIEmbedding(model="text-embedding-3-small")

#Settings.embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")

In [None]:
Settings.llm = OpenAI(temperature=0, model="gpt-4")

In [None]:
#documents = SimpleDirectoryReader(input_files=document_list).load_data()

#file_extractor = {".pdf": parser}
reader = SimpleDirectoryReader(aetna_policy)
documents = reader.load_data()

In [None]:
parser = LlamaParse(
    #api_key="...",  # can also be set in your env as LLAMA_CLOUD_API_KEY
    result_type="markdown"  # "markdown" and "text" are available
)

file_extractor = {".pdf": parser}
reader = SimpleDirectoryReader(aetna_policy, file_extractor=file_extractor)
documents = reader.load_data()

In [None]:
node_parser = SentenceSplitter(chunk_size=chunk_size, chunk_overlap = chunk_overlap)
nodes = node_parser.get_nodes_from_documents(documents)
# set node ids to be a constant
for idx, node in enumerate(nodes):
    node.id_ = f"node-{idx}"

In [None]:
index = VectorStoreIndex(nodes, embed_model=Settings.embed_model, show_progress=True)

In [None]:
retriever = VectorIndexRetriever(
    index=index,
    similarity_top_k=similarity_top_k
)

In [None]:
node_postprocessors = [
    #KeywordNodePostprocessor(
    #   required_keywords=required_key_words, exclude_keywords=excluded_key_words
    #),
    SimilarityPostprocessor(similarity_cutoff=similarity_cutoff) 
]

In [None]:
response_synthesizer = get_response_synthesizer(response_mode = response_mode)

In [None]:
query_engine = RetrieverQueryEngine(
    retriever=retriever,
    response_synthesizer=response_synthesizer,
    node_postprocessors=node_postprocessors
)

In [21]:
query = "What is the cost of mental health coverage?"

In [22]:
response = query_engine.query(query)
display_response(response)

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


**`Final Response:`** The cost of mental health coverage under this plan includes a 20% coinsurance for both outpatient and inpatient services when using an in-network provider. Office visits for mental health services have no charge. However, these services are not covered if you use an out-of-network provider. It's also important to note that prior authorization is required for outpatient and inpatient services.

In [None]:
retrievals = retriever.retrieve(query)
for n in retrievals:
    display_source_node(n, source_length=source_length)