In [1]:
from pathlib import Path

In [2]:
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, Settings
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.llms.groq import Groq
from llama_index.readers.file import PDFReader, PyMuPDFReader
from llama_index.readers.pdf_marker import PDFMarkerReader

In [3]:
file_details = {
    "2Q24 Earnings Release_Final.pdf": "2nd quarter 2024 earnings release Final of PNC Bank with detailed financial information for first two quarters of year 2024.",
    "2Q24 Financial Supplement_Final.pdf": "Supplemental data for 2nd quarter 2024 earnings release Final of PNC Bank with financial tables.",
    "Board of Directors  PNC.pdf": "PNC Bank Board of Directors member information.",
    "PNC 2023 10-K.pdf": "PNC Bank Form 10-K report for year 2023.",
    "PNC 2023 Annual Report.pdf": "PNC Bank detailed Annual Report for year 2023.",
    "pnc_privacy_notice.pdf": "Information on what kind of customer personal information PNC Bank share and not.",
}
def get_meta(filename):
    return {
        "file_path": filename,
        "file_details": file_details.get(
            Path(filename).name, ""
        ),
    }

In [4]:
# PDF Reader with `SimpleDirectoryReader`
parser = PDFMarkerReader()
file_extractor = {".pdf": parser}
documents = SimpleDirectoryReader("./data", file_extractor=file_extractor).load_data()

In [5]:
documents = SimpleDirectoryReader(
    "./pnc", file_metadata=get_meta, file_extractor=file_extractor
).load_data()

Loaded detection model vikp/surya_det3 on device cpu with dtype torch.float32
Loaded detection model vikp/surya_layout3 on device cpu with dtype torch.float32
Loaded reading order model vikp/surya_order on device cpu with dtype torch.float32
Loaded recognition model vikp/surya_rec on device cpu with dtype torch.float32
Loaded texify model to cpu with torch.float32 dtype


Detecting bboxes: 100%|██████████| 3/3 [01:07<00:00, 22.52s/it]
Detecting bboxes: 100%|██████████| 2/2 [00:50<00:00, 25.23s/it]
Finding reading order:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
# bge-base embedding model
Settings.embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-base-en-v1.5")

In [None]:
# Groq
Settings.llm = Groq(
    model="llama-3.1-8b-instant",
    api_key="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",
)

In [None]:
index = VectorStoreIndex.from_documents(
    documents,
)

In [None]:
query_engine = index.as_query_engine()

In [None]:
response = query_engine.query(
    "Who is PNC Chairman? Provide evidence from documents to support your answer."
)
print(response)

In [None]:
response = query_engine.query("Tell me about Bryan Salesky work experiance.")
print(response)

In [None]:
response = query_engine.query("What does PNC do with customer personal information?")
print(response)

In [None]:
response = query_engine.query(
    "What was PNC diluted earnings per common share in 2023? Provide evidence from documents to support your answer."
)
print(response)

In [None]:
response = query_engine.query(
    "How good PNC revenue was in 2023 compared to previous years?"
)
print(response)

In [None]:
response = query_engine.query("Can you print a table showing PNC Revenue, Net Income and Total Non-interst Expenses for first two quarters of 2024? Provide evidence from documents to support your answer.")
print(response)