In [1]:
from pathlib import Path
import os

In [2]:
from openinference.instrumentation.llama_index import LlamaIndexInstrumentor
from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter
from opentelemetry.sdk import trace as trace_sdk
from opentelemetry.sdk.trace.export import SimpleSpanProcessor

In [3]:
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, Settings, StorageContext
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.llms.groq import Groq
from llama_index.readers.file import PDFReader, PyMuPDFReader
from llama_index.storage.docstore.redis import RedisDocumentStore
from llama_index.storage.index_store.redis import RedisIndexStore
from llama_index.vector_stores.redis import RedisVectorStore
from llama_index.core.node_parser import SentenceSplitter

from redis import Redis
from redisvl.schema import IndexSchema

In [4]:
REDIS_HOST = os.getenv("REDIS_HOST", "127.0.0.1")
REDIS_PORT = os.getenv("REDIS_PORT", 6379)

In [5]:
endpoint = "http://phoenix:6006/v1/traces"  # Phoenix receiver address

tracer_provider = trace_sdk.TracerProvider()
tracer_provider.add_span_processor(SimpleSpanProcessor(OTLPSpanExporter(endpoint)))

LlamaIndexInstrumentor().instrument(tracer_provider=tracer_provider)

In [6]:
file_details = {
    "2Q24 Earnings Release_Final.pdf": "2nd quarter 2024 earnings release Final of PNC Bank with detailed financial information for first two quarters of year 2024.",
    "2Q24 Financial Supplement_Final.pdf": "Supplemental data for 2nd quarter 2024 earnings release Final of PNC Bank with financial tables.",
    "Board of Directors  PNC.pdf": "PNC Bank Board of Directors member information.",
    "PNC 2023 10-K.pdf": "PNC Bank Form 10-K report for year 2023.",
    "PNC 2023 Annual Report.pdf": "PNC Bank detailed Annual Report for year 2023.",
    "pnc_privacy_notice.pdf": "Information on what kind of customer personal information PNC Bank share and not.",
}
def get_meta(filename):
    return {
        "file_path": filename,
        "file_details": file_details.get(
            Path(filename).name, ""
        ),
    }

In [7]:
custom_schema = IndexSchema.from_dict(
    {
        # customize basic index specs
        "index": {
            "name": "user-index",
            "prefix": "index",
            "key_separator": ":",
        },
        # customize fields that are indexed
        "fields": [
            # required fields for llamaindex
            {"type": "tag", "name": "id"},
            {"type": "tag", "name": "doc_id"},
            {"type": "text", "name": "text"},
            # custom metadata fields
            {"type": "numeric", "name": "updated_at"},
            {"type": "tag", "name": "file_name"},
            # custom vector field definition for cohere embeddings
            {
                "type": "vector",
                "name": "vector",
                "attrs": {
                    "dims": 768,
                    "algorithm": "hnsw",
                    "distance_metric": "cosine",
                },
            },
        ],
    }
)

In [8]:
storage_context = StorageContext.from_defaults(
    docstore=RedisDocumentStore.from_host_and_port(
        host=REDIS_HOST, port=REDIS_PORT, namespace="llama_index"
    ),
    index_store=RedisIndexStore.from_host_and_port(
        host=REDIS_HOST, port=REDIS_PORT, namespace="llama_index"
    ),
    vector_store=RedisVectorStore(redis_client=Redis.from_url(f"redis://{REDIS_HOST}:{REDIS_PORT}"), 
                                  overwrite=True,
                                  schema=custom_schema
                                  )
)

ConnectionError: Error 113 connecting to 172.30.0.4:6379. No route to host.

In [13]:
# PDF Reader with `SimpleDirectoryReader`
parser = PyMuPDFReader()
file_extractor = {".pdf": parser}

In [14]:
documents = SimpleDirectoryReader(
    "./pnc", file_metadata=get_meta, file_extractor=file_extractor
).load_data()

In [9]:
# bge-base embedding model
Settings.embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-base-en-v1.5")

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/94.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/777 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [10]:
# Groq
Settings.llm = Groq(
    model="llama-3.1-8b-instant"
)

In [15]:
index = VectorStoreIndex.from_documents(
    documents
)

In [16]:
query_engine = index.as_query_engine()

In [17]:
response = query_engine.query(
    "Who is PNC Chairman? Provide evidence from documents to support your answer."
)
print(response)

William S. Demchak is the Chairman & Chief Executive Officer of The PNC Financial Services Group. This is supported by the document, which states: "William S. Demchak is chairman and chief executive officer of The PNC Financial Services Group, one of the largest diversified financial services companies in the United States."


In [18]:
response = query_engine.query("Tell me about Bryan Salesky work experiance.")
print(response)

Bryan Salesky has a significant work experience in the technology industry. He co-founded and served as the CEO of Stack AV Co., a developer of autonomous trucking solutions. Prior to this role, he co-founded and served as the CEO of Argo AI, LLC, a self-driving technology platform company. Additionally, he spent over a decade in various roles at leading technology companies, including Google and Carnegie Mellon University's National Robotics Engineering Center (NREC).


In [19]:
response = query_engine.query("What does PNC do with customer personal information?")
print(response)

PNC collects and shares customer personal information for various purposes. The types of personal information collected and shared depend on the product or service the customer has with PNC. This information can include social security numbers, income, account balances, account transactions, credit scores, and payment history.


In [20]:
response = query_engine.query(
    "What was PNC diluted earnings per common share in 2023? Provide evidence from documents to support your answer."
)
print(response)

According to the provided information, PNC diluted earnings per common share in 2023 was $12.79. This can be found in the "Basic and Diluted Earnings Per Common Share" table in the 10-K report for 2023, under the section "NOTE 13 EARNINGS PER SHARE".


In [21]:
response = query_engine.query(
    "How good PNC revenue was in 2023 compared to previous years?"
)
print(response)

PNC's revenue in 2023 was a record high, reaching $21.5 billion, a significant increase from previous years.


In [22]:
response = query_engine.query("Can you print a table showing PNC Revenue, Net Income and Total Non-interst Expenses for first two quarters of 2024? Provide evidence from documents to support your answer.")
print(response)

I cannot provide financial information for the first two quarters of 2024 as it is not available in the provided context information. Is there anything else I can help you with?
