In [None]:
%pip install llama-index-graph-stores-neo4j

In [None]:
import nest_asyncio

nest_asyncio.apply()

In [None]:
# load environment variables
from dotenv import load_dotenv
load_dotenv()

In [None]:
filename = "/Users/enigma6174/STARTUP/data/pdf/lost_in_the_middle.pdf"
output_dir = "/Users/enigma6174/STARTUP/data/images"

## Partition The Document

In [None]:
# import os

# from unstructured_client import UnstructuredClient
# from unstructured_client.models import operations, shared
# from unstructured_client.models.errors import SDKError

# # unstructured io serverless client
# client = UnstructuredClient(
#     api_key_auth=os.environ.get("UNSTRUCTURED_IO_API_KEY"),
#     server_url=os.environ.get("UNSTRUCTURED_IO_API_URL")
# )

# filename = "/Users/enigma6174/STARTUP/data/pdf/lost_in_the_middle.pdf"
# file = open(filename, "rb")

# # prepare partition request
# request = shared.PartitionParameters(
#     files=shared.Files(
#         content=file.read(),
#         file_name=filename,
#     ),
#     strategy="hi_res",
#     unique_element_ids=True,
#     extract_image_block_types=["Image", "Table"],
# )

# # send partition request
# try:
#     response = client.general.partition(request)
# except SDKError as e:
#     print(e)

In [None]:
from time import perf_counter
from unstructured.partition.pdf import partition_pdf

t1 = perf_counter()

# partition pdf file
elements = partition_pdf(
    filename=filename,
    strategy="hi_res",
    extract_images_in_pdf=True,
    extract_image_block_types=["Image", "Table"],
    extract_image_block_to_payload=False,
    extract_image_block_to_output_dir=output_dir
)

t2 = perf_counter()
runtime = t2 - t1
print(f"total runtime: {runtime}")

## Chunk The Partitions

In [None]:
from unstructured.chunking.title import chunk_by_title

# chunk previously partitioned elements
chunks = chunk_by_title(
    elements,
    max_characters=2000,
    new_after_n_chars=1500,
    overlap=100
)

In [None]:
print(f"{len(elements)} elements combined into {len(chunks)} chunks")

In [None]:
from unstructured.staging.base import elements_from_base64_gzipped_json

# deserialize gzipped base64 json
for chunk in chunks:
    metadata = chunk.metadata.to_dict()
    print(f"Element ID: {chunk.id}")

    # deserialization 
    orig_elements = elements_from_base64_gzipped_json(metadata["orig_elements"])
    print(f"    Uncompressed orig_elements:")
    for orig_element in orig_elements:
        print(f"        {orig_element.category}: {orig_element.text}")
        print("\n")

In [None]:
chunks[0].to_dict()

## Create Documents From Chunks 

In [None]:
from llama_index.core import Document
from unstructured.staging.base import elements_from_base64_gzipped_json

documents = []
for chunk in chunks:
    metadata = chunk.metadata.to_dict()

    # deserialize base64 encoded gzipped json
    orig_elements = elements_from_base64_gzipped_json(metadata["orig_elements"])
    doc_text = chunk.text # text content for document
    doc_id = chunk.id

    # iterate over orig_elements; append text components to doc_text
    for orig_element in orig_elements:
        if orig_element.category == "NarrativeText" or orig_element.category == "ListItem":
            doc_text  = f"{doc_text}\n{orig_element.text}"

    # metadata for each llama-index document class
    metadata = {
        "file_name": metadata["filename"],
        "language": metadata["languages"][0],
        "page_number": metadata["page_number"]
    }

    # document
    document = Document(
        text=doc_text,
        doc_id=doc_id,
        metadata=metadata,
        excluded_llm_metadata_keys=["file_name"]
    )
    documents.append(document)

In [None]:
len(documents)

## Initialize Graph Store And Index

In [None]:
import os
from llama_index.graph_stores.neo4j import Neo4jPropertyGraphStore

graph_store = Neo4jPropertyGraphStore(
    username=os.environ.get("NEO4J_USERNAME"),
    password=os.environ.get("NEO4J_PASSWORD"),
    url=os.environ.get("NEO4J_URI")
)

In [None]:
from llama_index.core import PropertyGraphIndex
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.llms.openai import OpenAI
from llama_index.core.indices.property_graph import SchemaLLMPathExtractor

# create index from property graph store
index = PropertyGraphIndex.from_documents(
    documents,
    embed_model=OpenAIEmbedding(model_name="text-embedding-3-small"),
    kg_extractors=[
        SchemaLLMPathExtractor(
            llm=OpenAI(model="gpt-4o", temperature=0.0)
        )
    ],
    property_graph_store=graph_store,
    show_progress=True
)

## Querying And Retrieval

In [None]:
# simple graph retriever
retriever = index.as_retriever(
    include_text=False # leave out source text from returned nodes; for demonstration 
)

# test retriever
nodes = retriever.retrieve("How does changing the location of relevant information affect the performance of LLM?")

for node in nodes:
    print(node.text)

In [None]:
# build query engine
query_engine = index.as_query_engine(include_text=True)

# test system
response = query_engine.query("How does changing the location of relevant information affect the performance of language models?")
print(str(response))