In [None]:
%pip install llama-index-graph-stores-neo4j

In [1]:
import nest_asyncio

nest_asyncio.apply()

In [2]:
# load environment variables
from dotenv import load_dotenv
load_dotenv()

True

In [3]:
filename = "/Users/enigma6174/STARTUP/data/pdf/lost_in_the_middle.pdf"
output_dir = "/Users/enigma6174/STARTUP/data/images"

## Partition The Document

In [None]:
# import os

# from unstructured_client import UnstructuredClient
# from unstructured_client.models import operations, shared
# from unstructured_client.models.errors import SDKError

# # unstructured io serverless client
# client = UnstructuredClient(
#     api_key_auth=os.environ.get("UNSTRUCTURED_IO_API_KEY"),
#     server_url=os.environ.get("UNSTRUCTURED_IO_API_URL")
# )

# filename = "/Users/enigma6174/STARTUP/data/pdf/lost_in_the_middle.pdf"
# file = open(filename, "rb")

# # prepare partition request
# request = shared.PartitionParameters(
#     files=shared.Files(
#         content=file.read(),
#         file_name=filename,
#     ),
#     strategy="hi_res",
#     unique_element_ids=True,
#     extract_image_block_types=["Image", "Table"],
# )

# # send partition request
# try:
#     response = client.general.partition(request)
# except SDKError as e:
#     print(e)

In [4]:
from time import perf_counter
from unstructured.partition.pdf import partition_pdf

t1 = perf_counter()

# partition pdf file
elements = partition_pdf(
    filename=filename,
    strategy="hi_res",
    extract_images_in_pdf=True,
    extract_image_block_types=["Image", "Table"],
    extract_image_block_to_payload=False,
    extract_image_block_to_output_dir=output_dir
)

t2 = perf_counter()
runtime = t2 - t1
print(f"total runtime: {runtime}")

total runtime: 62.86170066700288


## Chunk The Partitions

In [20]:
from unstructured.chunking.title import chunk_by_title

# chunk previously partitioned elements
chunks = chunk_by_title(
    elements,
    max_characters=2000,
    new_after_n_chars=1500,
    overlap=100
)

In [23]:
print(f"{len(elements)} elements combined into {len(chunks)} chunks")

247 elements combined into 58 chunks


In [37]:
from unstructured.staging.base import elements_from_base64_gzipped_json

# deserialize gzipped base64 json
for chunk in chunks:
    metadata = chunk.metadata.to_dict()
    print(f"Element ID: {chunk.id}")

    # deserialization 
    orig_elements = elements_from_base64_gzipped_json(metadata["orig_elements"])
    print(f"    Uncompressed orig_elements:")
    for orig_element in orig_elements:
        print(f"        {orig_element.category}: {orig_element.text}")
        print("\n")

Element ID: 848c9697-1ede-46c0-80ed-29edca4cb418
    Uncompressed orig_elements:
        UncategorizedText: 3 2 0 2


        Header: v o N 0 2 ] L C . s c [


        UncategorizedText: 3 v 2 7 1 3 0 . 7 0 3 2 : v i X r a


        Title: Lost in the Middle: How Language Models Use Long Contexts


        NarrativeText: Nelson F. Liu1∗ Kevin Lin2 Michele Bevilacqua3 John Hewitt1 Fabio Petroni3 2University of California, Berkeley nfliu@cs.stanford.edu Ashwin Paranjape3 Percy Liang1 1Stanford University 3Samaya AI


Element ID: 0fb21167-21d4-4d9c-b3ae-832553be25b7
    Uncompressed orig_elements:
        Title: Abstract


        NarrativeText: 20 Total Retrieved Documents (~4K tokens)


        NarrativeText: While recent language models have the abil- ity to take long contexts as input, relatively little is known about how well they use longer context. We analyze the performance of language models on two tasks that require identifying relevant information in their in- put contexts: mul

In [43]:
chunks[0].to_dict()

{'type': 'CompositeElement',
 'element_id': '848c9697-1ede-46c0-80ed-29edca4cb418',
 'text': '3 2 0 2\n\nv o N 0 2 ] L C . s c [\n\n3 v 2 7 1 3 0 . 7 0 3 2 : v i X r a\n\nLost in the Middle: How Language Models Use Long Contexts\n\nNelson F. Liu1∗ Kevin Lin2 Michele Bevilacqua3 John Hewitt1 Fabio Petroni3 2University of California, Berkeley nfliu@cs.stanford.edu Ashwin Paranjape3 Percy Liang1 1Stanford University 3Samaya AI',
 'metadata': {'file_directory': '/Users/enigma6174/STARTUP/data/pdf',
  'filename': 'lost_in_the_middle.pdf',
  'filetype': 'application/pdf',
  'languages': ['eng'],
  'last_modified': '2024-07-17T18:13:14',
  'page_number': 1,
  'orig_elements': 'eJztlt1u4zYQhV9loOuszB9JlHzVNMAibb1B0DhAgTQwKHJos5VJV6KddRd9945sB3CboMUukJtt7zxHc0jO8NPID58y7HCNIS28zaaQKaV1VbesZa6qK20ZGluU3FVFIVlbttkFZGtM2uqkKf9TZmLsrQ864XCIO72P27RYoV+uEilCyoY8J/nJ27QilVdlQeom+pBG38NDUeakKF7nzeMFnMJaspyNIWcsb17Gx3QSsmE/JFyPFdz6j9jdbbTB7A964HyHC+t7NCn2+zFhcj9gP0ww+OVaV1wVk7v55Y/z+9vJWNRkY1128gW9xtHRx

## Create Documents From Chunks 

In [48]:
from llama_index.core import Document
from unstructured.staging.base import elements_from_base64_gzipped_json

documents = []
for chunk in chunks:
    metadata = chunk.metadata.to_dict()

    # deserialize base64 encoded gzipped json
    orig_elements = elements_from_base64_gzipped_json(metadata["orig_elements"])
    doc_text = chunk.text # text content for document
    doc_id = chunk.id

    # iterate over orig_elements; append text components to doc_text
    for orig_element in orig_elements:
        if orig_element.category == "NarrativeText" or orig_element.category == "ListItem":
            doc_text  = f"{doc_text}\n{orig_element.text}"

    # metadata for each llama-index document class
    metadata = {
        "file_name": metadata["filename"],
        "language": metadata["languages"][0],
        "page_number": metadata["page_number"]
    }

    # document
    document = Document(
        text=doc_text,
        doc_id=doc_id,
        metadata=metadata,
        excluded_llm_metadata_keys=["file_name"]
    )
    documents.append(document)

In [53]:
len(documents)

58

## Initialize Graph Store And Index

In [52]:
import os
from llama_index.graph_stores.neo4j import Neo4jPropertyGraphStore

graph_store = Neo4jPropertyGraphStore(
    username=os.environ.get("NEO4J_USERNAME"),
    password=os.environ.get("NEO4J_PASSWORD"),
    url=os.environ.get("NEO4J_URI")
)



In [56]:
from llama_index.core import PropertyGraphIndex
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.llms.openai import OpenAI
from llama_index.core.indices.property_graph import SchemaLLMPathExtractor

# create index from property graph store
index = PropertyGraphIndex.from_documents(
    documents,
    embed_model=OpenAIEmbedding(model_name="text-embedding-3-small"),
    kg_extractors=[
        SchemaLLMPathExtractor(
            llm=OpenAI(model="gpt-4o", temperature=0.0)
        )
    ],
    property_graph_store=graph_store,
    show_progress=True
)

Parsing nodes:   0%|          | 0/58 [00:00<?, ?it/s]

Extracting paths from text with schema: 100%|███████████████████████████████████████████████████████████████████████████████████| 60/60 [02:01<00:00,  2.02s/it]
Generating embeddings: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:02<00:00,  2.86s/it]
Generating embeddings: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:02<00:00,  1.12s/it]


## Querying And Retrieval

In [58]:
# simple graph retriever
retriever = index.as_retriever(
    include_text=False # leave out source text from returned nodes; for demonstration 
)

# test retriever
nodes = retriever.retrieve("How does changing the location of relevant information affect the performance of LLM?")

for node in nodes:
    print(node.text)

Ying Sheng -> WORKED_ON -> How long can open-source LLMs truly promise on context length
Anze Xie -> WORKED_ON -> How long can open-source LLMs truly promise on context length
Rulin Shao -> WORKED_ON -> How long can open-source LLMs truly promise on context length
Dacheng Li -> WORKED_ON -> How long can open-source LLMs truly promise on context length
O’Connor and Andreas -> WORKED_ON -> Transformer LMs
Naman Goyal -> WORKED_ON -> LLaMA
Baptiste Rozière -> WORKED_ON -> LLaMA
Timothée Lacroix -> WORKED_ON -> LLaMA
Marie-Anne Lachaux -> WORKED_ON -> LLaMA
Xavier Martinet -> WORKED_ON -> LLaMA
Gautier Izac-ard -> WORKED_ON -> LLaMA
Thibaut Lavril -> WORKED_ON -> LLaMA
Hugo Touvron -> WORKED_ON -> LLaMA
Ivgi et al. -> WORKED_ON -> question answering performance


In [59]:
# build query engine
query_engine = index.as_query_engine(include_text=True)

# test system
response = query_engine.query("How does changing the location of relevant information affect the performance of language models?")
print(str(response))

Changing the location of relevant information in the input context can impact the performance of language models, as observed in the provided text. Specifically, when relevant information occurs in the middle of the input context, the performance of language models, such as GPT-4, can degrade. This suggests that the position of relevant information within the input context can influence how well a language model processes and utilizes that information.
