In [1]:
import os
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Access the variables
host = os.getenv("OPENSEARCH_HOST", "localhost")
port = int(os.getenv("OPENSEARCH_PORT", 9201))
url = f"http://{host}:{port}"

auth = (
    os.getenv("OPENSEARCH_USER", "admin"),
    os.getenv("OPENSEARCH_PASSWORD", "admin"),
)
index_name = os.getenv("OPENSEARCH_INDEX_NAME", "langchain")

print(f"Host: {host}, Port: {port}, Auth: {auth}, Index: {index_name}")


Host: localhost, Port: 9201, Auth: ('admin', 'Supe5Senh@'), Index: opensearch_graphvector_notebook


In [2]:
import sys
sys.path.append("/Users/pedropacheco/Projects/dev/langchain/libs/community")


In [3]:
from langchain_community.graph_vectorstores.content_graph import ContentGraph


In [4]:
import pathlib
g = ContentGraph("Graph Rag")
g.fromPDFDocument(pathlib.Path("sample10.pdf"),output_image_path="./images", reset_graph=True, infer_hierarchy=True)
g.graph


INFO:langchain_community.graph_vectorstores.content_graph:Synchronously processing PDF document from 'sample10.pdf'...
INFO:pikepdf._core:pikepdf C++ to Python logger bridge initialized
INFO:unstructured_inference:Reading PDF for file: sample10.pdf ...
INFO:unstructured_inference:Loading the Table agent ...
INFO:unstructured_inference:Loading the table structure model ...
INFO:timm.models._builder:Loading pretrained weights from Hugging Face hub (timm/resnet18.a1_in1k)
INFO:timm.models._hub:[timm/resnet18.a1_in1k] Safe alternative available for 'pytorch_model.bin' (as 'model.safetensors'). Loading weights using safetensors.
INFO:timm.models._builder:Missing keys (fc.weight, fc.bias) discovered while loading pretrained weights. This is expected if model is being adapted.


[Document(id='root', metadata={'file_date': datetime.datetime(2024, 11, 8, 16, 28, 51, 597270), 'links': [Link(kind='Title', direction='out', tag='a587eb4bb2db07115a3890531c896281'), Link(kind='Title', direction='out', tag='7cfe0d89c10c4f0aed70ae75fdeecbb2'), Link(kind='Title', direction='out', tag='09f5af3237057d3c83c4ac9432a8bd65'), Link(kind='Title', direction='out', tag='af55be232f6114fa7840d14941c92600'), Link(kind='Title', direction='out', tag='1c9c79a725133bd8fd1c9345acf2a5ce'), Link(kind='Title', direction='out', tag='ab95516f1227f26842c62074ff1a5171'), Link(kind='Title', direction='out', tag='dd50a2eeaad19e27a3b09c7c2f85b197'), Link(kind='Title', direction='out', tag='43a339c7e27d89a8db8800226eca0d93'), Link(kind='Title', direction='out', tag='d0ee92a476a42f097c3fd1a2d87660fc')]}, page_content='sample10.pdf'),
 Document(id='a587eb4bb2db07115a3890531c896281', metadata={'type': 'Title', 'links': [Link(kind='root', direction='in', tag='root')], 'detection_class_prob': 0.410677462

In [5]:
from langchain_community.graph_vectorstores.opensearch import OpenSearchGraphVectorStore
from langchain_openai import OpenAIEmbeddings
embedding_model = OpenAIEmbeddings()

os_client = OpenSearchGraphVectorStore(
    opensearch_url=url,
    http_auth=auth,
    index_name=index_name,
    embedding=embedding_model,
    reset_index=True,
)


  os_client = OpenSearchGraphVectorStore(
INFO:opensearch:HEAD https://localhost:9201/opensearch_graphvector_notebook [status:200 request:0.435s]
INFO:opensearch:POST https://localhost:9201/opensearch_graphvector_notebook/_delete_by_query [status:200 request:0.310s]


In [6]:
os_client.add_content_graph(g)


INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:opensearch:GET https://localhost:9201/opensearch_graphvector_notebook [status:200 request:0.008s]
INFO:opensearch:POST https://localhost:9201/_bulk [status:200 request:0.203s]
INFO:opensearch:POST https://localhost:9201/_bulk [status:200 request:0.066s]
INFO:opensearch:POST https://localhost:9201/opensearch_graphvector_notebook/_refresh [status:200 request:0.146s]


In [7]:
os_client.get_documents()


INFO:opensearch:POST https://localhost:9201/opensearch_graphvector_notebook/_search?size=10 [status:200 request:0.044s]


[Document(id='root', metadata={'file_date': '2024-11-08T16:28:51.597270', 'links': {Link(kind='Title', direction='out', tag='1c9c79a725133bd8fd1c9345acf2a5ce'), Link(kind='Title', direction='out', tag='af55be232f6114fa7840d14941c92600'), Link(kind='Title', direction='out', tag='a587eb4bb2db07115a3890531c896281'), Link(kind='Title', direction='out', tag='43a339c7e27d89a8db8800226eca0d93'), Link(kind='Title', direction='out', tag='dd50a2eeaad19e27a3b09c7c2f85b197'), Link(kind='Title', direction='out', tag='d0ee92a476a42f097c3fd1a2d87660fc'), Link(kind='Title', direction='out', tag='ab95516f1227f26842c62074ff1a5171'), Link(kind='Title', direction='out', tag='7cfe0d89c10c4f0aed70ae75fdeecbb2'), Link(kind='Title', direction='out', tag='09f5af3237057d3c83c4ac9432a8bd65')}}, page_content='sample10.pdf'),
 Document(id='a587eb4bb2db07115a3890531c896281', metadata={'type': 'Title', 'links': {Link(kind='root', direction='in', tag='root')}, 'detection_class_prob': 0.4106774628162384, 'coordinates'

In [13]:
os_client.similarity_search(
    "Can you explain the historical background of placeholder text?"
)


INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:opensearch:POST https://localhost:9201/opensearch_graphvector_notebook/_search?size=4 [status:200 request:0.167s]


[Document(id='09f5af3237057d3c83c4ac9432a8bd65', metadata={'type': 'Title', 'links': {Link(kind='NarrativeText', direction='out', tag='d2524ec0a9c23881dd42d90d099ef6e4'), Link(kind='ListItem', direction='out', tag='f686f97fe0738594ef89fe3d75b89e5a'), Link(kind='NarrativeText', direction='out', tag='1d3f374f8e8037c23d63c171bb53858b'), Link(kind='root', direction='in', tag='root')}, 'detection_class_prob': 0.7727823853492737, 'coordinates': {'points': [[165.8494415283203, 1085.2807716369907], [165.8494415283203, 1155.9474311925464], [1486.842529296875, 1155.9474311925464], [1486.842529296875, 1085.2807716369907]], 'system': 'PixelSpace', 'layout_width': 1653, 'layout_height': 2339}, 'last_modified': '2024-11-08T16:28:51', 'filetype': 'application/pdf', 'languages': ['eng'], 'page_number': 1, 'filename': 'sample10.pdf'}, page_content='Historical Background of Placeholder Text'),
 Document(id='7cfe0d89c10c4f0aed70ae75fdeecbb2', metadata={'type': 'Title', 'links': {Link(kind='root', directi

In [14]:
import asyncio

async def runner():
    async for document in os_client.asimilarity_search(
        "Can you explain the historical background of placeholder text?"
    ):
        print(document.id)


# Run the function directly if in an interactive environment like Jupyter Notebook
await runner()


INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:opensearch:POST https://localhost:9201/opensearch_graphvector_notebook/_search [status:200 request:0.071s]


09f5af3237057d3c83c4ac9432a8bd65
7cfe0d89c10c4f0aed70ae75fdeecbb2
1d3f374f8e8037c23d63c171bb53858b
a587eb4bb2db07115a3890531c896281


In [19]:
docs = os_client.get_documents(k=10)
print(docs)


INFO:opensearch:POST https://localhost:9201/opensearch_graphvector_notebook/_search?size=10 [status:200 request:0.061s]


[Document(id='root', metadata={'file_date': '2024-11-08T16:28:51.597270', 'links': {Link(kind='Title', direction='out', tag='1c9c79a725133bd8fd1c9345acf2a5ce'), Link(kind='Title', direction='out', tag='af55be232f6114fa7840d14941c92600'), Link(kind='Title', direction='out', tag='a587eb4bb2db07115a3890531c896281'), Link(kind='Title', direction='out', tag='43a339c7e27d89a8db8800226eca0d93'), Link(kind='Title', direction='out', tag='dd50a2eeaad19e27a3b09c7c2f85b197'), Link(kind='Title', direction='out', tag='d0ee92a476a42f097c3fd1a2d87660fc'), Link(kind='Title', direction='out', tag='ab95516f1227f26842c62074ff1a5171'), Link(kind='Title', direction='out', tag='7cfe0d89c10c4f0aed70ae75fdeecbb2'), Link(kind='Title', direction='out', tag='09f5af3237057d3c83c4ac9432a8bd65')}}, page_content='sample10.pdf'), Document(id='a587eb4bb2db07115a3890531c896281', metadata={'type': 'Title', 'links': {Link(kind='root', direction='in', tag='root')}, 'detection_class_prob': 0.4106774628162384, 'coordinates':

In [20]:
doc = os_client.search_by_id(docs[0].id)
doc.page_content


INFO:opensearch:GET https://localhost:9201/opensearch_graphvector_notebook/_doc/root [status:200 request:0.013s]


'sample10.pdf'

In [None]:
import asyncio

async def runner():
    doc = await os_client.asearch_by_id(docs[0].id)
    print(doc.page_content)


# Run the function directly if in an interactive environment like Jupyter Notebook
await runner()


INFO:opensearch:GET https://localhost:9201/opensearch_graphvector_notebook/_doc/root [status:200 request:0.063s]


sample10.pdf


{
  "took": 12,
  "timed_out": false,
  "_shards": {
    "total": 3,
    "successful": 3,
    "skipped": 0,
    "failed": 0
  },
  "hits": {
    "total": {
      "value": 100,
      "relation": "eq"
    },
    "max_score": 1.0,
    "hits": [
      {
        "_index": "your_index_name",
        "_type": "_doc",
        "_id": "1",
        "_score": 1.0,
        "_source": {
          "field1": "value1",
          "field2": "value2",
          "metadata": { ... }
        }
      },
      {
        "_index": "your_index_name",
        "_type": "_doc",
        "_id": "2",
        "_score": 0.9,
        "_source": {
          "field1": "value3",
          "field2": "value4",
          "metadata": { ... }
        }
      }
      // Additional hits...
    ]
  }
}

In [24]:
search_criteria = {"type": "NarrativeText"}
results = os_client.search_by_metadata(search_criteria, k=20)
for doc in results:
    print(doc.id)


INFO:opensearch:POST https://localhost:9201/opensearch_graphvector_notebook/_search?size=20 [status:200 request:0.149s]


7be4d084ed1d8d2f0f0136130d96657d
0cf4c4a72b187a3e8b9441d670668220
37bd4b148b92576880338619f547f56d
9bdca855f20835cd8a463e2645f269e9
5c8aa95465fc1db19445d8d1e93b269e
1d3f374f8e8037c23d63c171bb53858b
d2524ec0a9c23881dd42d90d099ef6e4
5ae5ed1dfcafbd54c90191a3a2bcaa76
1ebda6ab90b083768bbaa6e253a6b8d3
57fdd79928c738894790c3a902fbd25f
8bb08bf5438d4aa17e047133ad2a4d4d
075c0b9e6466940817c3620a2aa51c18
e352c457d9ffbce0c5eff0907a4f07f7
e2f8361e9188b2db54eee5c16c27f1ac
d7ae8a934c4792142b39737daedaf0f6
b59d0a8414ba0ade38baa4e50aa9f30b
f83f83705f648b1785c42410487bfbef
83eab554b8f92b031e15f802a7d85f1c
d6641b68a0633565c9b2d4b9b32fffec
4c004820d67a5aa32ba2b60de47048f3


In [25]:
import asyncio

async def runner():
    metadata = {"type": "NarrativeText"}
    async for document in os_client.asearch_by_metadata(metadata=metadata, k=5):
        print(document.id)

# Run the function directly if in an interactive environment like Jupyter Notebook
await runner()


INFO:opensearch:POST https://localhost:9201/opensearch_graphvector_notebook/_search?size=5 [status:200 request:0.080s]


7be4d084ed1d8d2f0f0136130d96657d
0cf4c4a72b187a3e8b9441d670668220
37bd4b148b92576880338619f547f56d
9bdca855f20835cd8a463e2645f269e9
5c8aa95465fc1db19445d8d1e93b269e


In [29]:
result = os_client.similarity_search_by_vector_and_metadata("Can you explain the historical background of placeholder text?", metadata = {"type": "Title"}, k=10)
for doc in results:
    print(doc.id, doc.metadata["type"])


INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:opensearch:POST https://localhost:9201/opensearch_graphvector_notebook/_search?size=10 [status:200 request:0.062s]


7be4d084ed1d8d2f0f0136130d96657d NarrativeText
0cf4c4a72b187a3e8b9441d670668220 NarrativeText
37bd4b148b92576880338619f547f56d NarrativeText
9bdca855f20835cd8a463e2645f269e9 NarrativeText
5c8aa95465fc1db19445d8d1e93b269e NarrativeText
1d3f374f8e8037c23d63c171bb53858b NarrativeText
d2524ec0a9c23881dd42d90d099ef6e4 NarrativeText
5ae5ed1dfcafbd54c90191a3a2bcaa76 NarrativeText
1ebda6ab90b083768bbaa6e253a6b8d3 NarrativeText
57fdd79928c738894790c3a902fbd25f NarrativeText
8bb08bf5438d4aa17e047133ad2a4d4d NarrativeText
075c0b9e6466940817c3620a2aa51c18 NarrativeText
e352c457d9ffbce0c5eff0907a4f07f7 NarrativeText
e2f8361e9188b2db54eee5c16c27f1ac NarrativeText
d7ae8a934c4792142b39737daedaf0f6 NarrativeText
b59d0a8414ba0ade38baa4e50aa9f30b NarrativeText
f83f83705f648b1785c42410487bfbef NarrativeText
83eab554b8f92b031e15f802a7d85f1c NarrativeText
d6641b68a0633565c9b2d4b9b32fffec NarrativeText
4c004820d67a5aa32ba2b60de47048f3 NarrativeText


In [30]:
result = os_client.traversal_search(
    "Can you explain the historical background of placeholder text?"
)

for doc in result:
    print(doc.id,doc.metadata["type"])


INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:opensearch:POST https://localhost:9201/opensearch_graphvector_notebook/_search?size=4 [status:200 request:0.082s]
INFO:opensearch:POST https://localhost:9201/opensearch_graphvector_notebook/_search?size=1000 [status:200 request:0.019s]
INFO:opensearch:POST https://localhost:9201/opensearch_graphvector_notebook/_search?size=1000 [status:200 request:0.003s]
INFO:opensearch:POST https://localhost:9201/opensearch_graphvector_notebook/_search?size=1000 [status:200 request:0.002s]
INFO:opensearch:POST https://localhost:9201/opensearch_graphvector_notebook/_search?size=1000 [status:200 request:0.006s]
INFO:opensearch:POST https://localhost:9201/opensearch_graphvector_notebook/_search?size=1000 [status:200 request:0.017s]
INFO:opensearch:POST https://localhost:9201/opensearch_graphvector_notebook/_search?size=1000 [status:200 request:0.003s]
INFO:opensearch:POST https://localhost:9201/opensearch_graphvect

09f5af3237057d3c83c4ac9432a8bd65 Title
7cfe0d89c10c4f0aed70ae75fdeecbb2 Title
1d3f374f8e8037c23d63c171bb53858b NarrativeText
a587eb4bb2db07115a3890531c896281 Title


In [None]:
retriever = os_client.as_retriever(search_type="traversal", search_kwargs={"k": 10, "depth": 2})

for doc in retriever.get_relevant_documents(query="Can you explain the historical background of placeholder text?"
):
    print(doc.id, doc.metadata["type"])


INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:opensearch:POST https://localhost:9201/opensearch_graphvector_notebook/_search?size=10 [status:200 request:0.074s]
INFO:opensearch:POST https://localhost:9201/opensearch_graphvector_notebook/_search?size=1000 [status:200 request:0.009s]
INFO:opensearch:POST https://localhost:9201/opensearch_graphvector_notebook/_search?size=1000 [status:200 request:0.018s]
INFO:opensearch:POST https://localhost:9201/opensearch_graphvector_notebook/_search?size=1000 [status:200 request:0.007s]
INFO:opensearch:POST https://localhost:9201/opensearch_graphvector_notebook/_search?size=1000 [status:200 request:0.003s]
INFO:opensearch:POST https://localhost:9201/opensearch_graphvector_notebook/_search?size=1000 [status:200 request:0.002s]
INFO:opensearch:POST https://localhost:9201/opensearch_graphvector_notebook/_search?size=1000 [status:200 request:0.003s]
INFO:opensearch:POST https://localhost:9201/opensearch_graphvec

09f5af3237057d3c83c4ac9432a8bd65 Title
7cfe0d89c10c4f0aed70ae75fdeecbb2 Title
1d3f374f8e8037c23d63c171bb53858b NarrativeText
a587eb4bb2db07115a3890531c896281 Title
af55be232f6114fa7840d14941c92600 Title
1c9c79a725133bd8fd1c9345acf2a5ce Title
d2524ec0a9c23881dd42d90d099ef6e4 NarrativeText
dd50a2eeaad19e27a3b09c7c2f85b197 Title
43a339c7e27d89a8db8800226eca0d93 Title
779ff8635a73d15280938bdfb6224ddf NarrativeText
