In [1]:
%pip install -q -r requirements.txt


Note: you may need to restart the kernel to use updated packages.


In [2]:
import logging
logging.basicConfig(level=logging.ERROR)
logger = logging.getLogger(__name__)


In [3]:
from dotenv import load_dotenv
import os

load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
ASTRA_DB_API_ENDPOINT = os.getenv("ASTRA_DB_API_ENDPOINT")
ASTRA_DB_DATABASE_ID = os.getenv("ASTRA_DB_DATABASE_ID")
ASTRA_DB_APPLICATION_TOKEN = os.getenv("ASTRA_DB_APPLICATION_TOKEN")
ASTRA_DB_KEYSPACE = os.getenv("ASTRA_DB_KEYSPACE")
 

In [4]:
from langchain_openai import OpenAIEmbeddings
embeddings = OpenAIEmbeddings(model="text-embedding-3-large")


In [5]:
from langchain_astradb import AstraDBVectorStore

vector_store = AstraDBVectorStore(
    collection_name="unstructure_elements",
    embedding=embeddings,
    api_endpoint=ASTRA_DB_API_ENDPOINT,
    token=ASTRA_DB_APPLICATION_TOKEN,
    namespace=ASTRA_DB_KEYSPACE,
)


In [6]:
from langchain_core.documents import Document
from unstructured.documents.elements import Element
def element_to_document(element: Element) -> Document:
    return Document(
        id=element.id,
        page_content=element.text,
        metadata={
            "type": type(element).__name__,
            "links": [],
            **element.metadata.to_dict(),
        },
    )


In [7]:
from unstructured.partition.pdf import partition_pdf

elements = partition_pdf(
    filename="./data/sample4.pdf",
    extract_images_in_pdf=True,
    infer_table_structure=True,
    max_characters=2000,
    new_after_n_chars=1700,
    extract_image_block_output_dir="images/",
)


In [8]:
docs = []
for element in elements:
        try:
            element_type = type(element).__name__
            doc = element_to_document(element)
            docs.append(doc)
        except Exception as e:
            print(
                f"An error occurred while processing element {element.id}: {e}"
            )
            break



In [9]:
vector_store.add_documents(docs)


['8a137031615c0f20b771cfe3956d57f8',
 '3579e20ad32d793a8dd28ff5d8023da7',
 '80e8c329a3a621b91a31bcf97ff3334f',
 '35ccf72b73f67342fb97284be07e0de1',
 '39324654c968eb3357caf516e5f7f294',
 'e6cd9de5667b301da81d1373f90e734c',
 '935fdcf8ba2e601348691f23784106a0',
 '002f87c6560b4f412ee064375b00319d',
 '2c3e32aafc4e37c57c9299bb1abe8cf3',
 'c350ce92d1a85844e3195a7f97174afb',
 'c20e14a2e6cc3abd9e3794c19db89c4e',
 'bc5e09f692d6cf4ac05d9e6d84ecf9ca',
 '73c2a9d151deb9c8f6b1113cb8faad74',
 'b58ded2c0d09fcd6b1810df1f7cd9423',
 '0c6140c07f5c2e92399007e8c762c4cb',
 'b5906a2b13ee1bbfb62f054bd10f61f2',
 'ae823f07134a0ee9e668bef86d2caa08',
 '4836a5c73aaa4ee24183d49ebc83390f',
 'ec4e566681062b3415e6e140502beb8b',
 '441d6021d20dac79ddd32d9e17568dc8',
 'b7cdb0075e34409ff9418ff94b025d74']

In [10]:
for doc in vector_store.astra_env.collection.find({}):
    print(doc["metadata"])


{'type': 'Image', 'links': [], 'coordinates': {'points': [[-2.083333333333333, 1966.6666666666665], [-2.083333333333333, 2200.0], [1702.0833333333333, 2200.0], [1702.0833333333333, 1966.6666666666665]], 'system': 'PixelSpace', 'layout_width': 1700, 'layout_height': 2200}, 'last_modified': '2024-11-04T19:11:07', 'filetype': 'application/pdf', 'languages': ['eng'], 'page_number': 2, 'image_path': 'images/figure-2-5.jpg', 'file_directory': './data', 'filename': 'sample4.pdf'}
{'type': 'Title', 'links': [], 'detection_class_prob': 0.6584488749504089, 'coordinates': {'points': [[195.39285278320312, 411.7275390625], [195.39285278320312, 513.0299504884981], [937.9750366210938, 513.0299504884981], [937.9750366210938, 411.7275390625]], 'system': 'PixelSpace', 'layout_width': 1700, 'layout_height': 2200}, 'last_modified': '2024-11-04T19:11:07', 'filetype': 'application/pdf', 'languages': ['eng'], 'page_number': 1, 'file_directory': './data', 'filename': 'sample4.pdf'}
{'type': 'Image', 'links': 

In [None]:
original_documents = []
def upgrade_documents(
    vector_store: AstraDBVectorStore,
    batch_size = 10,
) -> int:
        filter = {"upgraded": {"$exists": False}}
        chunks = vector_store.metadata_search(filter=filter, n=batch_size)
        print(len(chunks))
        if len(chunks) == 0:
            return 0

        id_to_md_map: dict[str, dict] = {}

        for chunk in chunks:
            original_documents.append(chunk)
            chunk.metadata["upgraded"] = True
            id_to_md_map[chunk.id] = chunk.metadata
        
        vector_store.update_metadata(id_to_md_map)
        
        return len(chunks)

while upgrade_documents(vector_store, batch_size=10) > 0:
    pass


0


In [12]:
import sys
sys.path.append("/Users/pedropacheco/Projects/demos/unstructured-platform")

from content_graph import ContentGraph  # Import after modifying the path
g = ContentGraph("Lorem Ipsum")
g.fromLangChainDocuments(
    documents=original_documents,
    output_image_path="./images",
    reset_graph=True,
    infer_hierarchy=True,
)
g.graph



  add_links(self.infered_parent, Link.outgoing(kind=element_type, tag=doc.id))


[Document(id='root', metadata={'file_date': '2024-11-05 17:40:26', 'links': [Link(kind='Image', direction='out', tag='b7cdb0075e34409ff9418ff94b025d74'), Link(kind='Title', direction='out', tag='35ccf72b73f67342fb97284be07e0de1'), Link(kind='Title', direction='out', tag='c20e14a2e6cc3abd9e3794c19db89c4e'), Link(kind='Title', direction='out', tag='e6cd9de5667b301da81d1373f90e734c'), Link(kind='Title', direction='out', tag='73c2a9d151deb9c8f6b1113cb8faad74'), Link(kind='Title', direction='out', tag='3579e20ad32d793a8dd28ff5d8023da7'), Link(kind='Title', direction='out', tag='4836a5c73aaa4ee24183d49ebc83390f'), Link(kind='Title', direction='out', tag='002f87c6560b4f412ee064375b00319d')]}, page_content='Lorem Ipsum'),
 Document(id='b7cdb0075e34409ff9418ff94b025d74', metadata={'type': 'Image', 'links': [Link(kind='Image', direction='in', tag='root')], 'coordinates': {'points': [[-2.083333333333333, 1966.6666666666665], [-2.083333333333333, 2200.0], [1702.0833333333333, 2200.0], [1702.083333

In [13]:
from langchain_astradb.graph_vectorstores import AstraDBGraphVectorStore


In [14]:
from langchain_astradb import AstraDBVectorStore

graph_vector_store = AstraDBGraphVectorStore(
    collection_name="unstructure_elements_graph",
    embedding=embeddings,
    api_endpoint=ASTRA_DB_API_ENDPOINT,
    token=ASTRA_DB_APPLICATION_TOKEN,
    namespace=ASTRA_DB_KEYSPACE,
)


  graph_vector_store = AstraDBGraphVectorStore(


In [15]:
graph_vector_store.add_documents(g.graph)


['root',
 'b7cdb0075e34409ff9418ff94b025d74',
 '35ccf72b73f67342fb97284be07e0de1',
 '0c6140c07f5c2e92399007e8c762c4cb',
 '2c3e32aafc4e37c57c9299bb1abe8cf3',
 'b58ded2c0d09fcd6b1810df1f7cd9423',
 'c20e14a2e6cc3abd9e3794c19db89c4e',
 '8a137031615c0f20b771cfe3956d57f8',
 'b5906a2b13ee1bbfb62f054bd10f61f2',
 'ae823f07134a0ee9e668bef86d2caa08',
 '441d6021d20dac79ddd32d9e17568dc8',
 '935fdcf8ba2e601348691f23784106a0',
 'e6cd9de5667b301da81d1373f90e734c',
 'bc5e09f692d6cf4ac05d9e6d84ecf9ca',
 '73c2a9d151deb9c8f6b1113cb8faad74',
 'c350ce92d1a85844e3195a7f97174afb',
 '3579e20ad32d793a8dd28ff5d8023da7',
 '39324654c968eb3357caf516e5f7f294',
 '4836a5c73aaa4ee24183d49ebc83390f',
 'ec4e566681062b3415e6e140502beb8b',
 '002f87c6560b4f412ee064375b00319d',
 '80e8c329a3a621b91a31bcf97ff3334f']

In [None]:
g.graph[1].metadata["links"]
