In [7]:
%pip install -q -r requirements.txt


Note: you may need to restart the kernel to use updated packages.


In [8]:
import logging
logging.basicConfig(level=logging.ERROR)
logger = logging.getLogger(__name__)


In [None]:
from dotenv import load_dotenv
import os

load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
ASTRA_DB_API_ENDPOINT = os.getenv("ASTRA_DB_API_ENDPOINT")
ASTRA_DB_DATABASE_ID = os.getenv("ASTRA_DB_DATABASE_ID")
ASTRA_DB_APPLICATION_TOKEN = os.getenv("ASTRA_DB_APPLICATION_TOKEN")
ASTRA_DB_KEYSPACE = os.getenv("ASTRA_DB_KEYSPACE")

 

In [None]:
from langchain_openai import OpenAIEmbeddings
embeddings = OpenAIEmbeddings(model="text-embedding-3-large")


In [None]:
from langchain_astradb import AstraDBVectorStore

vector_store = AstraDBVectorStore(
    collection_name="unstructure_elements",
    embedding=embeddings,
    api_endpoint=ASTRA_DB_API_ENDPOINT,
    token=ASTRA_DB_APPLICATION_TOKEN,
    namespace=ASTRA_DB_KEYSPACE,
)


In [None]:
from langchain_core.documents import Document
from unstructured.documents.elements import Element
def element_to_document(element: Element) -> Document:
    return Document(
        id=element.id,
        page_content=element.text,
        metadata={
            "type": type(element).__name__,
            "links": [],
            **element.metadata.to_dict(),
        },
    )


In [None]:
from unstructured.partition.pdf import partition_pdf

elements = partition_pdf(
    filename="./data/sample4.pdf",
    extract_images_in_pdf=True,
    infer_table_structure=True,
    max_characters=2000,
    new_after_n_chars=1700,
    extract_image_block_output_dir="images/",
)


In [None]:
docs = []
for element in elements:
        try:
            element_type = type(element).__name__
            doc = element_to_document(element)
            docs.append(doc)
        except Exception as e:
            print(
                f"An error occurred while processing element {element.id}: {e}"
            )
            break



In [None]:
vector_store.add_documents(docs)


In [None]:
for doc in vector_store.astra_env.collection.find({}):
    print(doc["metadata"])


In [9]:
original_documents = []
def upgrade_documents(
    vector_store: AstraDBVectorStore,
    batch_size = 10,
) -> int:
        filter = {"upgraded": {"$exists": False}}
        chunks = vector_store.metadata_search(filter=filter, n=batch_size)
        print(len(chunks))
        if len(chunks) == 0:
            return 0

        id_to_md_map: dict[str, dict] = {}

        for chunk in chunks:
            original_documents.append(chunk)
            chunk.metadata["upgraded"] = True
            id_to_md_map[chunk.id] = chunk.metadata
        
        vector_store.update_metadata(id_to_md_map)
        
        return len(chunks)

while upgrade_documents(vector_store, batch_size=10) > 0:
    pass


NameError: name 'AstraDBVectorStore' is not defined

In [None]:
import sys
sys.path.append("/Users/pedropacheco/Projects/demos/unstructured-platform")

from content_graph import ContentGraph  # Import after modifying the path
g = ContentGraph("Lorem Ipsum")
g.fromLangChainDocuments(
    documents=original_documents,
    output_image_path="./images",
    reset_graph=True,
    infer_hierarchy=True,
)
g.graph



In [None]:
from langchain_astradb.graph_vectorstores import AstraDBGraphVectorStore


In [None]:
from langchain_astradb import AstraDBVectorStore

graph_vector_store = AstraDBGraphVectorStore(
    collection_name="unstructure_elements_graph",
    embedding=embeddings,
    api_endpoint=ASTRA_DB_API_ENDPOINT,
    token=ASTRA_DB_APPLICATION_TOKEN,
    namespace=ASTRA_DB_KEYSPACE,
)


In [None]:
graph_vector_store.add_documents(g.graph)
