In [19]:
%pip install -q -r requirements.txt


Note: you may need to restart the kernel to use updated packages.


In [20]:
import logging

# Remove any existing handlers
for logger_name in logging.root.manager.loggerDict:
    logger = logging.getLogger(logger_name)
    logger.setLevel(logging.ERROR)
    logger.propagate = False
    logger.handlers = []

# Configure the root logger
logging.basicConfig(level=logging.ERROR)
logger = logging.getLogger(__name__)
logger.setLevel(logging.ERROR)


In [21]:
from dotenv import load_dotenv
import os

load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
ASTRA_DB_API_ENDPOINT = os.getenv("ASTRA_DB_API_ENDPOINT")
ASTRA_DB_DATABASE_ID = os.getenv("ASTRA_DB_DATABASE_ID")
ASTRA_DB_APPLICATION_TOKEN = os.getenv("ASTRA_DB_APPLICATION_TOKEN")
ASTRA_DB_KEYSPACE = os.getenv("ASTRA_DB_KEYSPACE")
ASTRA_DB_COLLECTION = os.getenv("ASTRA_DB_COLLECTION")
ASTRA_DB_EMBEDDING_DIMENSIONS = os.getenv("ASTRA_DB_EMBEDDING_DIMENSIONS")
UNSTRUCTURED_API_KEY = os.getenv("UNSTRUCTURED_API_KEY")
UNSTRUCTURED_API_URL = os.getenv("UNSTRUCTURED_API_URL")


In [22]:
from langchain.embeddings import HuggingFaceEmbeddings

# Initialize HuggingFaceEmbeddings with a model of your choice
hf_embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Test embedding dimensions
texts = ["Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua"]
text_embeddings = hf_embeddings.embed_documents(texts)

print("Embeddings:", len(text_embeddings[0]))


Embeddings: 384


In [23]:
from langchain_openai import OpenAIEmbeddings
oi_embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

texts = ["Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua"]
text_embeddings = oi_embeddings.embed_documents(texts)

print("Embeddings:", len(text_embeddings[0]))


Embeddings: 3072


In [24]:
from langchain_astradb import AstraDBVectorStore
def astradb(emdeggings, collection_name, clear=True):
    print(f"Creating vector store {collection_name} in AstraDB")
    vector_store = AstraDBVectorStore(
        collection_name=collection_name,
        embedding=hf_embeddings,
        api_endpoint=ASTRA_DB_API_ENDPOINT,
        token=ASTRA_DB_APPLICATION_TOKEN,
        namespace=ASTRA_DB_KEYSPACE,
    )
    
    vector_store.clear() 
    print(f"Vector store {vector_store.astra_env.collection} created in AstraDB")
    return vector_store


In [25]:
from unstructured_ingest.v2.pipeline.pipeline import Pipeline
from unstructured_ingest.v2.interfaces import ProcessorConfig

from unstructured_ingest.v2.processes.connectors.astradb import (
    AstraDBConnectionConfig,
    AstraDBAccessConfig,
    AstraDBUploadStagerConfig,
    AstraDBUploaderConfig
)
from unstructured_ingest.v2.processes.connectors.local import (
    LocalIndexerConfig,
    LocalDownloaderConfig,
    LocalConnectionConfig
)
from unstructured_ingest.v2.processes.partitioner import PartitionerConfig
from unstructured_ingest.v2.processes.chunker import ChunkerConfig
from unstructured_ingest.interfaces import ChunkingConfig
from unstructured_ingest.v2.processes.embedder import EmbedderConfig

def unstructured(strategy, collection_name, special_case=False):
    print(f"Parsing documents for stratygy {strategy}")
    print(f"Uploading to collection {collection_name}")
    
    if strategy != "by_element":        
        local_chunker_config = ChunkerConfig(chunking_strategy=strategy)
    else:
        local_chunker_config = ChunkerConfig(chunk_elements=True)
        
    Pipeline.from_configs(
        context=ProcessorConfig(),
        indexer_config=LocalIndexerConfig(input_path="./ingest"),
        downloader_config=LocalDownloaderConfig(),
        source_connection_config=LocalConnectionConfig(),
        partitioner_config=PartitionerConfig(
            partition_by_api=True,
            api_key=UNSTRUCTURED_API_KEY,
            partition_endpoint="", #os.getenv("UNSTRUCTURED_API_URL"),
            strategy="hi_res",
            additional_partition_args={
                "split_pdf_page": True,
                "split_pdf_allow_failed": True,
                "split_pdf_concurrency_level": 15
            }
        ),
        # Setting chunk_elements=True now has the same effect as setting chunking_strategy='by_title'
        chunker_config=local_chunker_config,
        embedder_config=EmbedderConfig(embedding_provider="huggingface"),
        destination_connection_config=AstraDBConnectionConfig(
            access_config=AstraDBAccessConfig(
                api_endpoint=ASTRA_DB_API_ENDPOINT,
                token=ASTRA_DB_APPLICATION_TOKEN
            )
        ),
        stager_config=AstraDBUploadStagerConfig(),
        uploader_config=AstraDBUploaderConfig(
            keyspace=ASTRA_DB_KEYSPACE,
            collection_name=collection_name,
            embedding_dimension=ASTRA_DB_EMBEDDING_DIMENSIONS
        )
    ).run()


In [26]:
vs_dict = {}
for strategy in ["by_title", "by_element"]: #["basic", "by_title", "by_similarity", "by_page"]:
    collection_name = f"unstructured_{strategy}"
    vs_dict[strategy] = astradb(hf_embeddings, collection_name=collection_name)
    unstructured(strategy, collection_name=collection_name)


INFO:root:Detecting API environment 'prod' from supplied endpoint


Creating vector store unstructured_by_title in AstraDB


2024-11-20 18:29:35,937 MainProcess INFO     created index with configs: {"input_path": "ingest", "recursive": false}, connection configs: {"access_config": "**********"}
2024-11-20 18:29:35,938 MainProcess INFO     Created download with configs: {"download_dir": null}, connection configs: {"access_config": "**********"}
2024-11-20 18:29:35,939 MainProcess INFO     created partition with configs: {"strategy": "hi_res", "ocr_languages": null, "encoding": null, "additional_partition_args": {"split_pdf_page": true, "split_pdf_allow_failed": true, "split_pdf_concurrency_level": 15}, "skip_infer_table_types": null, "fields_include": ["element_id", "text", "type", "metadata", "embeddings"], "flatten_metadata": false, "metadata_exclude": [], "element_exclude": [], "metadata_include": [], "partition_endpoint": "", "partition_by_api": true, "api_key": "*******", "hi_res_model_name": null}
2024-11-20 18:29:35,939 MainProcess INFO     created chunk with configs: {"chunking_strategy": "by_title", 

Vector store Collection(name="unstructured_by_title", keyspace="graphrag", database=Database(api_endpoint="https://e9b06722-b7b7-4d7d-9ba0-801344f200de-us-east-2.apps.astra.datastax.com", token="AstraCS:JWxL...", keyspace="graphrag"), api_options=CollectionAPIOptions(max_time_ms=None, embedding_api_key=EmbeddingAPIKeyHeaderProvider(empty))) created in AstraDB
Parsing documents for stratygy by_title
Uploading to collection unstructured_by_title


2024-11-20 18:29:36,028 MainProcess INFO     index finished in 4.5e-05s
2024-11-20 18:29:36,034 MainProcess INFO     calling DownloadStep with 1 docs
2024-11-20 18:29:36,034 MainProcess INFO     processing content async
2024-11-20 18:29:36,037 MainProcess INFO     download finished in 0.001835s, attributes: file_id=5e67eafb35b0
2024-11-20 18:29:36,039 MainProcess INFO     download step finished in 0.005107s
2024-11-20 18:29:36,039 MainProcess INFO     calling PartitionStep with 1 docs
2024-11-20 18:29:36,039 MainProcess INFO     processing content async
2024-11-20 18:29:36,041 MainProcess INFO     partition finished in 0.001259s, attributes: file_id=5e67eafb35b0
2024-11-20 18:29:36,042 MainProcess INFO     partition step finished in 0.002734s
2024-11-20 18:29:36,042 MainProcess INFO     calling ChunkStep with 1 docs
2024-11-20 18:29:36,042 MainProcess INFO     processing content across processes
2024-11-20 18:29:36,043 MainProcess INFO     processing content serially
2024-11-20 18:29:3

Creating vector store unstructured_by_element in AstraDB


2024-11-20 18:29:40,777 MainProcess INFO     created index with configs: {"input_path": "ingest", "recursive": false}, connection configs: {"access_config": "**********"}
2024-11-20 18:29:40,778 MainProcess INFO     Created download with configs: {"download_dir": null}, connection configs: {"access_config": "**********"}
2024-11-20 18:29:40,779 MainProcess INFO     created partition with configs: {"strategy": "hi_res", "ocr_languages": null, "encoding": null, "additional_partition_args": {"split_pdf_page": true, "split_pdf_allow_failed": true, "split_pdf_concurrency_level": 15}, "skip_infer_table_types": null, "fields_include": ["element_id", "text", "type", "metadata", "embeddings"], "flatten_metadata": false, "metadata_exclude": [], "element_exclude": [], "metadata_include": [], "partition_endpoint": "", "partition_by_api": true, "api_key": "*******", "hi_res_model_name": null}
2024-11-20 18:29:40,779 MainProcess INFO     created chunk with configs: {"chunking_strategy": null, "chunk

Vector store Collection(name="unstructured_by_element", keyspace="graphrag", database=Database(api_endpoint="https://e9b06722-b7b7-4d7d-9ba0-801344f200de-us-east-2.apps.astra.datastax.com", token="AstraCS:JWxL...", keyspace="graphrag"), api_options=CollectionAPIOptions(max_time_ms=None, embedding_api_key=EmbeddingAPIKeyHeaderProvider(empty))) created in AstraDB
Parsing documents for stratygy by_element
Uploading to collection unstructured_by_element


2024-11-20 18:29:40,873 MainProcess INFO     index finished in 4.7e-05s
2024-11-20 18:29:40,875 MainProcess INFO     calling DownloadStep with 1 docs
2024-11-20 18:29:40,875 MainProcess INFO     processing content async
2024-11-20 18:29:40,878 MainProcess INFO     download finished in 0.001249s, attributes: file_id=5e67eafb35b0
2024-11-20 18:29:40,879 MainProcess INFO     download step finished in 0.004159s
2024-11-20 18:29:40,879 MainProcess INFO     calling PartitionStep with 1 docs
2024-11-20 18:29:40,879 MainProcess INFO     processing content async
2024-11-20 18:29:40,881 MainProcess INFO     partition finished in 0.001071s, attributes: file_id=5e67eafb35b0
2024-11-20 18:29:40,882 MainProcess INFO     partition step finished in 0.002543s
2024-11-20 18:29:40,882 MainProcess INFO     calling ChunkStep with 1 docs
2024-11-20 18:29:40,882 MainProcess INFO     processing content across processes
2024-11-20 18:29:40,882 MainProcess INFO     processing content serially
2024-11-20 18:29:4

In [27]:
import sys
sys.path.append("/Users/pedropacheco/Projects/demos/unstructured-platform")
from content_graph import ContentGraph, _encode_astradb_documents


### Retrieve documents

In [None]:
from langchain_astradb.utils.vector_store_codecs import (
    _AstraDBVectorStoreDocumentCodec,
    _DefaultVectorizeVSDocumentCodec,
    _DefaultVSDocumentCodec,
)
docs = {}
for key in ["by_title","by_element"]:
    print(key)
    hits = list(vs_dict[key].astra_env.collection.find({}))
    document_codec = _DefaultVSDocumentCodec(content_field="content", ignore_invalid_documents=True)
    docs[key] = [document_codec.decode(hit) for hit in hits]


by_title
unstructured_by_title
{'_id': '1cbd995a-2982-4b7e-bd99-5a29828b7ebf', 'content': 'To evaluate the effectiveness of RAG systems for more global sensemaking tasks, we need questions that convey only a high-level understanding of dataset contents, and not the details of specific texts. We used an activity-centered approach to automate the generation of such questions: given a short description of a dataset, we asked the LLM to identify N potential users and N tasks per user, then for each (user, task) combination, we asked the LLM to generate N questions that require', 'metadata': {'type': 'CompositeElement', 'element_id': 'ba1ebb065550f21e10dfcbb7a58bf4ab', 'metadata': {'data_source': {'record_locator': {'path': '/Users/pedropacheco/Projects/demos/unstructured-platform/ingest/sample9.pdf'}, 'date_created': '1731958679.1744926', 'date_modified': '1731958679.1820238', 'date_processed': '1731960277.157229', 'permissions_data': [{'mode': 33188}]}, 'filename': 'sample9.pdf', 'filetyp

### Builds Vector Graph!

In [29]:
graphs={}
for strategy in ["by_title", "by_element"]:
    g = ContentGraph(strategy)
    g.fromLangChainDocuments(
        documents=docs[strategy],
        reset_graph=True,
        infer_hierarchy=False,
        strategy=strategy,
    )
    graphs[strategy] = g


An error occurred while processing element e02fa537-104f-4ede-afa5-37104faedee5: 'parent_id'
'parent_id'
Traceback (most recent call last):
  File "/Users/pedropacheco/Projects/demos/unstructured-platform/content_graph.py", line 258, in fromLangChainDocuments
    self.graph.append(doc)
          ^^^^^^^^^^^^
  File "/Users/pedropacheco/Projects/demos/unstructured-platform/content_graph.py", line 355, in _element_strategy_handler
    raise Exception(f'Parent document not found for {doc.id}')
                                        ^^^^^^^^^^^^^^^^^^^^^^^
KeyError: 'parent_id'
An error occurred while processing element 4b6c0efb-ec14-4274-ac0e-fbec14d2748c: 'parent_id'
'parent_id'
Traceback (most recent call last):
  File "/Users/pedropacheco/Projects/demos/unstructured-platform/content_graph.py", line 258, in fromLangChainDocuments
    self.graph.append(doc)
          ^^^^^^^^^^^^
  File "/Users/pedropacheco/Projects/demos/unstructured-platform/content_graph.py", line 355, in _element_st

In [None]:
graphs["by_element"].plot_graph("by_element")
graphs["by_title"].plot_graph("by_title")
