In [None]:
%pip install -q -r requirements.txt


In [None]:
# import logging

# # Remove any existing handlers
# for logger_name in logging.root.manager.loggerDict:
#     logger = logging.getLogger(logger_name)
#     logger.setLevel(logging.ERROR)
#     logger.propagate = False
#     logger.handlers = []

# # Configure the root logger
# logging.basicConfig(level=logging.ERROR)
# logger = logging.getLogger(__name__)
# logger.setLevel(logging.ERROR)


In [1]:
from dotenv import load_dotenv
import os

load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
ASTRA_DB_API_ENDPOINT = os.getenv("ASTRA_DB_API_ENDPOINT")
ASTRA_DB_DATABASE_ID = os.getenv("ASTRA_DB_DATABASE_ID")
ASTRA_DB_APPLICATION_TOKEN = os.getenv("ASTRA_DB_APPLICATION_TOKEN")
ASTRA_DB_KEYSPACE = os.getenv("ASTRA_DB_KEYSPACE")
ASTRA_DB_COLLECTION = os.getenv("ASTRA_DB_COLLECTION")
ASTRA_DB_EMBEDDING_DIMENSIONS = os.getenv("ASTRA_DB_EMBEDDING_DIMENSIONS")
UNSTRUCTURED_API_KEY = os.getenv("UNSTRUCTURED_API_KEY")
UNSTRUCTURED_API_URL = os.getenv("UNSTRUCTURED_API_URL")


In [2]:
from langchain.embeddings import HuggingFaceEmbeddings

# Initialize HuggingFaceEmbeddings with a model of your choice
hf_embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Test embedding dimensions
texts = ["Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua"]
text_embeddings = hf_embeddings.embed_documents(texts)

print("Embeddings:", len(text_embeddings[0]))


  hf_embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


Embeddings: 384


In [None]:
from langchain_openai import OpenAIEmbeddings
oi_embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

texts = ["Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua"]
text_embeddings = oi_embeddings.embed_documents(texts)

print("Embeddings:", len(text_embeddings[0]))


In [3]:
from langchain_astradb import AstraDBVectorStore
def astradb(emdeggings, collection_name, clear=True):
    print(f"Creating vector store {collection_name} in AstraDB")
    vector_store = AstraDBVectorStore(
        collection_name=collection_name,
        embedding=hf_embeddings,
        api_endpoint=ASTRA_DB_API_ENDPOINT,
        token=ASTRA_DB_APPLICATION_TOKEN,
        namespace=ASTRA_DB_KEYSPACE,
    )
    
    vector_store.clear() 
    print(f"Vector store {vector_store.astra_env.collection} created in AstraDB")
    return vector_store


In [4]:
from unstructured_ingest.v2.pipeline.pipeline import Pipeline
from unstructured_ingest.v2.interfaces import ProcessorConfig

from unstructured_ingest.v2.processes.connectors.astradb import (
    AstraDBConnectionConfig,
    AstraDBAccessConfig,
    AstraDBUploadStagerConfig,
    AstraDBUploaderConfig
)
from unstructured_ingest.v2.processes.connectors.local import (
    LocalIndexerConfig,
    LocalDownloaderConfig,
    LocalConnectionConfig
)
from unstructured_ingest.v2.processes.partitioner import PartitionerConfig
from unstructured_ingest.v2.processes.chunker import ChunkerConfig
from unstructured_ingest.interfaces import ChunkingConfig
from unstructured_ingest.v2.processes.embedder import EmbedderConfig

def unstructured(strategy, collection_name, special_case=False):
    print(f"Parsing documents for stratygy {strategy}")
    print(f"Uploading to collection {collection_name}")
    
    if strategy != "by_element":        
        local_chunker_config = ChunkerConfig(chunking_strategy=strategy)
    else:
        local_chunker_config = ChunkerConfig(chunk_elements=True)
        
    Pipeline.from_configs(
        context=ProcessorConfig(),
        indexer_config=LocalIndexerConfig(input_path="./ingest"),
        downloader_config=LocalDownloaderConfig(),
        source_connection_config=LocalConnectionConfig(),
        partitioner_config=PartitionerConfig(
            partition_by_api=True,
            api_key=UNSTRUCTURED_API_KEY,
            partition_endpoint="", #os.getenv("UNSTRUCTURED_API_URL"),
            strategy="hi_res",
            additional_partition_args={
                "split_pdf_page": True,
                "split_pdf_allow_failed": True,
                "split_pdf_concurrency_level": 15
            }
        ),
        # Setting chunk_elements=True now has the same effect as setting chunking_strategy='by_title'
        chunker_config=local_chunker_config,
        embedder_config=EmbedderConfig(embedding_provider="huggingface"),
        destination_connection_config=AstraDBConnectionConfig(
            access_config=AstraDBAccessConfig(
                api_endpoint=ASTRA_DB_API_ENDPOINT,
                token=ASTRA_DB_APPLICATION_TOKEN
            )
        ),
        stager_config=AstraDBUploadStagerConfig(),
        uploader_config=AstraDBUploaderConfig(
            keyspace=ASTRA_DB_KEYSPACE,
            collection_name=collection_name,
            embedding_dimension=ASTRA_DB_EMBEDDING_DIMENSIONS
        )
    ).run()


In [5]:
vs_dict = {}
for strategy in ["by_title", "by_element"]: #["basic", "by_title", "by_similarity", "by_page"]:
    collection_name = f"unstructured_{strategy}"
    vs_dict[strategy] = astradb(hf_embeddings, collection_name=collection_name)
    unstructured(strategy, collection_name=collection_name)


Creating vector store unstructured_by_title in AstraDB


2024-11-21 07:11:51,367 MainProcess INFO     created index with configs: {"input_path": "ingest", "recursive": false}, connection configs: {"access_config": "**********"}
2024-11-21 07:11:51,368 MainProcess INFO     Created download with configs: {"download_dir": null}, connection configs: {"access_config": "**********"}
2024-11-21 07:11:51,369 MainProcess INFO     created partition with configs: {"strategy": "hi_res", "ocr_languages": null, "encoding": null, "additional_partition_args": {"split_pdf_page": true, "split_pdf_allow_failed": true, "split_pdf_concurrency_level": 15}, "skip_infer_table_types": null, "fields_include": ["element_id", "text", "type", "metadata", "embeddings"], "flatten_metadata": false, "metadata_exclude": [], "element_exclude": [], "metadata_include": [], "partition_endpoint": "", "partition_by_api": true, "api_key": "*******", "hi_res_model_name": null}
2024-11-21 07:11:51,370 MainProcess INFO     created chunk with configs: {"chunking_strategy": "by_title", 

Vector store Collection(name="unstructured_by_title", keyspace="graphrag", database=Database(api_endpoint="https://e9b06722-b7b7-4d7d-9ba0-801344f200de-us-east-2.apps.astra.datastax.com", token="AstraCS:JWxL...", keyspace="graphrag"), api_options=CollectionAPIOptions(max_time_ms=None, embedding_api_key=EmbeddingAPIKeyHeaderProvider(empty))) created in AstraDB
Parsing documents for stratygy by_title
Uploading to collection unstructured_by_title


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
2024-11-21 07:11:51,677 MainProcess INFO     index finished in 0.000102s
2024-11-21 07:11:51,692 MainProcess INFO     calling DownloadStep with 1 docs
2024-11-21 07:11:51,693 MainProcess INFO     processing content async
2024-11-21 07:11:51,706 MainProcess INFO     download finished in 0.007822s, attributes: file_id=5e67eafb35b0
2024-11-21 07:11:51,711 MainProcess INFO     download step finished in 0.019174s
2024-11-21 07:11:51,713 MainProcess INFO     calling PartitionStep with 1 docs
2024-11-21 07:11:51,714 MainProcess INFO     processing content async
2024-11-21 07:11:51,723 MainProcess INFO     partition finished in 0.004756s, attributes: file_id=5e67eafb35b0
2024-11-21 07:11:51,726 MainProcess INFO     

Creating vector store unstructured_by_element in AstraDB


Overriding of current TracerProvider is not allowed
2024-11-21 07:11:58,023 MainProcess INFO     created index with configs: {"input_path": "ingest", "recursive": false}, connection configs: {"access_config": "**********"}
2024-11-21 07:11:58,024 MainProcess INFO     Created download with configs: {"download_dir": null}, connection configs: {"access_config": "**********"}
2024-11-21 07:11:58,026 MainProcess INFO     created partition with configs: {"strategy": "hi_res", "ocr_languages": null, "encoding": null, "additional_partition_args": {"split_pdf_page": true, "split_pdf_allow_failed": true, "split_pdf_concurrency_level": 15}, "skip_infer_table_types": null, "fields_include": ["element_id", "text", "type", "metadata", "embeddings"], "flatten_metadata": false, "metadata_exclude": [], "element_exclude": [], "metadata_include": [], "partition_endpoint": "", "partition_by_api": true, "api_key": "*******", "hi_res_model_name": null}
2024-11-21 07:11:58,027 MainProcess INFO     created ch

Vector store Collection(name="unstructured_by_element", keyspace="graphrag", database=Database(api_endpoint="https://e9b06722-b7b7-4d7d-9ba0-801344f200de-us-east-2.apps.astra.datastax.com", token="AstraCS:JWxL...", keyspace="graphrag"), api_options=CollectionAPIOptions(max_time_ms=None, embedding_api_key=EmbeddingAPIKeyHeaderProvider(empty))) created in AstraDB
Parsing documents for stratygy by_element
Uploading to collection unstructured_by_element


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
2024-11-21 07:11:58,280 MainProcess INFO     index finished in 9e-05s
2024-11-21 07:11:58,286 MainProcess INFO     calling DownloadStep with 1 docs
2024-11-21 07:11:58,287 MainProcess INFO     processing content async
2024-11-21 07:11:58,298 MainProcess INFO     download finished in 0.004939s, attributes: file_id=5e67eafb35b0
2024-11-21 07:11:58,300 MainProcess INFO     download step finished in 0.013801s
2024-11-21 07:11:58,301 MainProcess INFO     calling PartitionStep with 1 docs
2024-11-21 07:11:58,302 MainProcess INFO     processing content async
2024-11-21 07:11:58,308 MainProcess INFO     partition finished in 0.003654s, attributes: file_id=5e67eafb35b0
2024-11-21 07:11:58,310 MainProcess INFO     par

In [6]:
import sys
sys.path.append("/Users/pedropacheco/Projects/demos/unstructured-platform")
from content_graph import ContentGraph, _encode_astradb_documents


### Retrieve documents

In [7]:
from langchain_astradb.utils.vector_store_codecs import (
    _AstraDBVectorStoreDocumentCodec,
    _DefaultVectorizeVSDocumentCodec,
    _DefaultVSDocumentCodec,
)
docs = {}
for key in ["by_title","by_element"]:
    print(key)
    hits = list(vs_dict[key].astra_env.collection.find({}))
    document_codec = _DefaultVSDocumentCodec(content_field="content", ignore_invalid_documents=True)
    docs[key] = [document_codec.decode(hit) for hit in hits]


INFO:astrapy.cursors:creating iterator on 'unstructured_by_title'
INFO:astrapy.cursors:finished creating iterator on 'unstructured_by_title'
INFO:astrapy.collection:command=find on 'unstructured_by_title'


by_title


INFO:httpx:HTTP Request: POST https://e9b06722-b7b7-4d7d-9ba0-801344f200de-us-east-2.apps.astra.datastax.com/api/json/v1/graphrag/unstructured_by_title "HTTP/1.1 200 OK"
INFO:astrapy.collection:finished command=find on 'unstructured_by_title'
INFO:astrapy.collection:command=find on 'unstructured_by_title'
INFO:httpx:HTTP Request: POST https://e9b06722-b7b7-4d7d-9ba0-801344f200de-us-east-2.apps.astra.datastax.com/api/json/v1/graphrag/unstructured_by_title "HTTP/1.1 200 OK"
INFO:astrapy.collection:finished command=find on 'unstructured_by_title'
INFO:astrapy.collection:command=find on 'unstructured_by_title'
INFO:httpx:HTTP Request: POST https://e9b06722-b7b7-4d7d-9ba0-801344f200de-us-east-2.apps.astra.datastax.com/api/json/v1/graphrag/unstructured_by_title "HTTP/1.1 200 OK"
INFO:astrapy.collection:finished command=find on 'unstructured_by_title'
INFO:astrapy.collection:command=find on 'unstructured_by_title'
INFO:httpx:HTTP Request: POST https://e9b06722-b7b7-4d7d-9ba0-801344f200de-us-e

by_element


INFO:httpx:HTTP Request: POST https://e9b06722-b7b7-4d7d-9ba0-801344f200de-us-east-2.apps.astra.datastax.com/api/json/v1/graphrag/unstructured_by_element "HTTP/1.1 200 OK"
INFO:astrapy.collection:finished command=find on 'unstructured_by_element'
INFO:astrapy.collection:command=find on 'unstructured_by_element'
INFO:httpx:HTTP Request: POST https://e9b06722-b7b7-4d7d-9ba0-801344f200de-us-east-2.apps.astra.datastax.com/api/json/v1/graphrag/unstructured_by_element "HTTP/1.1 200 OK"
INFO:astrapy.collection:finished command=find on 'unstructured_by_element'
INFO:astrapy.collection:command=find on 'unstructured_by_element'
INFO:httpx:HTTP Request: POST https://e9b06722-b7b7-4d7d-9ba0-801344f200de-us-east-2.apps.astra.datastax.com/api/json/v1/graphrag/unstructured_by_element "HTTP/1.1 200 OK"
INFO:astrapy.collection:finished command=find on 'unstructured_by_element'
INFO:astrapy.collection:command=find on 'unstructured_by_element'
INFO:httpx:HTTP Request: POST https://e9b06722-b7b7-4d7d-9ba0

### Builds Vector Graph!

In [8]:
graphs={}
for strategy in ["by_title", "by_element"]:
    g = ContentGraph(strategy)
    g.fromLangChainDocuments(
        documents=docs[strategy],
        reset_graph=True,
        infer_hierarchy=True,
        strategy=strategy,
    )
    graphs[strategy] = g


INFO:content_graph:Creating content graph from existing langchain documents ...
  existing_links = get_links(doc)
  add_links(doc, link)
INFO:content_graph:Creating content graph from existing langchain documents ...


In [9]:
graphs["by_element"].plot_graph("by_element")
graphs["by_title"].plot_graph("by_title")


by_element.html
by_title.html


In [14]:
graphs["by_element"].find_document_by_element_id("544b004c7103572d5c70f38b4b8f2a90")


Document(id='cdd2d84b-dee3-4e67-92d8-4bdee39e6798', metadata={'type': 'Title', 'element_id': '544b004c7103572d5c70f38b4b8f2a90', 'metadata': {'filetype': 'application/pdf', 'languages': ['eng'], 'page_number': 12, 'filename': 'sample9.pdf', 'data_source': {'record_locator': {'path': '/Users/pedropacheco/Projects/demos/unstructured-platform/ingest/sample9.pdf'}, 'date_created': '1731958679.1744926', 'date_modified': '1731958679.1820238', 'date_processed': '1731960277.157229', 'permissions_data': [{'mode': 33188}]}}, 'links': [Link(kind='root', direction='in', tag='0'), Link(kind='NarrativeText', direction='out', tag='544b004c7103572d5c70f38b4b8f2a90'), Link(kind='Table', direction='out', tag='544b004c7103572d5c70f38b4b8f2a90'), Link(kind='Image', direction='out', tag='544b004c7103572d5c70f38b4b8f2a90'), Link(kind='Footer', direction='out', tag='544b004c7103572d5c70f38b4b8f2a90'), Link(kind='PageNumber', direction='out', tag='544b004c7103572d5c70f38b4b8f2a90')]}, page_content='Acknowledg

In [15]:
from langchain_astradb import AstraDBGraphVectorStore
from langchain_astradb.utils.astradb import SetupMode
collection_name = "graph_by_element"
graph_vector_store = AstraDBGraphVectorStore(
        collection_name=collection_name,
        embedding=hf_embeddings,
        api_endpoint=ASTRA_DB_API_ENDPOINT,
        token=ASTRA_DB_APPLICATION_TOKEN,
        namespace=ASTRA_DB_KEYSPACE,
        setup_mode=SetupMode.SYNC)
    

  graph_vector_store = AstraDBGraphVectorStore(
INFO:langchain_astradb.vectorstores:vector store default init, collection 'graph_by_element'
INFO:root:Detecting API environment 'prod' from supplied endpoint
INFO:astrapy.database:createCollection('graph_by_element')
INFO:httpx:HTTP Request: POST https://e9b06722-b7b7-4d7d-9ba0-801344f200de-us-east-2.apps.astra.datastax.com/api/json/v1/graphrag "HTTP/1.1 200 OK"
INFO:astrapy.database:finished createCollection('graph_by_element')
INFO:astrapy.cursors:creating iterator on 'graph_by_element'
INFO:astrapy.cursors:finished creating iterator on 'graph_by_element'
INFO:astrapy.collection:command=find on 'graph_by_element'
INFO:httpx:HTTP Request: POST https://e9b06722-b7b7-4d7d-9ba0-801344f200de-us-east-2.apps.astra.datastax.com/api/json/v1/graphrag/graph_by_element "HTTP/1.1 200 OK"
INFO:astrapy.collection:finished command=find on 'graph_by_element'


In [27]:
graph_vector_store.add_documents(graphs["by_title"].graph)


INFO:astrapy.collection:inserting 370 documents in 'graph_by_element'
INFO:astrapy.collection:insertMany(chunk) on 'graph_by_element'
INFO:astrapy.collection:insertMany(chunk) on 'graph_by_element'
INFO:astrapy.collection:insertMany(chunk) on 'graph_by_element'
INFO:astrapy.collection:insertMany(chunk) on 'graph_by_element'
INFO:astrapy.collection:insertMany(chunk) on 'graph_by_element'
INFO:astrapy.collection:insertMany(chunk) on 'graph_by_element'
INFO:astrapy.collection:insertMany(chunk) on 'graph_by_element'
INFO:astrapy.collection:insertMany(chunk) on 'graph_by_element'
INFO:httpx:HTTP Request: POST https://e9b06722-b7b7-4d7d-9ba0-801344f200de-us-east-2.apps.astra.datastax.com/api/json/v1/graphrag/graph_by_element "HTTP/1.1 200 OK"
INFO:astrapy.collection:finished insertMany(chunk) on 'graph_by_element'
INFO:httpx:HTTP Request: POST https://e9b06722-b7b7-4d7d-9ba0-801344f200de-us-east-2.apps.astra.datastax.com/api/json/v1/graphrag/graph_by_element "HTTP/1.1 200 OK"
INFO:astrapy.co

['c3025d5818bf08e5cd0715b8cb0deaaa',
 '131884dac11ed20a03569aa789c742b9',
 '3f58730b818ae25e3407a048f1a8d4ee',
 'bdff3fa0-0489-41dd-bf3f-a00489f1dd53',
 'affd88b35780276886a95aff58b181fb',
 '3552c8248ec0f2817a2806e0f470dca6',
 '5d7d632f-8fc1-412a-bd63-2f8fc1a12a6a',
 'd00891a5-68b5-4854-8891-a568b5385441',
 '02d7aec08fd7aa877dcbdcb740747160',
 '0741dc3e6a8c19a3018aef698417ce5d',
 '6594b3f9456d5d3755eb2988e99df29a',
 'abd1a541-c059-4573-91a5-41c05925731f',
 '68528bf6e884ac35b2f3a90ca5a0fdeb',
 '86cd866d3584272abbfe072060c29d55',
 'ff5fc392d5a89452dc7c900a720fd9cf',
 'c830fb5f26b35ec0b0d10a34f6f00613',
 '17ab3cba61d12e5fbfa7cd33a201b1b4',
 '57bc21cd-4173-4a06-bc21-cd41737a06e2',
 'd2f3c012-eca7-4ed5-b3c0-12eca78ed547',
 'ae501640-edf6-428f-9016-40edf6228ff2',
 'ed87691006ed398d104e502408a97938',
 '2c15b7ec172ccd767a0d828a6f24f8d4',
 '3a9cf01a-9711-4aac-9cf0-1a9711aaac67',
 '9605b65628592e95899ad4f0b2ad2b54',
 'aea73ec5-fd4d-4214-a73e-c5fd4d22147c',
 '0bb0914ada2ba5761ab8674eea92de9b',
 '

In [28]:
docs = list(graph_vector_store.mmr_traversal_search("global approach and naive rag"))


INFO:astrapy.cursors:creating iterator on 'graph_by_element'
INFO:astrapy.cursors:finished creating iterator on 'graph_by_element'
INFO:astrapy.collection:command=find on 'graph_by_element'
INFO:httpx:HTTP Request: POST https://e9b06722-b7b7-4d7d-9ba0-801344f200de-us-east-2.apps.astra.datastax.com/api/json/v1/graphrag/graph_by_element "HTTP/1.1 200 OK"
INFO:astrapy.collection:finished command=find on 'graph_by_element'
INFO:astrapy.cursors:creating iterator on 'graph_by_element'
INFO:astrapy.cursors:finished creating iterator on 'graph_by_element'
INFO:astrapy.collection:command=find on 'graph_by_element'
INFO:httpx:HTTP Request: POST https://e9b06722-b7b7-4d7d-9ba0-801344f200de-us-east-2.apps.astra.datastax.com/api/json/v1/graphrag/graph_by_element "HTTP/1.1 200 OK"
INFO:astrapy.collection:finished command=find on 'graph_by_element'
INFO:astrapy.cursors:creating iterator on 'graph_by_element'
INFO:astrapy.cursors:finished creating iterator on 'graph_by_element'
INFO:astrapy.collection

In [29]:
docs


[Document(id='674224fba7d004d7eb1e4c25994705d9', metadata={'type': 'NarrativeText', 'metadata': {'data_source': {'date_created': '1731958679.1744926', 'date_modified': '1731958679.1820238', 'date_processed': '1731960277.157229', 'permissions_data': [{'mode': 33188}], 'record_locator': {'path': '/Users/pedropacheco/Projects/demos/unstructured-platform/ingest/sample9.pdf'}}, 'filename': 'sample9.pdf', 'filetype': 'application/pdf', 'languages': ['eng'], 'links': [{'start_index': 53, 'text': '4', 'url': 'figure.4'}], 'page_number': 9, 'parent_id': '184b1c67c5b83248b137b9d6964259ab'}, 'links': {Link(kind='CompositeElement', direction='in', tag='9624b6ff352fa478765b3fc244c507ef')}, 'similarity_score': 0.6254081193039069, 'mmr_score': 0.31270405965195347}, page_content='Global approaches vs. na¨ıve RAG. As shown in Figure 4, global approaches consistently out- performed the na¨ıve RAG (SS) approach in both comprehensiveness and diversity metrics across datasets. Specifically, global approach