In [1]:
%pip install -q -r requirements.txt


Note: you may need to restart the kernel to use updated packages.


In [None]:
# Remove any existing handlers
for logger_name in logging.root.manager.loggerDict:
    logger = logging.getLogger(logger_name)
    logger.setLevel(logging.ERROR)
    logger.propagate = False
    logger.handlers = []

# Configure the root logger
logging.basicConfig(level=logging.ERROR)
logger = logging.getLogger(__name__)
logger.setLevel(logging.ERROR)


In [3]:
from dotenv import load_dotenv
import os

load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
ASTRA_DB_API_ENDPOINT = os.getenv("ASTRA_DB_API_ENDPOINT")
ASTRA_DB_DATABASE_ID = os.getenv("ASTRA_DB_DATABASE_ID")
ASTRA_DB_APPLICATION_TOKEN = os.getenv("ASTRA_DB_APPLICATION_TOKEN")
ASTRA_DB_KEYSPACE = os.getenv("ASTRA_DB_KEYSPACE")
ASTRA_DB_COLLECTION = os.getenv("ASTRA_DB_COLLECTION")
ASTRA_DB_EMBEDDING_DIMENSIONS = os.getenv("ASTRA_DB_EMBEDDING_DIMENSIONS")
UNSTRUCTURED_API_KEY = os.getenv("UNSTRUCTURED_API_KEY")
UNSTRUCTURED_API_URL = os.getenv("UNSTRUCTURED_API_URL")


In [4]:
from langchain.embeddings import HuggingFaceEmbeddings

# Initialize HuggingFaceEmbeddings with a model of your choice
hf_embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Test embedding dimensions
texts = ["Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua"]
text_embeddings = hf_embeddings.embed_documents(texts)

print("Embeddings:", len(text_embeddings[0]))


  hf_embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


Embeddings: 384


In [5]:
from langchain_openai import OpenAIEmbeddings
oi_embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

texts = ["Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua"]
text_embeddings = oi_embeddings.embed_documents(texts)

print("Embeddings:", len(text_embeddings[0]))


Embeddings: 3072


In [6]:
from langchain_astradb import AstraDBVectorStore
def astradb(emdeggings, collection_name, clear=True):
    print(f"Creating vector store {collection_name} in AstraDB")
    vector_store = AstraDBVectorStore(
        collection_name=collection_name,
        embedding=hf_embeddings,
        api_endpoint=ASTRA_DB_API_ENDPOINT,
        token=ASTRA_DB_APPLICATION_TOKEN,
        namespace=ASTRA_DB_KEYSPACE,
    )
    
    vector_store.clear() 
    print(f"Vector store {vector_store.astra_env.collection} created in AstraDB")
    return vector_store


In [7]:
from unstructured_ingest.v2.pipeline.pipeline import Pipeline
from unstructured_ingest.v2.interfaces import ProcessorConfig

from unstructured_ingest.v2.processes.connectors.astradb import (
    AstraDBConnectionConfig,
    AstraDBAccessConfig,
    AstraDBUploadStagerConfig,
    AstraDBUploaderConfig
)
from unstructured_ingest.v2.processes.connectors.local import (
    LocalIndexerConfig,
    LocalDownloaderConfig,
    LocalConnectionConfig
)
from unstructured_ingest.v2.processes.partitioner import PartitionerConfig
from unstructured_ingest.v2.processes.chunker import ChunkerConfig
from unstructured_ingest.interfaces import ChunkingConfig
from unstructured_ingest.v2.processes.embedder import EmbedderConfig

def unstructured(strategy, collection_name, special_case=False):
    print(f"Parsing documents for stratygy {strategy}")
    print(f"Uploading to collection {collection_name}")
    
    if strategy != "by_element":        
        local_chunker_config = ChunkerConfig(chunking_strategy=strategy)
    else:
        local_chunker_config = ChunkerConfig(chunk_elements=True)
        
    Pipeline.from_configs(
        context=ProcessorConfig(),
        indexer_config=LocalIndexerConfig(input_path="./ingest"),
        downloader_config=LocalDownloaderConfig(),
        source_connection_config=LocalConnectionConfig(),
        partitioner_config=PartitionerConfig(
            partition_by_api=True,
            api_key=UNSTRUCTURED_API_KEY,
            partition_endpoint="", #os.getenv("UNSTRUCTURED_API_URL"),
            strategy="hi_res",
            additional_partition_args={
                "split_pdf_page": True,
                "split_pdf_allow_failed": True,
                "split_pdf_concurrency_level": 15
            }
        ),
        # Setting chunk_elements=True now has the same effect as setting chunking_strategy='by_title'
        chunker_config=local_chunker_config,
        embedder_config=EmbedderConfig(embedding_provider="huggingface"),
        destination_connection_config=AstraDBConnectionConfig(
            access_config=AstraDBAccessConfig(
                api_endpoint=ASTRA_DB_API_ENDPOINT,
                token=ASTRA_DB_APPLICATION_TOKEN
            )
        ),
        stager_config=AstraDBUploadStagerConfig(),
        uploader_config=AstraDBUploaderConfig(
            keyspace=ASTRA_DB_KEYSPACE,
            collection_name=collection_name,
            embedding_dimension=ASTRA_DB_EMBEDDING_DIMENSIONS
        )
    ).run()


In [8]:
vs_dict = {}
for strategy in ["by_title", "by_element"]: #["basic", "by_title", "by_similarity", "by_page"]:
    collection_name = f"unstructured_{strategy}"
    vs_dict[strategy] = astradb(hf_embeddings, collection_name=collection_name)
    unstructured(strategy, collection_name=collection_name)


Creating vector store unstructured_by_title in AstraDB


2024-11-21 08:58:42,303 MainProcess INFO     created index with configs: {"input_path": "ingest", "recursive": false}, connection configs: {"access_config": "**********"}
2024-11-21 08:58:42,304 MainProcess INFO     Created download with configs: {"download_dir": null}, connection configs: {"access_config": "**********"}
2024-11-21 08:58:42,304 MainProcess INFO     created partition with configs: {"strategy": "hi_res", "ocr_languages": null, "encoding": null, "additional_partition_args": {"split_pdf_page": true, "split_pdf_allow_failed": true, "split_pdf_concurrency_level": 15}, "skip_infer_table_types": null, "fields_include": ["element_id", "text", "type", "metadata", "embeddings"], "flatten_metadata": false, "metadata_exclude": [], "element_exclude": [], "metadata_include": [], "partition_endpoint": "", "partition_by_api": true, "api_key": "*******", "hi_res_model_name": null}
2024-11-21 08:58:42,305 MainProcess INFO     created chunk with configs: {"chunking_strategy": "by_title", 

Vector store Collection(name="unstructured_by_title", keyspace="graphrag", database=Database(api_endpoint="https://e9b06722-b7b7-4d7d-9ba0-801344f200de-us-east-2.apps.astra.datastax.com", token="AstraCS:JWxL...", keyspace="graphrag"), api_options=CollectionAPIOptions(max_time_ms=None, embedding_api_key=EmbeddingAPIKeyHeaderProvider(empty))) created in AstraDB
Parsing documents for stratygy by_title
Uploading to collection unstructured_by_title


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
2024-11-21 08:58:42,407 MainProcess INFO     index finished in 4.9e-05s
2024-11-21 08:58:42,411 MainProcess INFO     calling DownloadStep with 1 docs
2024-11-21 08:58:42,412 MainProcess INFO     processing content async
2024-11-21 08:58:42,414 MainProcess INFO     download finished in 0.001552s, attributes: file_id=5e67eafb35b0
2024-11-21 08:58:42,415 MainProcess INFO     download step finished in 0.003699s
2024-11-21 08:58:42,415 MainProcess INFO     calling PartitionStep with 1 docs
2024-11-21 08:58:42,415 MainProcess INFO     processing content async
2024-11-21 08:58:42,418 MainProcess INFO     partition finished in 0.001476s, attributes: file_id=5e67eafb35b0
2024-11-21 08:58:42,418 MainProcess INFO     p

Creating vector store unstructured_by_element in AstraDB


2024-11-21 08:58:46,999 MainProcess INFO     created index with configs: {"input_path": "ingest", "recursive": false}, connection configs: {"access_config": "**********"}
2024-11-21 08:58:46,999 MainProcess INFO     Created download with configs: {"download_dir": null}, connection configs: {"access_config": "**********"}
2024-11-21 08:58:47,000 MainProcess INFO     created partition with configs: {"strategy": "hi_res", "ocr_languages": null, "encoding": null, "additional_partition_args": {"split_pdf_page": true, "split_pdf_allow_failed": true, "split_pdf_concurrency_level": 15}, "skip_infer_table_types": null, "fields_include": ["element_id", "text", "type", "metadata", "embeddings"], "flatten_metadata": false, "metadata_exclude": [], "element_exclude": [], "metadata_include": [], "partition_endpoint": "", "partition_by_api": true, "api_key": "*******", "hi_res_model_name": null}
2024-11-21 08:58:47,001 MainProcess INFO     created chunk with configs: {"chunking_strategy": null, "chunk

Vector store Collection(name="unstructured_by_element", keyspace="graphrag", database=Database(api_endpoint="https://e9b06722-b7b7-4d7d-9ba0-801344f200de-us-east-2.apps.astra.datastax.com", token="AstraCS:JWxL...", keyspace="graphrag"), api_options=CollectionAPIOptions(max_time_ms=None, embedding_api_key=EmbeddingAPIKeyHeaderProvider(empty))) created in AstraDB
Parsing documents for stratygy by_element
Uploading to collection unstructured_by_element


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
2024-11-21 08:58:47,089 MainProcess INFO     index finished in 2.4e-05s
2024-11-21 08:58:47,091 MainProcess INFO     calling DownloadStep with 1 docs
2024-11-21 08:58:47,091 MainProcess INFO     processing content async
2024-11-21 08:58:47,094 MainProcess INFO     download finished in 0.001622s, attributes: file_id=5e67eafb35b0
2024-11-21 08:58:47,094 MainProcess INFO     download step finished in 0.003461s
2024-11-21 08:58:47,094 MainProcess INFO     calling PartitionStep with 1 docs
2024-11-21 08:58:47,095 MainProcess INFO     processing content async
2024-11-21 08:58:47,096 MainProcess INFO     partition finished in 0.000995s, attributes: file_id=5e67eafb35b0
2024-11-21 08:58:47,097 MainProcess INFO     p

In [9]:
import sys
sys.path.append("/Users/pedropacheco/Projects/demos/unstructured-platform")
from content_graph import ContentGraph, _encode_astradb_documents


### Retrieve documents

In [None]:
from langchain_astradb.utils.vector_store_codecs import (
    _AstraDBVectorStoreDocumentCodec,
    _DefaultVectorizeVSDocumentCodec,
    _DefaultVSDocumentCodec,
)
original_documents = {}
for key in ["by_title","by_element"]:
    print(key)
    hits = list(vs_dict[key].astra_env.collection.find({}))
    document_codec = _DefaultVSDocumentCodec(content_field="content", ignore_invalid_documents=True)
    original_documents[key] = [document_codec.decode(hit) for hit in hits]


INFO:astrapy.cursors:creating iterator on 'unstructured_by_title'
INFO:astrapy.cursors:finished creating iterator on 'unstructured_by_title'
INFO:astrapy.collection:command=find on 'unstructured_by_title'
INFO:httpx:HTTP Request: POST https://e9b06722-b7b7-4d7d-9ba0-801344f200de-us-east-2.apps.astra.datastax.com/api/json/v1/graphrag/unstructured_by_title "HTTP/1.1 200 OK"
INFO:astrapy.collection:finished command=find on 'unstructured_by_title'
INFO:astrapy.collection:command=find on 'unstructured_by_title'


by_title


INFO:httpx:HTTP Request: POST https://e9b06722-b7b7-4d7d-9ba0-801344f200de-us-east-2.apps.astra.datastax.com/api/json/v1/graphrag/unstructured_by_title "HTTP/1.1 200 OK"
INFO:astrapy.collection:finished command=find on 'unstructured_by_title'
INFO:astrapy.collection:command=find on 'unstructured_by_title'
INFO:httpx:HTTP Request: POST https://e9b06722-b7b7-4d7d-9ba0-801344f200de-us-east-2.apps.astra.datastax.com/api/json/v1/graphrag/unstructured_by_title "HTTP/1.1 200 OK"
INFO:astrapy.collection:finished command=find on 'unstructured_by_title'
INFO:astrapy.collection:command=find on 'unstructured_by_title'
INFO:httpx:HTTP Request: POST https://e9b06722-b7b7-4d7d-9ba0-801344f200de-us-east-2.apps.astra.datastax.com/api/json/v1/graphrag/unstructured_by_title "HTTP/1.1 200 OK"
INFO:astrapy.collection:finished command=find on 'unstructured_by_title'
INFO:astrapy.collection:command=find on 'unstructured_by_title'
INFO:httpx:HTTP Request: POST https://e9b06722-b7b7-4d7d-9ba0-801344f200de-us-e

by_element


INFO:httpx:HTTP Request: POST https://e9b06722-b7b7-4d7d-9ba0-801344f200de-us-east-2.apps.astra.datastax.com/api/json/v1/graphrag/unstructured_by_element "HTTP/1.1 200 OK"
INFO:astrapy.collection:finished command=find on 'unstructured_by_element'
INFO:astrapy.collection:command=find on 'unstructured_by_element'
INFO:httpx:HTTP Request: POST https://e9b06722-b7b7-4d7d-9ba0-801344f200de-us-east-2.apps.astra.datastax.com/api/json/v1/graphrag/unstructured_by_element "HTTP/1.1 200 OK"
INFO:astrapy.collection:finished command=find on 'unstructured_by_element'
INFO:astrapy.collection:command=find on 'unstructured_by_element'
INFO:httpx:HTTP Request: POST https://e9b06722-b7b7-4d7d-9ba0-801344f200de-us-east-2.apps.astra.datastax.com/api/json/v1/graphrag/unstructured_by_element "HTTP/1.1 200 OK"
INFO:astrapy.collection:finished command=find on 'unstructured_by_element'
INFO:astrapy.collection:command=find on 'unstructured_by_element'
INFO:httpx:HTTP Request: POST https://e9b06722-b7b7-4d7d-9ba0

### Builds Vector Graph!

In [None]:
graphs={}
for strategy in ["by_title", "by_element"]:
    g = ContentGraph(strategy)
    g.fromLangChainDocuments(
        documents=original_documents[strategy],
        reset_graph=True,
        infer_hierarchy=True,
        strategy=strategy,
    )
    graphs[strategy] = g


INFO:content_graph:Creating content graph from existing langchain documents ...
  existing_links = get_links(doc)
  add_links(doc, link)
INFO:content_graph:Creating content graph from existing langchain documents ...


In [12]:
graphs["by_element"].plot_graph("by_element")
graphs["by_title"].plot_graph("by_title")


by_element.html
by_title.html


### Saves graphs to the graph vector store

In [14]:
from langchain_astradb import AstraDBGraphVectorStore
from langchain_astradb.utils.astradb import SetupMode
graph_vector_stores = {}
for key in ["by_title", "by_element"]:
    collection_name = f"graph_{key}"
    graph_vector_store = AstraDBGraphVectorStore(
        collection_name=collection_name,
        embedding=hf_embeddings,
        api_endpoint=ASTRA_DB_API_ENDPOINT,
        token=ASTRA_DB_APPLICATION_TOKEN,
        namespace=ASTRA_DB_KEYSPACE,
        setup_mode=SetupMode.SYNC
    )
    graph_vector_store.add_documents(graphs[key].graph)
    graph_vector_stores[key] = graph_vector_store
    

INFO:langchain_astradb.vectorstores:vector store default init, collection 'graph_by_title'
INFO:root:Detecting API environment 'prod' from supplied endpoint
INFO:astrapy.database:createCollection('graph_by_title')
INFO:httpx:HTTP Request: POST https://e9b06722-b7b7-4d7d-9ba0-801344f200de-us-east-2.apps.astra.datastax.com/api/json/v1/graphrag "HTTP/1.1 200 OK"
INFO:astrapy.database:finished createCollection('graph_by_title')
INFO:astrapy.cursors:creating iterator on 'graph_by_title'
INFO:astrapy.cursors:finished creating iterator on 'graph_by_title'
INFO:astrapy.collection:command=find on 'graph_by_title'
INFO:httpx:HTTP Request: POST https://e9b06722-b7b7-4d7d-9ba0-801344f200de-us-east-2.apps.astra.datastax.com/api/json/v1/graphrag/graph_by_title "HTTP/1.1 200 OK"
INFO:astrapy.collection:finished command=find on 'graph_by_title'
INFO:astrapy.collection:inserting 370 documents in 'graph_by_title'
INFO:astrapy.collection:insertMany(chunk) on 'graph_by_title'
INFO:astrapy.collection:inser

In [55]:
retrieved_docs={}
for key in ["by_title", "by_element"]:
    retrieved_docs[key] = list(graph_vector_stores[key].mmr_traversal_search("What are the differences between Graph RAG and naive RAG", depth=4, k=10))
    print(f"Retrival {key} has {len(retrieved_docs[key])} documents")
    for doc in retrieved_docs[key]:
        print(doc.metadata.get("element_id"), doc.metadata.get("type"))


INFO:astrapy.cursors:creating iterator on 'graph_by_title'
INFO:astrapy.cursors:finished creating iterator on 'graph_by_title'
INFO:astrapy.collection:command=find on 'graph_by_title'
INFO:httpx:HTTP Request: POST https://e9b06722-b7b7-4d7d-9ba0-801344f200de-us-east-2.apps.astra.datastax.com/api/json/v1/graphrag/graph_by_title "HTTP/1.1 200 OK"
INFO:astrapy.collection:finished command=find on 'graph_by_title'
INFO:astrapy.cursors:creating iterator on 'graph_by_title'
INFO:astrapy.cursors:finished creating iterator on 'graph_by_title'
INFO:astrapy.collection:command=find on 'graph_by_title'
INFO:httpx:HTTP Request: POST https://e9b06722-b7b7-4d7d-9ba0-801344f200de-us-east-2.apps.astra.datastax.com/api/json/v1/graphrag/graph_by_title "HTTP/1.1 200 OK"
INFO:astrapy.collection:finished command=find on 'graph_by_title'
INFO:astrapy.cursors:creating iterator on 'graph_by_title'
INFO:astrapy.cursors:finished creating iterator on 'graph_by_title'
INFO:astrapy.collection:command=find on 'graph_

Retrival by_title has 10 documents
eae4895a4f7bedf9d6810d3dcbc71774 Table
0f3416e50150ae1fcc799a10a8a3dc64 CompositeElement
99b4700ad701e926b4225b518b38d54c CompositeElement
7d2eb48cc884a68ec6a73a0312b508e9 CompositeElement
None NarrativeText
01ffa7aba16150d19dd4b920c3597b4c CompositeElement
None NarrativeText
9c4086b88a27a7760049d15518e89b13 CompositeElement
41a03643b142b7e0ccd4bd4445a08ac8 CompositeElement
a70afad2f731797e0fdbb86f72689d1a CompositeElement


INFO:astrapy.cursors:creating iterator on 'graph_by_element'
INFO:astrapy.cursors:finished creating iterator on 'graph_by_element'
INFO:astrapy.collection:command=find on 'graph_by_element'
INFO:httpx:HTTP Request: POST https://e9b06722-b7b7-4d7d-9ba0-801344f200de-us-east-2.apps.astra.datastax.com/api/json/v1/graphrag/graph_by_element "HTTP/1.1 200 OK"
INFO:astrapy.collection:finished command=find on 'graph_by_element'
INFO:astrapy.cursors:creating iterator on 'graph_by_element'
INFO:astrapy.cursors:finished creating iterator on 'graph_by_element'
INFO:astrapy.collection:command=find on 'graph_by_element'
INFO:httpx:HTTP Request: POST https://e9b06722-b7b7-4d7d-9ba0-801344f200de-us-east-2.apps.astra.datastax.com/api/json/v1/graphrag/graph_by_element "HTTP/1.1 200 OK"
INFO:astrapy.collection:finished command=find on 'graph_by_element'
INFO:astrapy.cursors:creating iterator on 'graph_by_element'
INFO:astrapy.cursors:finished creating iterator on 'graph_by_element'
INFO:astrapy.collection

Retrival by_element has 10 documents
f38b4b84f83fb8aba07c5535e896b4e6 NarrativeText
78fa702495f1462d716f6f9f5a5ed308 NarrativeText
184b1c67c5b83248b137b9d6964259ab Title
7f41e979979cb8b2acc7bf5731dd6c34 NarrativeText
e667ea4a13109d603aa3bb575511692b NarrativeText
98020eaabe84fc0744a00ac4ae2498fb Title
91aba52b96193022fc9db4c0b2a00a80 Image
1e2692dec572327c213ca27277e63413 NarrativeText
ddaca41a0c623dd70332adb3fbc6bd59 Title
6f6f09dc0fe2f929eb06759763803801 Table


In [None]:
from langchain.prompts import ChatPromptTemplate
from langchain.chat_models import ChatOpenAI

prompt_template = ChatPromptTemplate.from_template("""
You are an specialist in artificial intelligence. Your role is to answer questions accurately based on the provided context. If the context does not contain enough information, respond by stating that the context doesn't allow you to answer the question.
Here is the context: {context}
Based on the context above, answer this question: {question}""")

question = "What are the differences between Graph RAG and naive RAG?"
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)


input_variables=['context', 'question'] input_types={} partial_variables={} messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template="\nYou are an specialist in artificial intelligence. Your role is to answer questions accurately based on the provided context. If the context does not contain enough information, respond by stating that the context doesn't allow you to answer the question.\nHere is the context: {context}\nBased on the context above, answer this question: {question}"), additional_kwargs={})]


In [None]:
for strategy in ["by_element", "by_title"]:
    context = ""
    for doc in retrieved_docs[strategy]:
        context += f"\n{doc.page_content}"

    formatted_prompt = prompt_template.format(context=context, question=question)
    # Send the prompt to OpenAI and get the chat completion
    response = llm.invoke(formatted_prompt)
    print(response.content)


INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


The context does not allow me to answer the question about the differences between Graph RAG and naive RAG.


INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


The context indicates several differences between Graph RAG and naive RAG (SS):

1. **Performance on Global Queries**: Graph RAG provides superior performance on global queries over a dataset compared to naive RAG, particularly in summarizing root-level communities in the entity-based graph index.

2. **Comprehensiveness and Diversity**: Graph RAG outperforms naive RAG in terms of comprehensiveness (with a 72% win rate) and diversity (with a 62% win rate).

3. **Token Cost**: Graph RAG achieves competitive performance to other global methods at a fraction of the token cost compared to naive RAG.

4. **Handling of Entity References**: Graph RAG is designed to manage variations in entity references more effectively, reducing the risk of duplicate nodes in the entity graph due to its ability to detect and summarize closely-related communities of entities.

5. **Empowerment Comparisons**: The empowerment comparisons showed mixed results for both global approaches versus naive RAG, indicati

In [None]:
question = "What are the differences between Graph RAG and naive RAG?"
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)

for strategy in ["by_element", "by_title"]:
    context = ""
    for doc in retrieved_docs[strategy]:
        context += f"\n{doc.page_content}"

    formatted_prompt = prompt_template.format(context=context, question=question)
    # Send the prompt to OpenAI and get the chat completion
    response = llm.invoke(formatted_prompt)
    print(response.content)


### As a point of comparison, do retrive documents from the vector store and run chain

In [59]:
retrieved_docs={}
for key in ["by_title", "by_element"]:
    retrieved_docs[key] = list(vs_dict[key].max_marginal_relevance_search("What are the differences between Graph RAG and naive RAG", k=10, lambda_mult=0.5))
    print(f"Retrival {key} has {len(retrieved_docs[key])} documents")
    for doc in retrieved_docs[key]:
        print(doc.metadata.get("element_id"), doc.metadata.get("type"))


INFO:astrapy.cursors:creating iterator on 'unstructured_by_title'
INFO:astrapy.cursors:finished creating iterator on 'unstructured_by_title'
INFO:astrapy.collection:command=find on 'unstructured_by_title'
INFO:httpx:HTTP Request: POST https://e9b06722-b7b7-4d7d-9ba0-801344f200de-us-east-2.apps.astra.datastax.com/api/json/v1/graphrag/unstructured_by_title "HTTP/1.1 200 OK"
INFO:astrapy.collection:finished command=find on 'unstructured_by_title'
INFO:astrapy.cursors:creating iterator on 'unstructured_by_element'
INFO:astrapy.cursors:finished creating iterator on 'unstructured_by_element'
INFO:astrapy.collection:command=find on 'unstructured_by_element'


Retrival by_title has 10 documents
eae4895a4f7bedf9d6810d3dcbc71774 Table
e2c0d65f135931524dd0ecff1ba314a7 CompositeElement
cc9c3cc000c956c82d96ce2fdd86f597 CompositeElement
7099bddda898af673be4c4fd66499692 CompositeElement
d27013e58416392d6e81bb5f169b162c CompositeElement
4a65264c97297c5e62125f1545f336fd CompositeElement
7d2eb48cc884a68ec6a73a0312b508e9 CompositeElement
3344a28cb21121a548387d1f6d0532f7 CompositeElement
556c695c19e5a20fd4b86505bd2c11f6 CompositeElement
a70afad2f731797e0fdbb86f72689d1a CompositeElement


INFO:httpx:HTTP Request: POST https://e9b06722-b7b7-4d7d-9ba0-801344f200de-us-east-2.apps.astra.datastax.com/api/json/v1/graphrag/unstructured_by_element "HTTP/1.1 200 OK"
INFO:astrapy.collection:finished command=find on 'unstructured_by_element'


Retrival by_element has 10 documents
f38b4b84f83fb8aba07c5535e896b4e6 NarrativeText
8acfdb793c30dbcf758fcbd7b1b5dc79 Title
7df23ff5f6144fb4119be010c204db6a NarrativeText
ddaca41a0c623dd70332adb3fbc6bd59 Title
e410d7d4f215b88d55c5c1516732168f NarrativeText
1e2692dec572327c213ca27277e63413 NarrativeText
e56fc729feae4251c7a9dcfc56b1015e NarrativeText
674224fba7d004d7eb1e4c25994705d9 NarrativeText
033d0bf59323c91bc00ba692f9d8f747 NarrativeText
eeefa0d04f9d7ea044a4a2909464b49b NarrativeText


In [60]:
for strategy in ["by_element", "by_title"]:
    context = ""
    for doc in retrieved_docs[strategy]:
        context += f"\n{doc.page_content}"

    formatted_prompt = prompt_template.format(context=context, question=question)
    # Send the prompt to OpenAI and get the chat completion
    response = llm.invoke(formatted_prompt)
    print(response.content)


INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


The differences between Graph RAG and naive RAG include:

1. **Comprehensiveness and Diversity**: Graph RAG approaches consistently outperform naive RAG in both comprehensiveness and diversity metrics across various datasets. Specifically, global approaches achieved higher win rates in comprehensiveness (72-83% for Podcast transcripts and 72-80% for News articles) and diversity (75-82% for Podcast transcripts and 62-71% for News articles).

2. **Use of Graphs**: Graph RAG utilizes the natural modularity of graphs to partition data for global summarization, whereas naive RAG does not leverage this structure.

3. **Information Retrieval Method**: In naive RAG, documents are converted to text, split into chunks, and embedded into a vector space for similarity-based retrieval. In contrast, Graph RAG approaches incorporate graph structures, which may allow for more sophisticated retrieval and summarization techniques.

4. **Directness of Responses**: Naive RAG produces the most direct respo

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


The context indicates that Graph RAG outperforms naive RAG in terms of comprehensiveness and diversity. Additionally, advanced RAG systems, including Graph RAG, incorporate pre-retrieval, retrieval, and post-retrieval strategies designed to overcome the drawbacks of naive RAG. However, the specific differences between Graph RAG and naive RAG are not detailed in the provided context. Therefore, the context does not allow for a comprehensive answer regarding all the differences between the two approaches.
