In [None]:
# Copyright (c) 2024 Microsoft Corporation.
# Licensed under the MIT License.

In [None]:
import os
from pathlib import Path

import pandas as pd
import tiktoken

from graphrag.query.indexer_adapters import (
    read_indexer_entities,
    read_indexer_relationships,
    read_indexer_reports,
    read_indexer_text_units,
)
from graphrag.query.input.loaders.dfs import (
    store_entity_semantic_embeddings,
)
from graphrag.query.llm.oai.chat_openai import ChatOpenAI
from graphrag.query.llm.oai.embedding import OpenAIEmbedding
from graphrag.query.llm.oai.typing import OpenaiApiType
from graphrag.query.structured_search.drift_search.drift_context import (
    DRIFTSearchContextBuilder,
)
from graphrag.query.structured_search.drift_search.search import DRIFTSearch
from graphrag.vector_stores.lancedb import LanceDBVectorStore

INPUT_DIR = "./inputs/operation dulce"
LANCEDB_URI = f"{INPUT_DIR}/lancedb"

COMMUNITY_REPORT_TABLE = "create_final_community_reports"
ENTITY_TABLE = "create_final_nodes"
ENTITY_EMBEDDING_TABLE = "create_final_entities"
RELATIONSHIP_TABLE = "create_final_relationships"
COVARIATE_TABLE = "create_final_covariates"
TEXT_UNIT_TABLE = "create_final_text_units"
COMMUNITY_LEVEL = 2


# read nodes table to get community and degree data
entity_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_TABLE}.parquet")
entity_embedding_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_EMBEDDING_TABLE}.parquet")

entities = read_indexer_entities(entity_df, entity_embedding_df, COMMUNITY_LEVEL)

# load description embeddings to an in-memory lancedb vectorstore
# to connect to a remote db, specify url and port values.
description_embedding_store = LanceDBVectorStore(
    collection_name="entity_description_embeddings",
)
description_embedding_store.connect(db_uri=LANCEDB_URI)
entity_description_embeddings = store_entity_semantic_embeddings(
    entities=entities, vectorstore=description_embedding_store
)

print(f"Entity count: {len(entity_df)}")
entity_df.head()

relationship_df = pd.read_parquet(f"{INPUT_DIR}/{RELATIONSHIP_TABLE}.parquet")
relationships = read_indexer_relationships(relationship_df)

print(f"Relationship count: {len(relationship_df)}")
relationship_df.head()

text_unit_df = pd.read_parquet(f"{INPUT_DIR}/{TEXT_UNIT_TABLE}.parquet")
text_units = read_indexer_text_units(text_unit_df)

print(f"Text unit records: {len(text_unit_df)}")
text_unit_df.head()

Entity count: 434
Relationship count: 276
Text unit records: 12


Unnamed: 0,id,text,n_tokens,document_ids,entity_ids,relationship_ids,covariate_ids
0,2cf7a230c367a2dfaf0fc3c903eb8948,# Operation: Dulce\n\n## Chapter 1\n\nThe thru...,2500,[958fdd043f17ade63cb13570b59df295],"[b45241d70f0e43fca764df95b2b81f77, 4119fd06010...","[b35c3d1a7daa4924b6bdb58bc69c354d, a97e2ecd870...","[ad5a2020-cdec-4982-acdf-dbe5ee530066, 9d8a0fe..."
1,6d1255303acb7c9dc951cb0f5fc3042c,be the same.\n\n\*\n\nThe sense of foreboding...,2500,[958fdd043f17ade63cb13570b59df295],"[b45241d70f0e43fca764df95b2b81f77, 4119fd06010...","[b35c3d1a7daa4924b6bdb58bc69c354d, a97e2ecd870...","[5d1c9126-c48d-4755-9f9c-f739c823f95f, ec64a42..."
2,e841f178310356740b2ee9101d12c97f,". ""Your take on these signal inconsistencies?""...",2500,[958fdd043f17ade63cb13570b59df295],"[b45241d70f0e43fca764df95b2b81f77, 4119fd06010...","[b35c3d1a7daa4924b6bdb58bc69c354d, a97e2ecd870...","[0b22a34b-32e9-46a4-a0e8-d3d5466eba15, 7e14972..."
3,f36d96862b9366d7240b5c7ceb04f12b,", absorbed in the bewilderment of contact, whi...",2500,[958fdd043f17ade63cb13570b59df295],"[b45241d70f0e43fca764df95b2b81f77, 4119fd06010...","[b35c3d1a7daa4924b6bdb58bc69c354d, 09f18f81442...","[9cd6d645-ab97-4b39-b02e-647cea9b5545, 50dc124..."
4,f7d43808d2fb452cd953bf50c6de6bd4,"were at once coherent and enigmatic: ""*Voyage...",2500,[958fdd043f17ade63cb13570b59df295],"[b45241d70f0e43fca764df95b2b81f77, 4119fd06010...","[e02be3e37ca0454883a4c1fd859c24bb, 1dbc51475cb...",[87cf5900-6211-4e04-9115-50f3617c88b4]


In [None]:
api_key = os.environ["GRAPHRAG_API_KEY"]
llm_model = os.environ["GRAPHRAG_LLM_MODEL"]
embedding_model = os.environ["GRAPHRAG_EMBEDDING_MODEL"]

chat_llm = ChatOpenAI(
    api_key=api_key,
    model=llm_model,
    api_type=OpenaiApiType.OpenAI,  # OpenaiApiType.OpenAI or OpenaiApiType.AzureOpenAI
    max_retries=20,
)

token_encoder = tiktoken.get_encoding("cl100k_base")

text_embedder = OpenAIEmbedding(
    api_key=api_key,
    api_base=None,
    api_type=OpenaiApiType.OpenAI,
    model=embedding_model,
    deployment_name=embedding_model,
    max_retries=20,
)

In [None]:
def embed_community_reports(
    input_dir: str,
    embedder: OpenAIEmbedding,
    community_report_table: str = COMMUNITY_REPORT_TABLE,
):
    """Embeds the full content of the community reports and saves the DataFrame with embeddings to the output path."""
    input_path = Path(input_dir) / f"{community_report_table}.parquet"
    output_path = Path(input_dir) / f"{community_report_table}_with_embeddings.parquet"

    if not Path(output_path).exists():
        print("Embedding file not found. Computing community report embeddings...")

        report_df = pd.read_parquet(input_path)

        if "full_content" not in report_df.columns:
            error_msg = f"'full_content' column not found in {input_path}"
            raise ValueError(error_msg)

        report_df["full_content_embeddings"] = report_df.loc[:, "full_content"].apply(
            lambda x: embedder.embed(x)
        )

        # Save the DataFrame with embeddings to the output path
        report_df.to_parquet(output_path)
        print(f"Embeddings saved to {output_path}")
        return report_df
    print(f"Embeddings file already exists at {output_path}")
    return pd.read_parquet(output_path)


report_df = embed_community_reports(INPUT_DIR, text_embedder)
reports = read_indexer_reports(
    report_df,
    entity_df,
    COMMUNITY_LEVEL,
    content_embedding_col="full_content_embeddings",
)

Embeddings file already exists at ./inputs/operation dulce/create_final_community_reports_with_embeddings.parquet


In [4]:
context_builder = DRIFTSearchContextBuilder(
    chat_llm=chat_llm,
    text_embedder=text_embedder,
    entities=entities,
    relationships=relationships,
    reports=reports,
    entity_text_embeddings=entity_description_embeddings,
    text_units=text_units,
)

search = DRIFTSearch(
    llm=chat_llm, context_builder=context_builder, token_encoder=token_encoder
)

In [5]:
resp = await search.asearch("Who is agent Mercer?")

  return bound(*args, **kwds)
100%|██████████| 5/5 [00:24<00:00,  4.91s/it]
100%|██████████| 20/20 [01:14<00:00,  3.71s/it]
 70%|███████   | 14/20 [01:54<00:26,  4.46s/it]Exception in _asearch
Traceback (most recent call last):
  File "/home/alonsog/.cache/pypoetry/virtualenvs/graphrag-ta_-cxM1-py3.10/lib/python3.10/site-packages/httpx/_transports/default.py", line 72, in map_httpcore_exceptions
    yield
  File "/home/alonsog/.cache/pypoetry/virtualenvs/graphrag-ta_-cxM1-py3.10/lib/python3.10/site-packages/httpx/_transports/default.py", line 257, in __aiter__
    async for part in self._httpcore_stream:
  File "/home/alonsog/.cache/pypoetry/virtualenvs/graphrag-ta_-cxM1-py3.10/lib/python3.10/site-packages/httpcore/_async/connection_pool.py", line 367, in __aiter__
    raise exc from None
  File "/home/alonsog/.cache/pypoetry/virtualenvs/graphrag-ta_-cxM1-py3.10/lib/python3.10/site-packages/httpcore/_async/connection_pool.py", line 363, in __aiter__
    async for part in self._stream:


In [6]:
resp.response

{'nodes': [{'query': 'Who is agent Mercer?',
   'answer': "# Agent Mercer: Context and Role in Operation: Dulce\n\nAgent Mercer, also known as Alex Mercer, is a key figure within the Paranormal Military Squad, a specialized unit tasked with engaging in activities related to the paranormal. In the context of Operation: Dulce, a highly secretive mission aimed at establishing contact with extraterrestrial intelligence at the Dulce Military Base in New Mexico, Alex Mercer's role is of paramount importance.\n\n## Leadership in Operation: Dulce\n\nAlex Mercer serves as the leader of the team involved in Operation: Dulce. The mission's high level of confidentiality and the critical nature of its objectives signify that Mercer's leadership skills are essential to navigating the complexities and challenges posed by the operation. His leadership is crucial in coordinating the diverse skills of team members such as Sam Rivera, Taylor Cruz, and Jordan Hayes, each of whom brings unique expertise to

In [9]:
resp.response["nodes"][0]["answer"]

"# Agent Mercer: Context and Role in Operation: Dulce\n\nAgent Mercer, also known as Alex Mercer, is a key figure within the Paranormal Military Squad, a specialized unit tasked with engaging in activities related to the paranormal. In the context of Operation: Dulce, a highly secretive mission aimed at establishing contact with extraterrestrial intelligence at the Dulce Military Base in New Mexico, Alex Mercer's role is of paramount importance.\n\n## Leadership in Operation: Dulce\n\nAlex Mercer serves as the leader of the team involved in Operation: Dulce. The mission's high level of confidentiality and the critical nature of its objectives signify that Mercer's leadership skills are essential to navigating the complexities and challenges posed by the operation. His leadership is crucial in coordinating the diverse skills of team members such as Sam Rivera, Taylor Cruz, and Jordan Hayes, each of whom brings unique expertise to the mission.\n\n## Contributions to the Paranormal Milita