In [1]:
# Copyright (c) 2024 Microsoft Corporation.
# Licensed under the MIT License.

In [2]:
import os
from pathlib import Path

import pandas as pd
import tiktoken

from graphrag.config.models.drift_search_config import DRIFTSearchConfig
from graphrag.query.indexer_adapters import (
    read_indexer_entities,
    read_indexer_relationships,
    read_indexer_report_embeddings,
    read_indexer_reports,
    read_indexer_text_units,
)
from graphrag.query.llm.oai.chat_openai import ChatOpenAI
from graphrag.query.llm.oai.embedding import OpenAIEmbedding
from graphrag.query.llm.oai.typing import OpenaiApiType
from graphrag.query.structured_search.drift_search.drift_context import (
    DRIFTSearchContextBuilder,
)
from graphrag.query.structured_search.drift_search.search import DRIFTSearch
from graphrag.vector_stores.lancedb import LanceDBVectorStore

INPUT_DIR = "./inputs/operation dulce"
LANCEDB_URI = f"{INPUT_DIR}/lancedb"

COMMUNITY_REPORT_TABLE = "create_final_community_reports"
ENTITY_TABLE = "create_final_nodes"
ENTITY_EMBEDDING_TABLE = "create_final_entities"
RELATIONSHIP_TABLE = "create_final_relationships"
COVARIATE_TABLE = "create_final_covariates"
TEXT_UNIT_TABLE = "create_final_text_units"
COMMUNITY_LEVEL = 2


# read nodes table to get community and degree data
entity_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_TABLE}.parquet")
entity_embedding_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_EMBEDDING_TABLE}.parquet")

print(f"Entity df columns: {entity_df.columns}")

entities = read_indexer_entities(entity_df, entity_embedding_df, COMMUNITY_LEVEL)

# load description embeddings to an in-memory lancedb vectorstore
# to connect to a remote db, specify url and port values.
description_embedding_store = LanceDBVectorStore(
    collection_name="default-entity-description",
)
description_embedding_store.connect(db_uri=LANCEDB_URI)

full_content_embedding_store = LanceDBVectorStore(
    collection_name="default-community-full_content",
)
full_content_embedding_store.connect(db_uri=LANCEDB_URI)

print(f"Entity count: {len(entity_df)}")
entity_df.head()

relationship_df = pd.read_parquet(f"{INPUT_DIR}/{RELATIONSHIP_TABLE}.parquet")
relationships = read_indexer_relationships(relationship_df)

print(f"Relationship count: {len(relationship_df)}")
relationship_df.head()

text_unit_df = pd.read_parquet(f"{INPUT_DIR}/{TEXT_UNIT_TABLE}.parquet")
text_units = read_indexer_text_units(text_unit_df)

print(f"Text unit records: {len(text_unit_df)}")
text_unit_df.head()

Entity df columns: Index(['id', 'human_readable_id', 'title', 'community', 'level', 'degree', 'x',
       'y'],
      dtype='object')
Entity count: 888
Relationship count: 812
Text unit records: 38


Unnamed: 0,id,human_readable_id,text,n_tokens,document_ids,entity_ids,relationship_ids,covariate_ids
0,aa55265004ced76e9050ed4b7a45c0496e10faa0eddb8a...,1,"../\nJACOB COLLIER: Honestly, I think mastery...",1200,[1e0886ae010728d10b2972f66b88608dc82b8645d3085...,"[9a062709-56dd-4bf2-8b41-926124b7a6f7, f8c54a6...","[9af066c8-031b-4c52-b93b-b37763f6f0f7, 5b15580...","[f91209d1-0939-452e-b51b-be1763e2a27d, f2274c3..."
1,7f0fb1d3bf517dc76dffa984eec7a25e851e44ead0df82...,2,OMBERG: I grew up and started getting into al...,1200,[1e0886ae010728d10b2972f66b88608dc82b8645d3085...,"[9a062709-56dd-4bf2-8b41-926124b7a6f7, f8c54a6...","[c2ac3612-3aaf-440c-babd-e21f474e0366, 9aab0b4...","[13c74c18-439b-4419-8427-4ba826503055, 7120179..."
2,27b739ceeddfa100f7be3cf002fd3a27aea2228f1a02c4...,3,", you know, and it’s a very linear pathway too...",1200,[1e0886ae010728d10b2972f66b88608dc82b8645d3085...,"[9a062709-56dd-4bf2-8b41-926124b7a6f7, f8c54a6...","[9af066c8-031b-4c52-b93b-b37763f6f0f7, 5456bcb...",[a67f1c21-32ab-4eaa-b063-c815e7f3ea9d]
3,d97017305e234cc51554d653447d73b58441e1ff0f99e4...,4,"you know, we started taking a lot of parts an...",1200,[1e0886ae010728d10b2972f66b88608dc82b8645d3085...,"[9a062709-56dd-4bf2-8b41-926124b7a6f7, f8c54a6...","[bac3dd9b-f5c8-4966-9dc3-87f1f5976e36, 4f41be8...",[695ac017-7c10-44ad-a681-3b4c1ae86a87]
4,2b6d29f8a74b16ea9a70423bce803a08a5b9ed4e6a946b...,5,only so much processing that my mind can do i...,1200,[1e0886ae010728d10b2972f66b88608dc82b8645d3085...,"[9a062709-56dd-4bf2-8b41-926124b7a6f7, f8c54a6...","[d225ff7b-ca47-4fab-8d9a-4f86111526f8, 3fddb40...",[d8a7ad5c-a170-430e-9f9d-902047371ee2]


In [3]:
api_key = os.environ["GRAPHRAG_API_KEY"]
llm_model = os.environ["GRAPHRAG_LLM_MODEL"]
embedding_model = os.environ["GRAPHRAG_EMBEDDING_MODEL"]

chat_llm = ChatOpenAI(
    api_key=api_key,
    model=llm_model,
    api_type=OpenaiApiType.OpenAI,  # OpenaiApiType.OpenAI or OpenaiApiType.AzureOpenAI
    max_retries=20,
)

token_encoder = tiktoken.encoding_for_model(llm_model)

text_embedder = OpenAIEmbedding(
    api_key=api_key,
    api_base=None,
    api_type=OpenaiApiType.OpenAI,
    model=embedding_model,
    deployment_name=embedding_model,
    max_retries=20,
)

In [4]:
def read_community_reports(
    input_dir: str,
    community_report_table: str = COMMUNITY_REPORT_TABLE,
):
    """Embeds the full content of the community reports and saves the DataFrame with embeddings to the output path."""
    input_path = Path(input_dir) / f"{community_report_table}.parquet"
    return pd.read_parquet(input_path)


report_df = read_community_reports(INPUT_DIR)
reports = read_indexer_reports(
    report_df,
    entity_df,
    COMMUNITY_LEVEL,
    content_embedding_col="full_content_embeddings",
)
read_indexer_report_embeddings(reports, full_content_embedding_store)

In [5]:
drift_params = DRIFTSearchConfig(
    temperature=0,
    max_tokens=12_000,
    primer_folds=1,
    drift_k_followups=3,
    n_depth=3,
    n=1,
)

context_builder = DRIFTSearchContextBuilder(
    chat_llm=chat_llm,
    text_embedder=text_embedder,
    entities=entities,
    relationships=relationships,
    reports=reports,
    entity_text_embeddings=description_embedding_store,
    text_units=text_units,
    token_encoder=token_encoder,
    config=drift_params,
)

search = DRIFTSearch(
    llm=chat_llm, context_builder=context_builder, token_encoder=token_encoder
)

In [6]:
resp = await search.asearch("Who is agent Mercer?")

  return bound(*args, **kwds)
 40%|████      | 8/20 [01:12<02:55, 14.65s/it] Failed to parse search response: {
  "response": "# Mercer's Background and Mentorship Style\n\n## Background and Experience\n\nAgent Alex Mercer is depicted as a seasoned and determined member of the Paranormal Military Squad, tasked with the critical mission of Operation: Dulce. His background is characterized by a blend of military discipline and a deep-seated curiosity for the unknown. This duality is evident in his interactions and decision-making processes throughout the mission.\n\nMercer's experience in the field has honed his ability to balance protocol with intuition. He is described as having a " 
    , "unfailing determination"  
                                                                                                                                                                                                                                                                                 

In [7]:
resp.response

{'nodes': [{'query': 'Who is agent Mercer?',
   'answer': "## Introduction to Agent Mercer in Context of Dulce Base\n\nBased on the provided community summaries related to Alex Mercer and the Dulce Base team, Agent Mercer refers to Alex Mercer. He is identified as a central figure in Dulce Base, specifically within the Paranormal Military Squad. Alex Mercer plays a crucial role in leading efforts to decode and respond to alien communication. His responsibilities at Dulce Base involve providing guidance, making strategic decisions, and maintaining the focus and groundedness of his team. The following paragraphs delve deeper into his leadership role, interactions with team members, and approach towards the team’s mission and alien communication.\n\n### Leadership and Strategic Role\nAlex Mercer is acknowledged as a pivotal leader at Dulce Base, heading a specialized team dedicated to interstellar communications. He has a background as a former military member, which enriches his strategi

In [9]:
resp.response["nodes"][0]["answer"]

"## Introduction to Agent Mercer in Context of Dulce Base\n\nBased on the provided community summaries related to Alex Mercer and the Dulce Base team, Agent Mercer refers to Alex Mercer. He is identified as a central figure in Dulce Base, specifically within the Paranormal Military Squad. Alex Mercer plays a crucial role in leading efforts to decode and respond to alien communication. His responsibilities at Dulce Base involve providing guidance, making strategic decisions, and maintaining the focus and groundedness of his team. The following paragraphs delve deeper into his leadership role, interactions with team members, and approach towards the team’s mission and alien communication.\n\n### Leadership and Strategic Role\nAlex Mercer is acknowledged as a pivotal leader at Dulce Base, heading a specialized team dedicated to interstellar communications. He has a background as a former military member, which enriches his strategic mind and decision-making abilities. Mercer's role requir