In [1]:
import os

import pandas as pd
import tiktoken

from graphrag.query.context_builder.entity_extraction import EntityVectorStoreKey
from graphrag.query.indexer_adapters import (
    read_indexer_entities,
    read_indexer_relationships,
    read_indexer_reports,
    read_indexer_text_units,
)
from graphrag.query.input.loaders.dfs import (
    store_entity_semantic_embeddings,
)
from graphrag.query.llm.oai.chat_openai import ChatOpenAI
from graphrag.query.llm.oai.embedding import OpenAIEmbedding
from graphrag.query.llm.oai.typing import OpenaiApiType
from graphrag.query.structured_search.local_search.mixed_context import (
    LocalSearchMixedContext,
)
from graphrag.vector_stores.lancedb import LanceDBVectorStore

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
INPUT_DIR = "/root/autodl-tmp/HL/code/graphhl/test_graphrag_dimensions/faqtest/output/20240828-160021/artifacts"  # 此处需要修改为存放输出的位置
LANCEDB_URI = f"{INPUT_DIR}/lancedb"

COMMUNITY_REPORT_TABLE = "create_final_community_reports"
ENTITY_TABLE = "create_final_nodes"
ENTITY_EMBEDDING_TABLE = "create_final_entities"
RELATIONSHIP_TABLE = "create_final_relationships"
TEXT_UNIT_TABLE = "create_final_text_units"
COMMUNITY_LEVEL = 2

In [3]:
# read nodes table to get community and degree data
entity_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_TABLE}.parquet")
entity_embedding_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_EMBEDDING_TABLE}.parquet")

entities = read_indexer_entities(entity_df, entity_embedding_df, COMMUNITY_LEVEL)

# load description embeddings to an in-memory lancedb vectorstore
# to connect to a remote db, specify url and port values.
description_embedding_store = LanceDBVectorStore(
    collection_name="entity_description_embeddings",
)
description_embedding_store.connect(db_uri=LANCEDB_URI)
entity_description_embeddings = store_entity_semantic_embeddings(
    entities=entities, vectorstore=description_embedding_store
)

In [4]:
relationship_df = pd.read_parquet(f"{INPUT_DIR}/{RELATIONSHIP_TABLE}.parquet")
relationships = read_indexer_relationships(relationship_df)

report_df = pd.read_parquet(f"{INPUT_DIR}/{COMMUNITY_REPORT_TABLE}.parquet")
reports = read_indexer_reports(report_df, entity_df, COMMUNITY_LEVEL)

text_unit_df = pd.read_parquet(f"{INPUT_DIR}/{TEXT_UNIT_TABLE}.parquet")
text_units = read_indexer_text_units(text_unit_df)

In [5]:
llm_model = "gpt-4o"
embedding_model = "text-embedding-3-large"

from dotenv import load_dotenv
load_dotenv()
azure_deployment="gpt-4o"
azure_endpoint=os.environ.get("AZURE_ENDPOINT_LLM_GPT4O")
api_key=os.environ.get("API_KEY_LLM_GPT4O")
api_version=os.environ.get("API_VERSION_LLM_GPT4O")

deployment_name="text-embedding-3-large"
api_key=os.environ.get("API_KEY_EMB")
azure_endpoint=os.environ.get("AZURE_ENDPOINT_EMB")
api_version=os.environ.get("API_VERSION_EMB")

llm = ChatOpenAI(
    api_key=api_key,
    api_base=azure_endpoint,
    api_version=api_version,
    model=llm_model,
    api_type=OpenaiApiType.AzureOpenAI,  # OpenaiApiType.OpenAI or OpenaiApiType.AzureOpenAI
    max_retries=20,
)

token_encoder = tiktoken.get_encoding("cl100k_base")
text_embedder = OpenAIEmbedding(
    api_key=api_key,
    api_base=azure_endpoint,
    api_version=api_version,
    api_type=OpenaiApiType.AzureOpenAI,
    model=embedding_model,
    deployment_name=deployment_name,
    max_retries=20,
    dimensions = 1024
)

Initialized OpenAIEmbedding with dimensions: 1024


In [6]:
context_builder = LocalSearchMixedContext(
    community_reports=reports,
    text_units=text_units,
    entities=entities,
    relationships=relationships,
    entity_text_embeddings=description_embedding_store,
    embedding_vectorstore_key=EntityVectorStoreKey.ID,  # if the vectorstore uses entity title as ids, set this to EntityVectorStoreKey.TITLE
    text_embedder=text_embedder,
    token_encoder=token_encoder,
)

In [7]:
# text_unit_prop: proportion of context window dedicated to related text units
# community_prop: proportion of context window dedicated to community reports.
# The remaining proportion is dedicated to entities and relationships. Sum of text_unit_prop and community_prop should be <= 1
# conversation_history_max_turns: maximum number of turns to include in the conversation history.
# conversation_history_user_turns_only: if True, only include user queries in the conversation history.
# top_k_mapped_entities: number of related entities to retrieve from the entity description embedding store.
# top_k_relationships: control the number of out-of-network relationships to pull into the context window.
# include_entity_rank: if True, include the entity rank in the entity table in the context window. Default entity rank = node degree.
# include_relationship_weight: if True, include the relationship weight in the context window.
# include_community_rank: if True, include the community rank in the context window.
# return_candidate_context: if True, return a set of dataframes containing all candidate entity/relationship/covariate records that
# could be relevant. Note that not all of these records will be included in the context window. The "in_context" column in these
# dataframes indicates whether the record is included in the context window.
# max_tokens: maximum number of tokens to use for the context window.

local_context_params = {
    "text_unit_prop": 0.5,
    "community_prop": 0.1,
    "conversation_history_max_turns": 5,
    "conversation_history_user_turns_only": True,
    "top_k_mapped_entities": 10,
    "top_k_relationships": 10,
    "include_entity_rank": True,
    "include_relationship_weight": True,
    "include_community_rank": False,
    "return_candidate_context": False,
    "embedding_vectorstore_key": EntityVectorStoreKey.ID,  # set this to EntityVectorStoreKey.TITLE if the vectorstore uses entity title as ids
    "max_tokens": 12_000,  # change this based on the token limit you have on your model (if you are using a model with 8k limit, a good setting could be 5000)
}

In [8]:
query = "introduce the first pilot activity"

context_text, _ = context_builder.build_context(query=query, **local_context_params)

In [9]:
print(context_text)

id|title|content
2|Testing Organization and Associated Entities|"# Testing Organization and Associated Entities

The community revolves around a comprehensive testing organization that operates multiple self-operated laboratories and offices. Key entities include the laboratory, the research group from an academic institution, and highly qualified engineers. The organization provides extensive testing services, including biochemical tests and nanomaterials, and maintains long-term relationships with academic research groups.

## Comprehensive Testing Capabilities

The testing organization, referred to as '测试机构', showcases extensive testing capabilities through its 16 self-operated laboratories. These laboratories conduct a wide range of tests, including high-end biochemical tests, data analysis, and the provision of nanomaterials and reagents. The organization's ability to perform such diverse and advanced testing makes it a significant player in the scientific and industrial sectors [

In [10]:
from graphrag.query.structured_search.local_search.system_prompt import (
    LOCAL_SEARCH_SYSTEM_PROMPT,
)

system_prompt = LOCAL_SEARCH_SYSTEM_PROMPT
response_type = "multiple paragraphs"


search_prompt = system_prompt.format(
    context_data=context_text, response_type=response_type
)
search_messages = [
    {"role": "system", "content": search_prompt},
    {"role": "user", "content": query},
]

search_messages

[{'role': 'system',
  'content': '\n---Role---\n\nYou are a helpful assistant responding to questions about data in the tables provided.\n\n\n---Goal---\n\nGenerate a response of the target length and format that responds to the user\'s question, summarizing all information in the input data tables appropriate for the response length and format, and incorporating any relevant general knowledge.\n\nIf you don\'t know the answer, just say so. Do not make anything up.\n\nPoints supported by data should list their data references as follows:\n\n"This is an example sentence supported by multiple data references [Data: <dataset name> (record ids); <dataset name> (record ids)]."\n\nDo not list more than 5 record ids in a single reference. Instead, list the top 5 most relevant record ids and add "+more" to indicate that there are more.\n\nFor example:\n\n"Person X is the owner of Company Y and subject to many allegations of wrongdoing [Data: Sources (15, 16), Reports (1), Entities (5, 7); Rela

In [11]:
from openai import AzureOpenAI
search_prompt = system_prompt.format(
                context_data=context_text, response_type=response_type
            )
search_messages = [
    {"role": "system", "content": search_prompt},
    {"role": "user", "content": query},
]

client = AzureOpenAI(
    azure_deployment='gpt-4o',
                     azure_endpoint="https://shiyanjia-ai.openai.azure.com/",
                     api_key = "a073ef98d54a4838bedd34d99ecc52c5",
                     api_version="2024-04-01-preview"
                     )
result = client.chat.completions.create(
    model="gpt-4o",
    messages =search_messages
)
print(result.choices[0].message)


ChatCompletionMessage(content='# Introduction to 首单试点活动 (First Order Pilot Event)\n\n## Overview\n\nThe 首单试点活动, translated as the "First Order Pilot Event," is a promotional initiative designed to engage potential users by offering incentives for placing their first order. This event acts as a central figure within the community, strategically aiming to boost user participation and retain new customers. The 紧密配合 of different entities in supporting and executing this event underscores its significance and structured approach [Data: Entities (2); Relationships (1, 2, 3, 4)].\n\n## Purpose and Structure\n\nThe primary goal of the 首单试点活动 is to attract new users by providing two types of benefits for first-time orders. Participants in this event are encouraged to place their first order to receive limited-time offers. The structure involves follow-up communication with potential users to ensure they understand the benefits and are guided through the process, enhancing the likelihood of orde

In [12]:
print(result.choices[0].message.content)

# Introduction to 首单试点活动 (First Order Pilot Event)

## Overview

The 首单试点活动, translated as the "First Order Pilot Event," is a promotional initiative designed to engage potential users by offering incentives for placing their first order. This event acts as a central figure within the community, strategically aiming to boost user participation and retain new customers. The 紧密配合 of different entities in supporting and executing this event underscores its significance and structured approach [Data: Entities (2); Relationships (1, 2, 3, 4)].

## Purpose and Structure

The primary goal of the 首单试点活动 is to attract new users by providing two types of benefits for first-time orders. Participants in this event are encouraged to place their first order to receive limited-time offers. The structure involves follow-up communication with potential users to ensure they understand the benefits and are guided through the process, enhancing the likelihood of order completion and user engagement [Data: