In [44]:
# Copyright (c) 2024 Microsoft Corporation.
# Licensed under the MIT License.

In [6]:
import os

import pandas as pd
import tiktoken
import ollama
from graphrag.query.context_builder.entity_extraction import EntityVectorStoreKey
from graphrag.query.indexer_adapters import (
    read_indexer_covariates,
    read_indexer_entities,
    read_indexer_relationships,
    read_indexer_reports,
    read_indexer_text_units,
)
from graphrag.query.input.loaders.dfs import (
    store_entity_semantic_embeddings,
)
# from graphrag.query.llm.oai.chat_openai import ChatOpenAI
# from graphrag.query.llm.oai.embedding import OpenAIEmbedding
# from graphrag.query.llm.oai.typing import OpenaiApiType
from graphrag.query.question_gen.local_gen import LocalQuestionGen
from graphrag.query.structured_search.local_search.mixed_context import (
    LocalSearchMixedContext,
)
from graphrag.query.structured_search.local_search.search import LocalSearch
from graphrag.vector_stores.lancedb import LanceDBVectorStore
import ollama

  from .autonotebook import tqdm as notebook_tqdm


## Local Search Example

Local search method generates answers by combining relevant data from the AI-extracted knowledge-graph with text chunks of the raw documents. This method is suitable for questions that require an understanding of specific entities mentioned in the documents (e.g. What are the healing properties of chamomile?).

### Load text units and graph data tables as context for local search

- In this test we first load indexing outputs from parquet files to dataframes, then convert these dataframes into collections of data objects aligning with the knowledge model.

### Load tables to dataframes

In [7]:
INPUT_DIR = "/Users/luwi/Documents/Code/microsoft_graphrag_local/ragdirs/ragdir_6/output"#"./inputs/operation dulce"
LANCEDB_URI = f"{INPUT_DIR}/lancedb"

COMMUNITY_REPORT_TABLE = "create_final_community_reports"
ENTITY_TABLE = "create_final_nodes"
ENTITY_EMBEDDING_TABLE = "create_final_entities"
RELATIONSHIP_TABLE = "create_final_relationships"
COVARIATE_TABLE = "create_final_covariates"
TEXT_UNIT_TABLE = "create_final_text_units"
COMMUNITY_LEVEL = 2

In [13]:
entity_df = pd.read_parquet(f"/Users/luwi/Documents/Code/microsoft_graphrag_local/ragdirs/ragdir_2/output/create_base_extracted_entities.parquet")
entity_df.values

array([['<graphml xmlns="http://graphml.graphdrawing.org/xmlns" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://graphml.graphdrawing.org/xmlns http://graphml.graphdrawing.org/xmlns/1.0/graphml.xsd">\n  <key id="d6" for="edge" attr.name="source_id" attr.type="string" />\n  <key id="d5" for="edge" attr.name="description" attr.type="string" />\n  <key id="d4" for="edge" attr.name="weight" attr.type="double" />\n  <key id="d3" for="node" attr.name="entity_type" attr.type="string" />\n  <key id="d2" for="node" attr.name="source_id" attr.type="string" />\n  <key id="d1" for="node" attr.name="description" attr.type="string" />\n  <key id="d0" for="node" attr.name="type" attr.type="string" />\n  <graph edgedefault="undirected">\n    <node id="QUESTION ANSWERING">\n      <data key="d0">EVENT</data>\n      <data key="d1">A method of using a graph to answer questions based on text</data>\n      <data key="d2">e1e4f0675d6bb822863a64b663629c0f</data>\n      <data ke

#### Read entities

In [8]:
# read nodes table to get community and degree data
entity_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_TABLE}.parquet")
entity_embedding_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_EMBEDDING_TABLE}.parquet")

entities = read_indexer_entities(entity_df, entity_embedding_df, COMMUNITY_LEVEL)

# load description embeddings to an in-memory lancedb vectorstore
# to connect to a remote db, specify url and port values.
description_embedding_store = LanceDBVectorStore(
    collection_name="entity_description_embeddings",
)
description_embedding_store.connect(db_uri=LANCEDB_URI)
entity_description_embeddings = store_entity_semantic_embeddings(
    entities=entities, vectorstore=description_embedding_store
)

print(f"Entity count: {len(entity_df)}")
entity_df.head()

FileNotFoundError: [Errno 2] No such file or directory: '/Users/luwi/Documents/Code/microsoft_graphrag_local/ragdirs/ragdir_6/output/create_final_nodes.parquet'

#### Read relationships

In [48]:
relationship_df = pd.read_parquet(f"{INPUT_DIR}/{RELATIONSHIP_TABLE}.parquet")
relationships = read_indexer_relationships(relationship_df)

print(f"Relationship count: {len(relationship_df)}")
relationship_df.head()

Relationship count: 28


Unnamed: 0,source,target,weight,description,text_unit_ids,id,human_readable_id,source_degree,target_degree,rank
0,QUESTION ANSWERING,GRAPH RAG,16.0,The Graph RAG approach uses question answering...,[e1e4f0675d6bb822863a64b663629c0f],3db243c5a3b9469687d7b3e6beebfdd4,0,5,9,14
1,QUESTION ANSWERING,PRIVATE TEXT CORPORA,4.0,Question answering is used to index private te...,[e1e4f0675d6bb822863a64b663629c0f],79dcf8d8b7ce4d17a80db0f448cc97d0,1,5,3,8
2,QUESTION ANSWERING,TEXT DATA,3.0,Question answering models are trained on a lar...,[e1e4f0675d6bb822863a64b663629c0f],32f3ebbf64c74a7baa1cfd1eb4ccd95a,2,5,2,7
3,QUESTION ANSWERING,MODEL TRAINING,2.0,Question answering models are trained using th...,[e1e4f0675d6bb822863a64b663629c0f],949ca6e4e6e94bf3980b26832735b233,3,5,2,7
4,QUESTION ANSWERING,EVALUATION METRICS,2.0,Question answering models are evaluated using ...,[e1e4f0675d6bb822863a64b663629c0f],06a15aca87bf4691a0540036abef4ec0,4,5,2,7


In [49]:
# NOTE: covariates are turned off by default, because they generally need prompt tuning to be valuable
# Please see the GRAPHRAG_CLAIM_* settings

# covariate_df = pd.read_parquet(f"{INPUT_DIR}/{COVARIATE_TABLE}.parquet")

# claims = read_indexer_covariates(covariate_df)

# print(f"Claim records: {len(claims)}")
# covariates = {"claims": claims}

#### Read community reports

In [50]:
report_df = pd.read_parquet(f"{INPUT_DIR}/{COMMUNITY_REPORT_TABLE}.parquet")
reports = read_indexer_reports(report_df, entity_df, COMMUNITY_LEVEL)

print(f"Report records: {len(report_df)}")
report_df.head()

Report records: 2


Unnamed: 0,community,full_content,level,rank,title,rank_explanation,summary,findings,full_content_json,id
0,0,# Private Text Corpora Community\n\nThis commu...,0,6.0,Private Text Corpora Community,The impact severity rating is moderate due to ...,This community revolves around a collection of...,[{'explanation': 'The Graph RAG approach is be...,"{\n ""title"": ""Private Text Corpora Communit...",b054475a-a026-4260-9d93-6abf954a123d
1,1,# Graph RAG Community\n\nThe Graph RAG communi...,0,8.0,Graph RAG Community,The Graph RAG community has a moderate impact ...,The Graph RAG community revolves around a grap...,[{'explanation': 'The Graph RAG community cent...,"{\n ""title"": ""Graph RAG Community"",\n ""s...",cc10104e-5ecc-4e6a-a027-7acd2448a9d9


#### Read text units

In [51]:
text_unit_df = pd.read_parquet(f"{INPUT_DIR}/{TEXT_UNIT_TABLE}.parquet")
text_units = read_indexer_text_units(text_unit_df)

print(f"Text unit records: {len(text_unit_df)}")
text_unit_df.head()

Text unit records: 8


Unnamed: 0,id,text,n_tokens,document_ids,entity_ids,relationship_ids
0,e1e4f0675d6bb822863a64b663629c0f,To combine the strengths of these contrasting ...,44,[1730bed2b2faeda8ee1e88c01ac584b3],"[40740b36141645ae854cc406f7ff129f, d1645731adb...","[3db243c5a3b9469687d7b3e6beebfdd4, 79dcf8d8b7c..."
1,04becef81fb7d21fbe198dfca4e4d159,For a class of global sensemaking questions ov...,46,[889e75d03fdb3c2c486219d8495678ee],,
2,bfe1a0b28685e4194ffc64e6bef2501b,"An open-source, Python-based implementation of...",26,[94463dbd2b03a19805bf94f0c8552c47],"[46f5bfc902b54297928abdd085d020ff, 9c7fba7797a...","[f026b8337e8544cf93dacebf9247d574, c6ac85dbd97..."
3,7bf2b315168e3ec08a740d7638621161,"Given a question, each community summary is us...",30,[bec823c0460d762ebedd21cd46ec9d68],,
4,e5e6856d12d6ed343185093279e7d9a0,"Prior QFS methods, meanwhile, fail to scale to...",22,[ca922d8b1c0c462898eb677966f148fd],"[cc6c98e369bc420a9a31101e387f8927, 34e6a3da1ad...","[33f2b5da86154f098dbfb024ead2be02, 9eaa628eb25..."


In [2]:
!pip install langchain_ollama

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting langchain_ollama
  Downloading langchain_ollama-0.2.0-py3-none-any.whl.metadata (1.8 kB)
Collecting tenacity!=8.4.0,<9.0.0,>=8.1.0 (from langchain-core<0.4.0,>=0.3.0->langchain_ollama)
  Downloading tenacity-8.5.0-py3-none-any.whl.metadata (1.2 kB)
Downloading langchain_ollama-0.2.0-py3-none-any.whl (14 kB)
Downloading tenacity-8.5.0-py3-none-any.whl (28 kB)
Installing collected packages: tenacity, langchain_ollama
  Attempting uninstall: tenacity
    Found existing installation: tenacity 9.0.0
    Uninstalling tenacity-9.0.0:
      Successfully uninstalled tenacity-9.0.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
graphrag 0.0.1.dev185 requires tenacity<10.0.0,>=9.0.0, but you have tenacity 8.5.0 which is incompatible.[0m[31m
[0mSuccessfully installed langchain_olla

In [None]:
text = '\n-Goal-\nGiven a text document that is potentially relevant to this activity and a list of entity types, identify all entities of those types from the text and all relationships among the identified entities.\n \n-Steps-\n1. Identify all entities. For each identified entity, extract the following information:\n- entity_name: Name of the entity, capitalized\n- entity_type: One of the following types: [organization,person,geo,event]\n- entity_description: Comprehensive description of the entity\'s attributes and activities\nFormat each entity as ("entity"<|><entity_name><|><entity_type><|><entity_description>)\n \n2. From the entities identified in step 1, identify all pairs of (source_entity, target_entity) that are *clearly related* to each other.\nFor each pair of related entities, extract the following information:\n- source_entity: name of the source entity, as identified in step 1\n- target_entity: name of the target entity, as identified in step 1\n- relationship_description: explanation as to why you think the source entity and the target entity are related to each other\n- relationship_strength: a numeric score indicating strength of the relationship between the source entity and target entity\n Format each relationship as ("relationship"<|><source_entity><|><target_entity><|><relationship_description><|><relationship_strength>)\n \n3. Return output in English as a single list of all the entities and relationships identified in steps 1 and 2. Use **##** as the list delimiter.\n \n4. When finished, output <|COMPLETE|>\n \n######################\n-Examples-\n######################\nExample 1:\nEntity_types: ORGANIZATION,PERSON\nText:\nThe Verdantis\'s Central Institution is scheduled to meet on Monday and Thursday, with the institution planning to release its latest policy decision on Thursday at 1:30 p.m. PDT, followed by a press conference where Central Institution Chair Martin Smith will take questions. Investors expect the Market Strategy Committee to hold its benchmark interest rate steady in a range of 3.5%-3.75%.\n######################\nOutput:\n("entity"<|>CENTRAL INSTITUTION<|>ORGANIZATION<|>The Central Institution is the Federal Reserve of Verdantis, which is setting interest rates on Monday and Thursday)\n##\n("entity"<|>MARTIN SMITH<|>PERSON<|>Martin Smith is the chair of the Central Institution)\n##\n("entity"<|>MARKET STRATEGY COMMITTEE<|>ORGANIZATION<|>The Central Institution committee makes key decisions about interest rates and the growth of Verdantis\'s money supply)\n##\n("relationship"<|>MARTIN SMITH<|>CENTRAL INSTITUTION<|>Martin Smith is the Chair of the Central Institution and will answer questions at a press conference<|>9)\n<|COMPLETE|>\n\n######################\nExample 2:\nEntity_types: ORGANIZATION\nText:\nTechGlobal\'s (TG) stock skyrocketed in its opening day on the Global Exchange Thursday. But IPO experts warn that the semiconductor corporation\'s debut on the public markets isn\'t indicative of how other newly listed companies may perform.\n\nTechGlobal, a formerly public company, was taken private by Vision Holdings in 2014. The well-established chip designer says it powers 85% of premium smartphones.\n######################\nOutput:\n("entity"<|>TECHGLOBAL<|>ORGANIZATION<|>TechGlobal is a stock now listed on the Global Exchange which powers 85% of premium smartphones)\n##\n("entity"<|>VISION HOLDINGS<|>ORGANIZATION<|>Vision Holdings is a firm that previously owned TechGlobal)\n##\n("relationship"<|>TECHGLOBAL<|>VISION HOLDINGS<|>Vision Holdings formerly owned TechGlobal from 2014 until present<|>5)\n<|COMPLETE|>\n\n######################\nExample 3:\nEntity_types: ORGANIZATION,GEO,PERSON\nText:\nFive Aurelians jailed for 8 years in Firuzabad and widely regarded as hostages are on their way home to Aurelia.\n\nThe swap orchestrated by Quintara was finalized when $8bn of Firuzi funds were transferred to financial institutions in Krohaara, the capital of Quintara.\n\nThe exchange initiated in Firuzabad\'s capital, Tiruzia, led to the four men and one woman, who are also Firuzi nationals, boarding a chartered flight to Krohaara.\n\nThey were welcomed by senior Aurelian officials and are now on their way to Aurelia\'s capital, Cashion.\n\nThe Aurelians include 39-year-old businessman Samuel Namara, who has been held in Tiruzia\'s Alhamia Prison, as well as journalist Durke Bataglani, 59, and environmentalist Meggie Tazbah, 53, who also holds Bratinas nationality.\n######################\nOutput:\n("entity"<|>FIRUZABAD<|>GEO<|>Firuzabad held Aurelians as hostages)\n##\n("entity"<|>AURELIA<|>GEO<|>Country seeking to release hostages)\n##\n("entity"<|>QUINTARA<|>GEO<|>Country that negotiated a swap of money in exchange for hostages)\n##\n##\n("entity"<|>TIRUZIA<|>GEO<|>Capital of Firuzabad where the Aurelians were being held)\n##\n("entity"<|>KROHAARA<|>GEO<|>Capital city in Quintara)\n##\n("entity"<|>CASHION<|>GEO<|>Capital city in Aurelia)\n##\n("entity"<|>SAMUEL NAMARA<|>PERSON<|>Aurelian who spent time in Tiruzia\'s Alhamia Prison)\n##\n("entity"<|>ALHAMIA PRISON<|>GEO<|>Prison in Tiruzia)\n##\n("entity"<|>DURKE BATAGLANI<|>PERSON<|>Aurelian journalist who was held hostage)\n##\n("entity"<|>MEGGIE TAZBAH<|>PERSON<|>Bratinas national and environmentalist who was held hostage)\n##\n("relationship"<|>FIRUZABAD<|>AURELIA<|>Firuzabad negotiated a hostage exchange with Aurelia<|>2)\n##\n("relationship"<|>QUINTARA<|>AURELIA<|>Quintara brokered the hostage exchange between Firuzabad and Aurelia<|>2)\n##\n("relationship"<|>QUINTARA<|>FIRUZABAD<|>Quintara brokered the hostage exchange between Firuzabad and Aurelia<|>2)\n##\n("relationship"<|>SAMUEL NAMARA<|>ALHAMIA PRISON<|>Samuel Namara was a prisoner at Alhamia prison<|>8)\n##\n("relationship"<|>SAMUEL NAMARA<|>MEGGIE TAZBAH<|>Samuel Namara and Meggie Tazbah were exchanged in the same hostage release<|>2)\n##\n("relationship"<|>SAMUEL NAMARA<|>DURKE BATAGLANI<|>Samuel Namara and Durke Bataglani were exchanged in the same hostage release<|>2)\n##\n("relationship"<|>MEGGIE TAZBAH<|>DURKE BATAGLANI<|>Meggie Tazbah and Durke Bataglani were exchanged in the same hostage release<|>2)\n##\n("relationship"<|>SAMUEL NAMARA<|>FIRUZABAD<|>Samuel Namara was a hostage in Firuzabad<|>2)\n##\n("relationship"<|>MEGGIE TAZBAH<|>FIRUZABAD<|>Meggie Tazbah was a hostage in Firuzabad<|>2)\n##\n("relationship"<|>DURKE BATAGLANI<|>FIRUZABAD<|>Durke Bataglani was a hostage in Firuzabad<|>2)\n<|COMPLETE|>\n\n######################\n-Real Data-\n######################\nEntity_types: organization,person,geo,event\nText: Page 1:\nFrom Local to Global: A Graph RAG Approach to Query-Focused Summarization: The use\nof retrieval-augmented generation (RAG) to retrieve relevant informa- tion from an external\nknowledge source enables large language models (LLMs) to answer questions over private\nand/or previously unseen document collections. However, RAG fails on global questions\ndirected at an entire text corpus, such as “What are the main themes in the dataset?”, since\nthis is inherently a query- focused summarization (QFS) task, rather than an explicit retrieval\ntask. Prior QFS methods, meanwhile, fail to scale to the quantities of text indexed by typical\nRAG systems. To combine the strengths of these contrasting methods, we propose a Graph\nRAG approach to question answering over private text corpora that scales with both the\ngenerality of user questions and the quantity of source text to be in- dexed. Our approach\nuses an LLM to build a graph-based text index in two stages: first to derive an entity\nknowledge graph from the source documents, then to pre- generate community summaries\nfor all groups of closely-related entities. Given a question, each community summary is used\nto generate a partial response, before all partial responses are again summarized in a final\nresponse to the user. For a class of global sensemaking questions over datasets in the 1\nmillion token range, we show that Graph RAG leads to substantial improvements over a na\n̈ıve RAG baseline for both the comprehensiveness and diversity of generated answers. An\nopen-source, Python-based implementation of both global and local Graph RAG approaches\nis forthcoming at https://aka.ms/graphrag.\n######################\nOutput:'

In [5]:

from langchain_openai import ChatOpenAI
llm = ChatOpenAI(
    api_key="ollama",
    base_url="http://ollama:11434/v1",
    model="llama3.1",
    # callbacks=[callback_handler],
)
llm.invoke("Hello, world!")

from langchain_ollama import ChatOllama
llm = ChatOllama(
    api_key="ollama",
    base_url="http://localhost:11434",
    model="llama3.1",
    # callbacks=[callback_handler],
)
llm.invoke("Hello, world!")

input Hello, world!
input [HumanMessage(content='Hello, world!', additional_kwargs={}, response_metadata={})]
input Hello, world!


AIMessage(content="Hello! It's nice to meet you. Is there something I can help you with or would you like to chat?", additional_kwargs={}, response_metadata={'model': 'llama3.1', 'created_at': '2024-09-24T11:39:53.454897Z', 'message': {'role': 'assistant', 'content': ''}, 'done_reason': 'stop', 'done': True, 'total_duration': 2133115250, 'load_duration': 32338792, 'prompt_eval_count': 14, 'prompt_eval_duration': 82287000, 'eval_count': 25, 'eval_duration': 2017446000}, id='run-e431b8ff-fe92-4405-a5ec-d472eec12106-0', usage_metadata={'input_tokens': 14, 'output_tokens': 25, 'total_tokens': 39})

In [52]:
# llm_model = os.environ["GRAPHRAG_LLM_MODEL"]
# embedding_model = os.environ["GRAPHRAG_EMBEDDING_MODEL"]

from langchain_openai import ChatOpenAI

llm = ChatOpenAI(
    api_key="ollama",
    base_url="http://localhost:11434/v1",
    model="llama3.1",
    # callbacks=[callback_handler],
)

token_encoder = tiktoken.get_encoding("cl100k_base")

class OpenAICompatibleOllamaEmbedding:
    def __init__(self, model: str):
        self.model = model
        

    def __call__(self, prompt: str):
        return ollama.embeddings(model=self.model, prompt=prompt)["embedding"]

    def embed(self, prompt: str):
        return self(prompt=prompt)
    
    def embed_documents(self, texts: list[str]):
        return [self(text) for text in texts]

text_embedder = OpenAICompatibleOllamaEmbedding(model="nomic-embed-text")

# text_embedder = OpenAIEmbedding(
#     api_key=api_key,
#     api_base=None,
#     api_type=OpenaiApiType.OpenAI,
#     model=embedding_model,
#     deployment_name=embedding_model,
#     max_retries=20,
# )

### Create local search context builder

In [53]:
context_builder = LocalSearchMixedContext(
    community_reports=reports,
    text_units=text_units,
    entities=entities,
    relationships=relationships,
    # if you did not run covariates during indexing, set this to None
    # covariates=covariates,
    entity_text_embeddings=description_embedding_store,
    embedding_vectorstore_key=EntityVectorStoreKey.ID,  # if the vectorstore uses entity title as ids, set this to EntityVectorStoreKey.TITLE
    text_embedder=text_embedder,
    token_encoder=token_encoder,
)

### Create local search engine

In [54]:
# text_unit_prop: proportion of context window dedicated to related text units
# community_prop: proportion of context window dedicated to community reports.
# The remaining proportion is dedicated to entities and relationships. Sum of text_unit_prop and community_prop should be <= 1
# conversation_history_max_turns: maximum number of turns to include in the conversation history.
# conversation_history_user_turns_only: if True, only include user queries in the conversation history.
# top_k_mapped_entities: number of related entities to retrieve from the entity description embedding store.
# top_k_relationships: control the number of out-of-network relationships to pull into the context window.
# include_entity_rank: if True, include the entity rank in the entity table in the context window. Default entity rank = node degree.
# include_relationship_weight: if True, include the relationship weight in the context window.
# include_community_rank: if True, include the community rank in the context window.
# return_candidate_context: if True, return a set of dataframes containing all candidate entity/relationship/covariate records that
# could be relevant. Note that not all of these records will be included in the context window. The "in_context" column in these
# dataframes indicates whether the record is included in the context window.
# max_tokens: maximum number of tokens to use for the context window.


local_context_params = {
    "text_unit_prop": 0.5,
    "community_prop": 0.1,
    "conversation_history_max_turns": 5,
    "conversation_history_user_turns_only": True,
    "top_k_mapped_entities": 10,
    "top_k_relationships": 10,
    "include_entity_rank": True,
    "include_relationship_weight": True,
    "include_community_rank": False,
    "return_candidate_context": False,
    "embedding_vectorstore_key": EntityVectorStoreKey.ID,  # set this to EntityVectorStoreKey.TITLE if the vectorstore uses entity title as ids
    "max_tokens": 12_000,  # change this based on the token limit you have on your model (if you are using a model with 8k limit, a good setting could be 5000)
}

llm_params = {
    "max_tokens": 2_000,  # change this based on the token limit you have on your model (if you are using a model with 8k limit, a good setting could be 1000=1500)
    "temperature": 0.0,
}

In [55]:
search_engine = LocalSearch(
    llm=llm,
    context_builder=context_builder,
    token_encoder=token_encoder,
    llm_params=llm_params,
    context_builder_params=local_context_params,
    response_type="multiple paragraphs",  # free form text describing the response type and format, can be anything, e.g. prioritized list, single paragraph, multiple paragraphs, multiple-page report
)

### Run local search on sample queries

In [56]:
result = await search_engine.asearch("Tell me about Ishita Mann")
print(result.response)

messages {'role': 'system', 'content': '\n---Role---\n\nYou are a helpful assistant responding to questions about data in the tables provided.\n\n\n---Goal---\n\nGenerate a response of the target length and format that responds to the user\'s question, summarizing all information in the input data tables appropriate for the response length and format, and incorporating any relevant general knowledge.\n\nIf you don\'t know the answer, just say so. Do not make anything up.\n\nPoints supported by data should list their data references as follows:\n\n"This is an example sentence supported by multiple data references [Data: <dataset name> (record ids); <dataset name> (record ids)]."\n\nDo not list more than 5 record ids in a single reference. Instead, list the top 5 most relevant record ids and add "+more" to indicate that there are more.\n\nFor example:\n\n"Person X is the owner of Company Y and subject to many allegations of wrongdoing [Data: Sources (15, 16), Reports (1), Entities (5, 7)



In [65]:
question = "what is the usecase of Graph RAG"
result = await search_engine.asearch(question)
print(result.response)

messages {'role': 'system', 'content': '\n---Role---\n\nYou are a helpful assistant responding to questions about data in the tables provided.\n\n\n---Goal---\n\nGenerate a response of the target length and format that responds to the user\'s question, summarizing all information in the input data tables appropriate for the response length and format, and incorporating any relevant general knowledge.\n\nIf you don\'t know the answer, just say so. Do not make anything up.\n\nPoints supported by data should list their data references as follows:\n\n"This is an example sentence supported by multiple data references [Data: <dataset name> (record ids); <dataset name> (record ids)]."\n\nDo not list more than 5 record ids in a single reference. Instead, list the top 5 most relevant record ids and add "+more" to indicate that there are more.\n\nFor example:\n\n"Person X is the owner of Company Y and subject to many allegations of wrongdoing [Data: Sources (15, 16), Reports (1), Entities (5, 7)



#### Inspecting the context data used to generate the response

In [58]:
result.context_data["entities"].head()

Unnamed: 0,id,entity,description,number of relationships,in_context
0,4,TEXT DATA,A dataset of text used for training and testin...,2,True
1,18,SOURCE DOCUMENTS,The original texts that are being processed to...,3,True
2,2,USER QUESTIONS,Questions asked by users to be answered by the...,2,True
3,8,MS,Microsoft is the company behind the Graph RAG ...,1,True
4,17,GRAPH-BASED TEXT INDEX,A graph-based index of entities and relationsh...,2,True


In [59]:
result.context_data["relationships"].head()

Unnamed: 0,id,source,target,description,weight,rank,links,in_context
0,1,QUESTION ANSWERING,PRIVATE TEXT CORPORA,Question answering is used to index private te...,4.0,8,2,True
1,2,QUESTION ANSWERING,TEXT DATA,Question answering models are trained on a lar...,3.0,7,1,True
2,3,QUESTION ANSWERING,MODEL TRAINING,Question answering models are trained using th...,2.0,7,1,True
3,4,QUESTION ANSWERING,EVALUATION METRICS,Question answering models are evaluated using ...,2.0,7,1,True
4,6,PRIVATE TEXT CORPORA,USER QUESTIONS,User questions are answered by indexing privat...,3.0,5,2,True


In [60]:
result.context_data["reports"].head()

Unnamed: 0,id,title,content
0,1,Graph RAG Community,# Graph RAG Community\n\nThe Graph RAG communi...


In [61]:
result.context_data["sources"].head()

Unnamed: 0,id,text
0,0,To combine the strengths of these contrasting ...
1,6,Our approach uses an LLM to build a graph-base...
2,2,"An open-source, Python-based implementation of..."
3,4,"Prior QFS methods, meanwhile, fail to scale to..."


In [62]:
if "claims" in result.context_data:
    print(result.context_data["claims"].head())

### Question Generation

This function takes a list of user queries and generates the next candidate questions.

In [63]:
question_generator = LocalQuestionGen(
    llm=llm,
    context_builder=context_builder,
    token_encoder=token_encoder,
    llm_params=llm_params,
    context_builder_params=local_context_params,
)

In [64]:

question_history = [
    "Tell me about Agent Mercer",
    "What happens in Dulce military base?",
]
candidate_questions = await question_generator.agenerate(
    question_history=question_history, context_data=None, question_count=5
)
print(candidate_questions.response)

messages {'role': 'system', 'content': '\n---Role---\n\nYou are a helpful assistant generating a bulleted list of 5 questions about data in the tables provided.\n\n\n---Data tables---\n\n-----Conversation History-----\nturn|content\nuser|Tell me about Agent Mercer\n\n\n-----Reports-----\nid|title|content\n1|Graph RAG Community|"# Graph RAG Community\n\nThe Graph RAG community revolves around a graph-based approach to question answering, model training, and evaluation metrics. It involves various entities such as GRAPH RAG, QUESTION ANSWERING, MODEL TRAINING, EVALUATION METRICS, TEXT DATA, and more.\n\n## Graph RAG is a graph-based approach to question answering\n\nThe Graph RAG community centers around a graph-based approach to question answering [Data: Entities (3, 6); Relationships (0)]. This approach uses a set of evaluation metrics to evaluate the performance of question answering models. The Graph RAG approach is trained on a large corpus of text data and can be accessed at a spec

