In [None]:
# Copyright (c) 2024 Microsoft Corporation.
# Licensed under the MIT License.

# Multi Index Search
This notebook demonstrates multi-index search using the GraphRAG API.

Indexes created from Wikipedia state articles for Alaska, California, DC, Maryland, NY and Washington are used.

In [2]:
import asyncio

import pandas as pd

from graphrag.api.query import (
    multi_index_basic_search,
    multi_index_drift_search,
    multi_index_global_search,
    multi_index_local_search,
)
from graphrag.config.create_graphrag_config import create_graphrag_config

indexes = ["alaska", "california", "dc", "maryland", "ny", "washington"]
indexes = sorted(indexes)

print(indexes)

vector_store_configs = {
    index: {
        "type": "lancedb",
        "db_uri": f"inputs/{index}/lancedb",
        "container_name": "default",
        "overwrite": True,
        "index_name": f"{index}",
    }
    for index in indexes
}

['alaska', 'california', 'dc', 'maryland', 'ny', 'washington']


In [None]:
config_data = {
    "models": {
        "default_chat_model": {
            "model_supports_json": True,
            "parallelization_num_threads": 50,
            "parallelization_stagger": 0.3,
            "async_mode": "threaded",
            "type": "azure_openai_chat",
            "model": "gpt-4o",
            "auth_type": "azure_managed_identity",
            "api_base": "<API_BASE_URL>",
            "api_version": "2024-02-15-preview",
            "deployment_name": "gpt-4o",
        },
        "default_embedding_model": {
            "parallelization_num_threads": 50,
            "parallelization_stagger": 0.3,
            "async_mode": "threaded",
            "type": "azure_openai_embedding",
            "model": "text-embedding-3-large",
            "auth_type": "azure_managed_identity",
            "api_base": "<API_BASE_URL>",
            "api_version": "2024-02-15-preview",
            "deployment_name": "text-embedding-3-large",
        },
    },
    "vector_store": vector_store_configs,
    "local_search": {
        "prompt": "prompts/local_search_system_prompt.txt",
        "llm_max_tokens": 12000,
    },
    "global_search": {
        "map_prompt": "prompts/global_search_map_system_prompt.txt",
        "reduce_prompt": "prompts/global_search_reduce_system_prompt.txt",
        "knowledge_prompt": "prompts/global_search_knowledge_system_prompt.txt",
    },
    "drift_search": {
        "prompt": "prompts/drift_search_system_prompt.txt",
        "reduce_prompt": "prompts/drift_search_reduce_prompt.txt",
    },
    "basic_search": {"prompt": "prompts/basic_search_system_prompt.txt"},
}
parameters = create_graphrag_config(config_data, ".")
loop = asyncio.get_event_loop()

### Multi-index Global Search

In [None]:
entities = [pd.read_parquet(f"inputs/{index}/entities.parquet") for index in indexes]
communities = [
    pd.read_parquet(f"inputs/{index}/communities.parquet") for index in indexes
]
community_reports = [
    pd.read_parquet(f"inputs/{index}/community_reports.parquet") for index in indexes
]

task = loop.create_task(
    multi_index_global_search(
        parameters,
        entities,
        communities,
        community_reports,
        indexes,
        1,
        False,
        "Multiple Paragraphs",
        False,
        "Describe this dataset.",
    )
)
results = await task

#### Print report

In [19]:
print(results[0])

## Overview of the Dataset

The dataset is a comprehensive collection of reports that cover a wide array of topics, including historical events, cultural dynamics, economic influences, geographical regions, and environmental issues across various regions in the United States. Each report is uniquely identified by an ID and includes a title, occurrence weight, content, and rank. These elements help to organize the dataset and provide insights into the significance and relevance of each report.

## Content and Structure

The reports provide detailed information about specific entities and their relationships, highlighting their importance and impact in different contexts. Topics range from the historical significance of regions like Maryland and Washington D.C., to the cultural and economic landscapes of areas such as Washington State and Los Angeles. The dataset also delves into significant events and figures, such as the Good Friday Earthquake, the Trans-Alaska Pipeline, and the role o

#### Show context links back to original index

In [None]:
for report_id in [120, 129, 40, 16, 204, 143, 85, 122, 83]:
    index_name = [i for i in results[1]["reports"] if i["id"] == str(report_id)][0][  # noqa: RUF015
        "index_name"
    ]
    index_id = [i for i in results[1]["reports"] if i["id"] == str(report_id)][0][  # noqa: RUF015
        "index_id"
    ]
    print(report_id, index_name, index_id)
    index_reports = pd.read_parquet(
        f"inputs/{index_name}/create_final_community_reports.parquet"
    )
    print([i for i in results[1]["reports"] if i["id"] == str(report_id)][0]["title"])  # noqa: RUF015
    print(
        index_reports[index_reports["community"] == int(index_id)]["title"].to_numpy()[
            0
        ]
    )

120 dc 26
Washington D.C. Founders and Influences
Washington D.C. Founders and Influences
129 dc 35
Smithsonian Institution and Its Museums
Smithsonian Institution and Its Museums
40 alaska 40
Good Friday Earthquake and Its Global Impact
Good Friday Earthquake and Its Global Impact
16 alaska 16
Trans-Alaska Pipeline and Prudhoe Bay
Trans-Alaska Pipeline and Prudhoe Bay
204 ny 36
Long Island and its Educational and Cultural Landscape
Long Island and its Educational and Cultural Landscape
143 maryland 5
Western Maryland and Appalachian Region
Western Maryland and Appalachian Region
85 california 38
California and Its Historical and Geopolitical Context
California and Its Historical and Geopolitical Context
122 dc 28
District of Columbia and Legal Framework
District of Columbia and Legal Framework
83 california 36
Southern California and Key Geographical Entities
Southern California and Key Geographical Entities


#### Multi-index Local Search

In [None]:
nodes = [
    pd.read_parquet(f"inputs/{index}/create_final_nodes.parquet") for index in indexes
]
entities = [
    pd.read_parquet(f"inputs/{index}/create_final_entities.parquet")
    for index in indexes
]
community_reports = [
    pd.read_parquet(f"inputs/{index}/create_final_community_reports.parquet")
    for index in indexes
]
covariates = [
    pd.read_parquet(f"inputs/{index}/create_final_covariates.parquet")
    for index in indexes
]
text_units = [
    pd.read_parquet(f"inputs/{index}/create_final_text_units.parquet")
    for index in indexes
]
relationships = [
    pd.read_parquet(f"inputs/{index}/create_final_relationships.parquet")
    for index in indexes
]

task = loop.create_task(
    multi_index_local_search(
        parameters,
        nodes,
        entities,
        community_reports,
        text_units,
        relationships,
        covariates,
        indexes,
        1,
        "Multiple Paragraphs",
        False,
        "weather",
    )
)
results = await task

#### Print report

In [15]:
print(results[0])

### Weather Patterns in California and Washington

#### California's Climate

California exhibits a wide range of climates due to its diverse geography, which includes coastal areas, mountains, and deserts. The state experiences a Mediterranean climate in the Central Valley and coastal regions, characterized by wet winters and dry summers. The Sierra Nevada mountains have an alpine climate with snow in winter and mild summers, while the eastern side of the mountains creates rain shadows, leading to desert conditions in areas like Death Valley, which is one of the hottest places on Earth [Data: Reports (47); Entities (500, 502, 506)].

The state's climate diversity results in varying weather patterns, with northern regions receiving more rainfall than the south. The demand for water is high due to these climatic variations, and droughts have become more frequent, exacerbated by climate change and overextraction of water resources [Data: Reports (47); Claims (100)].

#### Washington's Cl

#### Show context links back to original index

In [None]:
for report_id in [47, 213]:
    index_name = [i for i in results[1]["reports"] if i["id"] == str(report_id)][0][  # noqa: RUF015
        "index_name"
    ]
    index_id = [i for i in results[1]["reports"] if i["id"] == str(report_id)][0][  # noqa: RUF015
        "index_id"
    ]
    print(report_id, index_name, index_id)
    index_reports = pd.read_parquet(
        f"inputs/{index_name}/create_final_community_reports.parquet"
    )
    print([i for i in results[1]["reports"] if i["id"] == str(report_id)][0]["title"])  # noqa: RUF015
    print(
        index_reports[index_reports["community"] == int(index_id)]["title"].to_numpy()[
            0
        ]
    )
for entity_id in [500, 502, 506, 1960, 1961, 1962]:
    index_name = [i for i in results[1]["entities"] if i["id"] == str(entity_id)][0][  # noqa: RUF015
        "index_name"
    ]
    index_id = [i for i in results[1]["entities"] if i["id"] == str(entity_id)][0][  # noqa: RUF015
        "index_id"
    ]
    print(entity_id, index_name, index_id)
    index_entities = pd.read_parquet(
        f"inputs/{index_name}/create_final_entities.parquet"
    )
    print(
        [i for i in results[1]["entities"] if i["id"] == str(entity_id)][0][  # noqa: RUF015
            "description"
        ][:100]
    )
    print(
        index_entities[index_entities["human_readable_id"] == int(index_id)][
            "description"
        ].to_numpy()[0][:100]
    )
for relationship_id in [1805, 1806]:
    index_name = [  # noqa: RUF015
        i for i in results[1]["relationships"] if i["id"] == str(relationship_id)
    ][0]["index_name"]
    index_id = [  # noqa: RUF015
        i for i in results[1]["relationships"] if i["id"] == str(relationship_id)
    ][0]["index_id"]
    print(relationship_id, index_name, index_id)
    index_relationships = pd.read_parquet(
        f"inputs/{index_name}/create_final_relationships.parquet"
    )
    print(
        [i for i in results[1]["relationships"] if i["id"] == str(relationship_id)][0][  # noqa: RUF015
            "description"
        ]
    )
    print(
        index_relationships[index_relationships["human_readable_id"] == int(index_id)][
            "description"
        ].to_numpy()[0]
    )
for claim_id in [100]:
    index_name = [i for i in results[1]["claims"] if i["id"] == str(claim_id)][0][  # noqa: RUF015
        "index_name"
    ]
    index_id = [i for i in results[1]["claims"] if i["id"] == str(claim_id)][0][  # noqa: RUF015
        "index_id"
    ]
    print(relationship_id, index_name, index_id)
    index_claims = pd.read_parquet(
        f"inputs/{index_name}/create_final_covariates.parquet"
    )
    print(
        [i for i in results[1]["claims"] if i["id"] == str(claim_id)][0]["description"]  # noqa: RUF015
    )
    print(
        index_claims[index_claims["human_readable_id"] == int(index_id)][
            "description"
        ].to_numpy()[0]
    )

47 california 0
California: A Hub of Cultural, Economic, and Environmental Significance
California: A Hub of Cultural, Economic, and Environmental Significance
213 washington 0
Washington State: Economic and Cultural Hub
Washington State: Economic and Cultural Hub
500 california 161
Boca is a location in California where the lowest temperature in the state, −45 °F, was recorded on 
Boca is a location in California where the lowest temperature in the state, −45 °F, was recorded on 
502 california 163
Mammoth is a location in the Sierra Nevada, California, known for its mountain climate
Mammoth is a location in the Sierra Nevada, California, known for its mountain climate
506 california 167
Eureka is a city in California known for its cool summers in the Humboldt Bay region
Eureka is a city in California known for its cool summers in the Humboldt Bay region
1960 washington 104
The Southern Oscillation is a climate pattern that influences weather during the cold season, affect
The Souther

### Multi-index Drift Search

In [None]:
nodes = [
    pd.read_parquet(f"inputs/{index}/create_final_nodes.parquet") for index in indexes
]
entities = [
    pd.read_parquet(f"inputs/{index}/create_final_entities.parquet")
    for index in indexes
]
community_reports = [
    pd.read_parquet(f"inputs/{index}/create_final_community_reports.parquet")
    for index in indexes
]
text_units = [
    pd.read_parquet(f"inputs/{index}/create_final_text_units.parquet")
    for index in indexes
]
relationships = [
    pd.read_parquet(f"inputs/{index}/create_final_relationships.parquet")
    for index in indexes
]

task = loop.create_task(
    multi_index_drift_search(
        parameters,
        nodes,
        entities,
        community_reports,
        text_units,
        relationships,
        indexes,
        1,
        "Multiple Paragraphs",
        False,
        "agriculture",
    )
)
results = await task

#### Print report

In [24]:
print(results[0])

### Overview of Agriculture in Key U.S. Regions

Agriculture in the United States is a diverse and regionally varied industry, with different areas specializing in specific crops and facing unique challenges. This overview highlights the agricultural dynamics in several key regions, including California, Washington, and Alaska, as well as the role of agriculture in the broader economic and environmental context.

#### California's Agricultural Landscape

California is a powerhouse in U.S. agriculture, with the Central Valley being a critical area for crop production. The region is known for producing a wide variety of crops, including almonds, grapes, and dairy products, supported by fertile soil and a favorable climate [Data: Sources (16, 29)]. However, water management is a significant challenge due to the state's dry climate and frequent droughts. The Sacramento and San Joaquin Rivers are vital for irrigation, but water scarcity remains a persistent issue, impacting crop yields and 

#### Show context links back to original index

In [None]:
for report_id in [47, 236]:
    for question in results[1]:
        resq = results[1][question]
        if len(resq["reports"]) == 0:
            continue
        if len([i for i in resq["reports"] if i["id"] == str(report_id)]) == 0:
            continue
        index_name = [i for i in resq["reports"] if i["id"] == str(report_id)][0][  # noqa: RUF015
            "index_name"
        ]
        index_id = [i for i in resq["reports"] if i["id"] == str(report_id)][0][  # noqa: RUF015
            "index_id"
        ]
        print(question, report_id, index_name, index_id)
        index_reports = pd.read_parquet(
            f"inputs/{index_name}/create_final_community_reports.parquet"
        )
        print([i for i in resq["reports"] if i["id"] == str(report_id)][0]["title"])  # noqa: RUF015
        print(
            index_reports[index_reports["community"] == int(index_id)][
                "title"
            ].to_numpy()[0]
        )
        break
for source_id in [10, 16, 19, 20, 21, 22, 24, 29, 93, 95]:
    for question in results[1]:
        resq = results[1][question]
        if len(resq["sources"]) == 0:
            continue
        if len([i for i in resq["sources"] if i["id"] == str(source_id)]) == 0:
            continue
        index_name = [i for i in resq["sources"] if i["id"] == str(source_id)][0][  # noqa: RUF015
            "index_name"
        ]
        index_id = [i for i in resq["sources"] if i["id"] == str(source_id)][0][  # noqa: RUF015
            "index_id"
        ]
        print(question, source_id, index_name, index_id)
        index_sources = pd.read_parquet(
            f"inputs/{index_name}/create_final_text_units.parquet"
        )
        print(
            [i for i in resq["sources"] if i["id"] == str(source_id)][0]["text"][:250]  # noqa: RUF015
        )
        print(index_sources.loc[int(index_id)]["text"][:250])
        break

What strategies is the USDA implementing in California to combat drought effects on agriculture? 47 california 0
California: A Hub of Cultural, Economic, and Environmental Significance
California: A Hub of Cultural, Economic, and Environmental Significance
What environmental challenges affect agriculture around the Columbia River? 236 washington 23
Columbia River and Its Regional Impact
Columbia River and Its Regional Impact
How does agriculture in the Tanana Valley impact the local economy? 10 alaska 10
 Fort Greely. This area was largely set aside and developed under a state program spearheaded by Hammond during his second term as governor. Delta-area crops consist predominantly of barley and hay. West of Fairbanks lies another concentration of sma
 Fort Greely. This area was largely set aside and developed under a state program spearheaded by Hammond during his second term as governor. Delta-area crops consist predominantly of barley and hay. West of Fairbanks lies another concentra

### Multi-index Basic Search

In [None]:
text_units = [
    pd.read_parquet(f"inputs/{index}/create_final_text_units.parquet")
    for index in indexes
]

task = loop.create_task(
    multi_index_basic_search(
        parameters, text_units, indexes, False, "industry in maryland"
    )
)
results = await task

#### Print report

In [25]:
print(results[0])

# Industry in Maryland

Maryland's economy is diverse and robust, with significant contributions from various sectors, including manufacturing, biotechnology, transportation, and agriculture. The state's strategic location near Washington, D.C., and its access to major transportation hubs like the Port of Baltimore, play a crucial role in its industrial landscape.

## Manufacturing

Manufacturing in Maryland is highly diversified, with no single sub-sector contributing more than 20% of the total. Key manufacturing industries include electronics, computer equipment, and chemicals. Historically, the primary metals sub-sector was significant, with the Sparrows Point steel factory once being the largest in the world. However, this sector has faced challenges from foreign competition, bankruptcies, and mergers [Data: Sources (0, 1)].

## Biotechnology

Maryland is a major center for life sciences research and development, hosting more than 400 biotechnology companies, making it the fourth l

#### Show context links back to original text

Note that original index name is not saved in context data for basic search

In [26]:
for source_id in [0, 1]:
    print(results[1]["sources"][source_id]["text"][:250])

 highly diversified with no sub-sector contributing over 20 percent of the total. Typical forms of manufacturing include electronics, computer equipment, and chemicals. The once-mighty primary metals sub-sector, which once included what was then the 
20%. Demographically, both Protestants and those identifying with no religion are more numerous than Catholics.
According to the Pew Research Center in 2014, 69 percent of Maryland's population identifies themselves as Christian. Nearly 52% of the ad
