# Hierarchical Queries


In [1]:
from dotenv import load_dotenv, find_dotenv

load_dotenv(find_dotenv())
import nest_asyncio
nest_asyncio.apply()
from loguru import logger

In [2]:
from llama_index.core import SimpleDirectoryReader, get_response_synthesizer
from llama_index.core import DocumentSummaryIndex
from llama_index.llms.openai import OpenAI
from llama_index.core.node_parser import SentenceSplitter

## Loading Wikipedia Data


In [3]:
wiki_titles = ["San Francisco", "Seattle", "Chicago", "Boston", "Dallas"]

from pathlib import Path

import requests

for title in wiki_titles:
    response = requests.get(
        "https://en.wikipedia.org/w/api.php",
        params={
            "action": "query",
            "format": "json",
            "titles": title,
            "prop": "extracts",
            # 'exintro': True,
            "explaintext": True,
        },
    ).json()
    page = next(iter(response["query"]["pages"].values()))
    wiki_text = page["extract"]

    data_path = Path("data")
    if not data_path.exists():
        Path.mkdir(data_path)

    with open(data_path / f"{title}.txt", "w") as fp:
        fp.write(wiki_text)

In [4]:

# Load all wiki documents
city_docs = []
for wiki_title in wiki_titles:
    docs = SimpleDirectoryReader(
        input_files=[f"data/{wiki_title}.txt"]
    ).load_data()
    docs[0].doc_id = wiki_title
    city_docs.extend(docs)

## Building Indices


In [5]:
llm = OpenAI(temperature=0, model="gpt-4o-mini")
chunker = SentenceSplitter(chunk_size=1024)

In [6]:
from llama_index.core.response_synthesizers import ResponseMode

response_synthesizer = get_response_synthesizer(
    response_mode=ResponseMode.TREE_SUMMARIZE, use_async=True
)
doc_summary_index = DocumentSummaryIndex.from_documents(
    city_docs,
    llm=llm,
    transformations=[chunker],
    response_synthesizer=response_synthesizer,
    show_progress=True,
)


  from .autonotebook import tqdm as notebook_tqdm
Parsing nodes: 100%|██████████| 5/5 [00:00<00:00, 52.69it/s]
Summarizing documents:   0%|          | 0/5 [00:00<?, ?it/s]

current doc id: San Francisco


Summarizing documents:  20%|██        | 1/5 [00:06<00:24,  6.09s/it]

current doc id: Seattle


Summarizing documents:  40%|████      | 2/5 [00:11<00:16,  5.53s/it]

current doc id: Chicago


Summarizing documents:  60%|██████    | 3/5 [00:15<00:09,  4.91s/it]

current doc id: Boston


Summarizing documents:  80%|████████  | 4/5 [00:19<00:04,  4.62s/it]

current doc id: Dallas


Summarizing documents: 100%|██████████| 5/5 [00:22<00:00,  4.57s/it]
Generating embeddings: 100%|██████████| 5/5 [00:00<00:00,  8.75it/s]


In [8]:
from pprint import pprint
pprint(doc_summary_index.get_document_summary("San Francisco"))

('The text provides a detailed overview of San Francisco, covering a wide '
 'range of topics such as its history, demographics, economy, arts and '
 'culture, sports, environment, education, infrastructure, public safety, and '
 "more. It explores the city's evolution over time, its significant "
 'achievements, cultural landmarks, major industries, environmental '
 'initiatives, government structure, transportation systems, and educational '
 "institutions. The text also highlights San Francisco's diverse population, "
 'economic significance, impact on various sectors like technology and the '
 'performing arts, and its contributions to areas such as LGBT rights and '
 'public safety.\n'
 '\n'
 'Some questions that this text can answer include:\n'
 '- What is the historical background of San Francisco, and how has it evolved '
 'over time?\n'
 "- What are the key industries driving San Francisco's economy, and how has "
 'it diversified over the years?\n'
 '- What are some of the ma

In [9]:
doc_summary_index.storage_context.persist("index")

from llama_index.core import load_index_from_storage
from llama_index.core import StorageContext

storage_context = StorageContext.from_defaults(persist_dir="index")
doc_summary_index = load_index_from_storage(storage_context)

## Retrieval from Document Summary Index


### High-level Querying


In [23]:

query_engine = doc_summary_index.as_query_engine(
    response_mode=ResponseMode.TREE_SUMMARIZE, use_async=True
)
response = query_engine.query("What are the sports teams in San Francisco?")

In [24]:
print(response.response)

The sports teams in San Francisco include Major League Baseball's San Francisco Giants, the National Football League's San Francisco 49ers, the NBA's Golden State Warriors, the collegiate teams San Francisco Dons and San Francisco State Gators, lower-league soccer clubs, and Esports teams like the Overwatch League's San Francisco Shock.


### LLM-based Retrieval


In [16]:

from llama_index.core.indices.document_summary import (
    DocumentSummaryIndexLLMRetriever,
)

llm_retriever = DocumentSummaryIndexLLMRetriever(
    doc_summary_index, llm=llm
)
retrieved_nodes = llm_retriever.retrieve("What are the sports team in San Francisco?")
len(retrieved_nodes)


27

In [17]:
print(retrieved_nodes[0].score)
print(retrieved_nodes[0].node.get_text())

10.0
San Francisco, officially the City and County of San Francisco, is a commercial, financial, and cultural center within Northern California, United States. With a population of 827,526 residents as of 2024, San Francisco is the fourth-most populous city in California and the 17th-most populous in the U.S.; with a land area of 46.9 square miles (121 square kilometers) at the upper end of the San Francisco Peninsula, it is the fifth-most densely populated U.S. county. Among U.S. cities proper with over 250,000 residents, San Francisco is ranked first by per capita income and sixth by aggregate income as of 2023. San Francisco anchors the 13th-most populous metropolitan statistical area in the U.S., with almost 4.6 million residents in 2023. The larger San Jose–San Francisco–Oakland combined statistical area, the fifth-largest urban region in the U.S., had a 2023 estimated population of over nine million.
Prior to European settlement, the modern city proper was inhabited by the Yelamu

In [18]:
# use retriever as part of a query engine
from llama_index.core.query_engine import RetrieverQueryEngine

# configure response synthesizer
response_synthesizer = get_response_synthesizer(response_mode="tree_summarize")

# assemble query engine
query_engine = RetrieverQueryEngine(
    retriever=llm_retriever,
    response_synthesizer=response_synthesizer,
)

# query
response = query_engine.query("What are the sports teams in San Francisco?")
print(response)

The sports teams in San Francisco include Major League Baseball's San Francisco Giants, the National Football League's San Francisco 49ers, the National Basketball Association's Golden State Warriors, the San Francisco Dons in NCAA Division I, and the San Francisco State Gators in NCAA Division II.


### Embedding-based Retrieval


In [19]:
from llama_index.core.indices.document_summary import (
    DocumentSummaryIndexEmbeddingRetriever,
)

embedding_retriever = DocumentSummaryIndexEmbeddingRetriever(
    doc_summary_index, llm=llm,
    similarity_top_k=3,
)

retrieved_nodes = embedding_retriever.retrieve("What are the sports teams in San Francisco?")
len(retrieved_nodes)

70

In [20]:
print(retrieved_nodes[0].node.get_text())

San Francisco, officially the City and County of San Francisco, is a commercial, financial, and cultural center within Northern California, United States. With a population of 827,526 residents as of 2024, San Francisco is the fourth-most populous city in California and the 17th-most populous in the U.S.; with a land area of 46.9 square miles (121 square kilometers) at the upper end of the San Francisco Peninsula, it is the fifth-most densely populated U.S. county. Among U.S. cities proper with over 250,000 residents, San Francisco is ranked first by per capita income and sixth by aggregate income as of 2023. San Francisco anchors the 13th-most populous metropolitan statistical area in the U.S., with almost 4.6 million residents in 2023. The larger San Jose–San Francisco–Oakland combined statistical area, the fifth-largest urban region in the U.S., had a 2023 estimated population of over nine million.
Prior to European settlement, the modern city proper was inhabited by the Yelamu. On 

In [21]:
# use retriever as part of a query engine
from llama_index.core.query_engine import RetrieverQueryEngine

# configure response synthesizer
response_synthesizer = get_response_synthesizer(response_mode="tree_summarize")

# assemble query engine
query_engine = RetrieverQueryEngine(
    retriever=embedding_retriever,
    response_synthesizer=response_synthesizer,
)

# query
response = query_engine.query("What are the sports teams in San Francisco?")
print(response)

The sports teams in San Francisco include Major League Baseball's San Francisco Giants, the National Football League's San Francisco 49ers, and the National Basketball Association's Golden State Warriors.
