# Test Complex Queries over Multiple Documents (with and without Query Decomposition)

Query Decomposition: The ability to decompose a complex query into a simpler query given the content of the index.

Use ChatGPT as the LLM model

In [1]:
import logging
import sys

# logging.basicConfig(stream=sys.stdout, level=logging.INFO)
# logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

# Uncomment if you want to temporarily disable logger
logger = logging.getLogger()
logger.disabled = True

In [2]:
from gpt_index import (
    GPTSimpleVectorIndex, 
    GPTSimpleKeywordTableIndex, 
    GPTListIndex, 
    SimpleDirectoryReader,
    LLMPredictor,
    ServiceContext
)
from langchain.llms.openai import OpenAIChat, OpenAI
import requests

#### Load Datasets

Load Wikipedia pages as well as Paul Graham's "What I Worked On" essay

In [3]:
wiki_titles = ["Toronto", "Seattle", "San Francisco", "Chicago", "Boston", "Washington, D.C.", "Cambridge, Massachusetts", "Houston"]

In [4]:
from pathlib import Path

import requests
for title in wiki_titles:
    response = requests.get(
        'https://en.wikipedia.org/w/api.php',
        params={
            'action': 'query',
            'format': 'json',
            'titles': title,
            'prop': 'extracts',
            # 'exintro': True,
            'explaintext': True,
        }
    ).json()
    page = next(iter(response['query']['pages'].values()))
    wiki_text = page['extract']

    data_path = Path('data')
    if not data_path.exists():
        Path.mkdir(data_path)

    with open(data_path / f"{title}.txt", 'w') as fp:
        fp.write(wiki_text)


In [5]:
# Load all wiki documents
city_docs = {}
for wiki_title in wiki_titles:
    city_docs[wiki_title] = SimpleDirectoryReader(input_files=[f"data/{wiki_title}.txt"]).load_data()


### Building the document indices
Build a vector index for the wiki pages about cities and persons, and PG essay

In [4]:
# # LLM Predictor (gpt-3.5-turbo)
llm_predictor_chatgpt = LLMPredictor(llm=OpenAIChat(temperature=0, model_name="gpt-3.5-turbo"))
service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor_chatgpt)

In [None]:
# Build city document index
city_indices = {}
index_summaries = {}
for wiki_title in wiki_titles:
    city_indices[wiki_title] = GPTSimpleVectorIndex.from_documents(city_docs[wiki_title], chunk_size_limit=512)
    # set summary text for city
    index_summaries[wiki_title] = f"Wikipedia articles about {wiki_title}"
    city_indices[wiki_title].save_to_disk(f'index_{wiki_title}.json')

### Loading the indices
Build a vector index for the NYC wiki page and PG essay

In [7]:
# If indices already saved, try loading
city_indices = {}
for wiki_title in wiki_titles:
    city_indices[wiki_title] = GPTSimpleVectorIndex.load_from_disk(
      f'index_{wiki_title}.json'
    )

### Build Graph: Keyword Table Index on top of vector indices! 

We compose a keyword table index on top of all the vector indices.

In [5]:
from gpt_index.indices.composability import ComposableGraph

In [None]:
graph = ComposableGraph.from_indices(
    GPTSimpleKeywordTableIndex,
    [index for _, index in city_indices.items()], 
    [summary for _, summary in index_summaries.items()],
    max_keywords_per_chunk=50
)

In [11]:
# [optional] save to disk
graph.save_to_disk("index_multi_doc_graph.json")

In [6]:
# [optional] load from disk
graph = ComposableGraph.load_from_disk("index_multi_doc_graph.json")

### Define Query Configs

**Query Transform**

In [7]:
from gpt_index.indices.query.query_transform.base import DecomposeQueryTransform
decompose_transform = DecomposeQueryTransform(
    llm_predictor_chatgpt, verbose=True
)

In [8]:
# set query config
query_configs = [
    {
        "index_struct_type": "simple_dict",
        "query_mode": "default",
        "query_kwargs": {
            "similarity_top_k": 1
        },
        # NOTE: set query transform for subindices
        "query_transform": decompose_transform
    },
    {
        "index_struct_type": "keyword_table",
        "query_mode": "simple",
        "query_kwargs": {
            "response_mode": "tree_summarize",
            "verbose": True
        },
    },
]

**Complex Query 1**

In [9]:
# with query decomposition in subindices
query_str = (
    "Compare and contrast the airports in Seattle, Houston, and Toronto. "
)
query_configs[0]["query_transform"] = decompose_transform
response_chatgpt = graph.query(
    query_str, 
    query_configs=query_configs, 
    service_context=service_context,
)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jerryliu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


[33;1m[1;3m> Current query: Compare and contrast the airports in Seattle, Houston, and Toronto. 
[0m[38;5;200m[1;3m> New query: What are the major airports in Houston?
[0m[36;1m[1;3m> Got response: The major airports in Houston are Bush Intercontinental Airport and William P. Hobby Airport. There is also a third municipal airport called Ellington Airport....
[0m[33;1m[1;3m> Current query: Compare and contrast the airports in Seattle, Houston, and Toronto. 
[0m[38;5;200m[1;3m> New query: What is the name of the airport in Seattle?
[0m[36;1m[1;3m> Got response: The name of the airport in Seattle is Seattle-Tacoma International Airport, locally known as Sea-Tac Airport....
[0m[33;1m[1;3m> Current query: Compare and contrast the airports in Seattle, Houston, and Toronto. 
[0m[38;5;200m[1;3m> New query: What are some notable features of the Toronto Pearson International Airport?
[0m[36;1m[1;3m> Got response: The Union Pearson Express train service provides a direct

In [11]:
print(str(response_chatgpt))



Seattle has one major airport, while Houston has two major airports and a third municipal airport. Toronto has one major airport, but also has a direct train service linking it to Union Station.


In [12]:
# without query decomposition in subindices
query_str = (
    "Compare and contrast the airports in Seattle, Houston, and Toronto. "
)
query_configs[0]["query_transform"] = None
response_chatgpt = graph.query(
    query_str, 
    query_configs=query_configs, 
    service_context=service_context,
)

[36;1m[1;3m> Got response: The context information provided does not contain any information about the airports in Seattle or Toronto, so a comparison and contrast cannot be made....
[0m[36;1m[1;3m> Got response: The context information only provides details about the airports in Seattle, and does not mention anything about the airports in Houston or Toronto. Therefore, a comparison and contrast of the airp...
[0m[36;1m[1;3m> Got response: The context information provided does not include any information about airports in Seattle or Houston, so a comparison and contrast of airports in those cities cannot be made. The context informat...
[0m

In [13]:
str(response_chatgpt)

'It is not possible to compare and contrast the airports in Seattle, Houston, and Toronto based on the given context information.'

**Complex Query 2**

In [14]:
# with query decomposition
query_str = (
    "Compare and contrast the sports environment of Houston and Boston. "
)
query_configs[0]["query_transform"] = decompose_transform
response_chatgpt = graph.query(
    query_str, 
    query_configs=query_configs, 
    service_context=service_context,
)

[33;1m[1;3m> Current query: Compare and contrast the sports environment of Houston and Boston. 
[0m[38;5;200m[1;3m> New query: What sports teams are based in Houston?
[0m[36;1m[1;3m> Got response: The sports teams based in Houston include the Houston Texans (NFL), Houston Dynamo (MLS), Houston Dash (National Women's Soccer League), and Houston SaberCats (Major League Rugby)....
[0m[33;1m[1;3m> Current query: Compare and contrast the sports environment of Houston and Boston. 
[0m[38;5;200m[1;3m> New query: What are some notable sports teams based in Boston?
[0m[36;1m[1;3m> Got response: Some notable sports teams based in Boston include the New England Patriots, Boston Breakers, Boston Storm, Boston College, Boston University, Harvard University, Northeastern University, Boston Can...
[0m

In [15]:
str(response_chatgpt)

'Houston and Boston both have diverse sports environments with a strong tradition of hosting major sporting events. However, the specific sports and teams represented in each city differ. Houston has a stronger focus on football and soccer, while Boston has a wider range of sports including football, soccer, lacrosse, and basketball. Houston is home to the Houston Texans, Houston Dynamo, Houston Dash, and Houston SaberCats, while Boston has the New England Patriots, Boston Breakers, Boston Storm, Boston College, Boston University, Harvard University, Northeastern University, Boston Cannons, Boston Uprising, and Boston Breach. Boston is known for the Boston Marathon and the Head of the Charles Regatta, while Houston will host the 2026 FIFA World Cup.'

In [16]:
# without query decomposition
query_str = (
    "Compare and contrast the sports environment of Houston and Boston. "
)
query_configs[0]["query_transform"] = None
response_chatgpt = graph.query(
    query_str, 
    query_configs=query_configs, 
    service_context=service_context,
)

[36;1m[1;3m> Got response: 

Sorry, I cannot answer this question as there is no information provided about the sports environment of Boston in the given context....
[0m[36;1m[1;3m> Got response: The context information provided does not contain any information about the sports environment of Houston, therefore a comparison and contrast cannot be made....
[0m

In [17]:
str(response_chatgpt)

'Sorry, I cannot answer this question as there is no information provided about the sports environment of Houston and Boston in the given context.'

In [18]:
# with query decomposition
query_str = (
    "Compare and contrast the sports environment of Houston and Boston. "
)
query_configs[0]["query_transform"] = decompose_transform
response_chatgpt = graph.query(
    query_str, 
    query_configs=query_configs, 
    service_context=service_context,
)

[33;1m[1;3m> Current query: Compare and contrast the sports environment of Houston and Boston. 
[0m[38;5;200m[1;3m> New query: What sports teams are based in Houston?
[0m[36;1m[1;3m> Got response: The sports teams based in Houston include the Houston Texans (NFL), Houston Dynamo (MLS), Houston Dash (National Women's Soccer League), and Houston SaberCats (Major League Rugby)....
[0m[33;1m[1;3m> Current query: Compare and contrast the sports environment of Houston and Boston. 
[0m[38;5;200m[1;3m> New query: What are some notable sports teams based in Boston?
[0m[36;1m[1;3m> Got response: Some notable sports teams based in Boston include the New England Patriots, Boston Breakers, Boston Storm, Boston College, Boston University, Harvard University, Northeastern University, Boston Can...
[0m

In [19]:
print(response_chatgpt)

Houston and Boston both have diverse sports environments with a strong tradition of hosting major sporting events. However, the specific sports and teams represented in each city differ. Houston has a stronger focus on football and soccer, while Boston has a wider range of sports including football, soccer, lacrosse, and basketball. Houston is home to the Houston Texans, Houston Dynamo, Houston Dash, and Houston SaberCats, while Boston has the New England Patriots, Boston Breakers, Boston Storm, Boston College, Boston University, Harvard University, Northeastern University, Boston Cannons, Boston Uprising, and Boston Breach. Boston is known for the Boston Marathon and the Head of the Charles Regatta, while Houston will host the 2026 FIFA World Cup.


In [20]:
# without query decomposition
query_str = (
    "Compare and contrast the sports environment of Houston and Boston. "
)
query_configs[0]["query_transform"] = None
response_chatgpt = graph.query(
    query_str, 
    query_configs=query_configs, 
    service_context=service_context,
)

[36;1m[1;3m> Got response: 

Sorry, I cannot answer this question as there is no information provided about the sports environment of Boston in the given context....
[0m[36;1m[1;3m> Got response: The context information provided does not contain any information about the sports environment of Houston, therefore a comparison and contrast cannot be made....
[0m

In [21]:
print(response_chatgpt)

Sorry, I cannot answer this question as there is no information provided about the sports environment of Houston and Boston in the given context.


**Complex Query 3**

In [22]:
# with query decomposition
query_str = (
    "Compare and contrast the arts and culture of Houston and Boston. "
)
query_configs[0]["query_transform"] = decompose_transform
response_chatgpt = graph.query(
    query_str, 
    query_configs=query_configs, 
    service_context=service_context,
)

[33;1m[1;3m> Current query: Compare and contrast the arts and culture of Houston and Boston. 
[0m[38;5;200m[1;3m> New query: What are some notable cultural institutions in Houston?
[0m[36;1m[1;3m> Got response: Some notable cultural institutions in Houston include The Museum of Fine Arts, the Houston Museum of Natural Science, the Contemporary Arts Museum Houston, the Station Museum of Contemporary Art, t...
[0m[33;1m[1;3m> Current query: Compare and contrast the arts and culture of Houston and Boston. 
[0m[38;5;200m[1;3m> New query: What are some notable cultural institutions in Boston?
[0m[36;1m[1;3m> Got response: Some notable cultural institutions in Boston include the Museum of Fine Arts, the Isabella Stewart Gardner Museum, the Institute of Contemporary Art, the Boston Athenæum, the Boston Children's Muse...
[0m

In [23]:
print(response_chatgpt)

Houston and Boston both have a rich arts and culture scene with notable institutions. Both cities have a Museum of Fine Arts and a Children's Museum. However, Houston has more museums and cultural institutions such as the Houston Museum of Natural Science, the Contemporary Arts Museum Houston, the Station Museum of Contemporary Art, the Holocaust Museum Houston, the Houston Zoo, The Menil Collection, Rothko Chapel, the Moody Center for the Arts, the Byzantine Fresco Chapel Museum, and Bayou Bend. Boston, on the other hand, has the Isabella Stewart Gardner Museum, the Institute of Contemporary Art, the Boston Athenæum, the Museum of Science, and the New England Aquarium. Overall, both cities offer a diverse range of cultural experiences for visitors and residents alike.


In [24]:
# without query decomposition
query_str = (
    "Compare and contrast the arts and culture of Houston and Boston. "
)
query_configs[0]["query_transform"] = None
response_chatgpt = graph.query(
    query_str, 
    query_configs=query_configs, 
    service_context=service_context,
)

[36;1m[1;3m> Got response: Sorry, I cannot answer this question as there is no information provided about the arts and culture of Boston in the given context....
[0m[36;1m[1;3m> Got response: The context information provided does not contain any information about the arts and culture of Houston, therefore a comparison and contrast cannot be made....
[0m

In [25]:
print(response_chatgpt)

Sorry, I cannot answer this question as there is no information provided about the arts and culture of Houston and Boston in the given context.
