# An Introduction to LlamaIndex Query Pipelines

In [132]:
import phoenix as px
session = px.launch_app()
import llama_index.core
llama_index.core.set_global_handler("arize_phoenix")
# px.close_app()
# session = px.active_session()

Existing running Phoenix instance detected! Shutting it down and starting a new instance...


üåç To view the Phoenix app in your browser, visit http://localhost:6006/
üì∫ To view the Phoenix app in a notebook, run `px.active_session().view()`
üìñ For more information on how to use Phoenix, check out https://docs.arize.com/phoenix


In [139]:
from dotenv import load_dotenv
import os
load_dotenv()
LANGCHAIN_API_KEY = os.environ.get("LANGCHAIN_API_KEY")

True

Setup
Here we setup some data + indexes (from PG's essay) that we'll be using in the rest of the cookbook.

In [16]:
from llama_index.core.query_pipeline import QueryPipeline
from llama_index.llms.ollama import Ollama
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.embeddings.ollama import OllamaEmbedding
from llama_index.core.storage.storage_context import StorageContext
from llama_index.core import(
    VectorStoreIndex,
    SimpleDirectoryReader,
    StorageContext,
    load_index_from_storage
)


In [83]:
reader = SimpleDirectoryReader(r"D:\Gen_AI_Tutorials\langchain\llamaindexrag\pdfs")

In [84]:
docs = reader.load_data()
print(docs)
# print(docs[0].get_content())

[Document(id_='fd6a6df4-3d66-4379-8610-2aeaa71a2606', embedding=None, metadata={'page_label': '1', 'file_name': 'attention_is_all_you_need.pdf', 'file_path': 'D:\\Gen_AI_Tutorials\\langchain\\llamaindexrag\\pdfs\\attention_is_all_you_need.pdf', 'file_type': 'application/pdf', 'file_size': 569417, 'creation_date': '2024-04-06', 'last_modified_date': '2024-03-21'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, text='Attention Is All You Need\nAshish Vaswani‚àó\nGoogle Brain\navaswani@google.comNoam Shazeer‚àó\nGoogle Brain\nnoam@google.comNiki Parmar‚àó\nGoogle Research\nnikip@google.comJakob Uszkoreit‚àó\nGoogle Research\nusz@google.com\nLlion Jones‚àó\nGoogle Research\nllion@google.comAidan N. Gomez‚àó‚Ä†\nUniversity of Toronto\naidan@cs.toronto.edu≈Åukasz K

In [85]:
import os

In [86]:
from llama_index.embeddings.ollama import OllamaEmbedding
# from llama_index.embeddings.huggingface import HuggingFaceEmbedding
embed_model = OllamaEmbedding(model_name="nomic-embed-text")
# embed_model = HuggingFaceEmbedding(model_name="nomic-embed-text",token="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")

In [88]:
if not os.path.exists("storage"):
    index = VectorStoreIndex.from_documents(docs,embed_model=embed_model)
    # save index to disk
    index.set_index_id("vector_index")
    index.storage_context.persist("./storage")
else:
    # rebuild storage context
    storage_context = StorageContext.from_defaults(persist_dir="storage")
    # load index
    index = load_index_from_storage(storage_context, index_id="vector_index")

1. Chain Together Prompt and LLM
In this section we show a super simple workflow of chaining together a prompt with LLM.

We simply define chain on initialization. This is a special case of a query pipeline where the components are purely sequential, and we automatically convert outputs into the right format for the next inputs.

In [89]:
llm = Ollama(model="phi")

In [90]:
from llama_index.core.prompts import PromptTemplate

In [91]:
# try chaining basic prompts
prompt_str= "Please generate related movies to {movie_name}"
prompt_tmpl = PromptTemplate(prompt_str)

p = QueryPipeline(chain=[prompt_tmpl, llm], verbose=True)

In [None]:
response = p.run(movie_name="The Departed")

In [None]:
print(str(response))

In [92]:
# if you implemented this imperatively - you can still do so!
# just make sure you format the prompt and call the right method on the LLM
from llama_index.core.llms import ChatMessage, MessageRole

In [93]:
# try chaining basic prompts
prompt_str= "Please generate related movies to {movie_name}"
prompt_tmpl = PromptTemplate(prompt_str)

In [39]:
# format prompt, pass to LLM
movie_name = "The Departed"
full_prompt_tmpl = prompt_tmpl.format(movie_name= movie_name)
response = llm.chat([ChatMessage(content=full_prompt_tmpl, role=MessageRole.USER)])

In [40]:
print(str(response))

assistant:  1. Goodfellas (1990)
2. American Gangster (2007)
3. Heat (1995)
4. Casino (1997)
5. Scarface (1983)
6. Donnie Brasco (2006)
7. The Godfather Part II (1974)
8. Taxi Driver (1976)
9. The Untouchables (1987)
10. The Bigger Picture (1998)
11. A Bronx Tale (1993)
12. The Firm (1987)
13. Ocean's Eleven (2001)
14. The Departed: Original Motion Picture Soundtrack (2007)
15. The Color of Money (1981)
16. The Italian Job (2003)
17. Casino Royale (2006)
18. Heat (1995)
19. American Gangster (2007)
20. The Godfather Part III (1990)




### Try Output Parsing

Let's parse the outputs into a structured Pydantic object.

In [94]:
from typing import List # type: ignore
from pydantic import BaseModel, Field
from llama_index.core.output_parsers import PydanticOutputParser

In [95]:
class Movie(BaseModel):
    """Object representing a single movie."""

    name: str = Field(..., description="Name of the movie.")
    year: int = Field(..., description="Year of the movie.")

class Movies(BaseModel):
    """Object representing a list of movies."""

    movies: List[Movie] = Field(..., description="List of movies.")

In [96]:
output_parser = PydanticOutputParser(Movies)
json_prompt_str = """\
Please generate related movies to {movie_name}.
"""
json_prompt_str = output_parser.format(json_prompt_str)

In [97]:
json_prompt_str

'Please generate related movies to {movie_name}.\n\n\n\nHere\'s a JSON schema to follow:\n{{"$defs": {{"Movie": {{"description": "Object representing a single movie.", "properties": {{"name": {{"description": "Name of the movie.", "title": "Name", "type": "string"}}, "year": {{"description": "Year of the movie.", "title": "Year", "type": "integer"}}}}, "required": ["name", "year"], "title": "Movie", "type": "object"}}}}, "description": "Object representing a list of movies.", "properties": {{"movies": {{"description": "List of movies.", "items": {{"$ref": "#/$defs/Movie"}}, "title": "Movies", "type": "array"}}}}, "required": ["movies"], "title": "Movies", "type": "object"}}\n\nOutput a valid JSON object but do not repeat the schema.\n'

In [98]:
print(json_prompt_str)

Please generate related movies to {movie_name}.



Here's a JSON schema to follow:
{{"$defs": {{"Movie": {{"description": "Object representing a single movie.", "properties": {{"name": {{"description": "Name of the movie.", "title": "Name", "type": "string"}}, "year": {{"description": "Year of the movie.", "title": "Year", "type": "integer"}}}}, "required": ["name", "year"], "title": "Movie", "type": "object"}}}}, "description": "Object representing a list of movies.", "properties": {{"movies": {{"description": "List of movies.", "items": {{"$ref": "#/$defs/Movie"}}, "title": "Movies", "type": "array"}}}}, "required": ["movies"], "title": "Movies", "type": "object"}}

Output a valid JSON object but do not repeat the schema.



In [99]:
# add json spec to prompt template
json_prompt_tmpl = PromptTemplate(json_prompt_str)

In [None]:

# define QueryPipeline
p = QueryPipeline(chain=[json_prompt_tmpl, llm, output_parser], verbose=True)
output = p.run(movie_name="Toy Story")

In [None]:
print(output)

In [None]:
output

### Streaming Support

The query pipelines have LLM streaming support (simply do `as_query_component(streaming=True)`). Intermediate outputs will get autoconverted, and the final output can be a streaming output. Here's some examples.

**1. Chain multiple Prompts with Streaming**

In [100]:
prompt_str = "Please generate movies related to {movie_name}"
prompt_tmpl = PromptTemplate(prompt_str)

# let's add some subsequent prompts
prompt_str2 = """\
Here's some text:

{text}

Can you rewrite this with a summary of each movie?
"""
prompt_tmpl2 = PromptTemplate(prompt_str2)

llm_streaming = llm.as_query_component(streaming=True)

p_streaming = QueryPipeline(
    chain= [prompt_tmpl, llm_streaming, prompt_tmpl2, llm_streaming],
    verbose=True
)

In [49]:
llm_streaming


LLMChatComponent(partial_dict={}, llm=Ollama(callback_manager=<llama_index.core.callbacks.base.CallbackManager object at 0x000002C23F818670>, system_prompt=None, messages_to_prompt=<function messages_to_prompt at 0x000002C231EF4280>, completion_to_prompt=<function default_completion_to_prompt at 0x000002C231F168C0>, output_parser=None, pydantic_program_mode=<PydanticProgramMode.DEFAULT: 'default'>, query_wrapper_prompt=None, base_url='http://localhost:11434', model='phi', temperature=0.75, context_window=3900, request_timeout=30.0, prompt_key='prompt', additional_kwargs={}), streaming=True)

In [101]:
llm= Ollama(model="phi",request_timeout=100)

In [52]:
llm

Ollama(callback_manager=<llama_index.core.callbacks.base.CallbackManager object at 0x000002C23F142EF0>, system_prompt=None, messages_to_prompt=<function messages_to_prompt at 0x000002C231EF4280>, completion_to_prompt=<function default_completion_to_prompt at 0x000002C231F168C0>, output_parser=None, pydantic_program_mode=<PydanticProgramMode.DEFAULT: 'default'>, query_wrapper_prompt=None, base_url='http://localhost:11434', model='phi', temperature=0.75, context_window=3900, request_timeout=100.0, prompt_key='prompt', additional_kwargs={})

In [None]:
output = p_streaming.run(movie_name="The Dark Knight")
for o in output:
    print(o.delta, end="")

**2. Feed streaming output to output parser**

In [102]:
p = QueryPipeline(chain=[json_prompt_tmpl,
                         llm.as_query_component(streaming=True),
                         output_parser
                         ],
                         verbose = True
                  )

In [None]:
output = p.run(movie_name="Toy Story")
print(output)

## Chain Together Query Rewriting Workflow (prompts + LLM) with Retrieval

Here we try a slightly more complex workflow where we send the input through two prompts before initiating retrieval.

1. Generate question about given topic.
2. Hallucinate answer given question, for better retrieval.

Since each prompt only takes in one input, note that the `QueryPipeline` will automatically chain LLM outputs into the prompt and then into the LLM.

You'll see how to define links more explicitly in the next section.

In [103]:
# generate question regarding topic
prompt_str1 = "Please generate a concise question about Paul Graham's life regarding the following topic {topic}"
prompt_tmpl1 = PromptTemplate(prompt_str1)

In [104]:
# use HyDE technique to hallucinate answer.
# HyDE: given a question try to hallucinate the answer and use that hallucinated answer as input to a
#       retriver to try to fatch relevant results
prompt_str2 = (
    "Please write a passage to answer the question\n"
    "Try to include as many key details as possible.\n"
    "\n"
    "\n"
    "{query_str}\n"
    "\n"
    "\n"
    'Passage:"""\n'
)
prompt_tmpl2 = PromptTemplate(prompt_str2)

In [105]:
retriever = index.as_retriever(similarity_top_k=5,embed_model=embed_model)

In [106]:
p = QueryPipeline(
    chain=[prompt_tmpl1, llm, prompt_tmpl2, llm, retriever], verbose=True
)

In [61]:
nodes = p.run(topic="college")
len(nodes)

[1;3;38;2;155;135;227m> Running module efaf3b43-0ae5-49c0-9a56-58dbc9266ae9 with input: 
topic: college

[0m[1;3;38;2;155;135;227m> Running module e67aff9e-6445-47f1-835e-beb174e7f930 with input: 
messages: Please generate a concise question about Paul Graham's life regarding the following topic college

[0m[1;3;38;2;155;135;227m> Running module a208701c-524e-4bcc-8ef9-b4b1a1c87f1b with input: 
query_str: assistant:  What is Paul Graham known for in relation to his college experiences?


[0m[1;3;38;2;155;135;227m> Running module 317069b8-599c-463d-b03b-85e7031ebbf6 with input: 
messages: Please write a passage to answer the question
Try to include as many key details as possible.


 What is Paul Graham known for in relation to his college experiences?



Passage:"""


[0m[1;3;38;2;155;135;227m> Running module 7a8262ad-9d12-4cd2-b745-f8adedc31dd4 with input: 
input: assistant:  Paul Graham, a highly influential figure in the world of entrepreneurship and technology, has shared 

5

In [62]:
for node in nodes:
    print(node)

Node ID: d8ccdae9-b239-4f1e-a74d-36fa8992c6bd
Text: On John‚Äôs footer days she never once forgot his sweater, and she
usually carried an umbrella in her mouth in case of rain. There is a
room in the basement of Miss Fulsom‚Äôs school where the nurses wait.
They sat on forms, while Nana lay on the floor, but that was the only
difference. They affected to ignore her as of an inferior social
status to ...
Score:  0.485

Node ID: 824d56ae-e31e-45d3-907d-da842d9cb614
Text: No. 27 was only a few yards distant, but there had been a slight
fall of snow, and Father and Mother Darling picked their way over it
deftly not to soil their shoes. They were already the only persons in
the street, and all the stars were watching them. Stars are beautiful,
but they may not take an active part in anything, they must just look
on...
Score:  0.472

Node ID: f88ae9e3-e7a6-4c9b-9076-ea674ef73a9c
Text: by J. M. Barrie [James Matthew Barrie]  A Millennium Fulcrum
Edition produced in 1991 by Duncan Research. No

## Create a Full RAG Pipeline as a DAG

Here we chain together a full RAG pipeline consisting of query rewriting, retrieval, reranking, and response synthesis.

Here we can't use `chain` syntax because certain modules depend on multiple inputs (for instance, response synthesis expects both the retrieved nodes and the original question). Instead we'll construct a DAG explicitly, through `add_modules` and then `add_link`.

### 1. RAG Pipeline with Query Rewriting

We use an LLM to rewrite the query first before passing it to our downstream modules - retrieval/reranking/synthesis.

In [None]:
# install post processor cohere rerank
# pip install llama-index-postprocessor-cohere-rerank

In [107]:
from llama_index.postprocessor.cohere_rerank import CohereRerank
from llama_index.core.response_synthesizers import TreeSummarize
from llama_index.core import Settings

In [114]:
# define modules
prompt_str = "Please generate a question about regarding the following topic {topic}"
prompt_tmpl = PromptTemplate(prompt_str)
retriever = index.as_retriever(similarity_to_k=3,embed_model=embed_model)
reranker = CohereRerank(api_key='x6JCRmE5AymCDcV5jyQjG6g54AXE2GsHX1jUH3T5')
summarizer = TreeSummarize(llm=llm)

In [115]:
# define query pipeline
p = QueryPipeline(verbose=True)
p.add_modules(
    {
        "llm": llm,
        "prompt_tmpl": prompt_tmpl,
        "retriever": retriever,
        "summarizer": summarizer,
        "reranker": reranker,
    }
)

Next we draw links between modules with add_link. add_link takes in the source/destination module ids, and optionally the source_key and dest_key. Specify the source_key or dest_key if there are multiple outputs/inputs respectively.

You can view the set of input/output keys for each module through module.as_query_component().input_keys and module.as_query_component().output_keys.

Here we explicitly specify dest_key for the reranker and summarizer modules because they take in two inputs (query_str and nodes).

In [116]:

p.add_link("prompt_tmpl", "llm")
p.add_link("llm", "retriever")
p.add_link("retriever", "reranker", dest_key="nodes")
p.add_link("llm", "reranker", dest_key="query_str")
p.add_link("reranker", "summarizer", dest_key="nodes")
p.add_link("llm", "summarizer", dest_key="query_str")


In [117]:
# look at summarizer input keys
print(summarizer.as_query_component().input_keys)

required_keys={'query_str', 'nodes'} optional_keys=set()



We use networkx to store the graph representation. This gives us an easy way to view the DAG!

In [None]:
# ## create graph
# from pyvis.network import Network

# net = Network(notebook=True, cdn_resources="in_line", directed=True)
# net.from_nx(p.dag)
# net.show("rag_dag.html")

# ## another option using `pygraphviz`
# # from networkx.drawing.nx_agraph import to_agraph
# # from IPython.display import Image
# # agraph = to_agraph(p.dag)
# # agraph.layout(prog="dot")
# # agraph.draw('rag_dag.png')
# # display(Image('rag_dag.png'))

In [118]:
response = p.run(topic="YOLO")

[1;3;38;2;155;135;227m> Running module prompt_tmpl with input: 
topic: YOLO

[0m[1;3;38;2;155;135;227m> Running module llm with input: 
messages: Please generate a question about regarding the following topic YOLO

[0m[1;3;38;2;155;135;227m> Running module retriever with input: 
input: assistant:  What is the purpose of using the You Only Look Once (YOLO) algorithm in object detection systems?


[0m[1;3;38;2;155;135;227m> Running module reranker with input: 
query_str: assistant:  What is the purpose of using the You Only Look Once (YOLO) algorithm in object detection systems?

nodes: [NodeWithScore(node=TextNode(id_='12ca46ae-7814-4fd4-9da5-340261a1d6cc', embedding=None, metadata={'page_label': '8', 'file_name': 'yolo.pdf', 'file_path': 'D:\\Gen_AI_Tutorials\\langchain\\llamaindex...

[0m[1;3;38;2;155;135;227m> Running module summarizer with input: 
query_str: assistant:  What is the purpose of using the You Only Look Once (YOLO) algorithm in object detection systems?

nodes:

In [119]:
print(str(response))

 The YOLO algorithm is used to provide a uniÔ¨Åed approach to object detection, where the neural network directly predicts bounding boxes and class probabilities from full images in real-time. This eliminates the need for complex pipelines with multiple components that must be trained separately, making it fast and efficient.



2. RAG Pipeline without Query Rewriting¬∂
Here we setup a RAG pipeline without the query rewriting step.

Here we need a way to link the input query to both the retriever, reranker, and summarizer. We can do this by defining a special InputComponent, allowing us to link the inputs to multiple downstream modules.

In [126]:
from llama_index.core.query_pipeline import InputComponent
retriever = index.as_retriever(similarity_top_k=5)
summarizer = TreeSummarize(llm=llm)
reranker = CohereRerank(api_key='x6JCRmE5AymCDcV5jyQjG6g54AXE2GsHX1jUH3T5')

In [127]:
p = QueryPipeline(verbose=True)
p.add_modules(
    {
        "input": InputComponent(),
        "retriever": retriever,
        "summarizer": summarizer,
    }
)
p.add_link("input", "retriever")
p.add_link("input", "summarizer", dest_key="query_str")
p.add_link("retriever", "summarizer", dest_key="nodes")

In [130]:
output = p.run(input="what is Detection?")

[1;3;38;2;155;135;227m> Running module input with input: 
input: what is Detection?

[0m[1;3;38;2;155;135;227m> Running module retriever with input: 
input: what is Detection?

[0m[1;3;38;2;155;135;227m> Running module summarizer with input: 
query_str: what is Detection?
nodes: [NodeWithScore(node=TextNode(id_='4f02ebde-c30d-4e40-b33f-cdb114b41f73', embedding=None, metadata={'page_label': '1', 'file_name': 'yolo.pdf', 'file_path': 'D:\\Gen_AI_Tutorials\\langchain\\llamaindex...

[0m

In [131]:
print(str(output))

 Detection is a computer vision task of finding objects in an image or video by analyzing visual data to identify them based on their size, shape, and location. It involves detecting, localizing, and classifying these objects using computer vision algorithms. One popular approach for object detection is YOLO (You Only Look Once), which uses a neural network architecture that predicts the location and scale of bounding boxes around each detected object in an image or video frame. The accuracy of object detection can be improved by using techniques such as data augmentation, non-maximal suppression, and adjusting the learning rate during training.

