LLM to infer the right query parameters for a vector database - including both the query string and metadata filter.

## AutoRetrieval

In [1]:
!pip install llama-index pinecone-client chromadb

Collecting llama-index
  Downloading llama_index-0.9.24-py3-none-any.whl (15.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.7/15.7 MB[0m [31m29.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pinecone-client
  Downloading pinecone_client-2.2.4-py3-none-any.whl (179 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.4/179.4 kB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting chromadb
  Downloading chromadb-0.4.21-py3-none-any.whl (508 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m508.6/508.6 kB[0m [31m34.3 MB/s[0m eta [36m0:00:00[0m
Collecting beautifulsoup4<5.0.0,>=4.12.2 (from llama-index)
  Downloading beautifulsoup4-4.12.2-py3-none-any.whl (142 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.0/143.0 kB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting dataclasses-json (from llama-index)
  Downloading dataclasses_json-0.6.3-py3-none-any.whl (28 kB)
Collecting de

In [13]:
import logging
import sys

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

In [2]:
import pinecone
import openai
import os
import getpass
from google.colab import userdata
from tqdm.autonotebook import tqdm

os.environ["OPENAI_API_KEY"] = userdata.get('OPENAI_API_KEY')
os.environ["PINECONE_API_KEY"] = userdata.get('PINECONE_API_KEY')

openai_api_key = os.environ["OPENAI_API_KEY"]
pinecone_api_key = os.environ["PINECONE_API_KEY"]

pinecone.init(api_key=pinecone_api_key, environment="gcp-starter")

  from tqdm.autonotebook import tqdm


In [3]:
# try to create index
# dimensions are for text-embedding-ada-002
try:
    pinecone.create_index(
        "quickstart-index", dimension=1536, metric="cosine"
    )
except Exception:
    # most likely index already exists
    pass

In [4]:
pinecone_index = pinecone.Index("quickstart-index")

In [5]:
from llama_index import VectorStoreIndex, StorageContext
from llama_index.vector_stores import PineconeVectorStore

In [6]:
from llama_index.schema import TextNode

nodes = [
    TextNode(
        text=(
            "Michael Jordan is a retired professional basketball player,"
            " widely regarded as one of the greatest basketball players of all"
            " time."
        ),
        metadata={
            "category": "Sports",
            "country": "United States",
            "gender": "male",
            "born": 1963,
        },
    ),
    TextNode(
        text=(
            "Angelina Jolie is an American actress, filmmaker, and"
            " humanitarian. She has received numerous awards for her acting"
            " and is known for her philanthropic work."
        ),
        metadata={
            "category": "Entertainment",
            "country": "United States",
            "gender": "female",
            "born": 1975,
        },
    ),
    TextNode(
        text=(
            "Elon Musk is a business magnate, industrial designer, and"
            " engineer. He is the founder, CEO, and lead designer of SpaceX,"
            " Tesla, Inc., Neuralink, and The Boring Company."
        ),
        metadata={
            "category": "Business",
            "country": "United States",
            "gender": "male",
            "born": 1971,
        },
    ),
    TextNode(
        text=(
            "Rihanna is a Barbadian singer, actress, and businesswoman. She"
            " has achieved significant success in the music industry and is"
            " known for her versatile musical style."
        ),
        metadata={
            "category": "Music",
            "country": "Barbados",
            "gender": "female",
            "born": 1988,
        },
    ),
    TextNode(
        text=(
            "Cristiano Ronaldo is a Portuguese professional footballer who is"
            " considered one of the greatest football players of all time. He"
            " has won numerous awards and set multiple records during his"
            " career."
        ),
        metadata={
            "category": "Sports",
            "country": "Portugal",
            "gender": "male",
            "born": 1985,
        },
    ),
]

In [None]:
vector_store = PineconeVectorStore(
    pinecone_index=pinecone_index, namespace="test"
)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
index = VectorStoreIndex(nodes, storage_context=storage_context)

Upserted vectors:   0%|          | 0/5 [00:00<?, ?it/s]

## Funtion Tool for AutoRetrieval

In [None]:
# define function tool
from llama_index.tools import FunctionTool
from llama_index.vector_stores.types import (
    VectorStoreInfo,
    MetadataInfo,
    MetadataFilter,
    MetadataFilters,
    FilterCondition,
    FilterOperator,
)
from llama_index.retrievers import VectorIndexRetriever
from llama_index.query_engine import RetrieverQueryEngine

from typing import List, Tuple, Any
from pydantic import BaseModel, Field

# hardcode top k for now
top_k = 3

# define vector store info describing schema of vector store
vector_store_info = VectorStoreInfo(
    content_info="brief biography of celebrities",
    metadata_info=[
        MetadataInfo(
            name="category",
            type="str",
            description=(
                "Category of the celebrity, one of [Sports, Entertainment,"
                " Business, Music]"
            ),
        ),
        MetadataInfo(
            name="country",
            type="str",
            description=(
                "Country of the celebrity, one of [United States, Barbados,"
                " Portugal]"
            ),
        ),
        MetadataInfo(
            name="gender",
            type="str",
            description=("Gender of the celebrity, one of [male, female]"),
        ),
        MetadataInfo(
            name="born",
            type="int",
            description=("Born year of the celebrity, could be any integer"),
        ),
    ],
)

In [None]:
vector_store_info.json()

'{"metadata_info": [{"name": "category", "type": "str", "description": "Category of the celebrity, one of [Sports, Entertainment, Business, Music]"}, {"name": "country", "type": "str", "description": "Country of the celebrity, one of [United States, Barbados, Portugal]"}, {"name": "gender", "type": "str", "description": "Gender of the celebrity, one of [male, female]"}, {"name": "born", "type": "int", "description": "Born year of the celebrity, could be any integer"}], "content_info": "brief biography of celebrities"}'

In [None]:
# define pydantic model for auto-retrieval function
class AutoRetrieveModel(BaseModel):
    query: str = Field(..., description="natural language query string")
    filter_key_list: List[str] = Field(
        ..., description="List of metadata filter field names"
    )
    filter_value_list: List[Any] = Field(
        ...,
        description=(
            "List of metadata filter field values (corresponding to names"
            " specified in filter_key_list)"
        ),
    )
    filter_operator_list: List[str] = Field(
        ...,
        description=(
            "Metadata filters conditions (could be one of <, <=, >, >=, ==, !=)"
        ),
    )
    filter_condition: str = Field(
        ...,
        description=("Metadata filters condition values (could be AND or OR)"),
    )


description = f"""\
Use this tool to look up biographical information about celebrities.
The vector database schema is given below:
{vector_store_info.json()}
"""

def auto_retrieve_fn(
    query: str,
    filter_key_list: List[str],
    filter_value_list: List[any],
    filter_operator_list: List[str],
    filter_condition: str,
):
    """Auto retrieval function.

    Performs auto-retrieval from a vector database, and then applies a set of filters.

    """
    query = query or "Query"

    metadata_filters = [
        MetadataFilter(key=k, value=v, operator=op)
        for k, v, op in zip(
            filter_key_list, filter_value_list, filter_operator_list
        )
    ]
    retriever = VectorIndexRetriever(
        index,
        filters=MetadataFilters(
            filters=metadata_filters, condition=filter_condition
        ),
        top_k=top_k,
    )
    query_engine = RetrieverQueryEngine.from_args(retriever)

    response = query_engine.query(query)
    return str(response)


auto_retrieve_tool = FunctionTool.from_defaults(
    fn=auto_retrieve_fn,
    name="celebrity_bios",
    description=description,
    fn_schema=AutoRetrieveModel,
)

## Initialize Agent

In [None]:
from llama_index.agent import OpenAIAgent
from llama_index.llms import OpenAI

agent = OpenAIAgent.from_tools(
    [auto_retrieve_tool],
    llm=OpenAI(api_key=openai_api_key , temperature=0, model="gpt-4-0613"),
    verbose=True,
)

In [None]:
response = agent.chat(
    "Tell me about few celebrities under category business and born after 1950. "
)
print(str(response))

Added user message to memory: Tell me about few celebrities under category business and born after 1950. 
=== Calling Function ===
Calling function: celebrity_bios with args: {
  "query": "celebrities",
  "filter_key_list": ["category", "born"],
  "filter_value_list": ["Business", 1950],
  "filter_operator_list": ["==", ">"],
  "filter_condition": "and"
}
Got output: Elon Musk is a well-known figure in the business world. He has gained fame for his roles as the founder, CEO, and lead designer of several successful companies, including SpaceX, Tesla, Inc., Neuralink, and The Boring Company.

Elon Musk is a prominent business celebrity who was born after 1950.

Elon Musk, born in 1971, is a business magnate and investor. He is the founder, CEO, CTO, and chief designer of SpaceX; early investor, CEO and product architect of Tesla, Inc.; founder of The


In [None]:
response = agent.chat("Tell me about two celebrities born after 1980. ")
print(str(response))

Added user message to memory: Tell me about two celebrities born after 1980. 
=== Calling Function ===
Calling function: celebrity_bios with args: {
  "query": "celebrities",
  "filter_key_list": ["born"],
  "filter_value_list": [1980],
  "filter_operator_list": [">"],
  "filter_condition": "and"
}
Got output: Rihanna and Cristiano Ronaldo are both well-known celebrities in their respective fields. Rihanna is a successful singer, actress, and businesswoman from Barbados, while Cristiano Ronaldo is a highly accomplished professional footballer from Portugal.

Rihanna and Cristiano Ronaldo are two celebrities who were born after 1980.

Rihanna, born in 1988, is a Barbadian singer, actress, and businesswoman. Known for her distinctive and versatile voice and for her fashionable appearance, she has achieved 14 number-ones and 31 top-ten singles in the US and UK respectively and has won nine Grammy Awards. Her successful business ventures include the luxury fashion house Fenty and the cosme

## Using VectorIndexAutoRetriever in llama-index

VectorStore using Chromadb

In [14]:
from llama_index.indices.vector_store.retrievers import (
    VectorIndexAutoRetriever,
)
from llama_index.vector_stores.types import MetadataInfo, VectorStoreInfo

In [15]:
import chromadb
from llama_index import VectorStoreIndex, StorageContext
from llama_index.vector_stores import ChromaVectorStore

# define vector store info describing schema of vector store
vector_store_info = VectorStoreInfo(
    content_info="brief biography of celebrities",
    metadata_info=[
        MetadataInfo(
            name="category",
            type="str",
            description=(
                "Category of the celebrity, one of [Sports, Entertainment,"
                " Business, Music]"
            ),
        ),
        MetadataInfo(
            name="country",
            type="str",
            description=(
                "Country of the celebrity, one of [United States, Barbados,"
                " Portugal]"
            ),
        ),
        MetadataInfo(
            name="gender",
            type="str",
            description=("Gender of the celebrity, one of [male, female]"),
        ),
        MetadataInfo(
            name="born",
            type="int",
            description=("Born year of the celebrity, could be any integer"),
        ),
    ],
)

chroma_client = chromadb.EphemeralClient()
chroma_collection = chroma_client.create_collection("quickstart")

vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
index = VectorStoreIndex(nodes, storage_context=storage_context)

UniqueConstraintError: ignored

In [16]:
retriever = VectorIndexAutoRetriever(
    index, vector_store_info=vector_store_info
)

In [17]:
retriever.retrieve("Tell me about Sports celebrities from United States")

[NodeWithScore(node=TextNode(id_='076790eb-0f48-4a43-bbea-55df0aca3254', embedding=None, metadata={'category': 'Sports', 'country': 'United States', 'gender': 'male', 'born': 1963}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, hash='85f79916d68fb985978decd2e6650904696b36f01e58c9c635702be23f14abae', text='Michael Jordan is a retired professional basketball player, widely regarded as one of the greatest basketball players of all time.', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadata_seperator='\n'), score=0.6560070899153513)]