In [27]:
from llama_index.core import Document, VectorStoreIndex
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import StorageContext
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.postprocessor import SimilarityPostprocessor
import chromadb

# Load chroma db inde

In [4]:
embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-base-en-v1.5")

In [20]:
db2 = chromadb.PersistentClient(path="../chroma_db")
chroma_collection = db2.get_or_create_collection("tables")
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
tables_index = VectorStoreIndex.from_vector_store(
    vector_store,
    embed_model=embed_model,
)

In [21]:
chroma_collection = db2.get_or_create_collection("columns")
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
columns_index = VectorStoreIndex.from_vector_store(
    vector_store,
    embed_model=embed_model,
)

# Vector Store Retriever from inde

In [22]:
columns_retriever = columns_index.as_retriever(similarity_top_k=10)

In [15]:
type(columns_retriever)

llama_index.core.indices.vector_store.retrievers.retriever.VectorIndexRetriever

In [23]:
columns_retriever.similarity_top_k

2

In [24]:
retreive_result = columns_retriever.retrieve("order")

In [34]:
len(retreive_result)

2

In [26]:
retreive_result[0].metadata

{'schema_name': 'main',
 'table_name': 'orders',
 'column_name': 'status',
 'type': 'VARCHAR',
 'description': 'Orders can be one of the following statuses:\n\n| status         | description                                                                                                            |\n|----------------|------------------------------------------------------------------------------------------------------------------------|\n| placed         | The order has been placed but has not yet left the warehouse                                                           |\n| shipped        | The order has ben shipped to the customer and is currently in transit                                                  |\n| completed      | The order has been received by the customer                                                                            |\n| return_pending | The customer has indicated that they would like to return the order, but it has not yet been received at the warehou

# SimilarityPostprocessor

https://docs.llamaindex.ai/en/stable/api_reference/postprocessor/similarity/

In [32]:
sp = SimilarityPostprocessor(similarity_cutoff=0.5)

In [35]:
len(sp.postprocess_nodes(retreive_result))

1

# Query Pipeline

https://docs.llamaindex.ai/en/stable/module_guides/querying/pipeline/

In [36]:
from llama_index.core.query_pipeline import QueryPipeline

In [37]:
p = QueryPipeline(chain=[columns_retriever, sp], verbose=True)

In [39]:
qp_result = p.run("customer")

[1;3;38;2;155;135;227m> Running module 2c5b2cbb-f52f-495b-adc7-2e824bffa273 with input: 
input: customer

[0m[1;3;38;2;155;135;227m> Running module 02a6ff90-0887-4e3d-9660-fcf32ecdad47 with input: 
nodes: [NodeWithScore(node=TextNode(id_='fac2dfe2-7452-4723-83f1-7376f968f65a', embedding=None, metadata={'schema_name': 'main', 'table_name': 'customers', 'column_name': 'customer_id', 'type': 'INTEGER', 'd...

[0m

[NodeWithScore(node=TextNode(id_='fac2dfe2-7452-4723-83f1-7376f968f65a', embedding=None, metadata={'schema_name': 'main', 'table_name': 'customers', 'column_name': 'customer_id', 'type': 'INTEGER', 'description': 'This is a unique identifier for a customer', 'accepted_values': ''}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='24534956-699f-4589-81f0-e1faf03e5843', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'schema_name': 'main', 'table_name': 'customers', 'column_name': 'customer_id', 'type': 'INTEGER', 'description': 'This is a unique identifier for a customer', 'accepted_values': ''}, hash='2d7933472a2fdf928ddaeabf00612681859b6fb43793f1a2b75959b255cb730b'), <NodeRelationship.NEXT: '3'>: RelatedNodeInfo(node_id='6349d21d-60ea-4b68-9e2b-13c17997db74', node_type=<ObjectType.TEXT: '1'>, metadata={}, hash='0645676091599a715cf0a9a655258151f251ba07f7af3c57f74c984c6f3bc2ec')}, text='customer_id Th

In [43]:
p

QueryPipeline(partial_dict={}, callback_manager=<llama_index.core.callbacks.base.CallbackManager object at 0x0000022CAC12DC90>, module_dict={'2c5b2cbb-f52f-495b-adc7-2e824bffa273': RetrieverComponent(partial_dict={}, retriever=<llama_index.core.indices.vector_store.retrievers.retriever.VectorIndexRetriever object at 0x0000022CAD084CA0>), '02a6ff90-0887-4e3d-9660-fcf32ecdad47': PostprocessorComponent(partial_dict={}, postprocessor=SimilarityPostprocessor(callback_manager=<llama_index.core.callbacks.base.CallbackManager object at 0x0000022CAC12DC90>, similarity_cutoff=0.5))}, dag=<networkx.classes.multidigraph.MultiDiGraph object at 0x0000022CAC12E500>, verbose=True, show_progress=False, num_workers=4)

# Generic find similar items and map metadata

In [59]:
from llama_index.core.retrievers import VectorIndexRetriever
type(columns_retriever)

llama_index.core.indices.vector_store.retrievers.retriever.VectorIndexRetriever

In [47]:
def find_similar_items(
    query: str,
    retriever: VectorIndexRetriever,
    similarity_cutoff: float,
    node_metadata_mapper
):
    top_n = retriever.retrieve(query)
    filtered = SimilarityPostprocessor(similarity_cutoff=similarity_cutoff).postprocess_nodes(top_n)
    return [
        node_metadata_mapper(**n.metadata)
        for n in filtered
    ]

In [60]:
from dataclasses import dataclass

@dataclass
class QueryColumnResult:
    schema_name: str
    table_name: str
    column_name: str
    type: str
    description: str
    accepted_values: str

In [61]:
r = find_similar_items(
    "customer",
    columns_retriever,
    0.2,
    QueryColumnResult
)
r

[QueryColumnResult(schema_name='main', table_name='customers', column_name='customer_id', type='INTEGER', description='This is a unique identifier for a customer', accepted_values=''),
 QueryColumnResult(schema_name='main', table_name='stg_customers', column_name='customer_id', type='INTEGER', description='', accepted_values='')]

In [66]:
r[0]

QueryColumnResult(schema_name='main', table_name='customers', column_name='customer_id', type='INTEGER', description='This is a unique identifier for a customer', accepted_values='')

In [67]:
from pydantic import BaseModel

class ColumnVectorMetadata(BaseModel):
    schema_name: str
    table_name: str
    column_name: str
    type: str
    description: str
    accepted_values: str 

In [68]:
r = find_similar_items(
    "customer",
    columns_retriever,
    0.2,
    ColumnVectorMetadata
)
r

[ColumnVectorMetadata(schema_name='main', table_name='customers', column_name='customer_id', type='INTEGER', description='This is a unique identifier for a customer', accepted_values=''),
 ColumnVectorMetadata(schema_name='main', table_name='stg_customers', column_name='customer_id', type='INTEGER', description='', accepted_values='')]

In [70]:
r[0].dict()

{'schema_name': 'main',
 'table_name': 'customers',
 'column_name': 'customer_id',
 'type': 'INTEGER',
 'description': 'This is a unique identifier for a customer',
 'accepted_values': ''}