In [1]:
from llama_index.core import Document, VectorStoreIndex
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import StorageContext
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
import chromadb

  from .autonotebook import tqdm as notebook_tqdm


# Inde metadata

https://docs.llamaindex.ai/en/stable/understanding/loading/loading/

https://docs.llamaindex.ai/en/stable/examples/vector_stores/ChromaIndexDemo/

https://docs.llamaindex.ai/en/stable/understanding/querying/querying/

https://docs.llamaindex.ai/en/stable/module_guides/querying/retriever/

In [2]:
chroma_client = chromadb.EphemeralClient()
chroma_collection = chroma_client.create_collection("quickstart")

In [3]:
embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-base-en-v1.5")

In [22]:
documents = [
    Document(text="main.customers", metadata= {'description': "This table has basic information about a customer, as well as some derived facts based on a customer's orders", 'columns': 'customer_id,first_name,last_name,first_order,most_recent_order,number_of_orders,total_order_amount'} ),
    Document(text="main.orders", metadata= {'description': 'This table has basic information about orders, as well as some derived facts based on payments', 'columns': 'order_id,customer_id,order_date,status,amount,credit_card_amount,coupon_amount,bank_transfer_amount,gift_card_amount'} ),
    Document(text="main.stg_customers", metadata= {'description': '', 'columns': 'customer_id'} ),
    Document(text="main.stg_orders", metadata= {'description': '', 'columns': 'order_id,status'} ),
    Document(text="main.stg_payments", metadata= {'description': '', 'columns': 'payment_id,payment_method'} ),
]

In [5]:
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
index = VectorStoreIndex.from_documents(
    documents, storage_context=storage_context, embed_model=embed_model
)

In [6]:
from llama_index.core.vector_stores.types import VectorStoreQuery
q = index.vector_store.aquery(VectorStoreQuery(
    query_str="customer"
))

In [7]:
embed_model.get_query_embedding("customer")

[0.003964375238865614,
 0.006858022417873144,
 0.0014693266712129116,
 0.0014276608126237988,
 0.07514078915119171,
 0.09294776618480682,
 0.037378840148448944,
 0.04471326991915703,
 -0.006537733133882284,
 0.017908476293087006,
 0.0015602126950398088,
 0.011060351505875587,
 -0.03747740387916565,
 0.006761063821613789,
 -0.013400522992014885,
 0.026210986077785492,
 0.02687446027994156,
 0.01818738505244255,
 0.03304744139313698,
 -0.014799610711634159,
 -0.013311916962265968,
 -0.014155020006000996,
 -0.005243597086519003,
 -0.010518299415707588,
 0.011649603955447674,
 -0.016100367531180382,
 -0.046130768954753876,
 0.05982939898967743,
 -0.06462674587965012,
 -0.00597340427339077,
 0.04159492626786232,
 0.004728162195533514,
 0.013675469905138016,
 -0.002000713488087058,
 -0.003742068773135543,
 -0.028672872111201286,
 0.03857677802443504,
 0.03329417482018471,
 -0.03155684098601341,
 -0.02720634825527668,
 -0.011513296514749527,
 0.021038124337792397,
 0.03513500839471817,
 0.016

# Save to disk

In [23]:
db = chromadb.PersistentClient(path="./chroma_db")
chroma_collection = db.get_or_create_collection("quickstart")
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)

In [24]:
storage_context = StorageContext.from_defaults(vector_store=vector_store)

index = VectorStoreIndex.from_documents(
    documents, storage_context=storage_context, embed_model=embed_model
)

# Load from disk

In [25]:
# load from disk
db2 = chromadb.PersistentClient(path="./chroma_db")
chroma_collection = db2.get_or_create_collection("quickstart")
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
index = VectorStoreIndex.from_vector_store(
    vector_store,
    embed_model=embed_model,
)

# Query

In [26]:
chroma_collection.count()

5

In [38]:
query_response = chroma_collection.query(
    query_embeddings=[
        embed_model.get_query_embedding("customer")
    ],
    n_results=10,
)
# https://docs.trychroma.com/getting-started

Number of requested results 10 is greater than number of elements in index 5, updating n_results = 5


In [39]:
query_response

{'ids': [['79a02866-88ad-43e8-9323-390c0a3c6cfd',
   'd35b0ab0-9689-4bf1-8196-a0046d9461e3',
   '374752db-e000-44fe-bd9b-97ba4287ae1c',
   '17d0f358-9a38-4f9e-9153-530e50929d28',
   '8092991d-be78-4864-9122-c4790da5b107']],
 'distances': [[0.5736563406154249,
   0.6355035911011129,
   0.8053954064551604,
   0.835269822567233,
   0.8848529236901392]],
 'metadatas': [[{'_node_content': '{"id_": "79a02866-88ad-43e8-9323-390c0a3c6cfd", "embedding": null, "metadata": {"description": "", "columns": "customer_id"}, "excluded_embed_metadata_keys": [], "excluded_llm_metadata_keys": [], "relationships": {"1": {"node_id": "e5fb96e7-11b4-4e86-8494-5f19357dfd81", "node_type": "4", "metadata": {"description": "", "columns": "customer_id"}, "hash": "0c08895b73233902cae936b21232cb03eea08d9145f2a379adaa7e090d205ace", "class_name": "RelatedNodeInfo"}, "2": {"node_id": "374752db-e000-44fe-bd9b-97ba4287ae1c", "node_type": "1", "metadata": {"description": "This table has basic information about orders, as 

In [40]:
query_response.keys()

dict_keys(['ids', 'distances', 'metadatas', 'embeddings', 'documents', 'uris', 'data'])

In [41]:
query_response['distances']

[[0.5736563406154249,
  0.6355035911011129,
  0.8053954064551604,
  0.835269822567233,
  0.8848529236901392]]

In [30]:
query_response['metadatas'][0][1]['_node_content']

'{"id_": "d35b0ab0-9689-4bf1-8196-a0046d9461e3", "embedding": null, "metadata": {"description": "This table has basic information about a customer, as well as some derived facts based on a customer\'s orders", "columns": "customer_id,first_name,last_name,first_order,most_recent_order,number_of_orders,total_order_amount"}, "excluded_embed_metadata_keys": [], "excluded_llm_metadata_keys": [], "relationships": {"1": {"node_id": "bb24437f-c80c-4dc6-9e72-8c9986a60d11", "node_type": "4", "metadata": {"description": "This table has basic information about a customer, as well as some derived facts based on a customer\'s orders", "columns": "customer_id,first_name,last_name,first_order,most_recent_order,number_of_orders,total_order_amount"}, "hash": "8fab84d152d6deede018715ffdbe8949e776d9513f9e2a341f2c5d1b625d57d5", "class_name": "RelatedNodeInfo"}, "3": {"node_id": "374752db-e000-44fe-bd9b-97ba4287ae1c", "node_type": "1", "metadata": {}, "hash": "01823a8bc297dd9b12d343b856d79dba085ef7f4b5e7eea

In [31]:
import json

In [33]:
nc = json.loads(query_response['metadatas'][0][1]['_node_content'])
nc

{'id_': 'd35b0ab0-9689-4bf1-8196-a0046d9461e3',
 'embedding': None,
 'metadata': {'description': "This table has basic information about a customer, as well as some derived facts based on a customer's orders",
  'columns': 'customer_id,first_name,last_name,first_order,most_recent_order,number_of_orders,total_order_amount'},
 'excluded_embed_metadata_keys': [],
 'excluded_llm_metadata_keys': [],
 'relationships': {'1': {'node_id': 'bb24437f-c80c-4dc6-9e72-8c9986a60d11',
   'node_type': '4',
   'metadata': {'description': "This table has basic information about a customer, as well as some derived facts based on a customer's orders",
    'columns': 'customer_id,first_name,last_name,first_order,most_recent_order,number_of_orders,total_order_amount'},
   'hash': '8fab84d152d6deede018715ffdbe8949e776d9513f9e2a341f2c5d1b625d57d5',
   'class_name': 'RelatedNodeInfo'},
  '3': {'node_id': '374752db-e000-44fe-bd9b-97ba4287ae1c',
   'node_type': '1',
   'metadata': {},
   'hash': '01823a8bc297dd9b

In [34]:
nc['metadata']

{'description': "This table has basic information about a customer, as well as some derived facts based on a customer's orders",
 'columns': 'customer_id,first_name,last_name,first_order,most_recent_order,number_of_orders,total_order_amount'}

In [37]:
result = []
for m in query_response['metadatas'][0]:
    nc = json.loads(m['_node_content'])
    node_metadata = nc['metadata']
    result.append(node_metadata['columns'])

result

['customer_id',
 'customer_id,first_name,last_name,first_order,most_recent_order,number_of_orders,total_order_amount']

In [42]:
import string
string.ascii_letters

'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'