In [3]:
import os
from tidb_vector.integrations import TiDBVectorClient
from dotenv import load_dotenv
load_dotenv()
DATABASE_URL= os.getenv("DATABASE_URL")

# Load the connection string from the .env file

vector_store = TiDBVectorClient(
   # The 'embedded_documents' table will store the vector data.
   table_name='embedded_documents',
   # The connection string to the TiDB cluster.
   connection_string=DATABASE_URL,
   # The dimension of the vector generated by the embedding model.
   vector_dimension=512,
   # Recreate the table if it already exists.
   drop_existing_table=True,
)



In [4]:
from sentence_transformers import SentenceTransformer


# good at retriving similarity based elements like retriving fish for swimming animal 
# model = SentenceTransformer("sentence-transformers/msmarco-MiniLM-L12-cos-v5")


#same expected output , similar to the above model , but also encodes images
model = SentenceTransformer("sentence-transformers/clip-ViT-B-32")
emb = model.encode("test sentence")
print("Embedding shape:", emb.shape)

  from .autonotebook import tqdm as notebook_tqdm
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


Embedding shape: (512,)


In [5]:
# embed_model_dims = model.get_sentence_embedding_dimension()

In [6]:
# print(embed_model_dims)

In [7]:
def text_to_embedding(text):
    """Generates vector embeddings for the given text."""
    embedding = model.encode(text)
    return embedding.tolist()

In [8]:
documents = [
    {
        "id": "f8e7dee2-63b6-42f1-8b60-2d46710c1971",
        "text": "dog",
        "embedding": text_to_embedding("dog"),
        "metadata": {"category": "animal"},
    },
    {
        "id": "8dde1fbc-2522-4ca2-aedf-5dcb2966d1c6",
        "text": "fish",
        "embedding": text_to_embedding("fish"),
        "metadata": {"category": "animal"},
    },
    {
        "id": "e4991349-d00b-485c-a481-f61695f2b5ae",
        "text": "tree",
        "embedding": text_to_embedding("tree"),
        "metadata": {"category": "plant"},
    },
]

vector_store.insert(
    ids=[doc["id"] for doc in documents],
    texts=[doc["text"] for doc in documents],
    embeddings=[doc["embedding"] for doc in documents],
    metadatas=[doc["metadata"] for doc in documents],
)

['f8e7dee2-63b6-42f1-8b60-2d46710c1971',
 '8dde1fbc-2522-4ca2-aedf-5dcb2966d1c6',
 'e4991349-d00b-485c-a481-f61695f2b5ae']

In [9]:
def print_result(query, result):
   print(f"Search result (\"{query}\"):")
   for r in result:
      print(f"- text: \"{r.document}\", distance: {r.distance}")

query = "a swimming animal"
query_embedding = text_to_embedding(query)
search_result = vector_store.query(query_embedding, k=3)
print_result(query, search_result)

Search result ("a swimming animal"):
- text: "fish", distance: 0.15278224362572568
- text: "dog", distance: 0.18056237493378835
- text: "tree", distance: 0.24404429157662788
