In [1]:
import os
from dotenv import load_dotenv
    
load_dotenv('api.env')
HUGGINGFACE_API_KEY = os.environ['HUGGINGFACE_API_KEY']


In [2]:
from indox.embeddings import HuggingFaceEmbedding
embedding_function = HuggingFaceEmbedding(api_key=HUGGINGFACE_API_KEY,model="multi-qa-mpnet-base-cos-v1")

[32mINFO[0m: [1mInitialized HuggingFaceEmbedding with model: multi-qa-mpnet-base-cos-v1[0m


In [3]:
from indox.vector_stores import DuckDB
vector_store = DuckDB(
    embedding_function=embedding_function,
    vector_key="embedding",   
    id_key="id",              
    text_key="text",          
    table_name="embeddings"   
)


In [8]:
from indox.vector_stores import DuckDB

# Initialize the DuckDB vector store
vector_store = DuckDB(
    embedding_function=embedding_function,
    vector_key="embedding",   
    id_key="id",              
    text_key="text",          
    table_name="embeddings"   
)

# Example texts and metadata
texts = [
    "The quick brown fox jumps over the lazy dog.",
    "A stitch in time saves nine.",
    "To be or not to be, that is the question.",
    "All that glitters is not gold."
]

metadatas = [
    {"author": "John", "category": "proverb"},
    {"author": "Jane", "category": "proverb"},
    {"author": "Shakespeare", "category": "literature"},
    {"author": "Anonymous", "category": "proverb"}
]

# Add texts to the vector store
ids = vector_store.add(texts=texts, metadatas=metadatas)
print(f"Added {len(ids)} documents to the vector store.")

# Perform a similarity search
query = "A quick brown fox"
results = vector_store.similarity_search(query=query, k=2)

# Print the search results
print("\nSearch Results:")
for idx, doc in enumerate(results):
    print(f"Result {idx + 1}:")
    print(f"Text: {doc.page_content}")
    print(f"Metadata: {doc.metadata}")
    print()

# Demonstrate deletion
print("Deleting the first two documents...")
vector_store.delete(ids=ids[:2])

# Verify deletion by searching again
print("\nSearch Results after deletion:")
results_after_deletion = vector_store.similarity_search(query=query, k=2)
for idx, doc in enumerate(results_after_deletion):
    print(f"Result {idx + 1}:")
    print(f"Text: {doc.page_content}")
    print(f"Metadata: {doc.metadata}")
    print()

final_results = vector_store.similarity_search(query=query, k=2)
print(f"\nNumber of documents after deleting: {len(final_results)}")


[32mINFO[0m: [1mEmbedding documents[0m
[32mINFO[0m: [1mStarting to fetch embeddings for texts using model: SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: MPNetModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)[0m


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Added 4 documents to the vector store.
[32mINFO[0m: [1mEmbedding documents[0m
[32mINFO[0m: [1mStarting to fetch embeddings for texts using model: SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: MPNetModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)[0m


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2024-09-01 10:25:01,853 - indox.vector_stores.duckdb - INFO - Successfully deleted 2 records.



Search Results:
Result 1:
Text: The quick brown fox jumps over the lazy dog.
Metadata: {'metadata': {'author': 'John', 'category': 'proverb', 'Similarity Score': 0.7203798381933839}}

Result 2:
Text: A stitch in time saves nine.
Metadata: {'metadata': {'author': 'Jane', 'category': 'proverb', 'Similarity Score': 0.11260949593859887}}

Deleting the first two documents...

Search Results after deletion:
[32mINFO[0m: [1mEmbedding documents[0m
[32mINFO[0m: [1mStarting to fetch embeddings for texts using model: SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: MPNetModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)[0m


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Result 1:
Text: All that glitters is not gold.
Metadata: {'metadata': {'author': 'Anonymous', 'category': 'proverb', 'Similarity Score': 0.053264947663911086}}

Result 2:
Text: To be or not to be, that is the question.
Metadata: {'metadata': {'author': 'Shakespeare', 'category': 'literature', 'Similarity Score': 0.023188454825013395}}

[32mINFO[0m: [1mEmbedding documents[0m
[32mINFO[0m: [1mStarting to fetch embeddings for texts using model: SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: MPNetModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)[0m


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


Number of documents after deleting: 2


In [9]:
vector_store.get_all_data()

[('7d359ad5-3f35-4cfb-b79e-d2d2c941798c',
  'To be or not to be, that is the question.',
  [0.049712978303432465,
   0.07388775795698166,
   -0.01546455267816782,
   0.03842025622725487,
   -0.007879802025854588,
   0.0019595599733293056,
   0.016548695042729378,
   -0.020007299259305,
   0.038717061281204224,
   0.004557646811008453,
   -0.031136803328990936,
   -0.04899581894278526,
   -0.06646940112113953,
   -0.034008875489234924,
   -0.03839413821697235,
   0.08508915454149246,
   0.040570128709077835,
   0.03892993927001953,
   0.04421188309788704,
   0.00559952249750495,
   -0.01097079273313284,
   0.03620843216776848,
   0.04578835144639015,
   -0.006336912512779236,
   0.017226465046405792,
   -0.04318435862660408,
   0.053065136075019836,
   -0.0018490358488634229,
   0.022808149456977844,
   0.07683956623077393,
   -0.030120033770799637,
   0.0537024587392807,
   -0.008463378995656967,
   -0.028037572279572487,
   0.007827416993677616,
   0.0020683833863586187,
   -0.0016405