In [None]:
! pip install milvus pymilvus

In [None]:
# Import necessary libraries
from pymilvus import connections, utility, Collection, FieldSchema, CollectionSchema, DataType
from milvus import default_server
from sentence_transformers import SentenceTransformer
import numpy as np

In [None]:
# Specify where the data will be stored.
default_server.set_base_dir('milvus_data')
default_server.set_wal_dir('milvus_wal')

default_server.start()
connections.connect("default", host="127.0.0.1", port=default_server.listen_port)

In [None]:
# Collection parameters
collection_name = "document_collection"
dimension = 384  # Dimensions of embedding vectors (depends on embedder model)

# Define fields (columns)
fields = [
    FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True),
    FieldSchema(name="user_id", dtype=DataType.INT64),
    FieldSchema(name="content", dtype=DataType.VARCHAR, max_length=500),
    FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=dimension)
]

# Create schema
schema = CollectionSchema(fields, description="Collection for document search")

# Create collection
collection = Collection(name=collection_name, schema=schema)

print(f"Collection '{collection_name}' created successfully.")

In [None]:
# Create index for efficient search
# Index parameters
index_params = {
    "index_type": "IVF_FLAT",
    "metric_type": "L2",
    "params": {"nlist": 128}
}

# Create index for embedding field
collection.create_index(field_name="embedding", index_params=index_params)

print("Index created successfully.")

----------------------------------------------------------------
|index_type/metric_type | L2 | IP  | COSINE | HAMMING | JACCARD | TANIMOTO |
------------|----------|--------|---------|---------|----------|-
|IVF_FLAT	  | ✅ | ✅ | ✅    | ✅      | ✅      | ✅      |
|IVF_SQ8	  | ✅ | ✅ | ✅    | ❌      | ❌      | ❌      |
|IVF_PQ	    | ✅ | ✅ | ⚠️*   | ❌      | ❌      | ❌      |
|HNSW	      | ✅ | ✅ | ⚠️*   | ❌      | ❌      | ❌      |
|DISKANN	  | ✅ | ✅ | ⚠️**  | ❌      | ❌      | ❌      |
|FLAT	      | ✅ | ✅ | ✅    | ✅      | ✅      | ✅      |
|SCANN	    | ✅ | ✅ | ⚠️*   | ❌      | ❌      | ❌      |

Here’s the English translation of your text about Milvus index types and metric types:

---

### **Types of `index_type` in Milvus:**

1. **IVF_FLAT** (Inverted File with Flat)  
   - Suitable for approximate nearest neighbor search Approximate Nearest Neighbors (ANN).  
   - Parameters: `nlist` (number of clusters).  
   - Requires setting `nprobe` during search.  

2. **IVF_SQ8** (Inverted File with Scalar Quantization)  
   - Similar to IVF_FLAT but with 8-bit compression for memory efficiency.  
   - Parameters: `nlist`.  

3. **IVF_PQ** (Inverted File with Product Quantization)  
   - Uses product quantization (PQ) for compression.  
   - Parameters: `nlist`, `m` (number of subspaces), `nbits` (bits assigned to each centroid).  

4. **HNSW** (Hierarchical Navigable Small World)  
   - A graph-based method for efficient search using small-world hierarchies.  
   - Parameters:  
     - `M` (number of connections per node in layers).  
     - `efConstruction` (number of candidates considered during construction).  

5. **DISKANN** (Graph-based, optimized for disk storage)  
   - Designed for very large datasets (e.g., billions of vectors).  
   - Parameters:  
     - `max_degree` (maximum degree of each node in the graph).  
     - `search_list_size` (search list size).  

6. **FLAT** (Exhaustive search without compression)  
   - Suitable for small datasets.  
   - No parameters (most accurate but slowest method).  

7. **SCANN** (Quantization-based)  
   - Parameters:  
     - `quantization_bit` (e.g., 4, 6, 8 bits).  
     - `nlist`.  

8. **AUTOINDEX** (Automatically optimized by Milvus).  

---

### **Types of `metric_type` (Distance Metrics):**
1. **`L2`** (Euclidean Distance)  
   - Measures straight-line distance between vectors.  

2. **`IP`** (Inner Product)  
   - Higher values indicate greater similarity.  

3. **`COSINE`** (Cosine Similarity)  
   - Compares the angle between vectors (normalized).  

4. **`HAMMING`** (Hamming Distance)  
   - For binary vectors (counts differing bits).  

5. **`JACCARD`** (Jaccard Index)  
   - Compares sets (useful for binary data).  

6. **`TANIMOTO`** (Tanimoto Coefficient)  
   - Similar to Jaccard but for specific use cases.  

---

### **Key Notes:**
- **IVF_FLAT** and **IVF_SQ8** are ideal for medium-dimensional data (e.g., 100–1000 dimensions).  
- **HNSW** offers high accuracy but higher memory usage.  
- **DISKANN** is optimized for massive-scale data (e.g., billions of vectors).  
- **COSINE** and **IP** often outperform **L2** for semantic search (e.g., NLP).  

In [14]:
# Load model for create embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')  # A model with 384 dimension

# Instance data
documents = [
    {"user_id": 10, "content": "My name is mahdi"},
    {"user_id": 10, "content": "I like neural networks"},
    {"user_id": 10, "content": "I don't like math"},
    {"user_id": 15, "content": "My name is steve"},
    {"user_id": 15, "content": "I love machine learning"}
]

# Produce embeddings
embeddings = model.encode([doc["content"] for doc in documents])

# Preparing for insertion
data = [
    [doc["user_id"] for doc in documents],  # (user_ids) reffers to user id in Relational DB
    [doc["content"] for doc in documents],  # (contents)
    embeddings.tolist()  # Convert numpy array to list
]

# Insert data
insert_result = collection.insert(data)

print(f"Inserted {len(documents)} documents.")

Inserted 5 documents.


In [16]:
# Connect to existing collection if it does not connected
# collection_name = "document_collection"
# collection = Collection(collection_name)

# Load collection into memory
collection.load()

# Query all data (limit to 1000 rows just in case)
results = collection.query(
    expr="",  # No filtering condition; returns everything
    output_fields=["id", "user_id", "content", "embedding"],  # You can add "embedding" too if you want
    limit=10
)

# Show results
for i, res in enumerate(results):
  print(f"{i+1}. ID: {res['id']} || User ID: {res['user_id']} || Content: {res['content']} || Embedding: {np.array(res['embedding']).shape}...")  # Show first 5 dims


1. ID: 459948152356873207 || User ID: 10 || Content: My name is mahdi || Embedding: (384,)...
2. ID: 459948152356873208 || User ID: 10 || Content: I like neural networks || Embedding: (384,)...
3. ID: 459948152356873209 || User ID: 10 || Content: I don't like math || Embedding: (384,)...
4. ID: 459948152356873210 || User ID: 15 || Content: My name is steve || Embedding: (384,)...
5. ID: 459948152356873211 || User ID: 15 || Content: I love machine learning || Embedding: (384,)...


In [17]:
# Tect for search
search_text = "what do you like?"

# Create embedding text
search_embedding = model.encode([search_text])[0].tolist()

# Search paramas
search_params = {
    "metric_type": "L2",
    "params": {"nprobe": 10}
}

# Execute search
results = collection.search(
    data=[search_embedding],
    anns_field="embedding",
    param=search_params,
    limit=3,  # number of results
    output_fields=["user_id", "content"] # Fields we wants to return
)

# Show results
for hits in results:
    for hit in hits:
        print(f"ID: {hit.id}, Score: {hit.score}")
        print(f"user_id: {hit.entity.get('user_id')}")
        print(f"Content: {hit.entity.get('content')}\n")

ID: 459948152356873208, Score: 1.2803442478179932
user_id: 10
Content: I like neural networks

ID: 459948152356873209, Score: 1.612396001815796
user_id: 10
Content: I don't like math

ID: 459948152356873211, Score: 1.6257460117340088
user_id: 15
Content: I love machine learning



In [20]:
# Text for search
search_text = "what do you like?"

# Create embedding text
search_embedding = model.encode([search_text])[0].tolist()

# Search params
search_params = {
    "metric_type": "L2",
    "params": {"nprobe": 10}
}

# Execute search with filter for user_id = 10
results = collection.search(
    data=[search_embedding],
    anns_field="embedding",
    param=search_params,
    limit=1,  # number of results
    output_fields=["user_id", "content"], # Fields we want to return
    expr="user_id == 15"  # Filter expression
)

# Show results
for hits in results:
    for hit in hits:
        print(f"ID: {hit.id}, Score: {hit.score}")
        print(f"user_id: {hit.entity.get('user_id')}")
        print(f"Content: {hit.entity.get('content')}\n")

ID: 459948152356873211, Score: 1.6257460117340088
user_id: 15
Content: I love machine learning



In [22]:
print(hits[0])

{'id': 459948152356873211, 'distance': 1.6257460117340088, 'entity': {'user_id': 15, 'content': 'I love machine learning'}}


In [27]:
# Disconnect and stop DB server
connections.disconnect("default")
default_server.stop()

In [28]:
# Set the previous locations
default_server.set_base_dir('milvus_data')
default_server.set_wal_dir('milvus_wal')

# Run the server
default_server.start()

# Connect to the server
connections.connect("default", host="127.0.0.1", port=default_server.listen_port)

In [29]:
from pymilvus import utility

# Checking for the existence of a previous collection
collection_name = "document_collection"

if utility.has_collection(collection_name):
    print(f"Collection '{collection_name}' exists and will be loaded.")
else:
    print(f"Collection '{collection_name}' does not exist!")

Collection 'document_collection' exists and will be loaded.


In [30]:
from pymilvus import Collection

# Receive the collection
collection = Collection(collection_name)

# Load collection into memory
collection.load()

print(f"Collection '{collection_name}' loaded successfully.")

Collection 'document_collection' loaded successfully.


In [31]:
# Query all data (limit to 1000 rows just in case)
results = collection.query(
    expr="",  # No filtering condition; returns everything
    output_fields=["id", "user_id", "content", "embedding"],  # You can add "embedding" too if you want
    limit=10
)

# Show results
for i, res in enumerate(results):
  print(f"{i+1}. ID: {res['id']} || User ID: {res['user_id']} || Content: {res['content']} || Embedding: {np.array(res['embedding']).shape}...")  # Show first 5 dims


1. ID: 459948152356873207 || User ID: 10 || Content: My name is mahdi || Embedding: (384,)...
2. ID: 459948152356873208 || User ID: 10 || Content: I like neural networks || Embedding: (384,)...
3. ID: 459948152356873209 || User ID: 10 || Content: I don't like math || Embedding: (384,)...
4. ID: 459948152356873210 || User ID: 15 || Content: My name is steve || Embedding: (384,)...
5. ID: 459948152356873211 || User ID: 15 || Content: I love machine learning || Embedding: (384,)...


In [None]:
# Delete all data in the collection (keeps the collection structure)
delete_expr = "id >= 0"  # Or any condition that matches all entities
collection.delete(expr=delete_expr)

print("All data has been deleted from the collection.")