# Import Library

In [17]:
from pymilvus import MilvusClient
from pymilvus import model
import numpy as np

# Set Up Vector Database

In [2]:
client = MilvusClient("milvus_demo.db")

# Create a Collection For Text Data

In [3]:
# Create a collection
if client.has_collection(collection_name="text_collection"):
    client.drop_collection(collection_name="text_collection")
    
client.create_collection(
    collection_name="text_collection",
    dimension=768,  # The vectors we will use in this demo has 768 dimensions
)

- The primary key and vector fields use their default names (“id” and “vector”).
- The metric type (vector distance definition) is set to its default value (COSINE).

In [4]:
# Text strings to search from.
docs = [
    "The printing press, invented by Johannes Gutenberg around 1440, revolutionized knowledge sharing.",
    "Isaac Newton formulated the laws of motion and universal gravitation in the 17th century.",
    "The Great Wall of China stretches over 13,000 miles across northern China.",
    "Marie Curie was the first woman to win a Nobel Prize, awarded for her research on radioactivity.",
    "The internet began as ARPANET, a project funded by the U.S. Department of Defense in the late 1960s.",
    "Mount Everest, located in the Himalayas, is the highest peak on Earth at 8,849 meters.",
    "The theory of evolution by natural selection was introduced by Charles Darwin in 1859.",
    "Ada Lovelace is often regarded as the world’s first computer programmer.",
    "The Renaissance period marked a cultural rebirth in Europe between the 14th and 17th centuries.",
    "Python, created by Guido van Rossum in 1991, is now one of the most popular programming languages.",
]

In [5]:
## Prepare Data

# This will download a small embedding model "paraphrase-albert-small-v2" (~50MB).
embedding_fn = model.DefaultEmbeddingFunction()

vectors = embedding_fn.encode_documents(docs)
# The output vector has 768 dimensions, matching the collection that we just created.
print("Dim:", embedding_fn.dim, vectors[0].shape)  # Dim: 768 (768,)

Dim: 768 (768,)


In [6]:
# Each entity has id, vector representation, raw text, and a subject label that we use
# to demo metadata filtering later.
data = [
    {"id": i, "vector": vectors[i], "text": docs[i], "subject": "history"}
    for i in range(len(vectors))
]

print("Data has", len(data), "entities, each with fields: ", data[0].keys())
print("Vector dim:", len(data[0]["vector"]))

Data has 10 entities, each with fields:  dict_keys(['id', 'vector', 'text', 'subject'])
Vector dim: 768


In [8]:
## Insert Data
res = client.insert(collection_name="text_collection", data=data)

print(res)

{'insert_count': 10, 'ids': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]}


## Test Vector search

In [9]:
query_vectors = embedding_fn.encode_queries(["Who is Alan Turing?"])

res = client.search(
    collection_name="text_collection",  # target collection
    data=query_vectors,  # query vectors
    limit=2,  # number of returned entities
    output_fields=["text", "subject"],  # specifies fields to be returned
)


In [13]:
for item in res[0]:
    print(item)

{'id': 7, 'distance': 0.21363919973373413, 'entity': {'text': 'Ada Lovelace is often regarded as the world’s first computer programmer.', 'subject': 'history'}}
{'id': 9, 'distance': 0.18234293162822723, 'entity': {'text': 'Python, created by Guido van Rossum in 1991, is now one of the most popular programming languages.', 'subject': 'history'}}


# Create a Collection For Random Vector

In [None]:
# config
DIM = 512
TOTAL_VECS = 100
TOPK = 3

In [19]:
# Create a collection
if client.has_collection(collection_name="random_num_collection"):
    client.drop_collection(collection_name="random_num_collection")
    
client.create_collection(
    collection_name="random_num_collection",
    dimension=DIM, 
)

In [21]:
# create random vector
xb = np.random.random((TOTAL_VECS, DIM))

data = [
    {"id": i, "vector": xb[i]}
    for i in range(TOTAL_VECS)
]

## Insert Data
res = client.insert(collection_name="random_num_collection", data=data)

print(res)

{'insert_count': 100, 'ids': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99]}


## Test Vector search

In [None]:
query_vectors = np.random.random((1, DIM))

res = client.search(
    collection_name="random_num_collection",  # target collection
    data=query_vectors,  # query vectors
    limit=TOPK,  # number of returned entities
    output_fields=["vector"]
)

for item in res[0]:
    print(item)

{'id': 97, 'distance': 0.8012682795524597, 'entity': {'vector': [0.46251943707466125, 0.3257023096084595, 0.11331310868263245, 0.284621000289917, 0.8703111410140991, 0.31545624136924744, 0.013413814827799797, 0.28054627776145935, 0.6641616225242615, 0.6051294803619385, 0.8120085000991821, 0.28454655408859253, 0.9972850680351257, 0.8659741878509521, 0.5861436128616333, 0.8700335621833801, 0.49931544065475464, 0.1093398928642273, 0.3795395791530609, 0.6892043352127075, 0.5062400698661804, 0.6446367502212524, 0.5016446113586426, 0.8870490193367004, 0.6711984276771545, 0.6599869132041931, 0.23918119072914124, 0.42805808782577515, 0.20708368718624115, 0.52486652135849, 0.20125654339790344, 0.5833305716514587, 0.7232836484909058, 0.19374099373817444, 0.1620689183473587, 0.5527801513671875, 0.7947288155555725, 0.5724526643753052, 0.7602733373641968, 0.504323422908783, 0.935041606426239, 0.44292423129081726, 0.6594735383987427, 0.08631635457277298, 0.9195895195007324, 0.4874807298183441, 0.331