Skip to content

Commit

Permalink
Change examples (#50)
Browse files Browse the repository at this point in the history
Signed-off-by: godchen <qingxiang.chen@zilliz.com>
  • Loading branch information
godchen0212 committed Apr 22, 2021
1 parent 1c58ce5 commit 99ebc76
Showing 1 changed file with 114 additions and 226 deletions.
340 changes: 114 additions & 226 deletions examples/example.py
Original file line number Diff line number Diff line change
@@ -1,232 +1,120 @@
#This program demos how to connect to Milvus vector database,
# create a vector collection,
# insert 10 vectors,
# and execute a vector similarity search.

import random
from pprint import pprint
import numpy as np

from milvus import Milvus, DataType

# ------
# Setup:
# First of all, you need a runing Milvus(0.11.x). By default, Milvus runs on localhost in port 19530.
# Then, you can use pymilvus(0.3.x) to connect to the server, You can change the _HOST and _PORT accordingly.
# ------
# Milvus server IP address and port.
# You may need to change _HOST and _PORT accordingly.
_HOST = '127.0.0.1'
_PORT = '19530'
client = Milvus(_HOST, _PORT)

# ------
# Basic create collection:
# You already have a Milvus instance running, and pymilvus connecting to Milvus.
# The first thing we will do is to create a collection `demo_films`. Incase we've already had a collection
# named `demo_films`, we drop it before we create.
# ------
collection_name = 'demo_films'
if collection_name in client.list_collections():
client.drop_collection(collection_name)

# ------
# Basic create collection:
# `auto_id` in the parameter is set to false so that we can provide our own unique ids.
# `embedding` in the `fields` is float vector with dimension of 8.
# For all fields, you can provide custom infos in "params" like {"unit": "minute"}
# For FLOAT_VECTOR and BINARY_VECTOR, "dim" is a must in "params".
# For more information you can refer to the pymilvus
# documentation (https://milvus-io.github.io/milvus-sdk-python/pythondoc/v0.3.0/index.html).
# ------
collection_param = {
"fields": [
# Milvus doesn't support string type now, but we are considering supporting it soon.
# {"name": "film_name", "type": DataType.STRING},
{"name": "duration", "type": DataType.INT32, "params": {"unit": "minute"}},
{"name": "release_year", "type": DataType.INT32},
{"name": "embedding", "type": DataType.FLOAT_VECTOR, "params": {"dim": 8}},
],
"segment_row_limit": 4096,
"auto_id": False
}

# ------
# Basic create collection:
# After create collection `demo_films`, we create a partition tagged "American", it means the films we
# will be inserted are from American.
# ------
client.create_collection(collection_name, collection_param)
client.create_partition(collection_name, "American")

# ------
# Basic create collection:
# You can check the collection info and partitions we've created by `describe_collection` and
# `list_partitions`
# ------
collection = client.describe_collection(collection_name)
pprint(collection)
partitions = client.list_partitions(collection_name)
pprint(partitions)

# ------
# Basic insert entities:
# We have three films of The_Lord_of_the_Rings serises here with their id, duration release_year
# and fake embeddings to be inserted. They are listed below to give you a overview of the structure.
# ------
The_Lord_of_the_Rings = [
{
"title": "The_Fellowship_of_the_Ring",
"id": 1,
"duration": 208,
"release_year": 2001,
"embedding": [random.random() for _ in range(8)]
},
{
"title": "The_Two_Towers",
"id": 2,
"duration": 226,
"release_year": 2002,
"embedding": [random.random() for _ in range(8)]
},
{
"title": "The_Return_of_the_King",
"id": 3,
"duration": 252,
"release_year": 2003,
"embedding": [random.random() for _ in range(8)]
}
]

# ------
# Basic insert entities:
# To insert these films into Milvus, we have to group values from the same field together like below.
# Then these grouped data are used to create `hybrid_entities`.
# ------
ids = [k.get("id") for k in The_Lord_of_the_Rings]
durations = [k.get("duration") for k in The_Lord_of_the_Rings]
release_years = [k.get("release_year") for k in The_Lord_of_the_Rings]
embeddings = [k.get("embedding") for k in The_Lord_of_the_Rings]

hybrid_entities = [
# Milvus doesn't support string type yet, so we cannot insert "title".
{"name": "duration", "values": durations, "type": DataType.INT32},
{"name": "release_year", "values": release_years, "type": DataType.INT32},
{"name": "embedding", "values": embeddings, "type": DataType.FLOAT_VECTOR},
]

# ------
# Basic insert entities:
# We insert the `hybrid_entities` into our collection, into partition `American`, with ids we provide.
# If succeed, ids we provide will be returned.
# ------
ids = client.insert(collection_name, hybrid_entities, ids, partition_tag="American")
print("Films are inserted and the ids are: {}".format(ids))


# ------
# Basic insert entities:
# After insert entities into collection, we need to flush collection to make sure its on disk,
# so that we are able to retrive it.
# ------
before_flush_counts = client.count_entities(collection_name)
client.flush([collection_name])
after_flush_counts = client.count_entities(collection_name)
print("There are {} films in collection `{}` before flush".format(before_flush_counts, collection_name))
print("There are {} films in collection `{}` after flush".format(after_flush_counts, collection_name))

# ------
# Basic insert entities:
# We can get the detail of collection statistics info by `get_collection_stats`
# ------
info = client.get_collection_stats(collection_name)
pprint(info)

# ------
# Basic search entities:
# Now that we have 3 films inserted into our collection, it's time to obtain them.
# We can get films by ids, if milvus can't find entity for a given id, `None` will be returned.
# In the case we provide below, we will only get 1 film with id=1 and the other is `None`
# ------
films = client.get_entity_by_id(collection_name, ids=[1, 200])
for film in films:
if film is not None:
print(" > id: {},\n > duration: {}m,\n > release_years: {},\n > embedding: {}"
.format(film.id, film.duration, film.release_year, film.embedding))

# ------
# Basic hybrid search entities:
# Getting films by id is not enough, we are going to get films based on vector similarities.
# Let's say we have a film with its `embedding` and we want to find `top3` films that are most similar
# with it by L2 distance. And there are some conditions for the results. We want to obtain films that are:
# `released in year` 2002 or 2003,
# `duration` of the films larger than 250 minutes.
#
# "range" includes "GT"(>), "LT"(<), "LTE"(<=), "GTE"(>=).
# There are more options other than "must", for instance "should", for more information, you can refer to
# pymilvus documentation (https://milvus-io.github.io/milvus-sdk-python/pythondoc/v0.3.0/index.html).
# ------
query_embedding = [random.random() for _ in range(8)]
query_hybrid = {
"bool": {
"must": [
{
"term": {"release_year": [2002, 2003]}
},
{
# "GT" for greater than
"range": {"duration": {"GT": 250}}
},
{
"vector": {
"embedding": {"topk": 3, "query": [query_embedding], "metric_type": "L2"}
}
}
]
_PORT = '19530' # default value
# _PORT = '19121' # default http value

# Vector parameters
_DIM = 8 # dimension of vector

_INDEX_FILE_SIZE = 32 # max file size of stored index


def main():
# Specify server addr when create milvus client instance
# milvus client instance maintain a connection pool, param
# `pool_size` specify the max connection num.
milvus = Milvus(_HOST, _PORT)

# Create collection demo_collection if it dosen't exist.
collection_name = 'example_collection'

ok = milvus.has_collection(collection_name)
field_name = 'example_field'
if not ok:
fields = {"fields":[{
"name": field_name,
"type": DataType.FLOAT_VECTOR,
"metric_type": "L2",
"params": {"dim": _DIM},
"indexes": [{"metric_type": "L2"}]
}]}

milvus.create_collection(collection_name=collection_name, fields=fields)
else:
milvus.drop_collection(collection_name=collection_name)

# Show collections in Milvus server
collections = milvus.list_collections()
print(collections)

# Describe demo_collection
stats = milvus.get_collection_stats(collection_name)
print(stats)

# 10000 vectors with 128 dimension
# element per dimension is float32 type
# vectors should be a 2-D array
vectors = [[random.random() for _ in range(_DIM)] for _ in range(10)]
print(vectors)
# You can also use numpy to generate random vectors:
# vectors = np.random.rand(10000, _DIM).astype(np.float32)

# Insert vectors into demo_collection, return status and vectors id list
entities = [{"name": field_name, "type": DataType.FLOAT_VECTOR, "values": vectors}]

res_ids = milvus.insert(collection_name=collection_name, entities=entities)
print("ids:",res_ids)

# Flush collection inserted data to disk.
milvus.flush([collection_name])

# present collection statistics info
stats = milvus.get_collection_stats(collection_name)
print(stats)

# create index of vectors, search more rapidly
index_param = {
"metric_type": "L2",
"index_type": "IVF_FLAT",
"params": {"nlist": 1024}
}
}

# ------
# Basic hybrid search entities:
# And we want to get all the fields back in reasults, so fields = ["duration", "release_year", "embedding"].
# If searching successfully, results will be returned.
# `results` have `nq`(number of queries) seperate results, since we only query for 1 film, The length of
# `results` is 1.
# We ask for top 3 in-return, but our condition is too strict while the database is too small, so we can
# only get 1 film, which means length of `entities` in below is also 1.
#
# Now we've gotten the results, and known it's a 1 x 1 structure, how can we get ids, distances and fields?
# It's very simple, for every `topk_film`, it has three properties: `id, distance and entity`.
# All fields are stored in `entity`, so you can finally obtain these data as below:
# And the result should be film with id = 3.
# ------
results = client.search(collection_name, query_hybrid, fields=["duration", "release_year", "embedding"])
for entities in results:
for topk_film in entities:
current_entity = topk_film.entity
print("==")
print("- id: {}".format(topk_film.id))
print("- distance: {}".format(topk_film.distance))

print("- release_year: {}".format(current_entity.release_year))
print("- duration: {}".format(current_entity.duration))
print("- embedding: {}".format(current_entity.embedding))

# ------
# Basic delete:
# Now let's see how to delete things in Milvus.
# You can simply delete entities by their ids.
# ------
client.delete_entity_by_id(collection_name, ids=[1, 2])
client.flush() # flush is important
result = client.get_entity_by_id(collection_name, ids=[1, 2])

counts_delete = sum([1 for entity in result if entity is not None])
counts_in_collection = client.count_entities(collection_name)
print("Get {} entities by id 1, 2".format(counts_delete))
print("There are {} entities after delete films with 1, 2".format(counts_in_collection))

# ------
# Basic delete:
# You can drop partitions we create, and drop the collection we create.
# ------
client.drop_partition(collection_name, partition_tag='American')
if collection_name in client.list_collections():
client.drop_collection(collection_name)

# ------
# Summary:
# Now we've went through all basic communications pymilvus can do with Milvus server, hope it's helpful!
# ------

# Create ivflat index in demo_collection
# You can search vectors without creating index. however, Creating index help to
# search faster
print("Creating index: {}".format(index_param))
status = milvus.create_index(collection_name, field_name, index_param)

# execute vector similarity search

print("Searching ... ")

dsl = {"bool": {"must": [{"vector": {
field_name: {
"metric_type": "L2",
"query": vectors,
"topk": 10,
"params": {"nprobe": 16}
}
}}]}}

milvus.load_collection(collection_name)
results = milvus.search(collection_name, dsl)
# indicate search result
# also use by:
# `results.distance_array[0][0] == 0.0 or results.id_array[0][0] == ids[0]`
if results[0][0].distance == 0.0 or results[0][0].id == ids[0]:
print('Query result is correct')
else:
print('Query result isn\'t correct')

milvus.drop_index(collection_name,field_name)
milvus.release_collection(collection_name)

# Delete demo_collection
status = milvus.drop_collection(collection_name)


if __name__ == '__main__':
main()

0 comments on commit 99ebc76

Please sign in to comment.