Change examples (#50)

Signed-off-by: godchen <qingxiang.chen@zilliz.com>
milvus-io · Apr 22, 2021 · 99ebc76 · 99ebc76
1 parent 1c58ce5
commit 99ebc76
Showing 1 changed file with 114 additions and 226 deletions.
diff --git a/examples/example.py b/examples/example.py
@@ -1,232 +1,120 @@
+#This program demos how to connect to Milvus vector database, 
+# create a vector collection,
+# insert 10 vectors, 
+# and execute a vector similarity search.
+
 import random
-from pprint import pprint
+import numpy as np
 
 from milvus import Milvus, DataType
 
-# ------
-# Setup:
-#    First of all, you need a runing Milvus(0.11.x). By default, Milvus runs on localhost in port 19530.
-#    Then, you can use pymilvus(0.3.x) to connect to the server, You can change the _HOST and _PORT accordingly.
-# ------
+# Milvus server IP address and port.
+# You may need to change _HOST and _PORT accordingly.
 _HOST = '127.0.0.1'
-_PORT = '19530'
-client = Milvus(_HOST, _PORT)
-
-# ------
-# Basic create collection:
-#     You already have a Milvus instance running, and pymilvus connecting to Milvus.
-#     The first thing we will do is to create a collection `demo_films`. Incase we've already had a collection
-#     named `demo_films`, we drop it before we create.
-# ------
-collection_name = 'demo_films'
-if collection_name in client.list_collections():
-    client.drop_collection(collection_name)
-
-# ------
-# Basic create collection:
-#     `auto_id` in the parameter is set to false so that we can provide our own unique ids.
-#     `embedding` in the `fields` is float vector with dimension of 8.
-#     For all fields, you can provide custom infos in "params" like {"unit": "minute"}
-#     For FLOAT_VECTOR and BINARY_VECTOR, "dim" is a must in "params". 
-#     For more information you can refer to the pymilvus 
-#     documentation (https://milvus-io.github.io/milvus-sdk-python/pythondoc/v0.3.0/index.html).
-# ------
-collection_param = {
-    "fields": [
-        #  Milvus doesn't support string type now, but we are considering supporting it soon.
-        #  {"name": "film_name", "type": DataType.STRING},
-        {"name": "duration", "type": DataType.INT32, "params": {"unit": "minute"}},
-        {"name": "release_year", "type": DataType.INT32},
-        {"name": "embedding", "type": DataType.FLOAT_VECTOR, "params": {"dim": 8}},
-    ],
-    "segment_row_limit": 4096,
-    "auto_id": False
-}
-
-# ------
-# Basic create collection:
-#     After create collection `demo_films`, we create a partition tagged "American", it means the films we
-#     will be inserted are from American.
-# ------
-client.create_collection(collection_name, collection_param)
-client.create_partition(collection_name, "American")
-
-# ------
-# Basic create collection:
-#     You can check the collection info and partitions we've created by `describe_collection` and
-#     `list_partitions`
-# ------
-collection = client.describe_collection(collection_name)
-pprint(collection)
-partitions = client.list_partitions(collection_name)
-pprint(partitions)
-
-# ------
-# Basic insert entities:
-#     We have three films of The_Lord_of_the_Rings serises here with their id, duration release_year
-#     and fake embeddings to be inserted. They are listed below to give you a overview of the structure.
-# ------
-The_Lord_of_the_Rings = [
-    {
-        "title": "The_Fellowship_of_the_Ring",
-        "id": 1,
-        "duration": 208,
-        "release_year": 2001,
-        "embedding": [random.random() for _ in range(8)]
-    },
-    {
-        "title": "The_Two_Towers",
-        "id": 2,
-        "duration": 226,
-        "release_year": 2002,
-        "embedding": [random.random() for _ in range(8)]
-    },
-    {
-        "title": "The_Return_of_the_King",
-        "id": 3,
-        "duration": 252,
-        "release_year": 2003,
-        "embedding": [random.random() for _ in range(8)]
-    }
-]
-
-# ------
-# Basic insert entities:
-#     To insert these films into Milvus, we have to group values from the same field together like below.
-#     Then these grouped data are used to create `hybrid_entities`.
-# ------
-ids = [k.get("id") for k in The_Lord_of_the_Rings]
-durations = [k.get("duration") for k in The_Lord_of_the_Rings]
-release_years = [k.get("release_year") for k in The_Lord_of_the_Rings]
-embeddings = [k.get("embedding") for k in The_Lord_of_the_Rings]
-
-hybrid_entities = [
-    # Milvus doesn't support string type yet, so we cannot insert "title".
-    {"name": "duration", "values": durations, "type": DataType.INT32},
-    {"name": "release_year", "values": release_years, "type": DataType.INT32},
-    {"name": "embedding", "values": embeddings, "type": DataType.FLOAT_VECTOR},
-]
-
-# ------
-# Basic insert entities:
-#     We insert the `hybrid_entities` into our collection, into partition `American`, with ids we provide.
-#     If succeed, ids we provide will be returned.
-# ------
-ids = client.insert(collection_name, hybrid_entities, ids, partition_tag="American")
-print("Films are inserted and the ids are: {}".format(ids))
-
-
-# ------
-# Basic insert entities:
-#     After insert entities into collection, we need to flush collection to make sure its on disk,
-#     so that we are able to retrive it.
-# ------
-before_flush_counts = client.count_entities(collection_name)
-client.flush([collection_name])
-after_flush_counts = client.count_entities(collection_name)
-print("There are {} films in collection `{}` before flush".format(before_flush_counts, collection_name))
-print("There are {} films in collection `{}` after flush".format(after_flush_counts, collection_name))
-
-# ------
-# Basic insert entities:
-#     We can get the detail of collection statistics info by `get_collection_stats`
-# ------
-info = client.get_collection_stats(collection_name)
-pprint(info)
-
-# ------
-# Basic search entities:
-#     Now that we have 3 films inserted into our collection, it's time to obtain them.
-#     We can get films by ids, if milvus can't find entity for a given id, `None` will be returned.
-#     In the case we provide below, we will only get 1 film with id=1 and the other is `None`
-# ------
-films = client.get_entity_by_id(collection_name, ids=[1, 200])
-for film in films:
-    if film is not None:
-        print(" > id: {},\n > duration: {}m,\n > release_years: {},\n > embedding: {}"
-              .format(film.id, film.duration, film.release_year, film.embedding))
-
-# ------
-# Basic hybrid search entities:
-#      Getting films by id is not enough, we are going to get films based on vector similarities.
-#      Let's say we have a film with its `embedding` and we want to find `top3` films that are most similar
-#      with it by L2 distance. And there are some conditions for the results. We want to obtain films that are:
-#      `released in year` 2002 or 2003,
-#      `duration` of the films larger than 250 minutes.
-#
-#      "range" includes "GT"(>), "LT"(<), "LTE"(<=), "GTE"(>=).
-#      There are more options other than "must", for instance "should", for more information, you can refer to
-#      pymilvus documentation (https://milvus-io.github.io/milvus-sdk-python/pythondoc/v0.3.0/index.html).
-# ------
-query_embedding = [random.random() for _ in range(8)]
-query_hybrid = {
-    "bool": {
-        "must": [
-            {
-                "term": {"release_year": [2002, 2003]}
-            },
-            {
-                # "GT" for greater than
-                "range": {"duration": {"GT": 250}}
-            },
-            {
-                "vector": {
-                    "embedding": {"topk": 3, "query": [query_embedding], "metric_type": "L2"}
-                }
-            }
-        ]
+_PORT = '19530'  # default value
+# _PORT = '19121'  # default http value
+
+# Vector parameters
+_DIM = 8  # dimension of vector
+
+_INDEX_FILE_SIZE = 32  # max file size of stored index
+
+
+def main():
+    # Specify server addr when create milvus client instance
+    # milvus client instance maintain a connection pool, param
+    # `pool_size` specify the max connection num.
+    milvus = Milvus(_HOST, _PORT)
+
+    # Create collection demo_collection if it dosen't exist.
+    collection_name = 'example_collection'
+
+    ok = milvus.has_collection(collection_name)
+    field_name = 'example_field'
+    if not ok:
+        fields = {"fields":[{
+            "name": field_name,
+            "type": DataType.FLOAT_VECTOR,
+            "metric_type": "L2",
+            "params": {"dim": _DIM},
+            "indexes": [{"metric_type": "L2"}]
+        }]}
+
+        milvus.create_collection(collection_name=collection_name, fields=fields)
+    else:
+        milvus.drop_collection(collection_name=collection_name)
+
+    # Show collections in Milvus server
+    collections = milvus.list_collections()
+    print(collections)
+
+    # Describe demo_collection
+    stats = milvus.get_collection_stats(collection_name)
+    print(stats)
+
+    # 10000 vectors with 128 dimension
+    # element per dimension is float32 type
+    # vectors should be a 2-D array
+    vectors = [[random.random() for _ in range(_DIM)] for _ in range(10)]
+    print(vectors)
+    # You can also use numpy to generate random vectors:
+    #   vectors = np.random.rand(10000, _DIM).astype(np.float32)
+
+    # Insert vectors into demo_collection, return status and vectors id list
+    entities = [{"name": field_name, "type": DataType.FLOAT_VECTOR, "values": vectors}]
+
+    res_ids = milvus.insert(collection_name=collection_name, entities=entities)
+    print("ids:",res_ids)
+
+    # Flush collection  inserted data to disk.
+    milvus.flush([collection_name])
+
+    # present collection statistics info
+    stats = milvus.get_collection_stats(collection_name)
+    print(stats)
+
+    # create index of vectors, search more rapidly
+    index_param = {
+        "metric_type": "L2",
+        "index_type": "IVF_FLAT",
+        "params": {"nlist": 1024}
     }
-}
-
-# ------
-# Basic hybrid search entities:
-#     And we want to get all the fields back in reasults, so fields = ["duration", "release_year", "embedding"].
-#     If searching successfully, results will be returned.
-#     `results` have `nq`(number of queries) seperate results, since we only query for 1 film, The length of
-#     `results` is 1.
-#     We ask for top 3 in-return, but our condition is too strict while the database is too small, so we can
-#     only get 1 film, which means length of `entities` in below is also 1.
-#
-#     Now we've gotten the results, and known it's a 1 x 1 structure, how can we get ids, distances and fields?
-#     It's very simple, for every `topk_film`, it has three properties: `id, distance and entity`.
-#     All fields are stored in `entity`, so you can finally obtain these data as below:
-#     And the result should be film with id = 3.
-# ------
-results = client.search(collection_name, query_hybrid, fields=["duration", "release_year", "embedding"])
-for entities in results:
-    for topk_film in entities:
-        current_entity = topk_film.entity
-        print("==")
-        print("- id: {}".format(topk_film.id))
-        print("- distance: {}".format(topk_film.distance))
-
-        print("- release_year: {}".format(current_entity.release_year))
-        print("- duration: {}".format(current_entity.duration))
-        print("- embedding: {}".format(current_entity.embedding))
-
-# ------
-# Basic delete:
-#     Now let's see how to delete things in Milvus.
-#     You can simply delete entities by their ids.
-# ------
-client.delete_entity_by_id(collection_name, ids=[1, 2])
-client.flush()  # flush is important
-result = client.get_entity_by_id(collection_name, ids=[1, 2])
-
-counts_delete = sum([1 for entity in result if entity is not None])
-counts_in_collection = client.count_entities(collection_name)
-print("Get {} entities by id 1, 2".format(counts_delete))
-print("There are {} entities after delete films with 1, 2".format(counts_in_collection))
-
-# ------
-# Basic delete:
-#     You can drop partitions we create, and drop the collection we create.
-# ------
-client.drop_partition(collection_name, partition_tag='American')
-if collection_name in client.list_collections():
-    client.drop_collection(collection_name)
-
-# ------
-# Summary:
-#     Now we've went through all basic communications pymilvus can do with Milvus server, hope it's helpful!
-# ------
+
+    # Create ivflat index in demo_collection
+    # You can search vectors without creating index. however, Creating index help to
+    # search faster
+    print("Creating index: {}".format(index_param))
+    status = milvus.create_index(collection_name, field_name, index_param)
+
+    # execute vector similarity search
+
+    print("Searching ... ")
+
+    dsl = {"bool": {"must": [{"vector": {
+                        field_name: {
+                            "metric_type": "L2",
+                            "query": vectors,
+                            "topk": 10,
+                            "params": {"nprobe": 16}
+                        }
+    }}]}}
+
+    milvus.load_collection(collection_name)
+    results = milvus.search(collection_name, dsl)
+    # indicate search result
+    # also use by:
+    #   `results.distance_array[0][0] == 0.0 or results.id_array[0][0] == ids[0]`
+    if results[0][0].distance == 0.0 or results[0][0].id == ids[0]:
+        print('Query result is correct')
+    else:
+        print('Query result isn\'t correct')
+
+    milvus.drop_index(collection_name,field_name)
+    milvus.release_collection(collection_name)
+
+    # Delete demo_collection
+    status = milvus.drop_collection(collection_name)
+
+
+if __name__ == '__main__':
+    main()