In [1]:
from pymilvus import connections
connections.connect(
  alias="default", 
  host='localhost', 
  port='19530'
)

In [2]:
from pymilvus import CollectionSchema, FieldSchema, DataType
from pymilvus import connections, FieldSchema, CollectionSchema, DataType, Collection, utility
from tqdm import tqdm
import numpy as np

utility.list_collections()

['test']

In [4]:
# utility.drop_collection("test")

In [3]:

def create_milvus_collection(collection_name, dim):
    if utility.has_collection(collection_name):
        utility.drop_collection(collection_name)
    
    fields = [
        FieldSchema(name='name', dtype=DataType.VARCHAR, descrition='image name', max_length=500, 
                    is_primary=True, auto_id=False),
        FieldSchema(name='id', dtype=DataType.INT64, descrition='image id'),
        FieldSchema(name='embedding', dtype=DataType.FLOAT_VECTOR, descrition='image embedding vectors', dim=dim)
    ]
    schema = CollectionSchema(fields=fields, description='reverse image search')
    collection = Collection(name=collection_name, schema=schema)

    index_params = {
        'metric_type':'L2',
        'index_type':"IVF_FLAT",
        'params':{"nlist":2048}
    }
    collection.create_index(field_name="embedding", index_params=index_params)
    return collection

collection = create_milvus_collection('test', 768)

## Indexing

### Method1

In [6]:
n = 2508110
features = np.ones((n, 768))
features = features * np.arange(n).reshape(-1, 1)

ids = [x for x in range(n)]

image_names = [f"image{i}" for i in range(n)]

: 

: 

In [None]:
n = len(image_names)
bs = n // 256 if n > 256 else n

pbar=tqdm(range(0, n, bs))
for i in pbar:
    data = [
        image_names[i:i+bs], ids[i:i+bs], features[i:i+bs]
    ]
    info = collection.insert(data)
    pbar.set_description(str(info))

(insert count: 78, delete count: 0, upsert count: 0, timestamp: 439186964259536897, success count: 78, err count: 0): 100%|██████████| 257/257 [11:00<00:00,  2.57s/it]    


### Method 2: 

In [11]:
n = 2508110
bs = n // 256 if n > 256 else n

pbar=tqdm(range(0, n, bs))
for i in pbar:

    features = np.ones((bs, 768), dtype=np.int32)
    features = features * np.arange(i,i+bs).reshape(-1, 1)
    ids = [x for x in range(i, i+bs)]
    image_names = [f"image{i}" for i in range(i, i+bs)]

    data = [
        image_names, ids, features
    ]
    info = collection.insert(data)
    pbar.set_description(str(info))

(insert count: 9797, delete count: 0, upsert count: 0, timestamp: 439187416099848193, success count: 9797, err count: 0): 100%|██████████| 257/257 [03:55<00:00,  1.09it/s]


## Query

In [12]:
print('Total number of inserted data is {}.'.format(collection.num_entities))

Total number of inserted data is 2517829.


In [13]:
collection = Collection(name='test')
collection.load()

In [14]:
# random 10000 numbers in range [0, 2508110)
import random

random.seed(0)
random_ids = random.sample(range(2508110), 100000)

### Filter by ids

In [15]:
expr = 'id in [' + ','.join([str(x) for x in random_ids]) + ']'
print(min(random_ids), expr)

51 id in [1615835,1764005,169803,1085975,2144441,2038128,1698416,1272185,1998992,1501767,2446881,916215,2116810,584156,1182113,586136,397749,1050699,2233733,616400,1300853,414243,309298,1384947,1980310,2348028,422371,1483908,1821048,1326225,857641,2317452,2000724,1856790,2186715,1092583,261219,2301411,58895,391210,1672787,4793,2070215,1397270,1023036,1364005,264175,801393,2380312,929893,1000827,597664,2277464,1878920,382587,337414,1342406,2130457,2052217,457421,1264359,2312182,1220922,523494,2296133,1395656,2266113,852290,2295249,2464644,1206522,1866419,384332,2501008,1614392,1329790,2414455,1015471,1217730,771201,794364,783200,138296,1090754,1998715,289764,376750,546200,627259,162074,336585,2267440,1641214,2200222,1156093,2188546,987765,902619,2473805,1759188,2431417,1154317,1889798,2066346,1498908,345497,1360316,483811,2040294,2462369,1406226,798506,1019365,67985,1136815,491296,924678,1560533,715052,1394757,1787323,260854,421978,613871,917602,189725,2406968,2240588,310365,111974,5219

In [16]:
search_params = {"metric_type": "L2", "params": {"nprobe": 128}}
query = np.ones((1, 768)) * 7

results = collection.search(
	data=query, 
	anns_field="embedding", 
	param=search_params, 
	limit=100, 
	expr= expr,
	consistency_level="Strong"
)
results[0].ids

['image51', 'image59', 'image61', 'image97', 'image159', 'image166', 'image177', 'image200', 'image211', 'image272', 'image290', 'image463', 'image471', 'image525', 'image563', 'image570', 'image589', 'image593', 'image611', 'image621', 'image684', 'image686', 'image688', 'image689', 'image729', 'image791', 'image856', 'image898', 'image914', 'image917', 'image958', 'image1009', 'image1053', 'image1062', 'image1088', 'image1139', 'image1148', 'image1214', 'image1242', 'image1283', 'image1284', 'image1387', 'image1392', 'image1417', 'image1435', 'image1444', 'image1448', 'image1486', 'image1493', 'image1501', 'image1503', 'image1551', 'image1592', 'image1628', 'image1659', 'image1695', 'image1702', 'image1732', 'image1752', 'image1811', 'image1827', 'image1855', 'image1901', 'image1914', 'image1928', 'image1934', 'image1978', 'image2011', 'image2022', 'image2075', 'image2140', 'image2157', 'image2191', 'image2273', 'image2322', 'image2387', 'image2413', 'image2437', 'image2470', 'image2

### Filter by names

In [17]:
random_ids = random.sample(range(2508110), 10000)
expr = 'name in [' + ','.join([f'\"image{x}\"' for x in random_ids]) + ']'
print(min(random_ids), expr)

500 name in ["image428000","image2354388","image449619","image416509","image413744","image267487","image1391471","image245482","image2045119","image398848","image481885","image2196207","image2104373","image866721","image1370196","image602491","image850581","image875306","image1086680","image460438","image555096","image1345071","image114435","image156930","image222419","image1683406","image1752940","image642424","image101065","image2268859","image189639","image499453","image1832639","image846307","image634694","image1152133","image1055010","image2150951","image805469","image2455018","image307486","image549183","image1680253","image1208980","image2064879","image369447","image1987284","image1889783","image9541","image132230","image1437924","image630493","image2438066","image1128844","image2331009","image1811776","image1836474","image691315","image86797","image1029303","image1111068","image1748177","image552089","image539062","image2106126","image2095270","image379051","image2043986","imag

In [18]:
search_params = {"metric_type": "L2", "params": {"nprobe": 10}}
query = np.ones((1, 768)) * 7

results = collection.search(
	data=query, 
	anns_field="embedding", 
	param=search_params, 
	limit=10, 
	expr= expr,
	consistency_level="Strong"
)

In [19]:
results[0].ids

['image500', 'image989', 'image176526', 'image177022', 'image441250', 'image441349', 'image529213', 'image529690', 'image529906', 'image617709']