In [4]:
import os
from pathlib import Path

import numpy as np
import requests
from dotenv import load_dotenv

dotenv_path = Path('.env')
load_dotenv(dotenv_path=dotenv_path)

CLIP_PORT = os.environ.get("CLIP_PORT", None)
MILVUS_PORT = os.environ.get("MILVUS_PORT", None)

assert CLIP_PORT is not None, "CLIP_PORT is not set"
assert MILVUS_PORT is not None, "MILVUS_PORT is not set"

from pysearch.milvus import Milvus2Processor as MilvusProcessor

In [5]:
config = {
    # Global config
    "HOST": "0.0.0.0",
    "PORT": MILVUS_PORT,
    "INDEX": "lsc23_db",
    "RETURN_SIZE": 10,
    "CACHE_DIR": ".cache/",
    # Milvus config
    "DIMENSION": 768,
}

In [13]:
proc = MilvusProcessor(config, autoload_collection=False)
proc.info()

{'name': 'lsc23_db',
 'description': 'Pysearch collection',
 'num_entities': 0,
 'collections': ['test', 'test_index', 'lsc23_db']}

In [15]:
proc.kill('test')

In [16]:
proc.info()

{'name': 'lsc23_db',
 'description': 'Pysearch collection',
 'num_entities': 0,
 'collections': ['lsc23_db']}

In [17]:
import pickle 
data = pickle.load(open("/home/vbs/lsc23/dragon2/L14_336_features_128.pkl", "rb"))

In [18]:
features = np.array([x[1] for x in  data.items()])
ids = np.array([x[0].split('.')[0] for x in data.items()]) # remove extension, e.g. .jpg. From 000000.jpg to 000000

In [19]:
# example of an id and a feature vector 
print(ids[0])
print(features[0].shape)

20190205_044043_000
(768,)


In [20]:
assert features.shape[0] == len(ids), "vectors and ids must have the same length"

In [21]:
features.shape

(725226, 768)

In [22]:
proc.index_list_document(features, ids)

(insert count: 234, delete count: 0, upsert count: 0, timestamp: 440107208522858497, success count: 234, err count: 0): 100%|██████████| 257/257 [01:01<00:00,  4.19it/s]  


Function update_list_document elapsed time: 0:01:01.388936


In [23]:
query = np.ones((1, 768)) * 7
results = proc.search(query, top_k=3)
results

Function search elapsed time: 0:00:00.469084


(['20190421_105512_000', '20191009_064951_000', '20190416_090317_000'],
 [37583.9609375, 37584.5859375, 37584.8984375])

In [24]:
results = proc.search(query, top_k=10, filter=["20190421_105512_000", "20190416_090317_000"])
results

Function search elapsed time: 0:00:00.225801


(['20190421_105512_000', '20190416_090317_000'],
 [37583.9609375, 37584.8984375])