In [1]:
import os
from pathlib import Path

import numpy as np
import requests
from dotenv import load_dotenv

dotenv_path = Path('.env')
load_dotenv(dotenv_path=dotenv_path)

CLIP_PORT = os.environ.get("CLIP_PORT", None)
MILVUS_PORT = os.environ.get("MILVUS_PORT", None)

assert CLIP_PORT is not None, "CLIP_PORT is not set"
assert MILVUS_PORT is not None, "MILVUS_PORT is not set"

from pysearch.milvus import Milvus2Processor as MilvusProcessor

In [2]:
config = {
    # Global config
    "HOST": "0.0.0.0",
    "PORT": MILVUS_PORT,
    "INDEX": "lsc23_full_db",
    "RETURN_SIZE": 10,
    "CACHE_DIR": ".cache/",
    # Milvus config
    "DIMENSION": 768,
}

In [3]:
proc = MilvusProcessor(config, autoload_collection=False)
proc.info()

{'name': 'lsc23_full_db',
 'description': 'Pysearch collection',
 'num_entities': 721823,
 'collections': ['test_index',
  'lsc23_db',
  'lsc23_with2000_db',
  'lsc23_full_db',
  'test'],
 'details': [{'metric_type': 'L2',
   'index_type': 'IVF_FLAT',
   'params': {'nlist': 2048}}]}

In [18]:
# proc.kill("lsc23_full_db")

In [None]:
# read /home/vbs/lsc23/image_list_contracted.txt 
with open("/root/workspace/data/info/image_list_contracted.txt") as f:
    lines = f.readlines()
    lines = [line.strip() for line in lines]

using_ids = set(lines)

new_lines = [] 
for line in lines:
    if line.startswith("2000"):
        continue
    new_lines.append(line)
# only use a subset ids 
using_ids = set(new_lines)

In [19]:
# read /home/vbs/lsc23/image_list_contracted.txt 
with open("/root/workspace/data/info/image_list_full.txt") as f:
    lines = f.readlines()
    lines = [line.strip() for line in lines]

using_ids = set(lines)

new_lines = [] 
for line in lines:
    if line.startswith("2000"):
        continue
    new_lines.append(line)
# only use a subset ids 
using_ids = set(new_lines)

In [20]:
# proc.kill('lsc23_db')

In [21]:
import pickle 
data = pickle.load(open("/root/workspace/data/features/L14_336_features_128.pkl", "rb"))
len(data)

725226

In [22]:
data = {k.split('.')[0]: v for k, v in data.items()} # remove extension, e.g. .jpg. From 000000.jpg to 000000
data = {k: v for k, v in data.items() if k in using_ids}

In [23]:
features = np.array([x[1] for x in  data.items()])
ids = np.array([x[0].split('.')[0] for x in data.items()])

In [24]:
# example of an id and a feature vector 
print(ids[0])
print(features[0].shape)

20190205_044043_000
(768,)


In [25]:
assert features.shape[0] == len(ids), "vectors and ids must have the same length"
features.shape

(721823, 768)

In [26]:
proc.index_list_document(features, ids)

(insert count: 159, delete count: 0, upsert count: 0, timestamp: 442121581785513985, success count: 159, err count: 0): 100%|██████████| 257/257 [01:16<00:00,  3.38it/s]  


Function update_list_document elapsed time: 0:01:16.028632


In [5]:
query = np.ones((1, 768)) * 7
results = proc.search(query, top_k=3)
results

Function search elapsed time: 0:00:00.249433


(['20191009_064951_000', '20200501_180733_000', '20190630_155904_000'],
 [37584.5859375, 37585.7890625, 37586.1328125])

In [6]:
results = proc.search(query, top_k=10, filter=["20190421_105512_000", "20190416_090317_000"])
results

Function search elapsed time: 0:00:00.193067


([], [])