In [8]:
from qdrant_client import QdrantClient
from qdrant_client.http import models
from qdrant_client.http.models import Distance, VectorParams
from qdrant_client.http.models import PointStruct
import numpy as np
import random
import pandas as pd
import time


import utils

In [9]:
qdrantClient = QdrantClient(host='localhost', port=6333, timeout=10000000)
# base_vectors = utils.read_fvecs("../../dataset/siftsmall/siftsmall_base.fvecs")
# query_vectors = utils.read_fvecs("../../dataset/siftsmall/siftsmall_query.fvecs")
# knn_groundtruth = utils.read_ivecs("../../dataset/siftsmall/siftsmall_groundtruth.ivecs")
base_vectors = utils.read_fvecs("../../dataset/sift/sift_base.fvecs")
query_vectors = utils.read_fvecs("../../dataset/sift/sift_query.fvecs")
knn_groundtruth = utils.read_ivecs("../../dataset/sift/sift_groundtruth.ivecs")

Loading file: sift_base.fvecs
    The dimension of the vectors in the file is: 128
    The final shape of the loaded dataset sift_base.fvecs is (1000000, 128).
Loading file: sift_query.fvecs
    The dimension of the vectors in the file is: 128
    The final shape of the loaded dataset sift_query.fvecs is (10000, 128).
 Loading file: sift_groundtruth.ivecs
    The dimension of the vectors in the file is: 100
    The final shape of the loaded dataset is (10000, 100).


In [10]:
vector_size = 128
collection_name = "ann_1M"

qdrantClient.delete_collection(collection_name=collection_name)

qdrantClient.recreate_collection(
    collection_name=collection_name,
    vectors_config=VectorParams(size=vector_size, distance=Distance.EUCLID),
)

True

In [5]:
# query_vectors = pd.DataFrame({'vector': query_vectors.tolist()})
# base_vectors = pd.DataFrame({'vector': base_vectors.tolist()})

In [11]:
batch_size = 50000
num_batches = len(base_vectors) // batch_size + int(len(base_vectors) % batch_size > 0)
print(f'Number of batches: {num_batches}')

for batch_idx in range(num_batches):
    print(f'Current progress: {(batch_idx+1)*batch_size}/{len(base_vectors)}', end='\r')
    start_idx = batch_idx * batch_size
    end_idx = min((batch_idx + 1) * batch_size, len(base_vectors))

    batch_vectors = base_vectors[start_idx:end_idx]

    qdrantClient.upsert(
        collection_name=collection_name,
        points=models.Batch(
            ids=list(range(start_idx, end_idx)),
            vectors=batch_vectors.tolist()
        )
    )

Number of batches: 20
Current progress: 1000000/1000000

In [20]:
truth = utils.top_k_neighbors(query_vectors, base_vectors, k=100, function='euclidean', filtering=False) 


In [22]:
print(f'Search function starting')
start_time = time.time()
result_ids = []
for i,elem in enumerate(query_vectors):
    print(f'Progress: {i}/{len(query_vectors)}', end='\r')
    search_result = qdrantClient.search(
        collection_name=collection_name, 
        query_vector=elem, 
        limit=100
    )
    result_ids.append([elem.id for elem in search_result])

end_time = time.time()
time_span = end_time - start_time
print(f'Search function took {end_time - start_time} seconds')


Search function starting
Search function took 43.78459906578064 seconds


In [20]:
# np.array(result_ids)
# len(np.intersect1d(knn_groundtruth[0], result_ids[0]))
len(result_ids)

10000

In [23]:
true_positives = 0
n_classified = 0
for i,elem in enumerate(result_ids):
    true_positives_iter = len(np.intersect1d(knn_groundtruth[i], result_ids[i]))
    true_positives += true_positives_iter
    n_classified += len(elem)
print(true_positives)
print(n_classified)
print(f'QPS = {(len(query_vectors) / time_span):.4f}')
print(f'Average recall: {true_positives/n_classified}')

983599
1000000
QPS = 228.3908
Average recall: 0.983599
