In [1]:
from qdrant_client import QdrantClient
from qdrant_client.http import models
from qdrant_client.http.models import Distance, VectorParams
from qdrant_client.http.models import PointStruct
import numpy as np
import random
import time 
import pandas as pd
import utils

In [2]:
qdrantClient = QdrantClient(host='localhost', port=6333)
base_vectors = utils.read_fvecs("../../dataset/siftsmall/siftsmall_base.fvecs")
query_vectors = utils.read_fvecs("../../dataset/siftsmall/siftsmall_query.fvecs")
knn_groundtruth = utils.read_ivecs("../../dataset/siftsmall/siftsmall_groundtruth.ivecs")

Loading file: siftsmall_base.fvecs
    The dimension of the vectors in the file is: 128
    The final shape of the loaded dataset siftsmall_base.fvecs is (10000, 128).
Loading file: siftsmall_query.fvecs
    The dimension of the vectors in the file is: 128
    The final shape of the loaded dataset siftsmall_query.fvecs is (100, 128).
 Loading file: siftsmall_groundtruth.ivecs
    The dimension of the vectors in the file is: 100
    The final shape of the loaded dataset is (100, 100).


In [3]:
vector_size = 128
collection_name = "ann_collection"

qdrantClient.delete_collection(collection_name=collection_name)

qdrantClient.recreate_collection(
    collection_name=collection_name,
    vectors_config=VectorParams(size=vector_size, distance=Distance.EUCLID),
)

True

In [4]:
base_vectors = pd.DataFrame({'vector': base_vectors.tolist()})
batch_points = [PointStruct(id=i, vector=elem["vector"]) for i, elem in base_vectors.iterrows()]

operation_info = qdrantClient.upsert(
    collection_name=collection_name,
    wait=True,
    points=batch_points
)
print(operation_info)

operation_id=0 status=<UpdateStatus.COMPLETED: 'completed'>


In [5]:
query_vectors = pd.DataFrame({'vector': query_vectors.tolist()})

In [20]:
print(f'Search function starting')
start_time = time.time()

result_ids = []
for _,elem in query_vectors.iterrows():
    vec = elem["vector"]
    search_result = qdrantClient.search(
            collection_name=collection_name, 
            query_vector=vec,
            score_threshold=500,
            limit=10000 
            )
    result_ids.append([elem.id for elem in search_result])

end_time = time.time()
time_span = end_time - start_time
print(f'Search function took {end_time - start_time} seconds')


Search function starting
Search function took 3.185925006866455 seconds


In [13]:
len(search_result)

3177

In [17]:
truth = utils.range_truth(query_vectors, base_vectors, threshold=600)

In [9]:
# df = pd.DataFrame(truth)
# df['size'] = df.iloc[:, 0].apply(lambda x: len(x))
# df

Unnamed: 0,0,size
0,"[6, 11, 20, 22, 24, 25, 26, 27, 29, 33, 34, 35...",6004
1,"[7, 8, 9, 10, 11, 18, 19, 20, 21, 22, 24, 25, ...",6394
2,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",5618
3,"[22, 24, 25, 26, 34, 35, 36, 37, 38, 39, 40, 4...",5032
4,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",5602
...,...,...
95,"[6, 8, 9, 11, 13, 17, 19, 21, 22, 24, 25, 26, ...",6832
96,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",6787
97,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",8196
98,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",5292


In [18]:
true_positives = 0
n_classified = 0
for i,elem in enumerate(result_ids):
    true_positives_iter = len(np.intersect1d(truth[i], elem))
    true_positives += true_positives_iter
    n_classified += len(elem)

print(f'QPS = {(len(query_vectors) / time_span):.4f}')
print(f'Average recall: {true_positives/n_classified}')

Average recall: 1.0
