In [17]:
from qdrant_client import QdrantClient
from qdrant_client.http import models
from qdrant_client.http.models import Distance, VectorParams
from qdrant_client.http.models import PointStruct
import numpy as np
import random
import pandas as pd
import time


import utils

In [18]:
qdrantClient = QdrantClient(host='localhost', port=6333)
base_vectors = utils.read_fvecs("../../dataset/siftsmall/siftsmall_base.fvecs")
query_vectors = utils.read_fvecs("../../dataset/siftsmall/siftsmall_query.fvecs")
knn_groundtruth = utils.read_ivecs("../../dataset/siftsmall/siftsmall_groundtruth.ivecs")

Loading file: siftsmall_base.fvecs
    The dimension of the vectors in the file is: 128
    The final shape of the loaded dataset siftsmall_base.fvecs is (10000, 128).
Loading file: siftsmall_query.fvecs
    The dimension of the vectors in the file is: 128
    The final shape of the loaded dataset siftsmall_query.fvecs is (100, 128).
 Loading file: siftsmall_groundtruth.ivecs
    The dimension of the vectors in the file is: 100
    The final shape of the loaded dataset is (100, 100).


In [19]:
vector_size = 128
collection_name = "ann"

qdrantClient.delete_collection(collection_name=collection_name)

qdrantClient.recreate_collection(
    collection_name=collection_name,
    hnsw_config=models.HnswConfigDiff(
        m=32,
        ef_construct=200
    ),
    vectors_config=VectorParams(size=vector_size, distance=Distance.EUCLID),
)

True

In [20]:
query_vectors = pd.DataFrame({'vector': query_vectors.tolist()})
base_vectors = pd.DataFrame({'vector': base_vectors.tolist()})


In [21]:
batch_points = [PointStruct(id=i, vector=elem["vector"]) for i, elem in base_vectors.iterrows()]

operation_info = qdrantClient.upsert(
    collection_name=collection_name,
    wait=True,
    points=batch_points
)
print(operation_info)

operation_id=0 status=<UpdateStatus.COMPLETED: 'completed'>


In [22]:
truth = utils.top_k_neighbors(query_vectors, base_vectors, k=10, function='euclidean', filtering=False) 
truth

[array([2176, 3752,  882, 4009, 2837,  190, 3615,  816, 1045, 1884]),
 array([2781, 9574, 2492, 1322, 3136, 1038, 9564,  925, 3998, 2183]),
 array([2707, 9938, 2698, 9972, 6995, 6801, 8906, 5232, 6162, 5199]),
 array([9843, 9825, 9574, 9582, 4097, 9576, 9581,  272, 9575, 4096]),
 array([4719, 5164, 1671, 1538, 5897, 4764, 4559,  358, 5775, 4622]),
 array([1097, 1239, 4943, 3227,  804, 2607, 4060, 4443, 4246, 3112]),
 array([2456, 3013, 1682, 8581, 2774, 3530,  924, 2732, 9701, 1916]),
 array([5447, 5868, 3109, 5671, 4597, 7586, 2804, 9394, 2735, 6639]),
 array([9628, 3382, 7828, 1098, 4391, 8518, 3912, 5002,   73, 8441]),
 array([4764, 4719, 5609, 9013, 1538,  688, 7885, 7642, 1671, 7123]),
 array([1218,  645,  693,  804,  496, 6946,  281, 4060, 8027, 4370]),
 array([4697, 4762, 8566, 4726, 6489, 1359, 4640, 1526, 9095, 4739]),
 array([2113,  130,  506,  128, 9692, 1362, 9581,  120, 2128,  483]),
 array([3609, 9815, 9574, 1564, 9663,  924, 1531, 9860, 9602, 5623]),
 array([ 797,  714, 

In [7]:
len(truth[0])

10

In [21]:
knn_groundtruth

array([[2176, 3752,  882, ...,  348, 3043, 3687],
       [2781, 9574, 2492, ..., 3849, 2905, 4102],
       [2707, 9938, 2698, ..., 1251, 8564, 8173],
       ...,
       [8825, 9081, 6142, ..., 8178, 5887, 4565],
       [5460, 5439, 5810, ..., 5199, 7483, 5232],
       [8082, 8782, 4767, ...,   11, 2482, 3631]], dtype=int32)

In [10]:
np.intersect1d(truth[0], knn_groundtruth[0][:10])

array([ 190,  816,  882, 1045, 1884, 2176, 2837, 3615, 3752, 4009])

In [25]:
query_vectors

Unnamed: 0,vector
0,"[1.0, 3.0, 11.0, 110.0, 62.0, 22.0, 4.0, 0.0, ..."
1,"[40.0, 25.0, 11.0, 0.0, 22.0, 31.0, 6.0, 8.0, ..."
2,"[28.0, 4.0, 3.0, 6.0, 7.0, 2.0, 2.0, 18.0, 19...."
3,"[24.0, 12.0, 14.0, 8.0, 3.0, 12.0, 4.0, 8.0, 8..."
4,"[0.0, 4.0, 47.0, 20.0, 9.0, 2.0, 1.0, 0.0, 41...."
...,...
95,"[60.0, 70.0, 39.0, 33.0, 26.0, 4.0, 0.0, 0.0, ..."
96,"[13.0, 7.0, 4.0, 27.0, 2.0, 0.0, 3.0, 19.0, 91..."
97,"[14.0, 8.0, 0.0, 0.0, 3.0, 49.0, 22.0, 8.0, 39..."
98,"[11.0, 48.0, 10.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2...."


In [28]:
print(f'Search function starting')
start_time = time.time()
result_ids = []
for _,elem in query_vectors.iterrows():
    vec = elem["vector"]
    search_result = qdrantClient.search(
        collection_name=collection_name,
        search_params=models.SearchParams(
            hnsw_ef=256,
            exact=False
        ), 
        query_vector=vec, 
        limit=10
    )
    result_ids.append([elem.id for elem in search_result])

end_time = time.time()
time_span = end_time - start_time
print(f'Search function took {end_time - start_time} seconds')


Search function starting
Search function took 0.3360719680786133 seconds


In [18]:
true_positives = 0
n_classified = 0
for i,elem in enumerate(result_ids):
    true_positives_iter = len(np.intersect1d(truth[i], elem))
    true_positives += true_positives_iter
    n_classified += len(elem)

print(f'QPS = {(len(query_vectors) / time_span):.4f}')
print(f'Average recall: {true_positives/n_classified}')

QPS = 453.7601
Average recall: 1.0
