Connect client


In [25]:
from qdrant_client import QdrantClient
from qdrant_client.http import models
from qdrant_client.http.models import Distance, VectorParams
from qdrant_client.http.models import PointStruct

from fastembed.embedding import DefaultEmbedding
import utils
# import numpy as np
# CONNECT TO CLIENT (FIRST RUN CONTAINER AT PORT 6333)
qdrantClient = QdrantClient(host='localhost', port=6333)

In [None]:
base_vectors = utils.read_fvecs("../../dataset/siftsmall/siftsmall_base.fvecs")
query_vectors = utils.read_fvecs("../../dataset/siftsmall/siftsmall_query.fvecs")
knn_groundtruth = utils.read_ivecs("../../dataset/siftsmall/siftsmall_groundtruth.ivecs")

In [57]:
vector_size = 128
collection_name = "test_collection"

qdrantClient.delete_collection(collection_name=collection_name)

qdrantClient.recreate_collection(
    collection_name=collection_name,
    vectors_config=VectorParams(size=vector_size, distance=Distance.EUCLID),
)


True

Fill database with base vectors (10K now)

In [58]:
batch_points = [PointStruct(id=i, vector=elem) for i, elem in enumerate(base_vectors)]


# Upload the batch
operation_info = qdrantClient.upsert(
    collection_name=collection_name,
    wait=True,
    points=batch_points
)
print(operation_info)

operation_id=0 status=<UpdateStatus.COMPLETED: 'completed'>


now look for the knn (k=100 bc its what the ground truth contains)

In [31]:
for elem in query_vectors:
    print(elem)
    break

[  1.   3.  11. 110.  62.  22.   4.   0.  43.  21.  22.  18.   6.  28.
  64.   9.  11.   1.   0.   0.   1.  40. 101.  21.  20.   2.   4.   2.
   2.   9.  18.  35.   1.   1.   7.  25. 108. 116.  63.   2.   0.   0.
  11.  74.  40. 101. 116.   3.  33.   1.   1.  11.  14.  18. 116. 116.
  68.  12.   5.   4.   2.   2.   9. 102.  17.   3.  10.  18.   8.  15.
  67.  63.  15.   0.  14. 116.  80.   0.   2.  22.  96.  37.  28.  88.
  43.   1.   4.  18. 116.  51.   5.  11.  32.  14.   8.  23.  44.  17.
  12.   9.   0.   0.  19.  37.  85.  18.  16. 104.  22.   6.   2.  26.
  12.  58.  67.  82.  25.  12.   2.   2.  25.  18.   8.   2.  19.  42.
  48.  11.]


In [47]:
knn_groundtruth[0]

array([2176, 3752,  882, 4009, 2837,  190, 3615,  816, 1045, 1884,  224,
       3013,  292, 1272, 5307, 4938, 1295,  492, 9211, 3625, 1254, 1292,
       1625, 3553, 1156,  146,  107, 5231, 1995, 9541, 3543, 9758, 9806,
       1064, 9701, 4064, 2456, 2763, 3237, 1317, 3530,  641, 1710, 8887,
       4263, 1756,  598,  370, 2776,  121, 4058, 7245, 1895,  124, 8731,
        696, 4320, 4527, 4050, 2648, 1682, 2154, 1689, 2436, 2005, 3210,
       4002, 2774,  924, 6630, 3449, 9814, 3515, 5375,  287, 1038, 4096,
       4094,  942, 4321,  123, 3814,   97, 4293,  420, 9734, 1916, 2791,
        149, 6139, 9576, 6837, 2952, 3138, 2890, 3066, 2852,  348, 3043,
       3687], dtype=int32)

In [59]:
for vec in query_vectors:
    search_result = qdrantClient.search(
        collection_name="test_collection", query_vector=[0.0 for _ in range(vector_size)], limit=100
    )
    break




In [62]:
import numpy as np

res = [elem.id for elem in search_result]
res = np.array(res, dtype=np.int32)

intersection = np.intersect1d(knn_groundtruth[0], res)
intersection

array([4527], dtype=int32)

In [63]:
res

array([3983, 3614, 3060, 5176, 3830, 1764, 5220, 3583, 7095, 3304, 4029,
        699, 4329, 4527, 3717, 4295, 1443, 5247, 1163, 9212, 2092, 2582,
       2765, 3672, 4145, 1962, 4977, 5296, 4332, 3592, 4088, 4127, 7293,
       2909, 3096, 7796, 7464, 1932, 1897, 6934, 3948, 7235, 8642, 7575,
       1976, 5229, 1524, 1164, 9365, 2639, 2329, 9389, 6449, 1422, 4228,
       4319, 6450,   73, 6238, 3247, 2388, 5141, 2114,  829, 4028, 5994,
       2583, 1602, 4976, 1314, 4280, 4104, 4314, 3061, 2779, 1703, 4300,
       2539, 1462, 6651, 3933, 2597,  408, 1544, 7023, 3522, 2080,  660,
       3203,  411, 1421, 5980, 6030, 2234, 9979, 2944, 6585, 1753, 3372,
       3051], dtype=int32)

In [52]:
import numpy as np


def euclidean_distance(query_vector, base_vectors):
    # Calculate Euclidean distances between query_vector and all base_vectors
    distances = np.sqrt(np.sum((base_vectors - query_vector) ** 2, axis=1))
    return distances

def top_k_neighbors(query_vectors, base_vectors, k=100):
    top_k_indices = []
    for query_vector in query_vectors:
        # Calculate Euclidean distances for each query vector
        distances = euclidean_distance(query_vector, base_vectors)
        
        # Find the indices of the k smallest distances
        k_indices = np.argsort(distances)[:k]
        top_k_indices.append(k_indices)
    
    return np.array(top_k_indices)

In [64]:
knn_indices = top_k_neighbors(query_vectors, base_vectors, k=100)


In [66]:
intersection = np.intersect1d(knn_indices[0], res)
len(intersection)

1