In [1]:
from qdrant_client import QdrantClient
from qdrant_client.http import models
from qdrant_client.http.models import Distance, VectorParams
from qdrant_client.http.models import PointStruct
import numpy as np
import random
import pandas as pd
import time
import pickle

import utils

In [2]:
qdrantClient = QdrantClient(host='localhost', port=6333, timeout=10000000)


In [3]:
import os
from dotenv import load_dotenv
load_dotenv()


def read_dataset():
    load_dotenv()
    base_vectors = utils.read_fvecs(os.getenv('BASE_VECTORS_PATH'))
    query_vectors = utils.read_fvecs(os.getenv('QUERY_VECTORS_PATH'))
    knn_groundtruth = utils.read_ivecs(os.getenv('GROUND_TRUTH_PATH'))
    return base_vectors, query_vectors, knn_groundtruth

In [16]:
vector_size = 128
collection_name = "testSIFT"

qdrantClient.delete_collection(collection_name=collection_name)

qdrantClient.recreate_collection(
    collection_name=collection_name,
    vectors_config=VectorParams(size=vector_size, distance=Distance.COSINE),
)

True

In [17]:
baseV, queryV, _ = read_dataset()
# baseV = baseV[:10000]
# queryV = queryV[:100]
# baseV = pd.DataFrame({'vector': baseV[:10000]})
# queryV = pd.DataFrame({'vector': queryV[:100]})
with open('ANN_SIFT_COSINE_GT.pkl', 'rb') as f:
        groundTruth = pickle.load(f)

Loading file: siftsmall_base.fvecs
    The dimension of the vectors in the file is: 128
    The final shape of the loaded dataset siftsmall_base.fvecs is (10000, 128).
Loading file: siftsmall_query.fvecs
    The dimension of the vectors in the file is: 128
    The final shape of the loaded dataset siftsmall_query.fvecs is (100, 128).
 Loading file: siftsmall_groundtruth.ivecs
    The dimension of the vectors in the file is: 100
    The final shape of the loaded dataset is (100, 100).


In [18]:
baseV = [[float(elem) for elem in vector] for vector in baseV]
queryV = [[float(elem) for elem in vector] for vector in queryV]

In [19]:
batch_size = 50000
num_batches = len(baseV) // batch_size + int(len(baseV) % batch_size > 0)
print(f'Number of batches: {num_batches}')

for batch_idx in range(num_batches):
    print(f'Current progress: {(batch_idx+1)*batch_size}/{len(baseV)}', end='\r')
    start_idx = batch_idx * batch_size
    end_idx = min((batch_idx + 1) * batch_size, len(baseV))

    batch_vectors = baseV[start_idx:end_idx]

    qdrantClient.upsert(
        collection_name=collection_name,
        points=models.Batch(
            ids=list(range(start_idx, end_idx)),
            vectors=batch_vectors
        )
    )

Number of batches: 1
Current progress: 50000/10000

In [20]:
print(f'Search function starting')
start_time = time.time()
result_ids = []
for i,elem in enumerate(queryV):
    print(f'Progress: {i}/{len(queryV)}', end='\r')
    search_result = qdrantClient.search(
        collection_name=collection_name, 
        query_vector=elem, 
        limit=100
    )
    result_ids.append([elem.id for elem in search_result])

end_time = time.time()
time_span = end_time - start_time
print(f'Search function took {end_time - start_time} seconds')

Search function starting
Search function took 0.2821681499481201 seconds


In [21]:
true_positives = 0
n_classified = 0
for i,elem in enumerate(result_ids):
    true_positives_iter = len(np.intersect1d(groundTruth[i], result_ids[i]))
    true_positives += true_positives_iter
    n_classified += len(elem)
print(true_positives)
print(n_classified)
print(f'QPS = {(len(queryV) / time_span):.4f}')
print(f'Average recall: {true_positives/n_classified}')

10000
10000
QPS = 354.3986
Average recall: 1.0
