Connect client


In [29]:
# !pip3 install session-info
# import session_info
# session_info.show()

In [1]:
from qdrant_client import QdrantClient
from qdrant_client.http import models
from qdrant_client.http.models import Distance, VectorParams
from qdrant_client.http.models import PointStruct
import numpy as np
import random
import pandas as pd
import time

import utils

In [2]:
qdrantClient = QdrantClient(host='localhost', port=6333, timeout=10000000)
# base_vectors = utils.read_fvecs("../../dataset/siftsmall/siftsmall_base.fvecs")
# query_vectors = utils.read_fvecs("../../dataset/siftsmall/siftsmall_query.fvecs")
# knn_groundtruth = utils.read_ivecs("../../dataset/siftsmall/siftsmall_groundtruth.ivecs")
base_vectors = utils.read_fvecs("../../dataset/sift/sift_base.fvecs")
query_vectors = utils.read_fvecs("../../dataset/sift/sift_query.fvecs")
knn_groundtruth = utils.read_ivecs("../../dataset/sift/sift_groundtruth.ivecs")

Loading file: sift_base.fvecs
    The dimension of the vectors in the file is: 128
    The final shape of the loaded dataset sift_base.fvecs is (1000000, 128).
Loading file: sift_query.fvecs
    The dimension of the vectors in the file is: 128
    The final shape of the loaded dataset sift_query.fvecs is (10000, 128).
 Loading file: sift_groundtruth.ivecs
    The dimension of the vectors in the file is: 100
    The final shape of the loaded dataset is (10000, 100).


# Attribute filtering 

Add random boolean attributes to base vectors

In [3]:
base_vectors_with_attributes = pd.DataFrame({'vector': base_vectors.tolist()})
num_rows = len(base_vectors_with_attributes)
base_vectors_with_attributes['attr1'] = [random.choice([True, False]) for _ in range(num_rows)]
base_vectors_with_attributes['attr2'] = [random.choice([True, False]) for _ in range(num_rows)]
base_vectors_with_attributes['attr3'] = [random.choice([True, False]) for _ in range(num_rows)]

Create qdrant collection

In [4]:
vector_size = 128
collection_name = "attribute_filtering_1M"

qdrantClient.delete_collection(collection_name=collection_name)

qdrantClient.recreate_collection(
    collection_name=collection_name,
    vectors_config=VectorParams(size=vector_size, distance=Distance.EUCLID),
)

True

In [5]:
start_time = time.time()
batch_points = [PointStruct(id=i, vector=elem["vector"], payload= {"attr1": elem["attr1"], "attr2": elem["attr2"], "attr3": elem["attr3"]}) for i, elem in base_vectors_with_attributes.iterrows()]
end_time = time.time()
time_span = end_time - start_time
print(f'Points created in {time_span}')

Points created in 52.095218896865845


In [6]:
len(batch_points)

1000000

Create point-structure elements and insert them in DB (see qdrant docs)

In [7]:
batch_size = 50000
num_batches = len(base_vectors) // batch_size + int(len(base_vectors) % batch_size > 0)
print(f'Number of batches: {num_batches}')

for batch_idx in range(num_batches):
    print(f'Current progress: {(batch_idx+1)*batch_size}/{len(base_vectors)}', end='\r')
    start_idx = batch_idx * batch_size
    end_idx = min((batch_idx + 1) * batch_size, len(base_vectors))

    batch_points_i = batch_points[start_idx:end_idx]

    operation_info = qdrantClient.upsert(
        collection_name=collection_name,
        wait=True,
        points=batch_points_i
    )

Number of batches: 20
Current progress: 1000000/1000000

In [8]:
qdrantClient.get_collection(collection_name=collection_name)
# Wait until status is green

CollectionInfo(status=<CollectionStatus.YELLOW: 'yellow'>, optimizer_status=<OptimizersStatusOneOf.OK: 'ok'>, vectors_count=1100000, indexed_vectors_count=800000, points_count=1100000, segments_count=3, config=CollectionConfig(params=CollectionParams(vectors=VectorParams(size=128, distance=<Distance.EUCLID: 'Euclid'>, hnsw_config=None, quantization_config=None, on_disk=None), shard_number=1, replication_factor=1, write_consistency_factor=1, read_fan_out_factor=None, on_disk_payload=True), hnsw_config=HnswConfig(m=16, ef_construct=100, full_scan_threshold=10000, max_indexing_threads=0, on_disk=False, payload_m=None), optimizer_config=OptimizersConfig(deleted_threshold=0.2, vacuum_min_vector_number=1000, default_segment_number=0, max_segment_size=None, memmap_threshold=None, indexing_threshold=20000, flush_interval_sec=5, max_optimization_threads=1), wal_config=WalConfig(wal_capacity_mb=32, wal_segments_ahead=0), quantization_config=None), payload_schema={})

Add random boolean attributes to query vectors

In [9]:
query_vectors_with_attributes = pd.DataFrame({'vector': query_vectors.tolist()})
num_rows = len(query_vectors_with_attributes)
query_vectors_with_attributes['attr1'] = [random.choice([True, False]) for _ in range(num_rows)]
query_vectors_with_attributes['attr2'] = [random.choice([True, False]) for _ in range(num_rows)]
query_vectors_with_attributes['attr3'] = [random.choice([True, False]) for _ in range(num_rows)]

# truth = utils.top_k_neighbors(query_vectors_with_attributes, base_vectors_with_attributes)


**Calculate ground truth** between query vectors and base vectors (which were inserted in the DB)

In [11]:
import utils

In [10]:
truth = utils.top_k_neighbors(query_vectors_with_attributes, base_vectors_with_attributes, k=100, function='euclidean', filtering=True) 

# truth contains the IDs of the k nearest neighbors that also satisfy the attribute filtering clause
# truth[0] contains a vector with all 100 IDs of the nn from query 0

2760/10000

Now search the knn in qdrant

In [15]:
print(f'Search function starting')
start_time = time.time()

result_ids = []
for _,elem in query_vectors_with_attributes.iterrows():
    # print(elem)
    vec = elem["vector"]
    attr1 = elem["attr1"]
    attr2 = elem["attr2"]
    attr3 = elem["attr3"]
    # print(attr1, attr2, attr3)
    search_result = qdrantClient.search(
        collection_name="attribute_filtering", 
        query_vector=vec, 
        query_filter=models.Filter(
            # must = AND
            must=[
                models.FieldCondition(
                    key="attr1",
                    match=models.MatchValue(
                        value=attr1,
                    ),
                ),
                models.FieldCondition(
                    key="attr2",
                    match=models.MatchValue(
                        value=attr2,
                    ),
                ),
                models.FieldCondition(
                    key="attr3",
                    match=models.MatchValue(
                        value=attr3,
                    ),
                )
            ]
        ),
        limit=100
    )
    result_ids.append([elem.id for elem in search_result])

end_time = time.time()
time_span = end_time - start_time
print(f'Search function took {end_time - start_time} seconds')

Search function starting
Search function took 1.8239409923553467 seconds


In [17]:
true_positives = 0
n_classified = 0
for i,elem in enumerate(result_ids):
    true_positives_iter = len(np.intersect1d(truth[i], elem))
    true_positives += true_positives_iter
    n_classified += len(elem)

print(f'QPS = {(len(query_vectors) / time_span):.4f}')
print(f'Average recall: {true_positives/n_classified}')

QPS = 54.8263
Average recall: 1.0


# Graveyard

In [76]:
base_vectors_with_attributes.iloc[8021]

vector    [3.0, 0.0, 0.0, 0.0, 13.0, 25.0, 23.0, 22.0, 0...
attr1                                                 False
attr2                                                 False
attr3                                                 False
Name: 8021, dtype: object

In [None]:
import numpy as np

res = [elem.id for elem in search_result]
res = np.array(res, dtype=np.int32)

intersection

In [None]:
# for vec in query_vectors:
#     search_result = qdrantClient.search(
#         collection_name="test_collection", query_vector=[0.0 for _ in range(vector_size)], limit=100
#     )
#     break

for vec in query_vectors:
    search_result = qdrantClient.search(
        collection_name="test_collection", query_vector=vec, limit=100
    )
    break


In [None]:
columns_to_match = base_vectors_with_attributes.columns[1:]
print(f"We will be matching the following (boolean) values: {query_vectors_with_attributes[columns_to_match].iloc[0]}")

# Create a boolean mask based on values in the 'r' row
# mask = (base_vectors_with_attributes[columns_to_match] == query_vectors_with_attributes[columns_to_match].iloc[0]).all(axis=1)

# filtered_df = base_vectors_with_attributes[mask]
# filtered_df

import utils
utils

In [None]:
# query_vectors_with_attributes.columns
boolean_columns = query_vectors_with_attributes.columns[query_vectors_with_attributes.dtypes == bool].difference(['vector'])
mask = (base_vectors_with_attributes[boolean_columns] == query_vectors_with_attributes[boolean_columns].iloc[0]).all(axis=1)
mask