Connect client


In [1]:
from qdrant_client import QdrantClient
from qdrant_client.http import models
from qdrant_client.http.models import Distance, VectorParams
from qdrant_client.http.models import PointStruct
import numpy as np
import random
import pandas as pd

from fastembed.embedding import DefaultEmbedding
import utils

In [2]:
qdrantClient = QdrantClient(host='localhost', port=6333)
base_vectors = utils.read_fvecs("../../dataset/siftsmall/siftsmall_base.fvecs")
query_vectors = utils.read_fvecs("../../dataset/siftsmall/siftsmall_query.fvecs")
knn_groundtruth = utils.read_ivecs("../../dataset/siftsmall/siftsmall_groundtruth.ivecs")

Loading file: siftsmall_base.fvecs
    The dimension of the vectors in the file is: 128
    The final shape of the loaded dataset siftsmall_base.fvecs is (10000, 128).
Loading file: siftsmall_query.fvecs
    The dimension of the vectors in the file is: 128
    The final shape of the loaded dataset siftsmall_query.fvecs is (100, 128).
 Loading file: siftsmall_groundtruth.ivecs
    The dimension of the vectors in the file is: 100
    The final shape of the loaded dataset is (100, 100).


# Attribute filtering 

Add random boolean attributes to base vectors

In [3]:
base_vectors_with_attributes = pd.DataFrame({'vector': base_vectors.tolist()})
num_rows = len(base_vectors_with_attributes)
base_vectors_with_attributes['attr1'] = [random.choice([True, False]) for _ in range(num_rows)]
base_vectors_with_attributes['attr2'] = [random.choice([True, False]) for _ in range(num_rows)]
base_vectors_with_attributes['attr3'] = [random.choice([True, False]) for _ in range(num_rows)]

Create qdrant collection

In [4]:
vector_size = 128
collection_name = "attribute_filtering"

qdrantClient.delete_collection(collection_name=collection_name)

qdrantClient.recreate_collection(
    collection_name=collection_name,
    vectors_config=VectorParams(size=vector_size, distance=Distance.EUCLID),
)

True

Create point-structure elements and insert them in DB (see qdrant docs)

In [5]:
batch_points = [PointStruct(id=i, vector=elem["vector"], payload= {"attr1": elem["attr1"], "attr2": elem["attr2"], "attr3": elem["attr3"]}) for i, elem in base_vectors_with_attributes.iterrows()]

operation_info = qdrantClient.upsert(
    collection_name=collection_name,
    wait=True,
    points=batch_points
)
print(operation_info)

operation_id=0 status=<UpdateStatus.COMPLETED: 'completed'>


In [6]:
base_vectors_with_attributes.to_csv('test.csv')

In [61]:
batch_points

[PointStruct(id=0, vector=[0.0, 16.0, 35.0, 5.0, 32.0, 31.0, 14.0, 10.0, 11.0, 78.0, 55.0, 10.0, 45.0, 83.0, 11.0, 6.0, 14.0, 57.0, 102.0, 75.0, 20.0, 8.0, 3.0, 5.0, 67.0, 17.0, 19.0, 26.0, 5.0, 0.0, 1.0, 22.0, 60.0, 26.0, 7.0, 1.0, 18.0, 22.0, 84.0, 53.0, 85.0, 119.0, 119.0, 4.0, 24.0, 18.0, 7.0, 7.0, 1.0, 81.0, 106.0, 102.0, 72.0, 30.0, 6.0, 0.0, 9.0, 1.0, 9.0, 119.0, 72.0, 1.0, 4.0, 33.0, 119.0, 29.0, 6.0, 1.0, 0.0, 1.0, 14.0, 52.0, 119.0, 30.0, 3.0, 0.0, 0.0, 55.0, 92.0, 111.0, 2.0, 5.0, 4.0, 9.0, 22.0, 89.0, 96.0, 14.0, 1.0, 0.0, 1.0, 82.0, 59.0, 16.0, 20.0, 5.0, 25.0, 14.0, 11.0, 4.0, 0.0, 0.0, 1.0, 26.0, 47.0, 23.0, 4.0, 0.0, 0.0, 4.0, 38.0, 83.0, 30.0, 14.0, 9.0, 4.0, 9.0, 17.0, 23.0, 41.0, 0.0, 0.0, 2.0, 8.0, 19.0, 25.0, 23.0, 1.0], payload={'attr1': True, 'attr2': False, 'attr3': True}),
 PointStruct(id=1, vector=[14.0, 35.0, 19.0, 20.0, 3.0, 1.0, 13.0, 11.0, 16.0, 119.0, 85.0, 5.0, 0.0, 5.0, 24.0, 26.0, 0.0, 27.0, 119.0, 13.0, 3.0, 9.0, 19.0, 0.0, 0.0, 11.0, 73.0, 9.0, 10.0,

Add random boolean attributes to query vectors

In [7]:
query_vectors_with_attributes = pd.DataFrame({'vector': query_vectors.tolist()})
num_rows = len(query_vectors_with_attributes)
query_vectors_with_attributes['attr1'] = [random.choice([True, False]) for _ in range(num_rows)]
query_vectors_with_attributes['attr2'] = [random.choice([True, False]) for _ in range(num_rows)]
query_vectors_with_attributes['attr3'] = [random.choice([True, False]) for _ in range(num_rows)]

# truth = utils.top_k_neighbors(query_vectors_with_attributes, base_vectors_with_attributes)


**Calculate ground truth** between query vectors and base vectors (which were inserted in the DB)

In [8]:
truth = utils.top_k_neighbors(query_vectors_with_attributes, base_vectors_with_attributes, k=100, function='euclidean', filtering=True) 

# truth contains the IDs of the k nearest neighbors that also satisfy the attribute filtering clause
# truth[0] contains a vector with all 100 IDs of the nn from query 0

Now search the knn in qdrant

In [21]:
result_ids = []
for _,elem in query_vectors_with_attributes.iterrows():
    # print(elem)
    vec = elem["vector"]
    attr1 = elem["attr1"]
    attr2 = elem["attr2"]
    attr3 = elem["attr3"]
    # print(attr1, attr2, attr3)
    search_result = qdrantClient.search(
        collection_name="attribute_filtering", 
        query_vector=vec, 
        query_filter=models.Filter(
            # must = AND
            must=[
                models.FieldCondition(
                    key="attr1",
                    match=models.MatchValue(
                        value=attr1,
                    ),
                ),
                models.FieldCondition(
                    key="attr2",
                    match=models.MatchValue(
                        value=attr2,
                    ),
                ),
                models.FieldCondition(
                    key="attr3",
                    match=models.MatchValue(
                        value=attr3,
                    ),
                )
            ]
        ),
        limit=100
    )
    result_ids.append([elem.id for elem in search_result])


100

In [24]:
sum = 0
for ids_i in result_ids:
    intersection = np.intersect1d(truth[0], qdrant_ids)
    sum = sum + len(intersection)/100 #(For k = 100)

print(f'The average accuracy is {sum/len(result_ids)}')

The average accuracy is 1.0


100

# Graveyard

In [76]:
base_vectors_with_attributes.iloc[8021]

vector    [3.0, 0.0, 0.0, 0.0, 13.0, 25.0, 23.0, 22.0, 0...
attr1                                                 False
attr2                                                 False
attr3                                                 False
Name: 8021, dtype: object

In [None]:
import numpy as np

res = [elem.id for elem in search_result]
res = np.array(res, dtype=np.int32)

intersection

In [None]:
# for vec in query_vectors:
#     search_result = qdrantClient.search(
#         collection_name="test_collection", query_vector=[0.0 for _ in range(vector_size)], limit=100
#     )
#     break

for vec in query_vectors:
    search_result = qdrantClient.search(
        collection_name="test_collection", query_vector=vec, limit=100
    )
    break


In [None]:
columns_to_match = base_vectors_with_attributes.columns[1:]
print(f"We will be matching the following (boolean) values: {query_vectors_with_attributes[columns_to_match].iloc[0]}")

# Create a boolean mask based on values in the 'r' row
# mask = (base_vectors_with_attributes[columns_to_match] == query_vectors_with_attributes[columns_to_match].iloc[0]).all(axis=1)

# filtered_df = base_vectors_with_attributes[mask]
# filtered_df

import utils
utils

In [None]:
# query_vectors_with_attributes.columns
boolean_columns = query_vectors_with_attributes.columns[query_vectors_with_attributes.dtypes == bool].difference(['vector'])
mask = (base_vectors_with_attributes[boolean_columns] == query_vectors_with_attributes[boolean_columns].iloc[0]).all(axis=1)
mask