In [5]:
!pip install faiss scann -q

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
sagemaker 2.165.0 requires importlib-metadata<5.0,>=1.4.0, but you have importlib-metadata 6.0.0 which is incompatible.[0m[31m
[0m


## <a href="https://github.com/facebookresearch/faiss">FAISS</a>

In [8]:
from typing import Any, List, Tuple

import numpy as np
import faiss

In [10]:
def search_with_faiss(
    corpus:List[Any], query_vector:List[Any], k:int=5
) -> Tuple[List[Any], List[Any]]:
    """
    Perform similarity search using FAISS.

    Args:
    - corpus (np.array): Corpus of vectors to be indexed.
    - query_vector (np.array): Vector for similarity search.
    - k (int): Number of nearest neighbors to retrieve. Default is 5.

    Returns:
    - distances (np.array): Distances to the nearest neighbors.
    - indices (np.array): Indices of the nearest neighbors.
    """
    # Initialize an index
    dimension = corpus.shape[1]
    index = faiss.IndexFlatL2(dimension)  # L2 distance (Euclidean distance) index

    # Add vectors to the index
    index.add(corpus)

    # Perform a k-nearest neighbor search
    distances, indices = index.search(query_vector, k)

    return distances, indices

In [11]:
num_samples = 1000
dimension = 100
np.random.seed(42)
corpus = np.random.random((num_samples, dimension)).astype('float32')

In [12]:
query_vector = np.random.random((1, dimension)).astype('float32')

distances, indices = search_with_faiss(corpus, query_vector)
print("Indices of nearest neighbors:", indices)
print("Distances to nearest neighbors:", distances)

Indices of nearest neighbors: [[220 490 785 455 416]]
Distances to nearest neighbors: [[11.527527 11.875461 12.248822 12.437688 12.511422]]


## <a href="https://github.com/google-research/google-research/tree/master/scann">SCANN</a>

In [15]:
import numpy as np
import tensorflow as tf
import scann

2024-03-23 11:03:32.529056: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-03-23 11:03:37.169729: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-03-23 11:03:37.177843: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [24]:
def search_with_scann(
    corpus:List[Any], 
    query_vector:List[Any], 
    # num_leaves:int=2000, 
    # num_leaves_to_search:int=100, 
    # training_sample_size:int=250000, 
    num_neighbors:int=10
) -> Tuple[List[Any], List[Any]]:
    """
    Perform similarity search using SCANN.

    Args:
    - corpus (np.array): Corpus of vectors to be indexed.
    - query_vector (np.array): Vector for similarity search.
    - num_leaves (int): Number of leaves in the tree. Default is 2000.
    - num_leaves_to_search (int): Number of leaves to search. Default is 100.
    - training_sample_size (int): Training sample size. Default is 250000.
    - num_neighbors (int): Number of nearest neighbors to retrieve. Default is 10.

    Returns:
    - neighbors (np.array): Indices of the nearest neighbors.
    - distances (np.array): Distances to the nearest neighbors.
    """
    num_samples = corpus.shape[0]
    num_clusters = min(2000, num_samples)  # Ensure number of clusters is less than or equal to the number of points

    # Create a SCANN object
    searcher = scann.scann_ops_pybind.builder(corpus, num_neighbors, "dot_product").tree(
        num_leaves=num_clusters, num_leaves_to_search=100, training_sample_size=250000).score_ah(
        2, anisotropic_quantization_threshold=0.2).reorder(100).build()

    # Perform nearest neighbor search
    neighbors, distances = searcher.search_batched(query_vector)

    return neighbors, distances

In [25]:
num_samples = 1000
dimension = 100
np.random.seed(42)
corpus = np.random.random((num_samples, dimension)).astype('float32')

query_vector = np.random.random((1, dimension)).astype('float32')

neighbors, distances = search_with_scann(corpus, query_vector)
print("Indices of nearest neighbors:", neighbors)
print("Distances to nearest neighbors:", distances)

Indices of nearest neighbors: [[608 816 220 404 514 625 773 175 152 403]]
Distances to nearest neighbors: [[28.623615 28.37905  28.336159 28.31845  28.276588 28.166014 28.130856
  27.992577 27.746948 27.730839]]


2024-03-23 11:06:45.570479: I scann/partitioning/partitioner_factory_base.cc:59] Size of sampled dataset for training partition: 1000
2024-03-23 11:06:45.748446: I ./scann/partitioning/kmeans_tree_partitioner_utils.h:84] PartitionerFactory ran in 177.894647ms.
