In [2]:
from typing import Dict, List, Annotated
import numpy as np
import os
from sklearn.cluster import KMeans
import pickle



DB_SEED_NUMBER = 42
ELEMENT_SIZE = np.dtype(np.float32).itemsize
DIMENSION = 70
class CustomIndex: pass

class VecDB:
    def __init__(self, database_file_path = "saved_db.dat", index_file_path = "index.dat", new_db = True, db_size = None) -> None:
        self.db_path = database_file_path
        self.index_path = index_file_path
        if new_db:
            if db_size is None:
                raise ValueError("You need to provide the size of the database")
            # delete the old DB file if exists
            if os.path.exists(self.db_path):
                os.remove(self.db_path)
            self.generate_database(db_size)
    
    def generate_database(self, size: int) -> None:
        rng = np.random.default_rng(DB_SEED_NUMBER)
        vectors = rng.random((size, DIMENSION), dtype=np.float32)
        self._write_vectors_to_file(vectors)
        self._build_index()
    # np.memmap() Parameters:
    # First argument: File path to store data
    # dtype: Data type (float32 here)
    # mode: File access mode
    # 'w+': Read/write, create if not exists
    # shape: Dimensions of the array
    # returns file contents on disk
    def _write_vectors_to_file(self, vectors: np.ndarray) -> None:
        mmap_vectors = np.memmap(self.db_path, dtype=np.float32, mode='w+', shape=vectors.shape)
        mmap_vectors[:] = vectors[:]
        mmap_vectors.flush()

    def _get_num_records(self) -> int:
        return os.path.getsize(self.db_path) // (DIMENSION * ELEMENT_SIZE)

    def insert_records(self, rows: Annotated[np.ndarray, (int, 70)]):
        num_old_records = self._get_num_records()
        num_new_records = len(rows)
        full_shape = (num_old_records + num_new_records, DIMENSION)
        mmap_vectors = np.memmap(self.db_path, dtype=np.float32, mode='r+', shape=full_shape)
        mmap_vectors[num_old_records:] = rows
        mmap_vectors.flush()
        #TODO: might change to call insert in the index, if you need
        self._build_index()

    def get_one_row(self, row_num: int) -> np.ndarray:
        # This function is only load one row in memory
        try:
            offset = row_num * DIMENSION * ELEMENT_SIZE
            # [0] is necessary because:
            # mmap_vector is 2D array: [[x1, x2, ..., x70]]
            # [0] extracts the single row as 1D: [x1, x2, ..., x70]
            mmap_vector = np.memmap(self.db_path, dtype=np.float32, mode='r', shape=(1, DIMENSION), offset=offset)
            return np.array(mmap_vector[0])
        except Exception as e:
            return f"An error occurred: {e}"

    def get_all_rows(self) -> np.ndarray:
        # Take care this load all the data in memory
        num_records = self._get_num_records()
        vectors = np.memmap(self.db_path, dtype=np.float32, mode='r', shape=(num_records, DIMENSION))
        return np.array(vectors)
    
    def retrieve(self, query: Annotated[np.ndarray, (1, DIMENSION)], top_k = 5):
        scores = []
        query = query.ravel()  # Flattens the query to 1D
        num_records = self._get_num_records()
        file = open(self.index_path,'rb')
        index = pickle.load(file)
        file.close()

        cluster_centers=index.centroids
        labels_list=index.labels_list
        for i,vec in enumerate(cluster_centers):
            score=self._cal_score(query,vec)
            scores.append((score,i))
        # # here we assume that the row number is the ID of each vector
        # for row_num in range(num_records):
        #     vector = self.get_one_row(row_num)
        #     score = self._cal_score(query, vector)
        #     scores.append((score, row_num))
        # here we assume that if two rows have the same score, return the lowest ID
        # Getting the nearest clusters
        n_probe = 5
        cluster_scores = sorted(scores, reverse=True)[:n_probe]
        # Get the vectors of nearest clusters
        top_vector=[]
        # for item in scores:
        top_vector.append(labels_list[cluster_scores[0][1]])
        # print("top_vector",top_vector[0])
        
        # store the vectors of the top cluster
        resulted_vectors=[]
        #loop over top_vectors and cosine similarity
        for row_num in top_vector[0]:
            vector = self.get_one_row(row_num)
            vector = vector.ravel()  # Flattens the vector to 1D
            score = self._cal_score(query, vector)
            resulted_vectors.append((score,row_num))
        # Sort by scores and keep only top_k results
        resulted_vectors = sorted(resulted_vectors, reverse=True)[:top_k]
        # print(resulted_vectors)
        # Extract only the row_num from resulted_vectors
        row_nums = [row_num for _, row_num in resulted_vectors]

        return row_nums  # Return only row_num values
    
    def _cal_score(self, vec1, vec2):
        dot_product = np.dot(vec1, vec2)
        norm_vec1 = np.linalg.norm(vec1)
        norm_vec2 = np.linalg.norm(vec2)
        cosine_similarity = dot_product / (norm_vec1 * norm_vec2)
        return cosine_similarity

    def _build_index(self, num_clusters=100, num_subspaces=7):
            
            # Placeholder for index building logic
            ### IVF
            # Apply k means clustering to all vectors -> return the cluster centroids 
            # store the clusters and their centroids in a array or dictionary?
            # Within each cluster:
            # loop over each vector 
            # create an array of subspace arrays (parameter) subspaces[[   [subvector1 of array 1],[subvector1 of array 2]   ] , [],...]
            # apply k-means clustering for each subspace[0] subspace[1] etc..
            # Generate the codebook
            
            
            # Build the PQ-IVF index.
            # Args:
            #     num_clusters (int): Number of clusters for the inverted file (IVF).
            #     num_subspaces (int): Number of subspaces for product quantization.
            #     subspace_dim (int): Dimension of each subspace.
            # Sample dataset
            vectors = self.get_all_rows()

            # Step 1: Coarse Quantization (Clustering)
            n_clusters=200
            kmeans = KMeans(n_clusters)
            labels = kmeans.fit_predict(vectors)  # Assign each vector to a cluster
            cluster_centers = kmeans.cluster_centers_
            # Step 2: Construct Posting Lists
            labels_list = {i: [] for i in range(n_clusters)}  # Two clusters: 0 and 1
            for i, label in enumerate(labels):
                labels_list[label].append(i)
            
            CustomIndex.centroids=cluster_centers
            CustomIndex.labels_list=labels_list
            
            filehandler = open(self.index_path,"wb")
            pickle.dump(CustomIndex,filehandler)
            filehandler.close()

            # print(object_file.centroids, object_file.labels_list, sep=', ')
            # Print Clustering Results
            # print("Cluster Centers:", cluster_centers)
            # print("Labels:", labels)
            # print("Posting Lists:", labels_list)

            # # # Query Processing
            # query = np.random.randint(0, 10, (1, 7))  # Integers between 0 and 9
            # nearest_centroid = np.argmin([np.linalg.norm(query - centroid) for centroid in kmeans.cluster_centers_])

            # # Fine Search (Within the posting list of nearest centroid)
            # nearest_vectors = [vectors[i] for i in labels_list[nearest_centroid]]
            # closest_vector = min(nearest_vectors, key=lambda x: np.linalg.norm(query - x))
            # print(f"Actual vector: {query}")
            # print(f"Nearest vector: {closest_vector}")
    

In [3]:
# Create an instance of VecDB and random DB of size 10K
db = VecDB(db_size = 10**7)

  super()._check_params_vs_input(X, default_n_init=10)


In [4]:


# Retrieve similar images for a given query
query_vector = np.random.rand(70) # Query vector of dimension 70
similar_images = db.retrieve(query_vector, top_k=5)
print(similar_images)
# print(db.get_one_row((similar_images[0])))



[4019341, 2539407, 4823272, 1342913, 5605117]


In [5]:
print(db._cal_score(query_vector,db.get_one_row((similar_images[0]))))
print(db._cal_score(query_vector,db.get_one_row((similar_images[1]))))
print(db._cal_score(query_vector,db.get_one_row((similar_images[2]))))
print(db._cal_score(query_vector,db.get_one_row((similar_images[3]))))
print(db._cal_score(query_vector,db.get_one_row((similar_images[4]))))

0.9078978664318471
0.9000237664630433
0.8948934506872064
0.8940165329832732
0.893875506043859


In [6]:

# Example usage
db =VecDB(db_size=1000)
query_vector = np.array([1, 2, 3])
similar_image_vector = np.array([4, 5, 6])

similarity = db._cal_score(query_vector, similar_image_vector)
print("Cosine Similarity Score:", similarity)

  super()._check_params_vs_input(X, default_n_init=10)


Cosine Similarity Score: 0.9746318461970762


In [7]:
import random

#consider this as a high dimensional vector
vec = v = [random.randint(1,20) for i in range(70)]
print(vec)

[3, 2, 4, 20, 16, 16, 1, 4, 6, 14, 12, 1, 7, 19, 1, 7, 9, 2, 14, 4, 10, 3, 8, 12, 4, 19, 12, 10, 18, 6, 4, 1, 2, 17, 17, 5, 7, 8, 16, 17, 20, 11, 12, 4, 17, 14, 4, 15, 6, 12, 20, 16, 2, 2, 5, 10, 1, 8, 6, 3, 3, 7, 4, 14, 2, 2, 16, 7, 17, 2]


In [8]:
chunk_count = 35
vector_size = len(vec)

# vector_size must be divisable by chunk_size
assert vector_size % chunk_count == 0
# length of each subvector will be vector_size/ chunk_count
subvector_size = int(vector_size / chunk_count)

# subvectors
sub_vectors = [vec[row: row+subvector_size] for row in range(0, vector_size, subvector_size)]
sub_vectors

[[3, 2],
 [4, 20],
 [16, 16],
 [1, 4],
 [6, 14],
 [12, 1],
 [7, 19],
 [1, 7],
 [9, 2],
 [14, 4],
 [10, 3],
 [8, 12],
 [4, 19],
 [12, 10],
 [18, 6],
 [4, 1],
 [2, 17],
 [17, 5],
 [7, 8],
 [16, 17],
 [20, 11],
 [12, 4],
 [17, 14],
 [4, 15],
 [6, 12],
 [20, 16],
 [2, 2],
 [5, 10],
 [1, 8],
 [6, 3],
 [3, 7],
 [4, 14],
 [2, 2],
 [16, 7],
 [17, 2]]

In [9]:
# subvectors are then processed and linked to their closest centroids, also known as reproduction values, within the respective subclusters.

k = 56
assert k % chunk_count == 0
k_ = int(k/chunk_count)

from random import randint
# reproduction values
c = []  
for j in range(chunk_count):
    # each j represents a subvector position
    c_j = []
    for i in range(k_):
        # each i represents a cluster/reproduction value position 
       c_ji = [randint(0, 9) for _ in range(subvector_size)]
       c_j.append(c_ji)  # add cluster centroid to subspace list
    
  # add subspace list of centroids
    c.append(c_j)

AssertionError: 

In [None]:
#helper function to calculate euclidean distance
def euclidean(v, u):
    distance = sum((x - y) ** 2 for x, y in zip(v, u)) ** .5
    return distance

#helper function to create unique ids
def nearest(c_j, chunk_j):
    distance = 9e9
    for i in range(k_):
        new_dist = euclidean(c_j[i], chunk_j)
        if new_dist < distance:
            nearest_idx = i
            distance = new_dist
    return nearest_idx

In [None]:
ids = []
# unique centroid IDs for each subvector
for j in range(chunk_count):
    i = nearest(c[j], sub_vectors[j])
    ids.append(i)
print(ids)

[1, 0, 2, 2, 0, 2, 0, 0, 3, 0, 1, 3, 0, 1]


In [None]:
quantized = []
for j in range(chunk_count):
    c_ji = c[j][ids[j]]
    quantized.extend(c_ji)

print(quantized)

[8, 4, 0, 2, 9, 7, 4, 7, 1, 8, 3, 0, 9, 8, 7, 7, 8, 7, 4, 3, 5, 9, 9, 7, 5, 5, 7, 7, 8, 4, 6, 9, 5, 4, 9, 9, 0, 9, 0, 8, 0, 8, 8, 9, 3, 5, 6, 4, 3, 3, 6, 9, 4, 2, 4, 8, 6, 8, 5, 9, 6, 8, 9, 9, 0, 0, 4, 9, 5, 9]


In [4]:
from sklearn.cluster import KMeans
import numpy as np

# Sample dataset
vectors = np.random.randint(0, 100, (100, 7))  # Integers between 0 and 9

# print(vectors)
# Step 1: Coarse Quantization (Clustering)
kmeans = KMeans(n_clusters=2)
print(vectors[:10])
kmeans.fit(vectors[:10])
cluster_centers = kmeans.cluster_centers_
labels = kmeans.predict(vectors)  # Assign each vector to a cluster

# Step 2: Construct Posting Lists
posting_lists = {i: [] for i in range(2)}  # Two clusters: 0 and 1
for i, label in enumerate(labels):
    posting_lists[label].append(i)
    

# Print Clustering Results
print("Cluster Centers:", cluster_centers)
print("Labels:", labels)
print("Posting Lists:", posting_lists)

# Query Processing
query = np.random.randint(0, 10, (1, 7))  # Integers between 0 and 9
nearest_centroid = np.argmin([np.linalg.norm(query - centroid) for centroid in kmeans.cluster_centers_])

# Fine Search (Within the posting list of nearest centroid)
nearest_vectors = [vectors[i] for i in posting_lists[nearest_centroid]]
closest_vector = min(nearest_vectors, key=lambda x: np.linalg.norm(query - x))
print(f"Actual vector: {query}")
print(f"Nearest vector: {closest_vector}")

[[77 43 16 44 93 15 96]
 [86 37 88 34 71 63 87]
 [19 98 36 14 69 61 63]
 [88 89 39  3 77 18 97]
 [78 86 71 68 44 29 76]
 [26  8 91 13 68 88 82]
 [93 48 97  1  3 35 17]
 [ 9 74 93 79  8 66 53]
 [40 38 77 30 58 89 49]
 [94 95 51 96 84 97 26]]
Cluster Centers: [[81.         72.66666667 42.         38.33333333 71.33333333 20.66666667
  89.66666667]
 [52.42857143 56.85714286 76.14285714 38.14285714 51.57142857 71.28571429
  53.85714286]]
Labels: [0 1 1 0 0 1 1 1 1 1 0 1 0 0 0 1 1 1 1 0 1 0 1 0 0 1 1 1 1 0 0 1 0 1 1 0 1
 1 1 1 1 0 1 1 0 1 1 1 1 0 1 1 0 1 0 1 1 1 1 1 1 1 1 0 1 1 1 1 0 0 1 1 1 0
 1 0 1 0 1 1 0 1 1 1 1 1 1 1 0 1 0 1 0 0 1 1 1 1 1 1]
Posting Lists: {0: [0, 3, 4, 10, 12, 13, 14, 19, 21, 23, 24, 29, 30, 32, 35, 41, 44, 49, 52, 54, 63, 68, 69, 73, 75, 77, 80, 88, 90, 92, 93], 1: [1, 2, 5, 6, 7, 8, 9, 11, 15, 16, 17, 18, 20, 22, 25, 26, 27, 28, 31, 33, 34, 36, 37, 38, 39, 40, 42, 43, 45, 46, 47, 48, 50, 51, 53, 55, 56, 57, 58, 59, 60, 61, 62, 64, 65, 66, 67, 70, 71, 72, 74, 76, 78, 

  super()._check_params_vs_input(X, default_n_init=10)


In [None]:
from sklearn.cluster import KMeans
import numpy as np

# Sample dataset
vectors = np.random.randint(0, 100, (100, 10))  # Integers between 0 and 9
# print(vectors)
part1 = vectors[:,:5]
part2 = vectors[:,5:]
# print(part1)
# print(part2)
n_clusters = 3
# print(vectors)
# Step 1: Coarse Quantization (Clustering)
kmeansp1 = KMeans(n_clusters)
kmeansp1.fit(part1)
cluster_centersp1 = kmeansp1.cluster_centers_

kmeansp2 = KMeans(n_clusters)
kmeansp2.fit(part2)
cluster_centersp2 = kmeansp2.cluster_centers_



labelsp1 = kmeansp1.predict(part1)  # Assign each vector's part1 to a cluster
labelsp2 = kmeansp2.predict(part2)  # Assign each vector's part2 to a cluster

print("labelsp1",labelsp1)
print("labelsp2",labelsp2)

# Step 2: Construct Posting Lists for Cartesian Product of Clusters
# Initialize posting lists for all cluster pairs
posting_lists = {(i, j): [] for i in range(n_clusters) for j in range(n_clusters)}

# Assign each vector to the corresponding posting list
for idx, (label1, label2) in enumerate(zip(labelsp1, labelsp2)):
    posting_lists[(label1, label2)].append(idx)

# Display the posting lists
for key, indices in posting_lists.items():
    print(f"Cluster Pair {key}: {indices}")
    

# # Print Clustering Results
# print("Cluster Centers:", cluster_centers)
# print("Labels:", labels)
# print("Posting Lists:", posting_lists)

# # Query Processing
query = np.random.randint(0, 100, (1, 10))  # Integers between 0 and 9
print(query)
query_part1 = query[:,:5]
query_part2 = query[:,5:]
k=5
# Compute distances to all u's (centroids of part1)
#calculate distance
# distances_u = kmeansp1.predict(query_part1)
distances_u = [np.linalg.norm(query_part1 - centroid) for centroid in kmeansp1.cluster_centers_]

nearest_u_indices = np.argsort(distances_u)[:k]  # Indices of top-k closest u's
# Compute distances to all v's (centroids of part2)
# distances_v = kmeansp2.predict(query_part2)
distances_v = [np.linalg.norm(query_part2 - centroid) for centroid in kmeansp2.cluster_centers_]

nearest_v_indices = np.argsort(distances_v)[:k]  # Indices of top-k closest v's
print ("Nearest U Indices = ",nearest_u_indices," Nearest V indices = ", nearest_v_indices)

# Combine u's and v's and find the pair with the smallest combined distance
min_distance = float('inf')
best_pairs = []
for u_idx in nearest_u_indices:
    for v_idx in nearest_v_indices:
        combined_distance = distances_u[u_idx] + distances_v[v_idx]
        # if combined_distance < min_distance:
        best_pairs.append((combined_distance,(u_idx,v_idx)))
        # best_pairs.append((u_idx,v_idx))
best_pairs = sorted(best_pairs, reverse=True)


print(f"Nearest u,v: {best_pairs}")
print(f"Nearest vectors: {posting_lists[best_pairs[0][1]]}")



# nearest_centroid = np.argmin([np.linalg.norm(query - centroid) for centroid in kmeans.cluster_centers_])

# # Fine Search (Within the posting list of nearest centroid)
# nearest_vectors = [vectors[i] for i in posting_lists[nearest_centroid]]
# closest_vector = min(nearest_vectors, key=lambda x: np.linalg.norm(query - x))
# print(f"Actual vector: {query}")
# print(f"Nearest vector: {closest_vector}")

labelsp1 [2 2 0 0 0 1 0 0 1 1 1 2 0 1 2 0 1 1 1 0 2 2 1 0 1 1 2 0 1 0 2 1 1 1 2 0 2
 0 0 1 2 0 1 0 2 2 0 0 1 2 1 1 2 2 2 2 0 0 2 2 2 2 1 2 2 2 0 2 2 1 1 2 0 1
 2 0 2 1 1 1 0 1 0 0 0 1 1 1 1 0 2 1 2 2 2 1 2 0 1 0]
labelsp2 [2 2 2 0 2 1 1 2 1 0 2 2 0 1 1 0 1 2 2 0 1 1 0 0 1 0 2 2 1 0 2 1 0 2 0 2 1
 0 2 0 2 1 0 2 1 1 1 0 1 2 2 1 0 2 2 2 2 1 2 0 2 1 1 0 0 1 1 0 2 2 2 0 2 1
 2 2 2 1 2 2 1 2 2 2 2 2 2 1 2 1 2 1 2 1 1 1 1 2 2 0]
Cluster Pair (0, 0): [3, 12, 15, 19, 23, 29, 37, 47, 99]
Cluster Pair (0, 1): [6, 41, 46, 57, 66, 80, 89]
Cluster Pair (0, 2): [2, 4, 7, 27, 35, 38, 43, 56, 72, 75, 82, 83, 84, 97]
Cluster Pair (1, 0): [9, 22, 25, 32, 39, 42]
Cluster Pair (1, 1): [5, 8, 13, 16, 24, 28, 31, 48, 51, 62, 73, 77, 87, 91, 95]
Cluster Pair (1, 2): [10, 17, 18, 33, 50, 69, 70, 78, 79, 81, 85, 86, 88, 98]
Cluster Pair (2, 0): [34, 52, 59, 63, 64, 67, 71]
Cluster Pair (2, 1): [14, 20, 21, 36, 44, 45, 61, 65, 93, 94, 96]
Cluster Pair (2, 2): [0, 1, 11, 26, 30, 40, 49, 53, 54, 55, 58, 60, 68, 74

  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)


In [None]:
import numpy as np
from sklearn.cluster import KMeans
from scipy.spatial.distance import cdist

class ProductQuantization:
    def __init__(self, m, k):
        """
        Initialize the Product Quantization class.
        :param m: Number of subspaces (split vector into m parts)
        :param k: Number of clusters per subspace
        """
        self.m = m
        self.k = k
        self.codebooks = []

    def fit(self, data):
        """
        Fit the quantizer to the data.
        :param data: Dataset of shape (n_samples, d_features)
        """
        n_samples, d_features = data.shape
        assert d_features % self.m == 0, "Number of features must be divisible by m"
        
        self.subvector_dim = d_features // self.m
        self.codebooks = []
        
        for i in range(self.m):
            sub_data = data[:, i * self.subvector_dim:(i + 1) * self.subvector_dim]
            kmeans = KMeans(n_clusters=self.k, random_state=42).fit(sub_data)
            self.codebooks.append(kmeans.cluster_centers_)
    
    def encode(self, data): #build the index
        """
        Encode the dataset into quantized indices.
        :param data: Dataset of shape (n_samples, d_features)
        :return: Encoded indices of shape (n_samples, m)
        """
        n_samples, d_features = data.shape
        codes = np.zeros((n_samples, self.m), dtype=np.int32) # Stores the nearest centroid for each subspace: if m = 4 -> [[1,5,1,6],[9,1,5,3],...]
        
        for i in range(self.m):
            sub_data = data[:, i * self.subvector_dim:(i + 1) * self.subvector_dim] #partition the data into subspaces
            distances = cdist(sub_data, self.codebooks[i]) #
            codes[:, i] = np.argmin(distances, axis=1)
        
        return codes #codes contain the compressed representation of the data
    
    def decode(self, codes): #used 
        """
        Decode the quantized indices back to approximate vectors.
        :param codes: Encoded indices of shape (n_samples, m)
        :return: Approximate vectors of shape (n_samples, d_features)
        """
        n_samples = codes.shape[0]
        decoded_vectors = np.zeros((n_samples, self.m * self.subvector_dim))
        
        for i in range(self.m):
            decoded_vectors[:, i * self.subvector_dim:(i + 1) * self.subvector_dim] = self.codebooks[i][codes[:, i]]
        
        return decoded_vectors

    def search(self, query, codes, top_k=1):
        """
        Perform approximate nearest neighbor search.
        :param query: Query vector of shape (1, d_features)
        :param codes: Encoded indices of the dataset
        :param top_k: Number of nearest neighbors to return
        :return: Indices of top_k nearest neighbors
        """
        distances = np.zeros(codes.shape[0])
        
        for i in range(self.m):
            sub_query = query[:, i * self.subvector_dim:(i + 1) * self.subvector_dim]
            sub_distances = cdist(sub_query, self.codebooks[i])
            distances += sub_distances[0, codes[:, i]]
        
        return np.argsort(distances)[:top_k]

# Example usage
if __name__ == "__main__":
    # Generate synthetic data
    data = np.random.random((1000, 16))  # 1000 samples, 16 dimensions
    
    query = np.random.random((1, 16))   # 1 query vector
    print(query)
    pq = ProductQuantization(m=8, k=256)  # 4 subspaces, 256 clusters per subspace
    pq.fit(data)
    
    codes = pq.encode(data)
    reconstructed_data = pq.decode(codes)
    
    # Perform search
    neighbors = pq.search(query, codes, top_k=5)
    print(data)
    print(codes)
    print(reconstructed_data)
    print("Nearest neighbors:", neighbors)


[[0.94669777 0.14702176 0.03122288 0.93221853 0.8269393  0.78845497
  0.30489522 0.9944374  0.97145154 0.72643284 0.11840847 0.83041497
  0.25410993 0.29975197 0.16046107 0.2680251 ]]


  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)


[[0.09518839 0.35055304 0.9398079  ... 0.95536698 0.62127548 0.25900692]
 [0.60531046 0.0299943  0.18020337 ... 0.84442837 0.30599367 0.31717393]
 [0.12436448 0.70804765 0.40053623 ... 0.01026024 0.98808191 0.81107594]
 ...
 [0.83905272 0.73320902 0.38375223 ... 0.27363777 0.37699509 0.06085107]
 [0.06235347 0.81673454 0.83788639 ... 0.13500675 0.61006013 0.20705075]
 [0.5731672  0.76549188 0.87305659 ... 0.68015527 0.97821466 0.63104061]]
[[ 55 168  80 ...  63  69 117]
 [157 240 141 ... 103 160 169]
 [111 134  58 ...  23  49   5]
 ...
 [227  34 191 ... 211 147  18]
 [ 77  66 245 ... 150 134 118]
 [ 51 120  81 ... 228 224  70]]
[[0.09794463 0.35227004 0.91964504 ... 0.95014297 0.62753523 0.24410977]
 [0.61093234 0.01673905 0.15575159 ... 0.85667647 0.30498966 0.29733311]
 [0.12194592 0.71991971 0.39669213 ... 0.01502664 0.98060759 0.81912355]
 ...
 [0.82988276 0.71941122 0.39528657 ... 0.29586202 0.37226255 0.06036116]
 [0.07032296 0.81166198 0.84902497 ... 0.12305148 0.57763475 0.1852

In [6]:
from vec_db import VecDB
from imi import IMI

db = VecDB(db_size = 10**3)
IMI.build_index(db)


UnboundLocalError: cannot access local variable 'n_clusters' where it is not associated with a value

In [None]:
IMI.retrieve(db,query)