In [2]:
import numpy as np

In [20]:
embeddings = np.load('minilm_mean_vectors.npz')['vectors'][:500]

In [3]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

  from .autonotebook import tqdm as notebook_tqdm


In [30]:
# embeds = model.encode(["cat", "cats", "american election", "dogs"])
hash = L2Hash(embeddings.shape[1], 1, 5)
xs = hash.hash(embeddings)
xs

'[0 1 1 1 0][1 1 1 0 1][0 1 1 1 0][0 1 0 1 0][0 1 1 1 0][0 0 1 1 1][0 1 1 1 0][1 1 0 0 1][1 0 1 1 0][0 1 1 0 1][1 1 1 1 1][0 1 0 0 0][1 1 1 1 0][0 0 0 1 1][1 1 1 1 1][1 1 1 1 1][1 1 0 1 1][0 0 1 1 1][0 0 1 1 0][1 1 1 0 1][1 1 1 1 1][1 1 1 1 1][0 1 1 1 0][0 1 1 0 0][1 1 1 0 1][0 1 1 1 1][0 1 1 0 0][1 1 1 1 0][0 1 0 1 1][0 0 1 1 1][0 0 1 1 1][1 1 0 1 0][0 1 0 1 1][0 1 1 1 1][0 1 1 1 0][1 1 1 0 0][0 1 1 1 1][0 0 1 1 0][1 1 1 1 1][1 1 1 1 1][1 1 1 0 1][1 0 1 0 1][1 1 1 1 1][1 1 1 1 0][0 1 1 1 1][1 1 1 1 1][1 1 1 1 1][1 1 1 1 1][1 0 0 0 1][1 1 1 1 0][0 1 1 0 1][1 1 1 1 1][0 1 1 1 1][1 1 1 1 0][1 1 1 1 1][0 1 1 1 1][1 1 1 1 0][0 1 0 1 1][1 1 1 1 0][0 1 1 1 1][1 0 1 1 1][1 1 1 1 1][1 1 1 1 1][1 0 1 1 1][0 0 0 0 1][0 1 1 1 1][0 1 1 1 1][1 1 1 1 0][1 1 1 1 0][0 1 1 1 0][1 1 0 1 1][0 0 0 1 1][1 1 1 1 1][0 0 1 0 1][0 1 0 1 1][0 1 1 1 1][0 1 1 1 0][0 1 1 1 0][1 0 1 1 1][1 1 1 1 0][0 1 0 0 1][1 1 1 1 1][0 1 0 1 1][1 1 1 1 0][0 1 0 1 1][1 1 1 1 1][0 1 1 1 1][1 1 1 0 1][1 1 1 1 1][0 1 1 1 0][1 1 1 1 

In [None]:
class L2Hash:
    def __init__(self, dim, r, nbits, seed=1):
        self.seed = seed
        self.nbits = nbits
        
        gen = np.random.RandomState(seed)
        self.a = gen.normal(0, 1, (nbits, dim))
        self.b = gen.uniform(0.0, r)
        self.r = r

    def hash(self, vectors):
        normalized_vectors = vectors / np.linalg.norm(vectors, axis=1, keepdims=True)
        hash_values = (np.dot(normalized_vectors, self.a.T) + self.b) / self.r
        hash_binary = (hash_values >= 0).astype(int)
        return

In [None]:
class LSHIndex:
    def __init__(self, indices, vectors, r=5, num_bins=20, num_projections=5, seed=1):
        # Add indexes into vectors (these would be lost when binning)
        indexed_vectors = np.hstack((indices[:, np.newaxis], vectors))
        # Store the vector dimension
        self._dim = vectors.shape[1]

        # Created hash codes by applying our projections
        self._hasher = L2Hash(self._dim, r, seed, num_projections)
        self._r = r
        self._num_projections = num_projections
        self._seed = seed
        hash_codes = self._hasher.hash(vectors)

        # Create bins based on the hash codes (lowest and highest code are the boundaries)
        self._bins = self.__create_bins(hash_codes, num_bins)
        self._binned_vectors = self.__hashes_to_bins(indexed_vectors, hash_codes, self._bins)
        self._num_bins = num_bins
    
    # LSH is probabilistic, select most common bin (or randomly if more than one)
    def __select_bin(self, codes):
        values, counts = np.unique(codes, return_counts=True)
        max_count = np.max(counts)
        most_common_values = values[counts == max_count]
        return np.random.choice(most_common_values)

    # Create the bins used for the index
    def __create_bins(self, hashes, bins):
        minval = np.min(hashes)
        maxval = np.max(hashes)
        return np.linspace(start=minval, stop=maxval, num=bins)

    # Sort the hash codes into bins
    def __hashes_to_bins(self, vectors, hash_codes, bins):
        bin_index = np.digitize(hash_codes, bins)
        bin_index = np.apply_along_axis(self.__select_bin, 1, bin_index)-1

        bins_dict = dict()
        for i in range(bins.shape[0]):
            bins_dict[i] = vectors[bin_index == i]
        return bins_dict

    def __find_k_neighbours(self, target, K):
        neighbours = self._binned_vectors[target]

        if(neighbours.shape[0] < K):
            low_bin = target-1
            high_bin = target+1

            while neighbours.shape[0] < K:
                if low_bin >= 0:
                    neighbours = np.concatenate((neighbours, self._binned_vectors[low_bin]), axis=0)
                    low_bin -= 1
                if high_bin < self._num_bins:
                    neighbours = np.concatenate((neighbours, self._binned_vectors[high_bin]), axis=0)
                    high_bin += 1
                if low_bin < 0 and high_bin >= self._num_bins:
                    break
        
        return neighbours[:K]

    def search(self, vector, K=10):
        hash_code = self._hasher.hash([vector])
        bin_ids = np.digitize(hash_code, self._bins) - 1
        bin_id = self.__select_bin(bin_ids)

        candidate_vectors = self.__find_k_neighbours(bin_id, K)
        l2_distances = np.sum((candidate_vectors[:,1:] - vector) ** 2, axis=1)
        
        sorted_indices = np.argsort(l2_distances)
        result_indices = candidate_vectors[:,0][sorted_indices]
        
        return result_indices, l2_distances[sorted_indices]

    # Save function for storing the index as a npz file
    def save(self, path):
        np.savez_compressed(
            path,
            properties = {
                "dim": self._dim,
                "r": self._r,
                "seed": self._seed,
                "bins": self._bins,
                "num_bins": self._num_bins,
                "num_projections": self._num_projections
            },
            binned_vectors = self._binned_vectors
        )

    # Loading the object from a npz file (to avoid having to rebuild it every time)
    @classmethod
    def load(cls, path):
        data = np.load(path, allow_pickle=True)
        instance = cls.__new__(cls)
        properties = data["properties"].item()

        instance._dim = properties["dim"]
        instance._r = properties["r"]
        instance._seed = properties["seed"]
        instance._num_projections = properties["num_projections"]
        instance._hasher = L2Hash(instance._dim, instance._r, instance._seed, instance._num_projections)

        instance._bins = properties["bins"]
        instance._num_bins = properties["num_bins"]
        instance._binned_vectors = data["binned_vectors"].item()
        return instance

    def __str__(self):
        return f"LSHIndex ({', '.join([f'bin({i}) = {self._binned_vectors[i].shape[0]}' for i in self._binned_vectors if self._binned_vectors[i].shape[0] > 0])})"

## Steps

- We created an LSHIndex class, it takes vectors, indices, and the number of projections and bins.
- Projections are created, these are used to create hash_codes of each vector.
- These hash_codes are then used to create bin boundaries, and bins are created.
- The hash_codes are then put into these bins

### Notes
- Note that if the vectors are already normalized to 1, cosine similarity is just the dot product

In [None]:
index = LSHIndex(np.arange(embeddings.shape[0]), embeddings, 1, 100, 100)
print(index)
index.save('testindex')

LSHIndex (bin(32) = 1, bin(33) = 8, bin(34) = 26, bin(35) = 70, bin(36) = 184, bin(37) = 416, bin(38) = 1008, bin(39) = 2237, bin(40) = 3792, bin(41) = 6375, bin(42) = 10388, bin(43) = 15881, bin(44) = 21864, bin(45) = 29517, bin(46) = 37057, bin(47) = 44237, bin(48) = 49326, bin(49) = 51735, bin(50) = 53096, bin(51) = 51678, bin(52) = 48696, bin(53) = 42511, bin(54) = 35739, bin(55) = 28770, bin(56) = 20999, bin(57) = 15211, bin(58) = 10259, bin(59) = 6527, bin(60) = 3810, bin(61) = 1852, bin(62) = 1004, bin(63) = 446, bin(64) = 207, bin(65) = 69, bin(66) = 22, bin(67) = 15, bin(69) = 4)


In [None]:
from sentence_transformers import SentenceTransformer
import pandas as pd

washington_titles = np.load('washington_idtitle', allow_pickle=True)["title"]
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

def find_documents(index, query, K=10):
    query_vector = model.encode(query)
    
    indices = index.search(query_vector, K)[0]
    return washington_titles.iloc[indices]



In [None]:
%%time
pd.set_option('display.max_colwidth', None)

find_documents(index, "President USA", 60)[:10]

CPU times: total: 15.6 ms
Wall time: 13 ms


968     Has Obama taken Bush’s ‘preemption’ strategy to another level?
656                              Obama closes the book on the 9/11 era
77                                  Why Obama will (won’t) win in 2012
1031                                                              None
945                                    Hungary’s rush toward autocracy
253         Marco Rubio has what Mitt Romney needs in a vice president
371        Iran intensifies efforts to influence policy in Afghanistan
450         Hosni Mubarak should be executed, Egyptian prosecutors say
20         Argentine sports obsession spawns sports journalism schools
638                                 A pledge that compromises our oath
Name: title, dtype: object

In [None]:
%%time
pd.set_option('display.max_colwidth', None)

find_documents(index, "President USA", 600000)[:50]

CPU times: total: 1min 34s
Wall time: 1min 51s


92                                            Donald Trump sworn in as 45th president of the United States
3173                                              Trump’s election threatens human rights around the world
4760                                                                          A president unlike any other
3594                               Strong presidencies may threaten democracy. Luckily, we don’t have one.
4789               When Lincoln saved the union and freed the slaves, five ex-presidents tried to stop him
4347                                                                   Trump makes America disdained again
3009                        ‘Maximalist: America in the World from Truman to Obama’ by Stephen Sestanovich
2507                                                Donald Trump and the expanding power of the presidency
1828                  What did the Founders have in mind for the presidency? Here’s what you need to know.
2533                                 