In [1]:
import pickle
import faiss

In [2]:
def load_data():
    with open('stackexchange.pickle', 'rb') as f:
        data = pickle.load(f)
    return data

data = load_data()
vectors = data["vector"]
names = data["name"]
data

{'name': array(['question_id:0', 'question_id:1', 'question_id:2', ...,
        'question_id:11280893', 'question_id:11280894',
        'question_id:11280895'], dtype='<U20'),
 'vector': array([[ 5.9850649e-03,  8.6651370e-02,  1.6981743e-02, ...,
         -7.0949502e-02,  1.1380416e-02, -5.2825913e-02],
        [-9.0043023e-02,  9.1169104e-02, -9.2082359e-03, ...,
          2.4855049e-02,  6.4666881e-03,  9.2397273e-02],
        [ 8.6565606e-02, -8.6956300e-02,  8.6640947e-02, ...,
         -8.6726159e-02, -8.6673632e-02,  8.6663313e-02],
        ...,
        [-1.0912500e-01, -8.7910935e-02, -4.0837310e-02, ...,
         -4.0971446e-01,  3.0658851e+00,  4.9759048e-01],
        [ 2.3090811e-01, -4.8821247e-03, -2.4868276e-02, ...,
         -1.0963874e-04,  5.8465521e-04,  1.3642834e-03],
        [ 2.2516649e+00,  2.1858168e+00, -4.2523792e-01, ...,
         -3.1400056e+00, -4.5083749e-01,  7.7036595e-01]], dtype=float32)}

In [3]:
faiss.MatrixStats(data['vector']).comments.split("\n")

['analyzing 11280896 vectors of size 64',
 'no NaN or Infs in data',
 '11280825 vectors are distinct (100.00%)',
 'vector 1359211 has 19 copies',
 'range of L2 norms=[2.09727e-44, 2.16837e+18] (19 null vectors)',
 'vectors have very large differences in norms, is this normal?',
 'matrix contains 0.00 % 0 entries',
 'no constant dimensions',
 'no dimension has a too large mean',
 'stddevs per dimension are in [1.27641e+11 7.98059e+14]',
 '']

In [4]:
class LSH():
    def __init__(self, vectors, labels):
        self.dimension = vectors.shape[1]
        self.vectors = vectors.astype('float32')
        self.labels = labels    
   
    def build(self, num_bits=8):
        self.index = faiss.IndexLSH(self.dimension, num_bits)
        self.index.add(self.vectors)
        
    def query(self, vectors, k=10):
        distances, indices = self.index.search(vectors, k) 
        return [self.labels[i] for i in indices[0]]

In [5]:
index = LSH(vectors, names)
index.build()

In [6]:
from random import randrange
randIndex = randrange(len(vectors))-1
search_vector = vectors[randIndex:randIndex+1]
print(f"Top 10 similar questions to {names[randIndex]} are:\n")
print(index.query(search_vector))

Top 10 similar questions to question_id:10698682 are:

['question_id:1741', 'question_id:2411', 'question_id:2233', 'question_id:764', 'question_id:2539', 'question_id:2724', 'question_id:2751', 'question_id:2767', 'question_id:1713', 'question_id:643']
