In [25]:
from LSH import LSHIndex
import numpy as np
import timeit
import math

In [28]:
file_ids = np.load('../Data/washington_idtitle.npz', allow_pickle=True)['id']
file_titles = np.load('../Data/washington_idtitle.npz', allow_pickle=True)['title']
file_vectors = np.load('../Data/minilm_mean_vectors.npz')['vectors']

# Get documentID by index
def get_docid(i):
    indices = i.astype(int)
    return np.array(file_ids)[indices]

# Get title by index
def get_doctitle(i):
    indices = i.astype(int)
    return np.array(file_titles)[indices]

# Get vector from documentID
def vector_from_docid(docid):
    vectors = file_vectors
    index = np.where(np.array(file_ids) == docid)[0][0]
    return vectors[index]
    

# Get nearest documents from another document ID
def nearest_documents(index, docid, K=200):
    query_vector = vector_from_docid(docid)
    
    # Retrieve more than needed, so the queried document can be removed
    i, d = index.search(query_vector, K=K+20)
    
    # Remove the document itself from the results (since the evaluation will penalize it otherwise) by removing all documents with distance 0
    filtered_index = np.where(d != 0)
    filtered_index[:K]
    i = i[filtered_index]
    d = d[filtered_index]
    
    scores = d.max() - (d - d.min()) / (d.max() - d.min())
    return get_docid(i), scores


In [30]:
from bs4 import BeautifulSoup

def get_topics(path):
    with open(path, "r") as input_file:
        soup = BeautifulSoup(input_file.read(), "xml")
        topics = soup.find_all('top')

    return [(int(topic.find('num').text.split(': ')[1]), topic.find('docid').text) for topic in topics]

def create_RUN(index, name, K=200, num=0):
    topics = get_topics('TREC/topics.backgroundlinking18.txt')
    
    with open(f'RUNS/{name}_runfile.txt', 'w') as f:
        for (num, docid) in topics:
            nearest, scores = nearest_documents(index, docid, K=K)
            i = 0
            for doc, score in zip(nearest, scores):
                f.write(f"{num} Q0 {doc} {i} {score} {name}\n")
                i += 1
            print(f'Finished topic: {num}')

create_RUN(index, 'test1')

Finished topic: 321
Finished topic: 336
Finished topic: 341
Finished topic: 347
Finished topic: 350
Finished topic: 362
Finished topic: 363
Finished topic: 367
Finished topic: 375
Finished topic: 378
Finished topic: 393
Finished topic: 397
Finished topic: 400
Finished topic: 408
Finished topic: 414
Finished topic: 422
Finished topic: 426
Finished topic: 427
Finished topic: 433
Finished topic: 439
Finished topic: 442
Finished topic: 445
Finished topic: 626
Finished topic: 646
Finished topic: 690
Finished topic: 801
Finished topic: 802
Finished topic: 803
Finished topic: 804
Finished topic: 805
Finished topic: 806
Finished topic: 807
Finished topic: 808
Finished topic: 809
Finished topic: 810
Finished topic: 811
Finished topic: 812
Finished topic: 813
Finished topic: 814
Finished topic: 815
Finished topic: 816
Finished topic: 817
Finished topic: 818
Finished topic: 819
Finished topic: 820
Finished topic: 821
Finished topic: 822
Finished topic: 823
Finished topic: 824
Finished topic: 825


In [None]:
# Measure average timer per query for reps rounds
def time_per_query(index, query, reps=50, K=200):
    return timeit.timeit(f"index.search(query, K)", globals={"index": index, "query": query, "K": K}, number=reps)/reps

# Measure index performance
def measure_index(index, queries, K=200):
    res = dict()
    
    times = [time_per_query(index, query, K=K) for query in queries]
    res['num_queries'] = len(queries)
    res['min_time'] = min(times)
    res['max_time'] = max(times)
    res['mean_time'] = sum(times)/len(queries)
    
    nearest, scores = nearest_documents(index, '988147454a2b8eafd1535cd673dd04ba', K=K)
    print(nearest,scores)
    
    return res

In [None]:
embeddings = np.load('../Data/minilm_mean_vectors.npz')['vectors']
queries = embeddings[np.random.choice(np.arange(embeddings.shape[0]), 20)]
n = embeddings.shape[0]
print(f"Recommended: {math.ceil(math.log2(pow(2*n, 1/3)))}")

Recommended: 7


In [None]:
index = LSHIndex(embeddings.shape[1], 1.3, 7, 100)
index.add(np.arange(embeddings.shape[0]), embeddings)

In [None]:
measure_index(index, queries)

['1e45a6cafa3239451c8262692fc68508' '41443585a94f09fca4c6f9023ef8f504'
 '76762d49a9a3e3055385ff1a99177848' 'abac86220c721694c86ea52f5730f157'
 '5a4e0287e3705e2794e6e8f20d7d7547' '130dd4c2f2a5a5e4d4989aad98aacb66'
 '3cb7f5aee3d09daf18158a7270f9f4fc' '9edc6efb21233c41d766b74ed219b2d8'
 '8b4077d6a9f83b30ae76811d5433f252' 'd88842ade2dd20e5a68c5be5735ae977'
 'ad2923b049e2a9a74e8f0e95899febe4' 'f62067caf4c8f56dda2543480fa74aa2'
 '4f0a7df6-2065-11e2-9cd5-b55c38388962'
 'b26e9318-760c-11e1-883d-f22537a8ca20' '3b0dffd704f86ecfe351670a84f0fce5'
 '9cc8d14a04bc1ba4d736cbdf9f9fe39d' '47e132051c06e7ef27008e66422d758c'
 '96c45a2ab04f0927dc5a2d543eac7050' '418a41696a2d7ebac01f66eb782b5350'
 'bf5e2298-26ef-11e7-b503-9d616bd5a305' '1ff5530d8c7f3c4128e69209759f86f0'
 '9bf10c388b7fb9ab3f33db2b03c39d8c' 'b94f2794a723c0a01a3ca057286b187a'
 'f1088bb8c50b58781d0590f31b057e98' 'ff9bd4a8-55b7-11e2-89de-76c1c54b1418'
 'da7972cb9b6ec71b73a64b08ff5d3337' '192f34cfd7daa1500c98a73852d275bc'
 'b0fc29cc466008c115d40bb

{'num_queries': 20,
 'min_time': 0.003146218000911176,
 'max_time': 0.08785019000060856,
 'mean_time': 0.03659964179992676}

## Measurements

- We measure min, max, and mean time over 20 queries sampled from the source vectors.
This is because if the distribution over bins is good, we will have min and max close to the mean.
So when we select a value for r, we need to check these values for a good distribution over the bins.

