#### Helper functions for TREC topics and documents

In [1]:
from LSH import LSHIndex
import numpy as np
import timeit
import math

In [2]:
file_ids = np.load('../Data/washington_idtitle.npz', allow_pickle=True)['id']
file_titles = np.load('../Data/washington_idtitle.npz', allow_pickle=True)['title']
file_vectors = np.load('../Data/minilm_mean_vectors.npz')['vectors']

# Get documentID by index
def get_docid(i):
    indices = i.astype(int)
    return np.array(file_ids)[indices]

# Get title by index
def get_doctitle(i):
    indices = i.astype(int)
    return np.array(file_titles)[indices]

# Get vector from documentID
def vector_from_docid(docid):
    vectors = file_vectors
    index = np.where(np.array(file_ids) == docid)[0][0]
    return vectors[index]
    

# Get nearest documents from another document ID
def nearest_documents(index, docid, K=200):
    query_vector = vector_from_docid(docid)
    
    # Retrieve more than needed, so the queried document can be removed
    i, d = index.search(query_vector, K=K+20)
    
    # Remove the document itself from the results (since the evaluation will penalize it otherwise) by removing all documents with distance 0
    filtered_index = np.where(d != 0)
    i = i[filtered_index][:K]
    d = d[filtered_index][:K]
    
    scores = 1 - (d - d.min()) / (d.max() - d.min())
    return get_docid(i), scores


#### Helper functions for TREC RUN files

In [3]:
from bs4 import BeautifulSoup
import subprocess
import pandas as pd

In [4]:
# Get the topics used for evaluation from a topics xml file
def get_topics(path):
    with open(path, "r") as input_file:
        soup = BeautifulSoup(input_file.read(), "xml")
        topics = soup.find_all('top')

    return [(int(topic.find('num').text.split(': ')[1]), topic.find('docid').text) for topic in topics]

# Create a RUN file given an index
def create_RUN(index, name, K=200, num=0):
    topics = get_topics('Trec/topics.backgroundlinking18.txt')
    
    with open(f'Trec/Runs/{name}_RUN.txt', 'w') as f:
        for (num, docid) in topics:
            nearest, scores = nearest_documents(index, docid, K=K)
            i = 1
            for doc, score in zip(nearest, scores):
                f.write(f"{num} Q0 {doc} {i} {score} {name}\n")
                i += 1

# Evaluate a RUN file using the given qrels
def eval_RUN(qrels, runs, output):
  try:
    result = subprocess.run(['wsl',
                             './Trec/trec_eval',
                             '-c', '-M1000',
                             '-m', 'map',
                             '-m', 'P.10',
                             '-m', 'ndcg_cut.5',
                             '-m', 'recall.100',
                             qrels,
                             runs],
                            stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
    print(result.stderr)
    with open(output, 'w') as f:
      f.write(result.stdout)
  except Exception as e:
    print(e)

# Retrieve the NDCG@5 score from an eval file
def retrieve_NDCG(eval_file):
    df = pd.read_csv(eval_file, sep='\s+', header=None, names=["Metric", "All", "Value"])
    return df.loc[df["Metric"] == "ndcg_cut_5", "Value"].values[0]

#### Our measurement functions

In [5]:
# Measure average timer per query for reps rounds
def time_per_query(index, query, reps=50, K=200):
    return timeit.timeit(f"index.search(query, K)", globals={"index": index, "query": query, "K": K}, number=reps)/reps

# Measure index performance
def measure_index(index, queries, name, K=50):
    print(f"started...", end='')
    res = dict()
    
    times = [time_per_query(index, query, K=K) for query in queries]
    print("got times...", end='')
    
    res['num_queries'] = len(queries)
    res['min_time'] = min(times)
    res['max_time'] = max(times)
    res['mean_time'] = sum(times)/len(queries)
    
    create_RUN(index, name, K=K)
    eval_RUN('Trec/qrels.backgroundlinking18.txt', f'Trec/Runs/{name}_RUN.txt', f'Trec/Evals/{name}.txt')
    res['ndcg_5'] = retrieve_NDCG(f'Trec/Evals/{name}.txt')
    print("done!")
    
    return res

#### The actual measurements

To know the amount of bins we want, we looked into the rice rule and the freedman-diaconis rule, however both are used for histogram binning.

Measure:
- 5 lines for each `r` in `[0.5, 1.3, 2, 5, 10]`
- x-axis: `#bins`
- y-axis: `NDCG@5` or `mean search time`

In [6]:
embeddings = np.load('../Data/minilm_mean_vectors.npz')['vectors']
# Sample 20 vectors as the queries
queries = embeddings[np.random.choice(np.arange(embeddings.shape[0]), 20)]
n = embeddings.shape[0]

In [15]:
import pickle

rs = np.array([0.5, 0.8, 1.3, 2, 5])
xs = np.arange(6, 12)

file = open("parameter_optimisation.pkl",'rb')
total_res = pickle.load(file)

for r in rs:
    for x in xs:
        name = f'r{r}_x{x}'
        
        if name in total_res:
            print(f"{name} already measured!")
            continue
        
        index = LSHIndex(embeddings.shape[1], r, x, 100)
        index.add(np.arange(embeddings.shape[0]), embeddings)
        
        print(f"Measuring r{r} x{x}: ", end='')

        dic = measure_index(index, queries, name)
        total_res[name] = dic
        print(total_res)
        
with open('parameter_optimisation.pkl', 'wb') as f:
    pickle.dump(total_res, f)
    print('total_res stored')

r0.5_x6 already measured!
r0.5_x7 already measured!
r0.5_x8 already measured!
r0.5_x9 already measured!
r0.5_x10 already measured!
r0.5_x11 already measured!
r0.8_x6 already measured!
r0.8_x7 already measured!
r0.8_x8 already measured!
r0.8_x9 already measured!
r0.8_x10 already measured!
r0.8_x11 already measured!
r1.3_x6 already measured!
r1.3_x7 already measured!
r1.3_x8 already measured!
r1.3_x9 already measured!
r1.3_x10 already measured!
r1.3_x11 already measured!
r2.0_x6 already measured!
r2.0_x7 already measured!
r2.0_x8 already measured!
r2.0_x9 already measured!
r2.0_x10 already measured!
r2.0_x11 already measured!
r5.0_x6 already measured!
r5.0_x7 already measured!
r5.0_x8 already measured!
r5.0_x9 already measured!
r5.0_x10 already measured!
r5.0_x11 already measured!
total_res stored


## Measurements

- We measure min, max, and mean time over 20 queries sampled from the source vectors.
This is because if the distribution over bins is good, we will have min and max close to the mean.
So when we select a value for r, we need to check these values for a good distribution over the bins.



In [16]:
total_res

{'r0.5_x6': {'num_queries': 20,
  'min_time': 0.016373948000837118,
  'max_time': 0.28149472599849107,
  'mean_time': 0.09390679139981514,
  'ndcg_5': 0.1668},
 'r0.5_x7': {'num_queries': 20,
  'min_time': 0.005667737999465317,
  'max_time': 0.08985833599930629,
  'mean_time': 0.03328375279984903,
  'ndcg_5': 0.1247},
 'r0.5_x8': {'num_queries': 20,
  'min_time': 0.0019153680000454187,
  'max_time': 0.057285086000338194,
  'mean_time': 0.01759525760007091,
  'ndcg_5': 0.0918},
 'r0.5_x9': {'num_queries': 20,
  'min_time': 0.0022914760001003743,
  'max_time': 0.03249052600003779,
  'mean_time': 0.008412997000035831,
  'ndcg_5': 0.0912},
 'r0.5_x10': {'num_queries': 20,
  'min_time': 0.00258402599953115,
  'max_time': 0.02361889399820939,
  'mean_time': 0.005662937099812553,
  'ndcg_5': 0.0553},
 'r0.5_x11': {'num_queries': 20,
  'min_time': 0.0052177079999819395,
  'max_time': 0.013893606001511216,
  'mean_time': 0.007432482300093396,
  'ndcg_5': 0.0676},
 'r1.3_x6': {'num_queries': 20,