# Testing out Faiss

In [18]:
import os
import time
import math
import random
import numpy as np
import json
from sklearn.preprocessing import normalize
import faiss

# res = faiss.StandardGpuResources()  # use a single GPU

def dist2sim(d):
    return 1 - d / 2

In [19]:
def get_index(index_type, dim):
    if index_type == 'hnsw':
        m = 48
        index = faiss.IndexHNSWFlat(dim, m)
        index.hnsw.efConstruction = 128
        # gpu_index = faiss.index_cpu_to_gpu(res, 0, index)
        return index
    elif index_type == 'l2':
        index = faiss.IndexFlatL2(dim)
        # gpu_index = faiss.index_cpu_to_gpu(res, 0, index)
        return index
    raise

In [20]:
def populate(index, fvecs, batch_size=10000):
    nloop = math.ceil(fvecs.shape[0] / batch_size)
    for n in range(nloop):
        s = time.time()
        index.add(normalize(fvecs[n * batch_size : min((n + 1) * batch_size, fvecs.shape[0])]))
        print(n * batch_size, time.time() - s)

    return index

In [254]:
def populate_and_index_map(index, vectors, ids=ids_for_faiss_idmap):
    mapper = faiss.IndexIDMap(index)
    mapper.add_with_ids(normalize(vectors), ids_for_faiss_idmap)
    return mapper

In [33]:
dim = 768
index_type = 'hnsw'
index = get_index(index_type, dim)

In [31]:
import numpy as np
d = 768                          # dimension
nb = 10000                       # database size
# nq = 10                         # nb of queries
np.random.seed(42)             # make reproducible
xb = (np.random.uniform(-1, 1, (nb, d))/10).astype('float32')
# xb[:, 0] += np.arange(nb) / 1000.
# xq = (np.random.uniform(-1, 1, (nq, d))/10).astype('float32')
# xq[:, 0] += np.arange(nq) / 1000.

In [6]:
nq = 10                         # nb of queries
xq = (np.random.uniform(-1, 1, (nq, d))/10).astype('float32')

In [7]:
index = populate(index, xb)

0 0.07434678077697754
1000 0.14951086044311523
2000 0.19786739349365234
3000 0.28281164169311523
4000 0.33390212059020996
5000 0.39216113090515137
6000 0.4473230838775635
7000 0.502324104309082
8000 0.5784924030303955
9000 0.5920150279998779


In [8]:
index.is_trained

True

In [9]:
index.ntotal

10000

In [10]:
k = 4
s = time.time()
dists, idxs = index.search(normalize(xq), k)
print((time.time() - s) / len(xq))
print(idxs[0], dist2sim(dists[0]))

0.0002969503402709961
[2922  208 9244 8449] [0.12403476 0.11176467 0.10933161 0.09883845]


In [11]:
# faiss.write_index(index, "mock_vectors.index")

In [15]:
# laoaded_index = faiss.read_index("mock_vectors.index")

In [16]:
# laoaded_index.hnsw.efSearch =  256

In [17]:
# k = 10
# s = time.time()
# dists, idxs = laoaded_index.search(normalize(xq), k)
# print((time.time() - s) / len(xq))
# print(idxs[0], dist2sim(dists[0]))

# Real vectors

In [122]:
import pandas as pd

In [123]:
data = pd.read_pickle("FINAL_data_and_embeddings/data_with_title_embeddings.pkl")

In [124]:
data.sample()

Unnamed: 0,id,title,authors,venue,year,n_citation,page_start,page_end,doc_type,publisher,volume,issue,fos,doi,references,indexed_abstract,abstract,cleaned_abstract_sentences,cleaned_title,title_embedding
43658,2254177447,Semantic Image Segmentation with Task-Specific...,"[{'name': 'Liang-Chieh Chen', 'id': '212789804...",{'raw': 'computer vision and pattern recogniti...,2016,56,4545,4554,Conference,IEEE,,,"[{'name': 'Computer vision', 'w': 0.440761358}...",10.1109/CVPR.2016.492,"[825165083, 845365781, 1495267108, 1529410181,...","{'IndexLength': 134, 'InvertedIndex': {'Deep':...",Deep convolutional neural networks (CNNs) are ...,[deep convolutional neural networks cnns are t...,semantic image segmentation with task specific...,"[0.7282408, -0.56314397, -0.5906662, 0.9875755..."


In [125]:
data_with_retrofitted = pd.read_pickle("FINAL_data_and_embeddings/final_embeddings/data_with_retrofitted_embeddings.pkl")

In [126]:
data_with_retrofitted.sample()

Unnamed: 0,id,references,merged,separate,retrofitted_merged,retrofitted_separate
96964,1988737268,"[15605318, 1489448163, 1559371027, 1564628430,...","[0.4173065113524596, -0.23755234324683747, -0....","[0.38529178500175476, -0.45145511627197266, -0...","[0.03568840776127606, -0.026547973741589244, -...","[0.031883103531993646, -0.03341143669524094, -..."


In [127]:
data_with_retrofitted.iloc[67890]

id                                                             1961300657
references                                                   [2097192888]
merged                  [-0.325588196516037, 0.23439041152596474, -0.0...
separate                [-0.33311566710472107, 0.3387480080127716, 0.0...
retrofitted_merged      [-0.028069935572412445, 0.020207500826895, -0....
retrofitted_separate    [-0.026874356915329287, 0.027328750253076968, ...
Name: 67890, dtype: object

In [128]:
data["retro_merged"] = data_with_retrofitted["retrofitted_merged"]
data["retro_separate"] = data_with_retrofitted["retrofitted_separate"]
data["merged"] = data_with_retrofitted["merged"]
data["separate"] = data_with_retrofitted["separate"]

In [132]:
data.drop([63376], inplace=True) #strange abstract with no readable text
data.reset_index(drop=True, inplace=True)

In [141]:
def correct_retrofitted_separate(row):
    separate = row.separate
    retro = row.retro_separate
    
    if np.isnan(retro[0]):
        return separate
    else:
        return retro

In [143]:
data["retro_separate_corrected"] = data.apply(lambda x: correct_retrofitted_separate(x), axis=1)

In [199]:
# data.to_pickle("data_with_all_corrected_sbert_embeddings.pkl")

In [149]:
# {63376, 63411, 69199, 69201, 69219, 69225, 79760}
# data.iloc[69218]

In [14]:
# data.to_pickle("data_with_all_sbert_embeddings.pkl")

In [22]:
# m_e = np.array(data["merged"][0])

In [150]:
merged_embeddings = np.float32(np.array([np.array(m) for m in data.merged.to_numpy()]))

In [151]:
separate_embeddings = np.float32(np.array([np.array(m) for m in data.separate.to_numpy()]))

In [152]:
separate_retro_embeddings = np.float32(np.array([np.array(m) for m in data.retro_separate_corrected.to_numpy()]))

In [153]:
merged_retro_embeddings = np.float32(np.array([np.array(m) for m in data.retro_merged.to_numpy()]))

In [155]:
separate_embeddings.shape

(127716, 768)

In [1]:
merged_embeddings.shape

NameError: name 'merged_embeddings' is not defined

In [157]:
separate_retro_embeddings.shape

(127716, 768)

In [158]:
merged_retro_embeddings.shape

(127716, 768)

In [160]:
dim = 768
index_type = 'hnsw'
merged_embeddings_index = get_index(index_type, dim)

separate_embeddings_index = get_index(index_type, dim)
retro_separate_embeddings_index = get_index(index_type, dim)
retro_merged_embeddings_index = get_index(index_type, dim)

In [161]:
merged_embeddings_index = populate(merged_embeddings_index, merged_embeddings)
faiss.write_index(merged_embeddings_index, "merged_embeddings_faiss.index")

separate_embeddings_index = populate(separate_embeddings_index, separate_embeddings)
faiss.write_index(separate_embeddings_index, "separate_embeddings_faiss.index")

retro_separate_embeddings_index = populate(retro_separate_embeddings_index, separate_retro_embeddings)
faiss.write_index(retro_separate_embeddings_index, "retro_separate_embeddings_faiss.index")

retro_merged_embeddings_index = populate(retro_merged_embeddings_index, merged_retro_embeddings)
faiss.write_index(retro_merged_embeddings_index, "retro_merged_embeddings_faiss.index")

0 0.031844139099121094
1000 0.059911251068115234
2000 0.07129979133605957
3000 0.08702206611633301
4000 0.09132814407348633
5000 0.10360360145568848
6000 0.1086282730102539
7000 0.11718153953552246
8000 0.13128876686096191
9000 0.12652325630187988
10000 0.1412053108215332
11000 0.16633915901184082
12000 0.14229154586791992
13000 0.15894556045532227
14000 0.1808767318725586
15000 0.17592525482177734
16000 0.19408035278320312
17000 0.1464383602142334
18000 0.16267681121826172
19000 0.17173290252685547
20000 0.1857616901397705
21000 0.1758744716644287
22000 0.19088363647460938
23000 0.20973491668701172
24000 0.18694233894348145
25000 0.19225358963012695
26000 0.21924233436584473
27000 0.20987367630004883
28000 0.21449804306030273
29000 0.21845579147338867
30000 0.20595479011535645
31000 0.2428264617919922
32000 0.32498955726623535
33000 0.2404172420501709
34000 0.23930644989013672
35000 0.2664647102355957
36000 0.2369229793548584
37000 0.26708507537841797
38000 0.22872424125671387
39000 0

In [82]:
set([t[0] for t in np.argwhere(np.isnan(separate_retro_embeddings))])


{63376, 63411, 69199, 69201, 69219, 69225, 79760}

In [83]:
set([t[0] for t in np.argwhere(np.isnan(separate_embeddings))])

{63376}

In [55]:
random.seed(2020)
q_idx = [random.randint(0, merged_embeddings.shape[0]) for _ in range(100)]

k = 10
s = time.time()
dists, idxs = merged_embeddings_index.search(normalize(merged_embeddings[q_idx]), k)
print((time.time() - s) / len(q_idx))
print(idxs[0], dist2sim(dists[0]))

7.747650146484375e-05
[111806  46256 110282    474  99043  15540  91718  86581  25530  71084] [0.7220207  0.7070291  0.6979676  0.6947185  0.69459105 0.6912974
 0.6819054  0.68079495 0.6784862  0.67215836]


In [56]:
# faiss.write_index(merged_embeddings_index, "merged_embeddings_faiss.index")

In [162]:
retro_merged_embeddings_index.ntotal

127716

In [163]:
retro_separate_embeddings_index.ntotal

127716

In [164]:
merged_embeddings_index.ntotal

127716

In [165]:
separate_embeddings_index.ntotal

127716

In [214]:
ids_for_faiss_idmap = np.array([int(m) for m in data.id])

In [233]:
np.save("ids_for_faiss_idmap.npy", ids_for_faiss_idmap)

In [244]:
test_index = get_index(index_type, dim)

In [245]:
test_index

<faiss.swigfaiss.IndexHNSWFlat; proxy of <Swig Object of type 'faiss::IndexHNSWFlat *' at 0x7fc1d2c557b0> >

In [246]:
test_index2 = faiss.IndexIDMap(test_index)

In [247]:
test_index2.add_with_ids(normalize(merged_embeddings[:100000]), ids_for_faiss_idmap[:100000])

In [248]:
random.seed(2020)
q_idx = [random.randint(0, merged_embeddings.shape[0]) for _ in range(100)]

k = 100000
s = time.time()
dists, idxs = test_index2.search(normalize(merged_embeddings[q_idx]), k)
print((time.time() - s) / len(q_idx))
print(idxs[0][:50], dist2sim(dists[0][:50]))

0.00024907350540161134
[2127650263 2085583171 2160684493 1595483645 2142544755 2194321275
 2340427832 2170805580 2770298516 2130105540 2751118800 2164479831
 1993309788 1892722218 2467980042 2123967136 2121557314 2214145768
  208252816 2029269644 2269778407 2130318956 2074668887 1504448809
 2593658277 2015103117 2006273250 2767050701 2564726951 2588610957
 1983801113   53987483 2006859604 1987326241 2118459920 1989549063
 1987409251 1965417459 2040990302  105964407 2146571341 2113592754
 2201912979 1931792391 2007347635 1909733559 1825869920 2044170013
 2104602264 2786465559] [1.         0.82983524 0.82756734 0.8228171  0.8228043  0.8221962
 0.8199305  0.8198794  0.8188399  0.8173102  0.81725645 0.8157397
 0.8154375  0.8141068  0.8128588  0.81061065 0.8100102  0.8098298
 0.8095945  0.80849373 0.8082956  0.80810225 0.80793345 0.80726826
 0.8071704  0.8067992  0.806648   0.8056046  0.8052933  0.8052643
 0.80484784 0.8044739  0.802582   0.80163527 0.8013518  0.79907703
 0.79825926 0.79802

In [249]:
k = 100000
s = time.time()
dists, idxs = test_index.search(normalize(merged_embeddings[q_idx]), k)
print((time.time() - s) / len(q_idx))
print(idxs[0][:50], dist2sim(dists[0][:50]))

0.00024988651275634763
[81221 91605 13887 13422 58312  5242 92633  8334 18152 24050 39959 17565
 79906 99871 56659  3701 96907 50798 43613 97405  6513 84949 65146 44767
 51473 87039 57179 56075 73405 10997 31671 33542 52356 60175 70093 50863
 98783 26499 59594  5763  5988 28529 42379 61695 33629 18981 28784 40593
  9460 28848] [1.         0.82983524 0.82756734 0.8228171  0.8228043  0.8221962
 0.8199305  0.8198794  0.8188399  0.8173102  0.81725645 0.8157397
 0.8154375  0.8141068  0.8128588  0.81061065 0.8100102  0.8098298
 0.8095945  0.80849373 0.8082956  0.80810225 0.80793345 0.80726826
 0.8071704  0.8067992  0.806648   0.8056046  0.8052933  0.8052643
 0.80484784 0.8044739  0.802582   0.80163527 0.8013518  0.79907703
 0.79825926 0.79802686 0.7977789  0.796767   0.79630464 0.79609615
 0.7957384  0.79560864 0.7955977  0.79520804 0.7951923  0.79352576
 0.7934384  0.79310787]


In [227]:
test_index.ntotal

5

In [250]:
faiss.write_index(test_index2, "index_mapped.index")

In [251]:
test_mapped = faiss.read_index("index_mapped.index")

In [252]:
k = 100000
s = time.time()
dists, idxs = test_mapped.search(normalize(merged_embeddings[q_idx]), k)
print((time.time() - s) / len(q_idx))
print(idxs[0][:50], dist2sim(dists[0][:50]))

0.0003398799896240234
[2127650263 2085583171 2160684493 1595483645 2142544755 2194321275
 2340427832 2170805580 2770298516 2130105540 2751118800 2164479831
 1993309788 1892722218 2467980042 2123967136 2121557314 2214145768
  208252816 2029269644 2269778407 2130318956 2074668887 1504448809
 2593658277 2015103117 2006273250 2767050701 2564726951 2588610957
 1983801113   53987483 2006859604 1987326241 2118459920 1989549063
 1987409251 1965417459 2040990302  105964407 2146571341 2113592754
 2201912979 1931792391 2007347635 1909733559 1825869920 2044170013
 2104602264 2786465559] [1.         0.82983524 0.82756734 0.8228171  0.8228043  0.8221962
 0.8199305  0.8198794  0.8188399  0.8173102  0.81725645 0.8157397
 0.8154375  0.8141068  0.8128588  0.81061065 0.8100102  0.8098298
 0.8095945  0.80849373 0.8082956  0.80810225 0.80793345 0.80726826
 0.8071704  0.8067992  0.806648   0.8056046  0.8052933  0.8052643
 0.80484784 0.8044739  0.802582   0.80163527 0.8013518  0.79907703
 0.79825926 0.798026

In [253]:
# Write stuff with mapped ids

In [264]:
dim = 768
index_type = 'hnsw'

merged_embeddings_index = get_index(index_type, dim)
separate_embeddings_index = get_index(index_type, dim)
retro_separate_embeddings_index = get_index(index_type, dim)
retro_merged_embeddings_index = get_index(index_type, dim)

In [265]:
merged_embeddings_index_mapped = populate_and_index_map(merged_embeddings_index, merged_embeddings)
faiss.write_index(merged_embeddings_index_mapped, "Mapped_indeces/merged_embeddings_faiss.index")

separate_embeddings_index_mapped = populate_and_index_map(separate_embeddings_index, separate_embeddings)
faiss.write_index(separate_embeddings_index_mapped, "Mapped_indeces/separate_embeddings_faiss.index")

retro_separate_embeddings_index_mapped = populate_and_index_map(retro_separate_embeddings_index, separate_retro_embeddings)
faiss.write_index(retro_separate_embeddings_index_mapped, "Mapped_indeces/retro_separate_embeddings_faiss.index")

retro_merged_embeddings_index_mapped = populate_and_index_map(retro_merged_embeddings_index, merged_retro_embeddings)
faiss.write_index(retro_merged_embeddings_index_mapped, "Mapped_indeces/retro_merged_embeddings_faiss.index")

In [266]:
random.seed(2020)
q_idx = [random.randint(0, merged_embeddings.shape[0]) for _ in range(100)]

k = 100
s = time.time()
dists, idxs = retro_merged_embeddings_index_mapped.search(normalize(merged_embeddings[q_idx]), k)
print((time.time() - s) / len(q_idx))
print(idxs[0][:50], dist2sim(dists[0][:50]))

0.00012667179107666015
[2127650263 2588610957 2085583171 2003664523 2552241273 2078639473
 2164479831 2416041116 2007347635 2593658277 2083281082 2770379238
 1836465849 2194321275 1983801113 2221677593 2770298516 1513988862
 2185898173 2739330054 1595483645 2123565204 2806970737 2006273250
 2767050701 2252143850 2724651715 2620761940 2130750514 2122480991
 2053978470 2123967136 2085606725 2201467212 2269778407 2062465628
 2148112459 2757631751  121023703 2524545028 2467980042 2135719924
 2201912979 1729873854 1488195763 2132295085 2291575139 2111119385
 1574909006 1557517019] [0.956792   0.85677725 0.8562635  0.8536861  0.8522803  0.8499435
 0.8491037  0.8487227  0.8480271  0.8466462  0.8465451  0.8452147
 0.84481    0.8442543  0.8438333  0.84381014 0.84300447 0.8428029
 0.8425627  0.8424239  0.8423993  0.8416184  0.8412725  0.839786
 0.8392     0.8384499  0.8379636  0.8369426  0.83535814 0.8352574
 0.83433306 0.8336283  0.83348763 0.83330584 0.8332734  0.83310306
 0.8322484  0.8318646