# Introduction to Faiss

## Imports

In [None]:
import numpy as np
import polars as pl

from sentence_transformers import SentenceTransformer
import faiss


## Introduction to Faiss

### Data Loading

In [None]:
# data = [
#     "../data/2012_MSRpar.train.tsv",
#     "../data/2012_MSRpar.test.tsv",
#     "../data/2012_OnWN.test.tsv",
#     "../data/2013_OnWN.test.tsv",
#     "../data/2014_OnWN.test.tsv",
#     "../data/2014_images.test.tsv.txt",
#     "../data/2015_images.test.tsv",
# ]

# sentences = []
# for d in data:
#     print(f"path: {d}")
#     pl_data = pl.read_csv(d, separator="\t", has_header=False, quote_char=None, ignore_errors=True)
#     sentences.extend(pl_data['column_1'].to_list())
#     sentences.extend(pl_data['column_2'].to_list())

# len(set(sentences))

In [62]:
sentences_df = pl.read_csv("../data/sentences.txt", separator="\n", quote_char=None, has_header=False)
sentences_df.head(2)

column_1
str
"""A group of four children danci…"
"""The Conference Board said its …"


In [63]:
sentences = sentences_df['column_1'].to_list()
print(len(sentences))
sentences[:5]

14504


['A group of four children dancing in a backyard.',
 'The Conference Board said its measure of business confidence, which had fallen to 53 in the first quarter of 2003, improved to 60 in the most recent second quarter.',
 'a person eating a meal, often in a restaurant',
 'When you crossed the line, you violated the constitutional right," said Charles Weisselberg, who teaches law at the University of California, Berkeley.',
 "Ross Garber, Rowland's legal counsel, said the governor would have no comment on the condo deal."]

In [64]:
sentences = [word for word in list(set(sentences)) if type(word) is str]
print(len(sentences))

14504


### Embedding Generation

In [65]:
model = SentenceTransformer('bert-base-nli-mean-tokens')
sentence_embeddings = model.encode(sentences)
sentence_embeddings.shape

(14504, 768)

In [66]:
np.save("../data/sentence_embeddings.npy", sentence_embeddings)

### IndexFlatL2

In [67]:
d = sentence_embeddings.shape[1]
index = faiss.IndexFlatL2(d)
index.is_trained

True

In [68]:
index.add(sentence_embeddings) # type: ignore
index.ntotal

14504

In [69]:
# Then search given a query `xq` and number of nearest neigbors to return `k`.
k = 4
xq = model.encode(["Someone sprints with a football"])

In [70]:
%%time
D, I = index.search(xq, k)
print(I)

[[10382  7600  7312  4212]]
CPU times: total: 0 ns
Wall time: 3.14 ms


In [93]:
rows = I[0].tolist()

print(*[sentences[idx] for idx in rows], sep="\n")

A group of football players is running in the field
A group of people playing football is running in the field
Two groups of people are playing football
A person playing football is running past an official carrying a football


In [94]:
# we have 4 vectors to return (k) - so we initialize a zero array to hold them
vecs = np.zeros((k, d))
# then iterate through each ID from I and add the reconstructed vector to our zero-array
for i, val in enumerate(rows):
    vecs[i, :] = index.reconstruct(val)

vecs.shape

(4, 768)

## END