In [1]:
import sys
sys.path.append('../../')

import faiss
import torch
import numpy as np
from sentence_transformers import SentenceTransformer, util
import nltk
from src.classes.document import Document
from pydantic_core import from_json

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
CORPUS_LOCATION = "../../data/corpus_jsonl/corpus.jsonl"

with open(CORPUS_LOCATION) as corpus_file:
    lines = corpus_file.readlines()
    documents = [Document.model_validate_json(line) for line in lines]

k = 100

In [5]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cuda


In [6]:
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\keith\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\keith\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [7]:
doc_ids = []
corpus_sentences = []
for doc in documents:
    doc_sentences = nltk.sent_tokenize(doc.contents)
    doc_ids.extend([doc.id]*len(doc_sentences))
    corpus_sentences.extend(doc_sentences)
doc_ids_np = np.array(doc_ids)
corpus_sentences_np = np.array(corpus_sentences)
    

In [8]:
model = SentenceTransformer("all-MiniLM-L6-v2", device=device)
embeddings = np.array(model.encode(corpus_sentences_np,convert_to_tensor=True,batch_size=128,device=device).to('cpu'))

  embeddings = np.array(model.encode(corpus_sentences_np,convert_to_tensor=True,batch_size=128,device=device).to('cpu'))


In [9]:
faiss.normalize_L2(embeddings)
quantizer = faiss.IndexFlatL2(embeddings.shape[1])
index = faiss.IndexIVFFlat(quantizer,embeddings.shape[1],embeddings.shape[0] // k)
index.train(embeddings)
index.add(embeddings)

In [10]:
query = np.array(model.encode(nltk.sent_tokenize("robot going to 3d imaging. make better motions")))
D, I =index.search(query, 10)
np.unique(I.flatten())


array([   906,    926,   3396,   6099,   6421,   7102,   7970,  14826,
        14827,  14833,  23970,  26229,  33354,  33503,  34201,  42667,
        42668, 192360, 201297, 223543])

In [11]:
top_doc_ids = np.unique(doc_ids_np[I.flatten()])
corpus_sentences_np[doc_ids_np == top_doc_ids[0]]
doc_sims = []
for id in top_doc_ids:
    doc_sims.append((str(id),float(max([model.similarity(query_s,embedding)  for query_s in query for embedding in embeddings[doc_ids_np==id]])))) 
doc_sims
sorted(doc_sims, key=lambda x: x[1], reverse=True)

[('257079127', 0.6689109802246094),
 ('259108523', 0.6504673957824707),
 ('12538994', 0.5457663536071777),
 ('263909429', 0.531419038772583),
 ('258832670', 0.5285805463790894),
 ('252595883', 0.5279020071029663),
 ('257280094', 0.5250652432441711),
 ('257279944', 0.5247530937194824),
 ('256105572', 0.5152944326400757),
 ('252780848', 0.5089223384857178),
 ('226226846', 0.4998963475227356),
 ('258823353', 0.4717143774032593),
 ('263605591', 0.4712994694709778),
 ('252780361', 0.4662661552429199),
 ('53235839', 0.4593668580055237),
 ('38294295', 0.4399349093437195),
 ('4071564', 0.4391446113586426)]

In [14]:
faiss.write_index(index,"../../indexes/colbert_index/index.faiss")
np.savez_compressed("../../indexes/colbert_index/embeddings", embeddings = embeddings, doc_ids = doc_ids_np, sentences = corpus_sentences_np)

In [15]:
from pyserini.index.lucene import LuceneIndexReader
from pyserini.search.lucene import LuceneSearcher