In [1]:
from data_handling import get_data
from sentence_transformers import SentenceTransformer
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import faiss
from os.path import exists

# Data

In [None]:
data = get_data()
data.head()

In [None]:
data['text'] = data['title'] + " " + data['abstract']

# Model

In [None]:
model = SentenceTransformer('all-mpnet-base-v2', device='cuda')
model

#### Max Sequence Length

In [None]:
# Count number of words in text column
data['text'].apply(lambda x: len(x.split(' '))).hist(bins=30)
plt.yscale('log')
plt.vlines(384, 0, 10**6, colors='red')

# Faiss

In [None]:
model.encode(data['text'].values[0]).shape

In [None]:
class FaissIdx:
    def __init__(self, model, dim=768):
        # Maintaining the document data
        # self.doc_map = dict()
        self.model = model

        # Initialize the index
        self.index = faiss.IndexFlatIP(dim)

        # Use GPU
        res = faiss.StandardGpuResources()
        self.index = faiss.index_cpu_to_gpu(res, 0, self.index)

    def add_doc(self, document_text):
        batch_size = 256

        for i in tqdm(range(0, len(document_text), batch_size), desc="Adding documents to index", unit="batch"):
            self.index.add(self.model.encode(document_text[i:i+batch_size]))

        #self.doc_map[self.ctr] = document_text # store the original document text

        #def search_doc(self, query, k=3):
        #    D, I = self.index.search(self.model.encode(query).reshape(1, -1), k)
        #    return [{self.doc_map[idx]: score} for idx, score in zip(I[0], D[0]) if idx in self.doc_map]

    def load_index(self, index_path):
        # Convert index to cpu
        index.index = faiss.index_gpu_to_cpu(index.index)
        self.index = faiss.read_index(index_path)
        self.ctr = self.index.ntotal
        # Convert index back to gpu
        res = faiss.StandardGpuResources()
        self.index = faiss.index_cpu_to_gpu(res, 0, self.index)

    def save_index(self, index_path):
        # Convert index to cpu
        index.index = faiss.index_gpu_to_cpu(index.index)
        faiss.write_index(self.index, index_path)
        # Convert index back to gpu
        res = faiss.StandardGpuResources()
        self.index = faiss.index_cpu_to_gpu(res, 0, self.index)
    
index = FaissIdx(model)

In [None]:
# If index exists, load it
if exists('faiss_index.faiss'):
    print("Loading index...")
    index.load_index('index.faiss')

#### Add to index

In [None]:
#text_data = text_data[:1000]

In [None]:
text_data = data['text'].values
#text_data = text_data[:1000]

In [None]:
index.add_doc(text_data)

In [None]:
index.save_index('index.faiss')

In [None]:
# Load index
index.load_index('index.faiss')

In [None]:
# Search for similar documents
query = "Neural networks"

In [None]:
D, I = index.index.search(index.model.encode(query).reshape(1, -1), 3)

In [None]:
[(data.iloc[idx].title, score) for idx, score in zip(I[0], D[0]) if idx in list(data.index)]

In [None]:
I.shape, D.shape