In [1]:
import os
from rank_bm25 import BM25Okapi

### Retrieval using BM25 Module

In [2]:
# Step 1: Load text files into a list
def load_texts_from_directory(directory_path):
    documents = []
    file_names = []
    for file_name in os.listdir(directory_path):
        if file_name.endswith('.txt'):
            file_path = os.path.join(directory_path, file_name)
            with open(file_path, 'r', encoding='utf-8') as file:
                documents.append(file.read())
                file_names.append(file_name)
    return documents, file_names

In [3]:
# Step 2: Preprocess documents (tokenization)
def preprocess_documents(documents):
    return [doc.split() for doc in documents]

In [4]:
# Step 3: Compute BM25 similarity
def compute_bm25_similarity(preprocessed_docs, corpus, file_names):
    bm25 = BM25Okapi(preprocessed_docs)
    scores_matrix = []
    for query in preprocessed_docs:
        doc_scores = bm25.get_scores(query)
        scores_matrix.append(doc_scores)
    return scores_matrix

In [5]:
# Step 4: Extracts the most similar documents
def similarity(similarity_matrix, file_names, i):
    pairs = []
    for j in range(len(similarity_matrix[i])):
        pairs.append((file_names[j], similarity_matrix[i][j]))
    l = sorted(pairs, key=lambda x: x[1])
    l1 = l[-5:]
    ret = []
    for j in range(len(l1)):
        ret.append(l1[j][0])
    return ret

In [6]:
directory_path = "./txt_papers"

# Load and preprocess the documents
documents, file_names = load_texts_from_directory(directory_path)
preprocessed_docs = preprocess_documents(documents)

# Compute and display BM25 similarity
similarity_matrix = compute_bm25_similarity(preprocessed_docs, documents, file_names)
# preprocessed_docs, documents, file_names
for i in range(len(file_names)):
    similar_docs = similarity(similarity_matrix, file_names, i)
    print(f"For Document {file_names[i]} most similar docs are", similar_docs)

For Document 1706.03762.txt most similar docs are ['1703.03130.txt', '1701.06538.txt', '1609.08144.txt', '1705.03122v2.txt', '1706.03762.txt']
For Document 1607.06450.txt most similar docs are ['1702.00887.txt', '1701.06538.txt', '1705.03122v2.txt', 'srivastava14a.txt', '1607.06450.txt']
For Document 1601.06733.txt most similar docs are ['1606.04199.txt', '1702.00887.txt', 'D16-1244.txt', '1703.03130.txt', '1601.06733.txt']
For Document 1308.0850.txt most similar docs are ['1602.02410.txt', '1607.06450.txt', '1609.08144.txt', 'srivastava14a.txt', '1308.0850.txt']
For Document 1511.08228.txt most similar docs are ['1609.08144.txt', 'srivastava14a.txt', '1701.06538.txt', '1602.02410.txt', '1511.08228.txt']
For Document 1508.07909.txt most similar docs are ['1705.03122v2.txt', '1703.03906.txt', '1606.04199.txt', '1609.08144.txt', '1508.07909.txt']
For Document 1701.06538.txt most similar docs are ['1706.03762.txt', '1602.02410.txt', '1705.03122v2.txt', '1609.08144.txt', '1701.06538.txt']


### Retrieval using BERT

In [7]:
import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Step 2: Encode documents using BERT
def encode_documents(documents, model, tokenizer, device):
    embeddings = []
    for doc in documents:
        inputs = tokenizer(doc, padding=True, truncation=True, return_tensors="pt", max_length=512)
        inputs = {key: val.to(device) for key, val in inputs.items()}
        with torch.no_grad():
            outputs = model(**inputs)
            embeddings.append(outputs.last_hidden_state[:, 0, :].squeeze().cpu().numpy())
    return np.array(embeddings)

# Step 3: Compute similarity matrix
def compute_similarity_matrix(embeddings, file_names):
    similarity_matrix = cosine_similarity(embeddings)
    return similarity_matrix


# Step 4: Extracts the most similar documents
def similarity(similarity_matrix, file_names, i):
    pairs = []
    for j in range(len(similarity_matrix[i])):
        pairs.append((file_names[j], similarity_matrix[i][j]))
    l = sorted(pairs, key=lambda x: x[1])
    l1 = l[-5:]
    ret = []
    for j in range(len(l1)):
        ret.append(l1[j][0])
    return ret

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

  return torch._C._cuda_getDeviceCount() > 0


device(type='cpu')

In [9]:
directory_path = "./txt_papers"

model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

embeddings = encode_documents(documents, model, tokenizer, device)
similarity_matrix = compute_similarity_matrix(embeddings, file_names)
for i in range(len(file_names)):
    similar_docs = similarity(similarity_matrix, file_names, i)
    print(f"For Document {file_names[i]} most similar docs are", similar_docs)

For Document 1706.03762.txt most similar docs are ['1701.06538.txt', '1512.00567.txt', '1703.10722.txt', '1606.04199.txt', '1706.03762.txt']
For Document 1607.06450.txt most similar docs are ['1409.0473.txt', '1412.3555.txt', '1705.03122v2.txt', 'srivastava14a.txt', '1607.06450.txt']
For Document 1601.06733.txt most similar docs are ['1409.0473.txt', '1508.04025.txt', '1702.00887.txt', '1703.03906.txt', '1601.06733.txt']
For Document 1308.0850.txt most similar docs are ['1608.05859.txt', '1409.0473.txt', '1705.03122v2.txt', 'srivastava14a.txt', '1308.0850.txt']
For Document 1511.08228.txt most similar docs are ['1406.1078.txt', 'D16-1244.txt', '1701.06538.txt', 'srivastava14a.txt', '1511.08228.txt']
For Document 1508.07909.txt most similar docs are ['1701.06538.txt', 'D16-1244.txt', '1705.03122v2.txt', '1409.0473.txt', '1508.07909.txt']
For Document 1701.06538.txt most similar docs are ['D16-1244.txt', '1606.04199.txt', 'N16-1118.txt', '1511.08228.txt', '1701.06538.txt']
For Document 1