Abstracts = documents for similarity search.

Query = user input.

want to display the title of the matched paper (not just the abstract) in the final output.

In [1]:
import pandas as pd

In [3]:
df = pd.read_csv('data/ML-Arxiv-Papers.csv')
df = df[['title', 'abstract']]

In [4]:
df.head()

Unnamed: 0,title,abstract
0,Learning from compressed observations,The problem of statistical learning is to co...
1,Sensor Networks with Random Links: Topology De...,"In a sensor network, in practice, the commun..."
2,The on-line shortest path problem under partia...,The on-line shortest path problem is conside...
3,A neural network approach to ordinal regression,Ordinal regression is an important type of l...
4,Parametric Learning and Monte Carlo Optimization,This paper uncovers and explores the close r...


In [127]:
abstracts = df['abstract'].values[:500]

In [128]:
len(abstracts)

500

In [129]:
from sentence_transformers import SentenceTransformer
import numpy as np
import faiss

In [130]:
bi_encoder = SentenceTransformer('all-MiniLM-L6-v2')

### Encode Documents with Bi-Encoder

In [131]:
# Step 1: Encode all documents
abstracts_embeddings = bi_encoder.encode(abstracts, convert_to_numpy=True)

# Step 2: Build FAISS index
index = faiss.IndexFlatL2(abstracts_embeddings.shape[1])
index.add(abstracts_embeddings)

In [132]:
abstracts_embeddings.shape

(500, 384)

### Cross-Encoder for Reranking

In [133]:
from sentence_transformers import CrossEncoder

cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')


### Search Pipeline

In [134]:
query = 'problem of statistical learning'
top_k = 3

In [135]:
# Bi-Encoder stage
query_embedding = bi_encoder.encode(query, convert_to_numpy=True)
_, top_k_indices = index.search(np.array([query_embedding]), top_k)
candidates = df.iloc[top_k_indices[0]].copy()
candidates

Unnamed: 0,title,abstract
117,Statistical Learning of Arbitrary Computable C...,Statistical learning theory chiefly studies ...
288,Equations of States in Statistical Learning fo...,Many learning machines that have hierarchica...
111,Learning Low-Density Separators,"We define a novel, basic, unsupervised learn..."


In [136]:
# Cross-Encoder Reranking
pairs = [[query, abstract] for abstract in candidates['abstract'].values]

In [137]:
scores = cross_encoder.predict(pairs)
scores

array([5.4768353, 3.9468727, 2.9747014], dtype=float32)

In [138]:
candidates['scores'] = scores

In [139]:
# Sort by score
candidates.sort_values('scores', ascending=False, inplace=True)
candidates

Unnamed: 0,title,abstract,scores
117,Statistical Learning of Arbitrary Computable C...,Statistical learning theory chiefly studies ...,5.476835
288,Equations of States in Statistical Learning fo...,Many learning machines that have hierarchica...,3.946873
111,Learning Low-Density Separators,"We define a novel, basic, unsupervised learn...",2.974701


In [140]:
# Search Pipeline
def search(query, top_k=5):
    # Bi-Encoder stage
    query_embedding = bi_encoder.encode(query, convert_to_numpy=True)
    _, top_k_indices = index.search(np.array([query_embedding]), top_k)
    candidates = df.iloc[top_k_indices[0]].copy()

    # Cross-Encoder Reranking
    pairs = [[query, abstract] for abstract in candidates['abstract'].values]
    scores = cross_encoder.predict(pairs)

    candidates['scores'] = scores
    candidates.sort_values('scores', ascending=False, inplace=True)
    return candidates

In [141]:
search(query)

Unnamed: 0,title,abstract,scores
117,Statistical Learning of Arbitrary Computable C...,Statistical learning theory chiefly studies ...,5.476835
288,Equations of States in Statistical Learning fo...,Many learning machines that have hierarchica...,3.946873
111,Learning Low-Density Separators,"We define a novel, basic, unsupervised learn...",2.974701
269,Introduction to Machine Learning: Class Notes ...,Introduction to Machine learning covering St...,-0.372652
467,A Generalization of the Chow-Liu Algorithm and...,We extend the Chow-Liu algorithm for general...,-8.988293
