In [1]:
from data_handling import get_data
from sentence_transformers import SentenceTransformer
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import faiss
from os.path import exists

# Data

In [8]:
data = get_data()
data.head()

Unnamed: 0,abstract,title,url
0,A fully differential calculation in perturba...,Calculation of prompt diphoton production cros...,https://arxiv.org/abs/0704.0001
1,"We describe a new algorithm, the $(k,\ell)$-...",Sparsity-certifying Graph Decompositions,https://arxiv.org/abs/0704.0002
2,The evolution of Earth-Moon system is descri...,The evolution of the Earth-Moon system based o...,https://arxiv.org/abs/0704.0003
3,We show that a determinant of Stirling cycle...,A determinant of Stirling cycle numbers counts...,https://arxiv.org/abs/0704.0004
4,In this paper we show how to compute the $\L...,From dyadic $\Lambda_{\alpha}$ to $\Lambda_{\a...,https://arxiv.org/abs/0704.0005


In [9]:
data['text'] = data['title'] + " " + data['abstract']

# Model

In [2]:
model = SentenceTransformer('all-mpnet-base-v2')
model

SentenceTransformer(
  (0): Transformer({'max_seq_length': 384, 'do_lower_case': False}) with Transformer model: MPNetModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Normalize()
)

#### Max Sequence Length

In [None]:
# Count number of words in text column
data['text'].apply(lambda x: len(x.split(' '))).hist(bins=30)
plt.yscale('log')
plt.vlines(384, 0, 10**6, colors='red')

# Faiss

In [None]:
model.encode(data['text'].values[0]).shape

In [3]:
class FaissIdx:
    def __init__(self, model, dim=768):
        # Maintaining the document data
        # self.doc_map = dict()
        self.model = model

        # Initialize the index
        self.index = faiss.IndexFlatIP(dim)

        # Use GPU
        res = faiss.StandardGpuResources()
        self.index = faiss.index_cpu_to_gpu(res, 0, self.index)

    def add_doc(self, document_text):
        batch_size = 256

        for i in tqdm(range(0, len(document_text), batch_size), desc="Adding documents to index", unit="batch"):
            self.index.add(self.model.encode(document_text[i:i+batch_size]))

        #self.doc_map[self.ctr] = document_text # store the original document text

        #def search_doc(self, query, k=3):
        #    D, I = self.index.search(self.model.encode(query).reshape(1, -1), k)
        #    return [{self.doc_map[idx]: score} for idx, score in zip(I[0], D[0]) if idx in self.doc_map]

    def load_index(self, index_path):
        # Convert index to cpu
        index.index = faiss.index_gpu_to_cpu(index.index)
        self.index = faiss.read_index(index_path)
        self.ctr = self.index.ntotal
        # Convert index back to gpu
        res = faiss.StandardGpuResources()
        self.index = faiss.index_cpu_to_gpu(res, 0, self.index)

    def save_index(self, index_path):
        # Convert index to cpu
        index.index = faiss.index_gpu_to_cpu(index.index)
        faiss.write_index(self.index, index_path)
        # Convert index back to gpu
        res = faiss.StandardGpuResources()
        self.index = faiss.index_cpu_to_gpu(res, 0, self.index)
    
index = FaissIdx(model)

In [4]:
index.load_index('index.faiss')
index.ctr

565241

In [None]:
# If index exists, load it
if exists('faiss_index.faiss'):
    print("Loading index...")
    index.load_index('index.faiss')

#### Add to index

In [None]:
#text_data = text_data[:1000]

In [None]:
text_data = data['text'].values
#text_data = text_data[:1000]

In [None]:
index.add_doc(text_data)

In [None]:
index.save_index('index.faiss')

In [None]:
# Load index
index.load_index('index.faiss')

In [5]:
# Search for similar documents
query = "Neural networks"

In [6]:
D, I = index.index.search(index.model.encode(query).reshape(1, -1), 3)

In [10]:
[(data.iloc[idx].title, score) for idx, score in zip(I[0], D[0]) if idx in list(data.index)]

[('Neural Networks for Complex Data', 0.5609818),
 ('Neural Networks for Handwritten English Alphabet Recognition', 0.5540364),
 ('Computer Model of a "Sense of Humour". II. Realization in Neural\n  Networks',
  0.54914415)]

In [14]:
# Search for similar documents
query = "Finding New Physics without learning about it: Anomaly Detection as a tool for Searches at Colliders"
D, I = index.index.search(index.model.encode(query).reshape(1, -1), 15)
[(data.iloc[idx].title, score) for idx, score in zip(I[0], D[0]) if idx in list(data.index)]

[('Automatic anomaly detection in high energy collider data', 0.777464),
 ('Semi-Supervised Anomaly Detection - Towards Model-Independent Searches\n  of New Physics',
  0.7767359),
 ('New physics searches for the LHC', 0.71976817),
 ('Exotic Searches at LHC and Tevatron', 0.7023321),
 ('Model Independent Searches for New Physics at the Fermilab Tevatron\n  Collider',
  0.69643986),
 ('Non-SUSY Searches at the Tevatron', 0.69399834),
 ('Searches for new physics at the Tevatron', 0.6828933),
 ('Model Independent Search For New Physics At The Tevatron', 0.6795386),
 ('Considerations in the Interpretation of Cosmological Anomalies', 0.6762587),
 ('Search for Anomalous Production of Events with Two Photons and\n  Additional Energetic Objects at CDF',
  0.6749262),
 ('Large Jet Multiplicities and New Physics at the LHC', 0.67225295),
 ('Results Of A Model-Independent Global Search For New Physics At CDF',
  0.6642769),
 ('Global Search for New Physics with 2.0/fb at CDF', 0.66402626),
 ('Int

In [None]:
original = pd.read_json("data/arxiv.json", lines=True)

In [17]:
data.iloc[124321].title

'A Super Bubble Candidate in the Galactic Center and a Local Enhancement\n  G359.77-0.09'

In [18]:
# Search for similar documents
query = "A Super Bubble Candidate in the Galactic Center and a Local Enhancement"
D, I = index.index.search(index.model.encode(query).reshape(1, -1), 15)
[(data.iloc[idx].title, score) for idx, score in zip(I[0], D[0]) if idx in list(data.index)]

[('A Super Bubble Candidate in the Galactic Center and a Local Enhancement\n  G359.77-0.09',
  0.79220134),
 ('Fermi Bubbles and Bubble-like emission from the Galactic Plane', 0.7287574),
 ('Giant Gamma-ray Bubbles from Fermi-LAT: AGN Activity or Bipolar Galactic\n  Wind?',
  0.71087974),
 ("What's in a Fermi Bubble: a quasar episode in the Galactic centre",
  0.6959878),
 ('A diffuse bubble-like radio-halo source MRC 0116+111: imprint of AGN\n  feedback in a low-mass cluster of galaxies',
  0.68884975),
 ('The Fermi Bubbles: Giant, Multi-Billion-Year-Old Reservoirs of Galactic\n  Center Cosmic Rays',
  0.6878665),
 ('Kinematical and Physical properties of a 700 pc large bubble in NGC 6946',
  0.687588),
 ('Massive Star Forming Regions in the Galaxy using the Spitzer GLIMPSE\n  Survey',
  0.68454164),
 ('The Fermi Bubbles: Supersonic AGN Jets with Anisotropic Cosmic Ray\n  Diffusion',
  0.6836057),
 ('The Fermi Bubbles. I. Possible Evidence for Recent AGN Jet Activity in\n  the Galaxy'

In [None]:
I.shape, D.shape

In [1]:
import pandas as pd

In [6]:
# Read with no index
pd.read_csv('data/arxiv_processed.csv', nrows=1, skiprows=4, index_col=False, header=None)

Unnamed: 0,0,1,2
0,We show that a determinant of Stirling cycle...,A determinant of Stirling cycle numbers counts...,https://arxiv.org/abs/0704.0004


In [7]:
df = pd.read_csv('data/arxiv_processed.csv')

In [8]:
df.columns

Index(['abstract', 'title', 'url'], dtype='object')