In [None]:
!pip3 install numpy
!pip3 install matplotlib
!pip3 install pandas
!pip3 install pymupdf
!pip3 install PyPDF2
!pip3 install pytorch
!pip3 install gensim
!pip3 install nltk
!pip3 install scipy
!pip3 install sklearn
!pip3 install lxml
!pip3 install requests beautifulsoup4
!pip3 install google-cloud-storage

# Problem
Design a system that takes a research paper (pdf) as input and generates a list of the “Top k authors” (the author can specify k) that can potentially review their paper.

# Dataset

https://www.kaggle.com/datasets/Cornell-University/arxiv

List files:
gsutil cp gs://arxiv-dataset/arxiv/

Download pdfs from March 2020:
gsutil cp gs://arxiv-dataset/arxiv/arxiv/pdf/2003/ ./a_local_directory/

Download all the source files
gsutil cp -r gs://arxiv-dataset/arxiv/  ./a_local_directory/

#Get 10 documents
```
sample_docs = [
  'gs://arxiv-dataset/arxiv/arxiv/pdf/2309/2309.00001v1.pdf',
  'gs://arxiv-dataset/arxiv/arxiv/pdf/2309/2309.00003v1.pdf',
  'gs://arxiv-dataset/arxiv/arxiv/pdf/2309/2309.00004v1.pdf',
  'gs://arxiv-dataset/arxiv/arxiv/pdf/2309/2309.00005v1.pdf',
  'gs://arxiv-dataset/arxiv/arxiv/pdf/2309/2309.00006v1.pdf',
  'gs://arxiv-dataset/arxiv/arxiv/pdf/2309/2309.00007v1.pdf',
  'gs://arxiv-dataset/arxiv/arxiv/pdf/2309/2309.00007v2.pdf',
  'gs://arxiv-dataset/arxiv/arxiv/pdf/2309/2309.00008v1.pdf',
  'gs://arxiv-dataset/arxiv/arxiv/pdf/2309/2309.00009v1.pdf'
]
```

In [None]:
from bs4 import BeautifulSoup
import requests

meta_url="https://arxiv.org/abs/{}"
pdf_url="https://arxiv.org/pdf/{}.pdf"

def extract_metadata(paper_id):
  # Make a GET request to fetch the raw HTML content
  html_content = requests.get(meta_url.format(paper_id)).text

  # Parse the html content
  soup = BeautifulSoup(html_content, "lxml")
  paper_title = soup.find("h1", attrs={"class": ("title", "mathjax")}).contents[1]
  soup_authors = soup.find("div", attrs={"class": "authors"})
  paper_authors = []
  for soup_author in soup_authors.find("a"):
    paper_authors.append(soup_author.text)
  paper_abstract = soup.find("blockquote", attrs={"class": ("abstract", "mathjax")}).contents[2]
  return (paper_title, paper_authors, paper_abstract)

In [None]:
import os

sample_dir = './.data/sample_docs'
try:
  os.makedirs(sample_dir, exist_ok = True)
except:
  pass

sample_docs = [
  'gs://arxiv-dataset/arxiv/arxiv/pdf/1405/1405.4053v1.pdf',
  'gs://arxiv-dataset/arxiv/arxiv/pdf/2309/2309.00001v1.pdf',
  'gs://arxiv-dataset/arxiv/arxiv/pdf/2309/2309.00003v1.pdf',
  'gs://arxiv-dataset/arxiv/arxiv/pdf/2309/2309.00004v1.pdf',
  'gs://arxiv-dataset/arxiv/arxiv/pdf/2309/2309.00005v1.pdf',
  'gs://arxiv-dataset/arxiv/arxiv/pdf/2309/2309.00006v1.pdf',
  'gs://arxiv-dataset/arxiv/arxiv/pdf/2309/2309.00007v1.pdf',
  'gs://arxiv-dataset/arxiv/arxiv/pdf/2309/2309.00007v2.pdf',
  'gs://arxiv-dataset/arxiv/arxiv/pdf/2309/2309.00008v1.pdf',
  'gs://arxiv-dataset/arxiv/arxiv/pdf/2309/2309.00009v1.pdf'
]
sample_doc_names = [(x.split('/')[-1].split('v')[0], os.path.join(sample_dir, x.split('/')[-1])) for x in sample_docs]

for doc_url in sample_docs:
  !gsutil cp "$doc_url" "$sample_dir"

Copying gs://arxiv-dataset/arxiv/arxiv/pdf/1405/1405.4053v1.pdf...
/ [0 files][    0.0 B/143.1 KiB]                                                / [1 files][143.1 KiB/143.1 KiB]                                                
Operation completed over 1 objects/143.1 KiB.                                    
Copying gs://arxiv-dataset/arxiv/arxiv/pdf/2309/2309.00001v1.pdf...
/ [1 files][ 25.1 MiB/ 25.1 MiB]                                                
Operation completed over 1 objects/25.1 MiB.                                     
Copying gs://arxiv-dataset/arxiv/arxiv/pdf/2309/2309.00003v1.pdf...
/ [1 files][ 94.3 KiB/ 94.3 KiB]                                                
Operation completed over 1 objects/94.3 KiB.                                     
Copying gs://arxiv-dataset/arxiv/arxiv/pdf/2309/2309.00004v1.pdf...
/ [1 files][585.4 KiB/585.4 KiB]                                                
Operation completed over 1 objects/585.4 KiB.                                

In [None]:
import PyPDF2

class Publication:
  def __init__(self, title, authors, abstract, paragraphs):
    self.title = title
    self.authors = authors
    self.abstract = abstract
    self.paragraphs = paragraphs

class PublicationMetaLoader:
  def __init__(self, paper_id):
    self.paper_id = paper_id

  def load(self):
    (title, authors, abstract) = extract_metadata(self.paper_id)
    return Publication(title, authors, abstract, [])

class PublicationLoader:
  def __init__(self, paper_id, file_path):
    self.file_path = file_path
    self.paper_id = paper_id

  def load(self):
    f = open(self.file_path, 'rb')
    pdfReader = PyPDF2.PdfReader(f)

    fpage_text = self.to_text(pdfReader.pages[0])
    pub = PublicationMetaLoader(self.paper_id).load()
    pub.paragraphs = [fpage_text]

    f.close()
    return pub

  def to_text(self, page):
    return page.extract_text()


In [None]:
import pandas as pd

pubs = []
for doc in sample_doc_names:
  try:
    pubs.append(PublicationMetaLoader(doc[0]).load())
  except Exception as inst:
    print(inst, doc[0])

print(len(pubs))



10


In [None]:
import fitz
doc = fitz.open(sample_doc_names[0][1])

prev_y0 = 0
lines = []
line_text = []
fonts = []

for page in doc:
  for block in page.get_text('dict')["blocks"]:
    if block['type'] != 0:
      continue

    for line in block['lines']:
      y0 = int(line['bbox'][1])

      if y0 != prev_y0:
        lines.append(' '.join(line_text))
        line_text = []
        fonts = []

      for s in line['spans']:
        fonts.append((s['font'], s['size']))
        line_text.extend(s['text'].split(' '))
      prev_y0 = y0


In [None]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.spatial.distance import cosine

import re
import numpy as np
import pandas as pd

def author_set(pubs):
  temp = set()
  for pub in pubs:
    temp.update(pub.authors)
  return list(temp)

def cosine_score(X1, X2):
  return cosine(X1, X2)

vectorizer = TfidfVectorizer(stop_words='english')
def to_embeddings(pubs):
  texts = [pub.abstract.lower() for pub in pubs]

  #print(sent_tokenize(text))
  # words = word_tokenize(text)

  # eng_stopwords = stopwords.words("english")
  # words = [w for w in words if w not in eng_stopwords]
  # words = [w for w in words if re.match('^[a-z_-]+$', w)]
  X = vectorizer.fit_transform(texts)
  return (X, vectorizer.get_feature_names_out())

total_len = len(pubs)
train_pubs = pubs[0:int(.8*total_len)]
test_pubs = pubs[len(train_pubs):]

uniq_authors = author_set(train_pubs)

X_train, V = to_embeddings(train_pubs)
refernce_vector = np.zeros(X_train.shape[1]) + 0.001

print(X_train.shape, len(V), cosine_score(refernce_vector, X_train[0].toarray()[0]))

train_df = pd.DataFrame(columns=["doc_index", "doc_title", "sim_score", "author"])
for i, pub in enumerate(train_pubs):
  for author in pub.authors:
    train_df.loc[len(train_df)] = [i, pub.title, cosine_score(refernce_vector, X_train[i].toarray()[0]), author]
train_df.sort_values(by="sim_score", inplace=True)

X_test = vectorizer.transform([pub.abstract.lower() for pub in test_pubs])

print(train_df)

for t in X_test:
  score = cosine_score(refernce_vector, t.toarray()[0])
  print(score, train_df.iloc[(train_df['sim_score']-score).abs().argsort()[:2]])


(8, 421) 421 0.6767738956621577
   doc_index                                          doc_title  sim_score  \
6          6  When Measures are Unreliable: Imperceptible Ad...   0.597698   
7          7  When Measures are Unreliable: Imperceptible Ad...   0.597698   
1          1  QuanAnts Machine: A Quantum Algorithm for Biom...   0.613642   
5          5                          Dual Radar SAR Controller   0.639775   
4          4  High Spectral Spatial Resolution Synthetic Hyp...   0.667388   
0          0  Distributed Representations of Sentences and D...   0.676774   
3          3     José Díaz Bejarano (1933-2019). A Bibliography   0.695475   
2          2  Laser-assisted inelastic electron scattering b...   0.784275   

              author  
6         Yuchen Sun  
7         Yuchen Sun  
1  Phuong-Nam Nguyen  
5       Josiah Smith  
4          Yajie Sun  
0         Quoc V. Le  
3       J.M. Vaquero  
2     Gabriela Buica  
0.8427848456449878    doc_index                           