In [3]:
!pip install feedparser
!pip install beautifulsoup4



## Paper parser

In [4]:
import feedparser
from bs4 import BeautifulSoup

In [5]:
feed = feedparser.parse('http://arxiv.org/rss/cs')
len(feed['entries'])

531

In [6]:
papers = []
for entry in feed['entries']:
    
    soup = BeautifulSoup(entry['author'], "html.parser")
    
    paper_id = entry.id.split('/')[-1]
    paper_title = entry['title'].split('.')[0]
    paper_abs = entry['summary'][3:-4]
    
    authors = []
    for tag in soup.find_all('a', href=True):
        authors.append(tag.contents[0])
    
    paper_entry = {
        'id': paper_id,
        'title': paper_title,
        'authors': authors,
        'abstract': paper_abs
    }
    
    papers.append(paper_entry)

In [7]:
authors_list = [
    # --- general
    'Geoffrey E. Hinton',
    # --- nlp
    'Graham Neubig',
    'Christopher D. Manning',  
    'Thomas Wolf',
    'Alexander M. Rush',
    'Dan Jurafsky',
    # --- conv-nets (and related stuff)
    'Jifeng Dai',
    'Ross Girshick',
    'Vladlen Koltun',
    # --- math, equivariance
    'Taco Cohen',
    'Max Welling',
]

In [8]:
key_words = [
    # --- nlp
    'bert',
    'syntax',
    'semantics'
    # --- math-charged
    'equivariant',
    'manifold',
    'sheaf',
    'homology',
]
key_words_pairs = [
    # --- nlp
    ('common', 'sense'),
    ('grammatical', 'error'),   
    # --- math
    ('category', 'theory'),
    ('diffential', 'geometry'),
    ('lie', 'group'),
    ('field', 'theory'),
    ('optimal', 'transport')
]
key_words_triples = [
    # --- nlp
    ('grammatical', 'error', 'correction')
]

key_words_pairs = [' '.join(ks) for ks in key_words_pairs]
key_words_triples = [' '.join(ks) for ks in key_words_triples]

In [9]:
print("=== author based search:\n")
for p in papers:
    for a in p['authors']:
        if a in authors_list:
            print(a)
            print(p['title'])
            print("https://arxiv.org/pdf/" + p['id'] + ".pdf")
            print()
            break

=== author based search:

Jifeng Dai
DriveMLM: Aligning Multi-Modal Large Language Models with Behavioral Planning States for Autonomous Driving
https://arxiv.org/pdf/2312.09245.pdf

Graham Neubig
An In-depth Look at Gemini's Language Abilities
https://arxiv.org/pdf/2312.11444.pdf

Jifeng Dai
A Survey of Reasoning with Foundation Models
https://arxiv.org/pdf/2312.11562.pdf



In [10]:
interesting_items = []
for p in papers:

    words_title = [t.lower() for t in p['title'].split(" ")]
    pairs_title = [w1 + ' ' + w2 for (w1, w2) in zip(words_title[:-1], words_title[1:])]
    triples_title = [' '.join(ws) for ws in zip(words_title[:-2], words_title[1:-1], words_title[2:])]    
    
    words_abs = [t.lower().strip(',:;.?!') for t in p['abstract'].replace("\n"," ").split(" ")]
    pairs_abs = [w1 + ' ' + w2 for (w1, w2) in zip(words_abs[:-1], words_abs[1:])]
    triples_abs = [' '.join(ws) for ws in zip(words_abs[:-2], words_abs[1:-1], words_abs[2:])]
    
    found = False
    for k in key_words:  
        if k in words_title or k in words_abs:
            found=True
            break
    if found:
        interesting_items.append(p) 
        continue
        
    # pairs   
    found = False
    for kp in key_words_pairs:
        if kp in pairs_title or kp in pairs_abs:
            found=True
            break
    if found:
        interesting_items.append(p)
        continue
        
    # triples   
    found = False
    for kt in key_words_triples:
        if kt in triples_title or kt in triples_abs:
            found=True
            break
    if found:
        interesting_items.append(p)   
        continue

print("=== title/abstract based search:\n")         
for p in interesting_items:
    print(p['title'])
    print(', '.join(p['authors']))    
    print("https://arxiv.org/pdf/" + p['id'] + ".pdf")
    #print(p['abstract'])
    print()        

=== title/abstract based search:

Dynamic Syntax Mapping: A New Approach to Unsupervised Syntax Parsing
Buvarp Gohsh, Woods Ali, Anders Michael
https://arxiv.org/pdf/2312.14966.pdf

Stacked tensorial neural networks for reduced-order modeling of a parametric partial differential equation
Caleb G. Wagner
https://arxiv.org/pdf/2312.14979.pdf

Latents2Semantics: Leveraging the Latent Space of Generative Models for Localized Style Manipulation of Face Images
Snehal Singh Tomar, A.N. Rajagopalan
https://arxiv.org/pdf/2312.15037.pdf

Unsupervised Auditory and Semantic Entrainment Models with Deep Neural Networks
Jay Kejriwal, Stefan Benus, Lina M. Rojas-Barahona
https://arxiv.org/pdf/2312.15098.pdf

On the Impact of Multiple Source Code Representations on Software Engineering Tasks -- An Empirical Study
Karthik Chandra Swarna, Noble Saji Mathews, Dheeraj Vagavolu, Sridhar Chimalakonda
https://arxiv.org/pdf/2106.10918.pdf

A Survey on Generative Diffusion Model
Hanqun Cao, Cheng Tan, Zhangyan