In [84]:
# !pip install feedparser
# !pip install beautifulsoup4
# !pip install lxml

## Paper parser

In [85]:
import feedparser
from bs4 import BeautifulSoup

In [86]:
feed = feedparser.parse('http://arxiv.org/rss/cs')
len(feed['entries'])

0

In [77]:
papers = []
for entry in feed['entries']:
    
    soup = BeautifulSoup(entry['author'], "html.parser")
    
    paper_id = entry.id.split('/')[-1]
    paper_title = entry['title'].split('.')[0]
    paper_abs = entry['summary'][3:-4]
    
    authors = []
    for tag in soup.find_all('a', href=True):
        authors.append(tag.contents[0])
    
    paper_entry = {
        'id': paper_id,
        'title': paper_title,
        'authors': authors,
        'abstract': paper_abs
    }
    
    papers.append(paper_entry)

In [78]:
authors_list = [
    # --- general
    'Geoffrey E. Hinton',
    # --- nlp
    'Graham Neubig',
    'Christopher D. Manning',  
    'Thomas Wolf',
    'Alexander M. Rush',
    'Dan Jurafsky',
    # --- conv-nets (and related stuff)
    'Jifeng Dai',
    'Ross Girshick',
    'Vladlen Koltun',
    # --- math, equivariance
    'Taco Cohen',
    'Max Welling',
]

In [79]:
key_words = [
    # --- nlp
    'bert',
    'syntax',
    'semantics'
    # --- math-charged
    'equivariant',
    'manifold',
    'sheaf',
    'homology',
]

In [80]:
key_words_pairs = [
    # --- nlp
    ('common', 'sense'),
    ('grammatical', 'error'),   
    # --- math
    ('category', 'theory'),
    ('diffential', 'geometry'),
    ('lie', 'group'),
    ('field', 'theory'),
    ('optimal', 'transport')
]
key_words_triples = [
    # --- nlp
    ('grammatical', 'error', 'correction')
]

key_words_pairs = [' '.join(ks) for ks in key_words_pairs]
key_words_triples = [' '.join(ks) for ks in key_words_triples]

In [81]:
print("=== author based search:\n")
for p in papers:
    for a in p['authors']:
        if a in authors_list:
            print(a)
            print(p['title'])
            print("https://arxiv.org/pdf/" + p['id'] + ".pdf")
            print()
            break

=== author based search:

Noah A. Smith
Time is Encoded in the Weights of Finetuned Language Models
https://arxiv.org/pdf/2312.13401.pdf

Jifeng Dai
A Survey of Reasoning with Foundation Models: Concepts, Methodologies, and Outlook
https://arxiv.org/pdf/2312.11562.pdf



In [82]:
interesting_items = []
for p in papers:

    words_title = [t.lower() for t in p['title'].split(" ")]
    pairs_title = [w1 + ' ' + w2 for (w1, w2) in zip(words_title[:-1], words_title[1:])]
    triples_title = [' '.join(ws) for ws in zip(words_title[:-2], words_title[1:-1], words_title[2:])]    
    
    words_abs = [t.lower().strip(',:;.?!') for t in p['abstract'].replace("\n"," ").split(" ")]
    pairs_abs = [w1 + ' ' + w2 for (w1, w2) in zip(words_abs[:-1], words_abs[1:])]
    triples_abs = [' '.join(ws) for ws in zip(words_abs[:-2], words_abs[1:-1], words_abs[2:])]
    
    found = False
    for k in key_words:  
        if k in words_title or k in words_abs:
            found=True
            break
    if found:
        interesting_items.append(p) 
        continue
        
    # pairs   
    found = False
    for kp in key_words_pairs:
        if kp in pairs_title or kp in pairs_abs:
            found=True
            break
    if found:
        interesting_items.append(p)
        continue
        
    # triples   
    found = False
    for kt in key_words_triples:
        if kt in triples_title or kt in triples_abs:
            found=True
            break
    if found:
        interesting_items.append(p)   
        continue

print("=== title/abstract based search:\n")         
for p in interesting_items:
    print(p['title'])
    print(', '.join(p['authors']))    
    print("https://arxiv.org/pdf/" + p['id'] + ".pdf")
    #print(p['abstract'])
    print()        

=== title/abstract based search:

Not All Steps are Equal: Efficient Generation with Progressive Diffusion Models
Wenhao Li, Xiu Su, Shan You, Tao Huang, Fei Wang, Chen Qian, Chang Xu
https://arxiv.org/pdf/2312.13307.pdf

Generate E-commerce Product Background by Integrating Category Commonality and Personalized Style
Haohan Wang, Wei Feng, Yang Lu, Yaoyu Li, Zheng Zhang, Jingjing Lv, Xin Zhu, Junjie Shen, Zhangang Lin, Lixing Bo, Jingping Shao
https://arxiv.org/pdf/2312.13309.pdf

Unlocking Pre-trained Image Backbones for Semantic Image Synthesis
Tariq Berrada, Jakob Verbeek, Camille Couprie, Karteek Alahari
https://arxiv.org/pdf/2312.13314.pdf

ShowRoom3D: Text to High-Quality 3D Room Generation Using 3D Priors
Weijia Mao, Yan-Pei Cao, Jia-Wei Liu, Zhongcong Xu, Mike Zheng Shou
https://arxiv.org/pdf/2312.13324.pdf

How to Prune Your Language Model: Recovering Accuracy on the "Sparsity May Cry'' Benchmark
Eldar Kurtic, Torsten Hoefler, Dan Alistarh
https://arxiv.org/pdf/2312.13547.pdf