In [None]:
import numpy as np
import re
import requests
import string
import urllib

import bs4
from IPython.core.display import display, HTML
import feedparser

# Base API query URL.
BASE_URL = "http://arxiv.org/list/{}/new"

In [None]:
def get_new_papers(subject):
    url = BASE_URL.format(subject)
    
    soup = requests.get(url)
    soup = bs4.BeautifulSoup(soup.text, "html.parser")
    
    # Papers are enclosed in `dd` tags.
    entries = soup.find_all("dd")
    ids = [re.findall('\d*\.\d+|\d+',num.text)[0] \
           for num in soup.find_all('span', {'class': 'list-identifier'})]

    papers = {}
    for arxiv_id, entry in zip(ids, entries):
        paper = {}
        paper["id"] = arxiv_id
    
        title = entry.find_next("div", {"class": "list-title"}).text.split("Title:")[-1].strip()
        paper["title"] = title
        
        authors = entry.find_next('div', {'class': 'list-authors'}).text.split('Authors:')[-1].strip().split(', \n')
        paper["authors"] = authors
        
        subjects = entry.find_next('div', {'class': 'list-subjects'}).text.split('Subjects:')[-1].strip().split('; ')
        subjects = set(subjects)
        paper["subjects"] = subjects
        
        abstract = entry.find_next('p', {'class': 'mathjax'})
        if abstract is None:
            abstract = ""
        else:
            abstract = abstract.text
        paper["abstract"] = abstract.replace('\n', ' ')
            
        papers[arxiv_id] = paper
    return papers


HIGHLIGHT_TEMPLATE = '<span style="color:blue">{}</span>'


def highlight(x, text):
    original_text = text.split(' ')
    
    x = x.lower()
    text = [y.strip(string.punctuation).lower() for y in text.split(' ')]
    indices = [i for i, y in enumerate(text) if y == x]
    
    for index in indices:
        original_text[index] = HIGHLIGHT_TEMPLATE.format(original_text[index])
        
    return ' '.join(original_text)
    

def is_interesting(paper):
    for author in paper["authors"]:
        if author in AUTHORS:
            return True
    for keyword in KEYWORDS:
        if keyword.lower() in paper["abstract"].lower() or keyword in paper["title"].lower():
            return True
    return False


def highlight_paper(paper):
    for i, author in enumerate(paper["authors"]):
        if author in AUTHORS:
            paper["authors"][i] = HIGHLIGHT_TEMPLATE.format(author)
            
    for keyword in KEYWORDS:
        paper["abstract"] = highlight(keyword, paper["abstract"])
    return paper


def render_paper(paper):
    render = ""
    render += "<h3>{}</h3>".format(paper["title"])
    render += "<b>{}</b><br>".format(', '.join(paper["authors"]))
    render += "{}<br>".format(paper["abstract"])
    render += "<i>{}</i><br>".format(', '.join(paper["subjects"]))
    render += '<a href="https://arxiv.org/abs/{}" target="_blank">[arXiv]</a> '.format(paper["id"])
    render += '<a href="https://arxiv.org/pdf/{}.pdf" target="_blank">[pdf]</a><br>'.format(paper["id"])
    return render

## User Preferences

In [None]:
# arXiv subjects.
CATEGORIES = ["cs.AI", "cs.CL", "cs.GT", "cs.CV", "cs.LG", "cs.MA", "cs.SI", "cs.NE", "stat.ML"]
# Author watch list.
AUTHORS = ["Satinder Singh", "Honglak Lee", "Aaron Courville", "Joelle Pineau", "Yoshua Bengio",
           "Ryan Lowe", "Nando de Freitas", "Junhyuk Oh", "Pieter Abbeel", "David Silver",
           "Scott Reed", "Alex Graves", "Jacob Andreas"]
# Keyword watch list.
KEYWORDS = ["reinforcement", "lifelong", "continual", "education", "multi-agent", "memory",
            "time", "meta-learning"]

## Gather papers

In [None]:
# Collect papers across all the categories we are interested in.
papers = {}
for cat in CATEGORIES:
    papers.update(get_new_papers(cat))
print("Found {} papers.".format(len(papers)))

# Print the interest papers.
page = ""
page += '<link href="https://fonts.googleapis.com/css?family=Lato" rel="stylesheet">'
page += "<style>.output_html *{font-family: 'Lato', sans-serif;}</style>"
page += "<h1>Starred Papers</h1>"
for paper in papers.values():
    if is_interesting(paper):
        paper = highlight_paper(paper)
        page += render_paper(paper)
        
# Print the rest.
page += "<br><br><h1>Other Papers</h1>"
for paper in papers.values():
    if not is_interesting(paper):
        page += render_paper(paper)
        
display(HTML(page))