-
Notifications
You must be signed in to change notification settings - Fork 11
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit 7220487
Showing
8 changed files
with
995 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
|
||
# NIPS papers pretty html | ||
|
||
This is a set of scripts for creating nice preview page (see here: http://cs.stanford.edu/~karpathy/prettynips/ ) for all papers published at NIPS. I hope these scripts can be useful to others to create similar pages for other conferences. They show how one can manipulate PDFs, extract image thumbnails, analyze word frequencies, etc. | ||
|
||
#### Installation | ||
|
||
0. Clone this repository to $FOLDER `git clone (see path above.git)` | ||
|
||
1. Download nips25offline from `http://books.nips.cc/nips25.html` and move it into $FOLDER. | ||
|
||
2. Install ImageMagick: `sudo apt-get install imagemagick` | ||
|
||
3. Run `pdftowordcloud.py` (to generate top words for each paper. Output saved in topwords.p as pickle) | ||
|
||
4. Run `pdftothumbs.py` (to generate tiny thumbnails for all papers. Outputs saved in thumbs/ folder) | ||
|
||
5. Run `scrape.py` (to generate paperid, title, authors list by scraping NIPS .html page) | ||
|
||
6. Finally, run `generatenice.py` (to create the nipsnice.html page) | ||
|
||
#### Licence | ||
|
||
WTFPL licence |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
# creates the nice .html page | ||
# assumes that pdftowordcloud.py, pdftothumbs.py and scrape.py were already run | ||
|
||
import cPickle as pickle | ||
|
||
# load the pickle of papers scraped from the HTML page (result of scrape.py) | ||
paperdict = pickle.load(open( "papers.p", "rb" )) | ||
print "Loaded %d papers from papers.p (generated by scrape.py)" % (len(paperdict), ) | ||
|
||
# load the top word frequencies (result of pdftowordcloud.py) | ||
topdict = pickle.load(open("topwords.p", "rb")) | ||
print "Loaded %d entries from topwords.p (generated by pdftowordcloud.py)" % (len(topdict), ) | ||
|
||
# build up the string | ||
html = open("nipsnice_template.html", "r").read() | ||
s = "" | ||
for p in paperdict: | ||
|
||
# get title, author | ||
title, author = paperdict[p] | ||
|
||
# get top words | ||
topwords = topdict.get(p, []) | ||
t = [x[0] for x in topwords] | ||
tcat = ", ".join(t) | ||
|
||
# get path to thumbnails for this paper | ||
thumbpath = "thumbs/NIPS2012_%s.pdf.jpg" % (p, ) | ||
|
||
# get links to PDF, supplementary and bibtex on NIPS servers | ||
pdflink = "http://books.nips.cc/papers/files/nips25/NIPS2012_%s.pdf" % (p, ) | ||
bibtexlink = "http://books.nips.cc/papers/files/nips25/bibhtml/NIPS2012_%s.html" % (p, ) | ||
supplink = "http://books.nips.cc/papers/files/nips25/NIPS2012_%s.extra.zip" % (p, ) | ||
|
||
s += """ | ||
<div class="apaper"> | ||
<div class="paperdesc"> | ||
<span class="ts">%s</span><br /> | ||
<span class="as">%s</span><br /><br /> | ||
</div> | ||
<div class="dllinks"> | ||
<a href="%s">[pdf] </a> | ||
<a href="%s">[bibtex] </a> | ||
<a href="%s">[supplementary]<br /></a> | ||
</div> | ||
<img src = "%s"><br /> | ||
<span class="tt">TOP 100 words: %s</span> | ||
</div> | ||
""" % (title, author, pdflink, bibtexlink, supplink, thumbpath, tcat) | ||
|
||
newhtml = html.replace("RESULTTABLE", s) | ||
|
||
f = open("nipsnice.html", "w") | ||
f.write(newhtml) | ||
f.close() | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,96 @@ | ||
<html> | ||
<head> | ||
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8"> | ||
<title>NIPS 2012 Accepted Papers</title> | ||
|
||
<style> | ||
/* CSS */ | ||
|
||
body { | ||
margin: 0; | ||
padding: 0; | ||
font-family: arial; | ||
background-color: #F6F3E5; | ||
} | ||
.as { | ||
font-size: 12px; | ||
color: #900; | ||
} | ||
.ts { | ||
font-weight: bold; | ||
font-size: 14px; | ||
} | ||
.tt { | ||
color: #009; | ||
font-size: 13px; | ||
} | ||
h1 { | ||
font-size: 20px; | ||
padding: 0; | ||
margin: 0; | ||
} | ||
#titdiv { | ||
width: 100%; | ||
height: 90px; | ||
background-color: #840000; | ||
color: white; | ||
|
||
padding-top: 20px; | ||
padding-left: 20px; | ||
|
||
border-bottom: 1px solid #540000; | ||
} | ||
|
||
#maindiv { | ||
width: 1000px; | ||
padding: 15px; | ||
margin-left: auto; | ||
margin-right: auto; | ||
|
||
border-left: solid 1px #D6D3C5; | ||
border-right: solid 1px #D6D3C5; | ||
|
||
background-color: white; | ||
} | ||
|
||
.apaper { | ||
margin-top: 25px; | ||
min-height: 300px; | ||
} | ||
|
||
.paperdesc { | ||
float: left; | ||
} | ||
|
||
.dllinks { | ||
float: right; | ||
} | ||
|
||
#titdiv a:link{ color: white; } | ||
#titdiv a:visited{ color: white; } | ||
|
||
#maindiv a:link{ color: #666; } | ||
#maindiv a:visited{ color: #600; } | ||
|
||
</style> | ||
|
||
</head> | ||
|
||
<body> | ||
|
||
<div id ="titdiv"> | ||
<h1>NIPS 2012 papers</h1> | ||
(in nicer format than <a href="http://books.nips.cc/nips25.html">this)</a><br /> | ||
maintained by <a href="https://twitter.com/karpathy">@karpathy</a><br/> | ||
source code on <a href="">github</a> | ||
</div> | ||
|
||
<div id="maindiv"> | ||
<!-- the keyword below will be replaced by content from the python script generatenice.py --> | ||
RESULTTABLE | ||
</div> | ||
|
||
<br /><br /><br /><br /><br /><br /> | ||
</body> | ||
|
||
</html> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
# go over all pdfs in NIPS and use imagemagick to convert | ||
# them all to a sequence of thumbnail images | ||
# requires sudo apt-get install imagemagick | ||
|
||
import os | ||
|
||
relpath = "nips25offline/content/" | ||
allFiles = os.listdir(relpath) | ||
pdfs = [x for x in allFiles if x.endswith(".pdf")] | ||
|
||
for i,f in enumerate(pdfs): | ||
paperid = f[9:-4] | ||
fullpath = relpath + f | ||
|
||
print "processing %s, %d/%d" % (paperid, i, len(pdfs)) | ||
|
||
# this is a mouthful... | ||
# take first 8 pages of the pdf ([0-7]), since 9th page are references | ||
# tile them horizontally, use JPEG compression 80, trim the borders for each image | ||
cmd = "montage %s[0-7] -mode Concatenate -tile x1 -quality 80 -resize x230 -trim %s" % (fullpath, "thumbs/" + f + ".jpg") | ||
print "EXEC: " + cmd | ||
os.system(cmd) | ||
|
||
|
||
# an alternate, more roundabout alternative that is worse and requires temporary files, yuck! | ||
#cmd = "convert -thumbnail x200 %s[0-7] test.png" % (fullpath, ) | ||
# os.system(cmd) | ||
#cmd = "montage -mode concatenate -quality 80 -tile x1 test-*.png %s" % ("thumbs/" + f + ".jpg", ) | ||
# os.system(cmd) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,47 @@ | ||
# go over all pdfs in NIPS, get all the words from each, discard stop words, | ||
# count frequencies of all words, retain top 100 for each PDF and dump a | ||
# pickle of results into topwords.p | ||
|
||
import os | ||
from string import punctuation | ||
from operator import itemgetter | ||
import re | ||
import cPickle as pickle | ||
|
||
N= 100 # how many top words to retain | ||
|
||
# load in stopwords (i.e. boring words, these we will ignore) | ||
stopwords = open("stopwords.txt", "r").read().split() | ||
stopwords = [x.strip(punctuation) for x in stopwords if len(x)>2] | ||
|
||
# get list of all PDFs supplied by NIPS | ||
relpath = "nips25offline/content/" | ||
allFiles = os.listdir(relpath) | ||
pdfs = [x for x in allFiles if x.endswith(".pdf")] | ||
|
||
# go over every PDF, use pdftotext to get all words, discard boring ones, and count frequencies | ||
topdict = {} # dict of paperid -> [(word, frequency),...] | ||
for i,f in enumerate(pdfs): | ||
paperid = f[9:-4] | ||
fullpath = relpath + f | ||
|
||
print "processing %s, %d/%d" % (paperid, i, len(pdfs)) | ||
|
||
# create text file | ||
cmd = "pdftotext %s %s" % (fullpath, "out.txt") | ||
print "EXEC: " + cmd | ||
os.system(cmd) | ||
|
||
txtlst = open("out.txt").read().split() # get all words in a giant list | ||
words = [x.lower() for x in txtlst if re.match('^[\w-]+$', x) is not None] # take only alphanumerics | ||
words = [x for x in words if len(x)>2 and (not x in stopwords)] # remove stop words | ||
|
||
# count up frequencies of all words | ||
wcount = {} | ||
for w in words: wcount[w] = wcount.get(w, 0) + 1 | ||
top = sorted(wcount.iteritems(), key=itemgetter(1), reverse=True)[:N] # sort and take top N | ||
|
||
topdict[paperid] = top # save to our dict | ||
|
||
# dump to pickle | ||
pickle.dump(topdict, open("topwords.p", "wb")) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,74 @@ | ||
# scrape the NIPS25.html file looking for authors names, titles | ||
# and create a database of all papers. This is necessary because | ||
# extracting the authors and titles from PDFs directly is tricky. | ||
|
||
from HTMLParser import HTMLParser | ||
import cPickle as pickle | ||
|
||
class Paper: | ||
def __init__(self): | ||
self.paper = "" # the id of the paper | ||
self.title = "" # the title of the paper | ||
self.authors = "" # the author list of the paper | ||
|
||
# create a subclass of HTMLParser and override handler methods | ||
# this is an event-driven parser so we maintain a state etc. | ||
# this is super hacky and tuned to the specifics of the .html | ||
# page provided by NIPS. | ||
class MyHTMLParser(HTMLParser): | ||
def __init__(self): | ||
HTMLParser.__init__(self) | ||
self.firstPaperEncountered = False | ||
self.curPaper = Paper() | ||
self.allPapers = [] | ||
|
||
def handle_starttag(self, tag, attrs): | ||
if not tag == 'a': return | ||
|
||
# attrs is a list of (key, value) pairs | ||
for k,v in attrs: | ||
if k == 'name': | ||
print "New paper: " + v | ||
|
||
if self.firstPaperEncountered: | ||
# push current paper to stack | ||
self.allPapers.append(self.curPaper) | ||
|
||
# this signals new paper being read | ||
self.curPaper = Paper() # start a new paper | ||
self.curPaper.paper = v[1:] # for some reason first character is P, then follows the 4-digit ID | ||
self.firstPaperEncountered = True | ||
|
||
def handle_endtag(self, tag): | ||
if not self.firstPaperEncountered: return | ||
|
||
def handle_data(self, data): | ||
if not self.firstPaperEncountered: return | ||
|
||
# there are many garbage data newlines, get rid of it | ||
s = data.strip() | ||
if len(s) == 0: return | ||
|
||
# title is first data encountered, then authors | ||
if self.curPaper.title == "": | ||
self.curPaper.title = data | ||
print 'title ' + data | ||
return | ||
|
||
if self.curPaper.authors == "": | ||
self.curPaper.authors = data | ||
print 'authors ' + data | ||
return | ||
|
||
|
||
parser = MyHTMLParser() | ||
f = open('nips25offline/nips25.html').read() | ||
parser.feed(f) | ||
|
||
outdict = {} | ||
for p in parser.allPapers: | ||
outdict[p.paper] = (p.title, p.authors) | ||
|
||
# dump a dictionary indexed by paper id that points to (title, authors) tuple | ||
pickle.dump(outdict, open("papers.p", "wb")) | ||
|
Oops, something went wrong.