Skip to content

Commit

Permalink
initial commit of scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
karpathy committed Nov 24, 2012
0 parents commit 7220487
Show file tree
Hide file tree
Showing 8 changed files with 995 additions and 0 deletions.
24 changes: 24 additions & 0 deletions Readme.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@

# NIPS papers pretty html

This is a set of scripts for creating nice preview page (see here: http://cs.stanford.edu/~karpathy/prettynips/ ) for all papers published at NIPS. I hope these scripts can be useful to others to create similar pages for other conferences. They show how one can manipulate PDFs, extract image thumbnails, analyze word frequencies, etc.

#### Installation

0. Clone this repository to $FOLDER `git clone (see path above.git)`

1. Download nips25offline from `http://books.nips.cc/nips25.html` and move it into $FOLDER.

2. Install ImageMagick: `sudo apt-get install imagemagick`

3. Run `pdftowordcloud.py` (to generate top words for each paper. Output saved in topwords.p as pickle)

4. Run `pdftothumbs.py` (to generate tiny thumbnails for all papers. Outputs saved in thumbs/ folder)

5. Run `scrape.py` (to generate paperid, title, authors list by scraping NIPS .html page)

6. Finally, run `generatenice.py` (to create the nipsnice.html page)

#### Licence

WTFPL licence
58 changes: 58 additions & 0 deletions generatenice.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
# creates the nice .html page
# assumes that pdftowordcloud.py, pdftothumbs.py and scrape.py were already run

import cPickle as pickle

# load the pickle of papers scraped from the HTML page (result of scrape.py)
paperdict = pickle.load(open( "papers.p", "rb" ))
print "Loaded %d papers from papers.p (generated by scrape.py)" % (len(paperdict), )

# load the top word frequencies (result of pdftowordcloud.py)
topdict = pickle.load(open("topwords.p", "rb"))
print "Loaded %d entries from topwords.p (generated by pdftowordcloud.py)" % (len(topdict), )

# build up the string
html = open("nipsnice_template.html", "r").read()
s = ""
for p in paperdict:

# get title, author
title, author = paperdict[p]

# get top words
topwords = topdict.get(p, [])
t = [x[0] for x in topwords]
tcat = ", ".join(t)

# get path to thumbnails for this paper
thumbpath = "thumbs/NIPS2012_%s.pdf.jpg" % (p, )

# get links to PDF, supplementary and bibtex on NIPS servers
pdflink = "http://books.nips.cc/papers/files/nips25/NIPS2012_%s.pdf" % (p, )
bibtexlink = "http://books.nips.cc/papers/files/nips25/bibhtml/NIPS2012_%s.html" % (p, )
supplink = "http://books.nips.cc/papers/files/nips25/NIPS2012_%s.extra.zip" % (p, )

s += """
<div class="apaper">
<div class="paperdesc">
<span class="ts">%s</span><br />
<span class="as">%s</span><br /><br />
</div>
<div class="dllinks">
<a href="%s">[pdf] </a>
<a href="%s">[bibtex] </a>
<a href="%s">[supplementary]<br /></a>
</div>
<img src = "%s"><br />
<span class="tt">TOP 100 words: %s</span>
</div>
""" % (title, author, pdflink, bibtexlink, supplink, thumbpath, tcat)

newhtml = html.replace("RESULTTABLE", s)

f = open("nipsnice.html", "w")
f.write(newhtml)
f.close()

96 changes: 96 additions & 0 deletions nipsnice_template.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
<title>NIPS 2012 Accepted Papers</title>

<style>
/* CSS */

body {
margin: 0;
padding: 0;
font-family: arial;
background-color: #F6F3E5;
}
.as {
font-size: 12px;
color: #900;
}
.ts {
font-weight: bold;
font-size: 14px;
}
.tt {
color: #009;
font-size: 13px;
}
h1 {
font-size: 20px;
padding: 0;
margin: 0;
}
#titdiv {
width: 100%;
height: 90px;
background-color: #840000;
color: white;

padding-top: 20px;
padding-left: 20px;

border-bottom: 1px solid #540000;
}

#maindiv {
width: 1000px;
padding: 15px;
margin-left: auto;
margin-right: auto;

border-left: solid 1px #D6D3C5;
border-right: solid 1px #D6D3C5;

background-color: white;
}

.apaper {
margin-top: 25px;
min-height: 300px;
}

.paperdesc {
float: left;
}

.dllinks {
float: right;
}

#titdiv a:link{ color: white; }
#titdiv a:visited{ color: white; }

#maindiv a:link{ color: #666; }
#maindiv a:visited{ color: #600; }

</style>

</head>

<body>

<div id ="titdiv">
<h1>NIPS 2012 papers</h1>
(in nicer format than <a href="http://books.nips.cc/nips25.html">this)</a><br />
maintained by <a href="https://twitter.com/karpathy">@karpathy</a><br/>
source code on <a href="">github</a>
</div>

<div id="maindiv">
<!-- the keyword below will be replaced by content from the python script generatenice.py -->
RESULTTABLE
</div>

<br /><br /><br /><br /><br /><br />
</body>

</html>
29 changes: 29 additions & 0 deletions pdftothumbs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
# go over all pdfs in NIPS and use imagemagick to convert
# them all to a sequence of thumbnail images
# requires sudo apt-get install imagemagick

import os

relpath = "nips25offline/content/"
allFiles = os.listdir(relpath)
pdfs = [x for x in allFiles if x.endswith(".pdf")]

for i,f in enumerate(pdfs):
paperid = f[9:-4]
fullpath = relpath + f

print "processing %s, %d/%d" % (paperid, i, len(pdfs))

# this is a mouthful...
# take first 8 pages of the pdf ([0-7]), since 9th page are references
# tile them horizontally, use JPEG compression 80, trim the borders for each image
cmd = "montage %s[0-7] -mode Concatenate -tile x1 -quality 80 -resize x230 -trim %s" % (fullpath, "thumbs/" + f + ".jpg")
print "EXEC: " + cmd
os.system(cmd)


# an alternate, more roundabout alternative that is worse and requires temporary files, yuck!
#cmd = "convert -thumbnail x200 %s[0-7] test.png" % (fullpath, )
# os.system(cmd)
#cmd = "montage -mode concatenate -quality 80 -tile x1 test-*.png %s" % ("thumbs/" + f + ".jpg", )
# os.system(cmd)
47 changes: 47 additions & 0 deletions pdftowordcloud.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
# go over all pdfs in NIPS, get all the words from each, discard stop words,
# count frequencies of all words, retain top 100 for each PDF and dump a
# pickle of results into topwords.p

import os
from string import punctuation
from operator import itemgetter
import re
import cPickle as pickle

N= 100 # how many top words to retain

# load in stopwords (i.e. boring words, these we will ignore)
stopwords = open("stopwords.txt", "r").read().split()
stopwords = [x.strip(punctuation) for x in stopwords if len(x)>2]

# get list of all PDFs supplied by NIPS
relpath = "nips25offline/content/"
allFiles = os.listdir(relpath)
pdfs = [x for x in allFiles if x.endswith(".pdf")]

# go over every PDF, use pdftotext to get all words, discard boring ones, and count frequencies
topdict = {} # dict of paperid -> [(word, frequency),...]
for i,f in enumerate(pdfs):
paperid = f[9:-4]
fullpath = relpath + f

print "processing %s, %d/%d" % (paperid, i, len(pdfs))

# create text file
cmd = "pdftotext %s %s" % (fullpath, "out.txt")
print "EXEC: " + cmd
os.system(cmd)

txtlst = open("out.txt").read().split() # get all words in a giant list
words = [x.lower() for x in txtlst if re.match('^[\w-]+$', x) is not None] # take only alphanumerics
words = [x for x in words if len(x)>2 and (not x in stopwords)] # remove stop words

# count up frequencies of all words
wcount = {}
for w in words: wcount[w] = wcount.get(w, 0) + 1
top = sorted(wcount.iteritems(), key=itemgetter(1), reverse=True)[:N] # sort and take top N

topdict[paperid] = top # save to our dict

# dump to pickle
pickle.dump(topdict, open("topwords.p", "wb"))
74 changes: 74 additions & 0 deletions scrape.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
# scrape the NIPS25.html file looking for authors names, titles
# and create a database of all papers. This is necessary because
# extracting the authors and titles from PDFs directly is tricky.

from HTMLParser import HTMLParser
import cPickle as pickle

class Paper:
def __init__(self):
self.paper = "" # the id of the paper
self.title = "" # the title of the paper
self.authors = "" # the author list of the paper

# create a subclass of HTMLParser and override handler methods
# this is an event-driven parser so we maintain a state etc.
# this is super hacky and tuned to the specifics of the .html
# page provided by NIPS.
class MyHTMLParser(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.firstPaperEncountered = False
self.curPaper = Paper()
self.allPapers = []

def handle_starttag(self, tag, attrs):
if not tag == 'a': return

# attrs is a list of (key, value) pairs
for k,v in attrs:
if k == 'name':
print "New paper: " + v

if self.firstPaperEncountered:
# push current paper to stack
self.allPapers.append(self.curPaper)

# this signals new paper being read
self.curPaper = Paper() # start a new paper
self.curPaper.paper = v[1:] # for some reason first character is P, then follows the 4-digit ID
self.firstPaperEncountered = True

def handle_endtag(self, tag):
if not self.firstPaperEncountered: return

def handle_data(self, data):
if not self.firstPaperEncountered: return

# there are many garbage data newlines, get rid of it
s = data.strip()
if len(s) == 0: return

# title is first data encountered, then authors
if self.curPaper.title == "":
self.curPaper.title = data
print 'title ' + data
return

if self.curPaper.authors == "":
self.curPaper.authors = data
print 'authors ' + data
return


parser = MyHTMLParser()
f = open('nips25offline/nips25.html').read()
parser.feed(f)

outdict = {}
for p in parser.allPapers:
outdict[p.paper] = (p.title, p.authors)

# dump a dictionary indexed by paper id that points to (title, authors) tuple
pickle.dump(outdict, open("papers.p", "wb"))

Loading

0 comments on commit 7220487

Please sign in to comment.