initial commit of scripts

mattdeitke · Nov 24, 2012 · 7220487 · 7220487
commit 7220487
Show file tree

Hide file tree

Showing 8 changed files with 995 additions and 0 deletions.
diff --git a/Readme.md b/Readme.md
@@ -0,0 +1,24 @@
+
+# NIPS papers pretty html
+
+This is a set of scripts for creating nice preview page (see here: http://cs.stanford.edu/~karpathy/prettynips/ ) for all papers published at NIPS. I hope these scripts can be useful to others to create similar pages for other conferences. They show how one can manipulate PDFs, extract image thumbnails, analyze word frequencies, etc.
+
+#### Installation
+
+0. Clone this repository to $FOLDER `git clone (see path above.git)`
+
+1. Download nips25offline from `http://books.nips.cc/nips25.html` and move it into $FOLDER.
+
+2. Install ImageMagick: `sudo apt-get install imagemagick`
+
+3. Run `pdftowordcloud.py` (to generate top words for each paper. Output saved in topwords.p as pickle)
+
+4. Run `pdftothumbs.py` (to generate tiny thumbnails for all papers. Outputs saved in thumbs/ folder)
+
+5. Run `scrape.py` (to generate paperid, title, authors list by scraping NIPS .html page)
+
+6. Finally, run `generatenice.py` (to create the nipsnice.html page)
+
+#### Licence
+
+WTFPL licence
diff --git a/generatenice.py b/generatenice.py
@@ -0,0 +1,58 @@
+# creates the nice .html page
+# assumes that pdftowordcloud.py, pdftothumbs.py and scrape.py were already run
+
+import cPickle as pickle
+
+# load the pickle of papers scraped from the HTML page (result of scrape.py)
+paperdict = pickle.load(open( "papers.p", "rb" ))
+print "Loaded %d papers from papers.p (generated by scrape.py)" % (len(paperdict), )
+
+# load the top word frequencies (result of pdftowordcloud.py)
+topdict = pickle.load(open("topwords.p", "rb"))
+print "Loaded %d entries from topwords.p (generated by pdftowordcloud.py)" % (len(topdict), )
+
+# build up the string
+html = open("nipsnice_template.html", "r").read()
+s = ""
+for p in paperdict:
+
+	# get title, author
+	title, author = paperdict[p]
+
+	# get top words
+	topwords = topdict.get(p, [])
+	t = [x[0] for x in topwords]
+	tcat = ", ".join(t)
+
+	# get path to thumbnails for this paper
+	thumbpath = "thumbs/NIPS2012_%s.pdf.jpg" % (p, )
+
+	# get links to PDF, supplementary and bibtex on NIPS servers
+	pdflink = "http://books.nips.cc/papers/files/nips25/NIPS2012_%s.pdf" % (p, )
+	bibtexlink = "http://books.nips.cc/papers/files/nips25/bibhtml/NIPS2012_%s.html" % (p, )
+	supplink = "http://books.nips.cc/papers/files/nips25/NIPS2012_%s.extra.zip" % (p, )
+
+	s += """
+
+	<div class="apaper">
+	<div class="paperdesc">
+		<span class="ts">%s</span><br />
+		<span class="as">%s</span><br /><br />
+	</div>
+	<div class="dllinks">
+		<a href="%s">[pdf] </a>
+		<a href="%s">[bibtex] </a>
+		<a href="%s">[supplementary]<br /></a>
+	</div>
+	<img src = "%s"><br />
+	<span class="tt">TOP 100 words: %s</span>
+	</div>
+
+	""" % (title, author, pdflink, bibtexlink, supplink, thumbpath, tcat)
+
+newhtml = html.replace("RESULTTABLE", s)
+
+f = open("nipsnice.html", "w")
+f.write(newhtml)
+f.close()
+
diff --git a/nipsnice_template.html b/nipsnice_template.html
@@ -0,0 +1,96 @@
+<html>
+<head>
+<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
+<title>NIPS 2012 Accepted Papers</title>
+
+<style>
+/* CSS */
+
+body {
+	margin: 0;
+	padding: 0;
+	font-family: arial;
+	background-color: #F6F3E5;
+}
+.as {
+	font-size: 12px;
+	color: #900;
+}
+.ts {
+	font-weight: bold;
+	font-size: 14px;
+}
+.tt {
+	color: #009;
+	font-size: 13px;
+}
+h1 {
+	font-size: 20px;
+	padding: 0;
+	margin: 0;
+}
+#titdiv {
+	width: 100%;
+	height: 90px;
+	background-color: #840000;
+	color: white;
+
+	padding-top: 20px;
+	padding-left: 20px;
+
+	border-bottom: 1px solid #540000;
+}
+
+#maindiv {
+	width: 1000px;
+	padding: 15px;
+	margin-left: auto;
+	margin-right: auto;
+
+	border-left: solid 1px #D6D3C5;
+	border-right: solid 1px #D6D3C5;
+
+	background-color: white;
+}
+
+.apaper {
+	margin-top: 25px;
+	min-height: 300px;
+}
+
+.paperdesc {
+	float: left;
+}
+
+.dllinks {
+	float: right;
+}
+
+#titdiv a:link{ color: white; }
+#titdiv a:visited{ color: white; }
+
+#maindiv a:link{ color: #666; }
+#maindiv a:visited{ color: #600; }
+
+</style>
+
+</head>
+
+<body>
+
+<div id ="titdiv">
+<h1>NIPS 2012 papers</h1>
+(in nicer format than <a href="http://books.nips.cc/nips25.html">this)</a><br />
+maintained by <a href="https://twitter.com/karpathy">@karpathy</a><br/>
+source code on <a href="">github</a>
+</div>
+
+<div id="maindiv">
+<!-- the keyword below will be replaced by content from the python script generatenice.py -->
+RESULTTABLE
+</div>
+
+<br /><br /><br /><br /><br /><br />
+</body>
+
+</html>
diff --git a/pdftothumbs.py b/pdftothumbs.py
@@ -0,0 +1,29 @@
+# go over all pdfs in NIPS and use imagemagick to convert 
+# them all to a sequence of thumbnail images
+# requires sudo apt-get install imagemagick
+
+import os
+
+relpath = "nips25offline/content/"
+allFiles = os.listdir(relpath)
+pdfs = [x for x in allFiles if x.endswith(".pdf")]
+
+for i,f in enumerate(pdfs):
+	paperid = f[9:-4]
+	fullpath = relpath + f
+
+	print "processing %s, %d/%d" % (paperid, i, len(pdfs))
+
+	# this is a mouthful... 
+	# take first 8 pages of the pdf ([0-7]), since 9th page are references
+	# tile them horizontally, use JPEG compression 80, trim the borders for each image
+	cmd = "montage %s[0-7] -mode Concatenate -tile x1 -quality 80 -resize x230 -trim %s" % (fullpath, "thumbs/" + f + ".jpg")
+	print "EXEC: " + cmd
+	os.system(cmd)
+
+
+# an alternate, more roundabout alternative that is worse and requires temporary files, yuck!
+#cmd = "convert -thumbnail x200 %s[0-7] test.png" % (fullpath, )
+# os.system(cmd)
+#cmd = "montage -mode concatenate -quality 80 -tile x1 test-*.png %s" % ("thumbs/" + f + ".jpg", )
+# os.system(cmd)
diff --git a/pdftowordcloud.py b/pdftowordcloud.py
@@ -0,0 +1,47 @@
+# go over all pdfs in NIPS, get all the words from each, discard stop words,
+# count frequencies of all words, retain top 100 for each PDF and dump a 
+# pickle of results into topwords.p
+
+import os
+from string import punctuation
+from operator import itemgetter
+import re
+import cPickle as pickle
+
+N= 100 # how many top words to retain
+
+# load in stopwords (i.e. boring words, these we will ignore)
+stopwords = open("stopwords.txt", "r").read().split()
+stopwords = [x.strip(punctuation) for x in stopwords if len(x)>2]
+
+# get list of all PDFs supplied by NIPS
+relpath = "nips25offline/content/"
+allFiles = os.listdir(relpath)
+pdfs = [x for x in allFiles if x.endswith(".pdf")]
+
+# go over every PDF, use pdftotext to get all words, discard boring ones, and count frequencies
+topdict = {} # dict of paperid -> [(word, frequency),...]
+for i,f in enumerate(pdfs):
+	paperid = f[9:-4]
+	fullpath = relpath + f
+
+	print "processing %s, %d/%d" % (paperid, i, len(pdfs))
+
+	# create text file
+	cmd = "pdftotext %s %s" % (fullpath, "out.txt")
+	print "EXEC: " + cmd
+	os.system(cmd)
+
+	txtlst = open("out.txt").read().split() # get all words in a giant list
+	words = [x.lower() for x in txtlst if re.match('^[\w-]+$', x) is not None] # take only alphanumerics
+	words = [x for x in words if len(x)>2 and (not x in stopwords)] # remove stop words
+
+	# count up frequencies of all words
+	wcount = {} 
+	for w in words: wcount[w] = wcount.get(w, 0) + 1
+	top = sorted(wcount.iteritems(), key=itemgetter(1), reverse=True)[:N] # sort and take top N
+
+	topdict[paperid] = top # save to our dict
+
+# dump to pickle
+pickle.dump(topdict, open("topwords.p", "wb"))
diff --git a/scrape.py b/scrape.py
@@ -0,0 +1,74 @@
+# scrape the NIPS25.html file looking for authors names, titles
+# and create a database of all papers. This is necessary because
+# extracting the authors and titles from PDFs directly is tricky.
+
+from HTMLParser import HTMLParser
+import cPickle as pickle
+
+class Paper:
+	def __init__(self):
+		self.paper = "" # the id of the paper
+		self.title = "" # the title of the paper
+		self.authors = "" # the author list of the paper
+
+# create a subclass of HTMLParser and override handler methods
+# this is an event-driven parser so we maintain a state etc.
+# this is super hacky and tuned to the specifics of the .html 
+# page provided by NIPS.
+class MyHTMLParser(HTMLParser):
+	def __init__(self):
+		HTMLParser.__init__(self)
+		self.firstPaperEncountered = False
+		self.curPaper = Paper()
+		self.allPapers = []
+
+	def handle_starttag(self, tag, attrs):
+		if not tag == 'a': return
+
+		# attrs is a list of (key, value) pairs
+		for k,v in attrs:
+			if k == 'name':
+				print "New paper: " + v
+
+				if self.firstPaperEncountered:
+					# push current paper to stack
+					self.allPapers.append(self.curPaper) 
+
+				# this signals new paper being read
+				self.curPaper = Paper() # start a new paper
+				self.curPaper.paper = v[1:] # for some reason first character is P, then follows the 4-digit ID
+				self.firstPaperEncountered = True
+
+	def handle_endtag(self, tag):
+		if not self.firstPaperEncountered: return
+
+	def handle_data(self, data):
+		if not self.firstPaperEncountered: return
+
+		# there are many garbage data newlines, get rid of it
+		s = data.strip()
+		if len(s) == 0: return
+
+		# title is first data encountered, then authors
+		if self.curPaper.title == "": 
+			self.curPaper.title = data
+			print 'title ' + data
+			return
+
+		if self.curPaper.authors == "": 
+			self.curPaper.authors = data
+			print 'authors ' + data
+			return
+
+
+parser = MyHTMLParser()
+f = open('nips25offline/nips25.html').read()
+parser.feed(f)
+
+outdict = {}
+for p in parser.allPapers:
+	outdict[p.paper] = (p.title, p.authors)
+
+# dump a dictionary indexed by paper id that points to (title, authors) tuple
+pickle.dump(outdict, open("papers.p", "wb"))
+