diff --git a/README.md b/README.md
index cb68e38..37ba1cd 100644
--- a/README.md
+++ b/README.md
@@ -6,15 +6,15 @@ Paper Machines is an open-source extension for the [Zotero](http://www.zotero.or
 
 ## Prerequisites
 
-In order to run Paper Machines, you will need the following (note that Python and Java are installed automatically on Mac OS X):
+In order to run Paper Machines, you will need the following (Python and Java are installed automatically on Mac OS X 10.7 and above):
 
 * [Zotero](http://www.zotero.org/) with PDF indexing tools installed (see the Search pane of Zotero's Preferences)
 * a corpus of documents with high-quality metadata (recommended: at least 1,000 for topic modeling purposes)
-* Python ([download for Windows](http://www.python.org/ftp/python/2.7.3/python-2.7.3.msi))
-* Java ([download for Windows/Mac/Linux/etc.](http://java.com/en/download/index.jsp))
+* Python 2.7 ([download page](http://www.python.org/download/releases/2.7.3)) \[N.B. Mac OS 10.6 users must download this version of Python\]
+* Java ([download page](http://java.com/en/download/index.jsp))
 
 ## Installation
-Paper Machines should work either in Zotero for Firefox or Zotero Standalone. To install, you must download the <a href="https://github.com/downloads/chrisjr/papermachines/papermachines-0.2.4.xpi">XPI file</a>. If you wish to use the extension in the Standalone version, right-click on the link and save the XPI file in your Downloads folder. Then, in Zotero Standalone, go to the Tools menu -> Add-Ons. Select the gear icon at the right, then "Install Add-On From File." Navigate to your Downloads folder (or wherever you have saved the XPI file) and open it.
+Paper Machines should work either in Zotero for Firefox or Zotero Standalone. To install, you must download the <a href="https://github.com/downloads/chrisjr/papermachines/papermachines-0.2.5.xpi">XPI file</a>. If you wish to use the extension in the Standalone version, right-click on the link and save the XPI file in your Downloads folder. Then, in Zotero Standalone, go to the Tools menu -> Add-Ons. Select the gear icon at the right, then "Install Add-On From File." Navigate to your Downloads folder (or wherever you have saved the XPI file) and open it.
 
 ## Usage
 To begin, right-click (control-click for Mac) on the collection you wish to analyze and select "Extract Texts for Paper Machines." Once the extraction process is complete, this right-click menu will offer several different processes that may be run on a collection, each with an accompanying visualization. Once these processes have been run, selecting "Export Output of Paper Machines..." will allow you to choose which visualizations to export.
diff --git a/chrome/content/papermachines/options.xul b/chrome/content/papermachines/options.xul
index 85e29e9..e522e60 100644
--- a/chrome/content/papermachines/options.xul
+++ b/chrome/content/papermachines/options.xul
@@ -18,6 +18,7 @@
     <preference id="pref_extract_txt" name="extensions.papermachines.general.extract_txt" type="bool"/>
     <preference id="pref_extract_notes" name="extensions.papermachines.general.extract_notes" type="bool"/>
     <preference id="pref_extract_tags" name="extensions.papermachines.general.extract_tags" type="bool"/>
+    <preference id="pref_python_exe" name="extensions.papermachines.general.python_exe" type="unichar"/>
   </preferences>
  
    <vbox>
@@ -40,6 +41,10 @@
       <checkbox preference="pref_extract_notes" label="&papermachines.prefs.general.extract_notes;" id="extract_notes"/>
       <checkbox preference="pref_extract_tags" label="&papermachines.prefs.general.extract_tags;" id="extract_tags"/>
     </groupbox>
+    <hbox align="center">
+      <label control="python_exe" value="&papermachines.prefs.general.python_exe;"/>
+      <textbox preference="pref_python_exe" id="python_exe" maxlength="64"/>
+    </hbox>
     <separator class="groove-thin"/>
     <label value="&papermachines.prefs.after_close;"/>
   </vbox>
diff --git a/chrome/content/papermachines/papermachines.js b/chrome/content/papermachines/papermachines.js
index 4ca75b2..2b19370 100755
--- a/chrome/content/papermachines/papermachines.js
+++ b/chrome/content/papermachines/papermachines.js
@@ -18,6 +18,7 @@ Zotero.PaperMachines = {
 	install_dir: null,
 	tagCloudReplace: true,
 	processors_dir: null,
+	python_exe: null,
 	processors: ["wordcloud", "phrasenet", "mallet", "mallet_classify", "geoparse", "dbpedia", "export-output"],
 	processNames: null, // see locale files
 	prompts: null,
@@ -231,9 +232,11 @@ Zotero.PaperMachines = {
 		this.log_dir = this._getOrCreateDir("logs", this.out_dir);
 		this.args_dir = this._getOrCreateDir("args");
 
+
 		Components.utils.import("chrome://papermachines/content/Preferences.js");
 		Components.utils.import("chrome://papermachines/content/strptime.js");
 
+		this.python_exe = this.findPythonExecutable();
 
 		var stoplist_lang = Preferences.get("extensions.papermachines.general.lang") || "en";
 
@@ -252,7 +255,7 @@ Zotero.PaperMachines = {
 		Components.utils.import("resource://gre/modules/AddonManager.jsm");
 		AddonManager.getAddonByID("papermachines@chrisjr.org",
 			function(addon) {
-				Zotero.PaperMachines._updateBundledFilesCallback(addon.getResourceURI().QueryInterface(Components.interfaces.nsIFileURL).file);
+				Zotero.PaperMachines._updateBundledFilesCallback(addon.getResourceURI("").QueryInterface(Components.interfaces.nsIFileURL).file);
 			});
 
 		// Connect to (and create, if necessary) papermachines.sqlite in the Zotero directory
@@ -389,8 +392,8 @@ Zotero.PaperMachines = {
 			return;
 		}
 
-		var proc_file = Zotero.PaperMachines.processors_dir.clone();
-		proc_file.append(processor + ".pyw");
+		var processor_file = Zotero.PaperMachines.processors_dir.clone();
+		processor_file.append(processor + ".py");
 
 		var proc = Components.classes["@mozilla.org/process/util;1"]
 			.createInstance(Components.interfaces.nsIProcess);
@@ -413,7 +416,7 @@ Zotero.PaperMachines = {
 		var argFile = Zotero.PaperMachines._getOrCreateFile(argsHashFilename, Zotero.PaperMachines.args_dir);
 		Zotero.File.putContents(argFile, args_str);
 
-		var procArgs = [argFile.path];
+		var procArgs = [processor_file.path, argFile.path];
 
 		outFile.append(processor + thisID + "-" + args_hash + ".html");
 
@@ -431,9 +434,13 @@ Zotero.PaperMachines = {
 			}
 		};
 
-		var observer = new this.processObserver(processor, processPath, callback);
+		var observer = new Zotero.PaperMachines.processObserver(processor, processPath, callback);
+
+		var python_exe_file = Zotero.PaperMachines._getLocalFile(Zotero.PaperMachines.python_exe);
+
+		Zotero.PaperMachines.LOG("running " + python_exe_file.leafName + " " + procArgs.join(" "));
 
-		proc.init(proc_file);
+		proc.init(python_exe_file);
 		proc.runAsync(procArgs, procArgs.length, observer);
 	},
 	replaceTagsBoxWithWordCloud: function (uri) {
@@ -678,7 +685,7 @@ Zotero.PaperMachines = {
 	},
 	traverseItemGroup: function (itemGroup) {
 		var itemGroups = [];
-		if ("isLibrary" in itemGroup && itemGroup.isLibrary()) {
+		if (typeof itemGroup.isLibrary == "function" && itemGroup.isLibrary()) {
 			if (itemGroup.id == "L") {
 				itemGroups.push(ZoteroPane.collectionsView._dataItems[0][0]);
 				var collectionKeys = Zotero.DB.columnQuery("SELECT key from collections WHERE libraryID IS NULL;");
@@ -687,7 +694,7 @@ Zotero.PaperMachines = {
 				}
 			}
 		} else {
-			if ("isCollection" in itemGroup && itemGroup.isCollection()) {
+			if (typeof itemGroup.isCollection == "function" && itemGroup.isCollection()) {
 				itemGroups.push(itemGroup);
 				var currentCollection = ("ref" in itemGroup) ? itemGroup.ref : itemGroup;
 				if (currentCollection.hasChildCollections()) {
@@ -696,7 +703,7 @@ Zotero.PaperMachines = {
 						itemGroups.push(Zotero.PaperMachines.traverseItemGroup(children[i]));
 					}
 				}
-			} else if ("isGroup" in itemGroup && itemGroup.isGroup()) {
+			} else if (typeof itemGroup.isGroup == "function" && itemGroup.isGroup()) {
 				if (itemGroup.ref.hasCollections()) {
 					var children = itemGroup.ref.getCollections();
 					for (var i in children) {
@@ -922,7 +929,7 @@ Zotero.PaperMachines = {
 		Zotero.PaperMachines.DB.query("INSERT OR IGNORE INTO files_to_extract (filename, itemID, outfile, collection) VALUES (?,?,?,?)", [tagsFile.path, item.id, tagsFile.path.replace("_tags.txt", ".txt"), dir.leafName]);
 	},
 	_updateBundledFilesCallback: function (installLocation) {
-		this.install_dir = installLocation;
+		Zotero.PaperMachines.install_dir = installLocation;
 		var xpiZipReader, isUnpacked = installLocation.isDirectory();
 		if(!isUnpacked) {
 			xpiZipReader = Components.classes["@mozilla.org/libjar/zip-reader;1"]
@@ -941,12 +948,12 @@ Zotero.PaperMachines = {
 			procs_dir.append("papermachines");
 			procs_dir.append("processors");
 
-			this._copyAllFiles(procs_dir, this.processors_dir);
+			this._copyAllFiles(procs_dir, Zotero.PaperMachines.processors_dir);
 		}
-		this.aux_dir = this._getOrCreateDir("support", this.processors_dir);
+		Zotero.PaperMachines.aux_dir = Zotero.PaperMachines._getOrCreateDir("support", Zotero.PaperMachines.processors_dir);
 
-		var new_aux = this._getOrCreateDir("support", this.out_dir);
-		this._copyAllFiles(this.aux_dir, new_aux);
+		var new_aux = Zotero.PaperMachines._getOrCreateDir("support", Zotero.PaperMachines.out_dir);
+		Zotero.PaperMachines._copyAllFiles(Zotero.PaperMachines.aux_dir, new_aux);
 	},
 	_copyOrMoveAllFiles: function (copy_or_move, source, target, recursive) {
 		var files = source.directoryEntries;
@@ -960,10 +967,6 @@ Zotero.PaperMachines = {
 				}
 				if (copy_or_move) {
 					f.copyTo(target, f.leafName);
-					if (f.leafName.indexOf(".pyw") != -1) {
-						var regpy = f.leafName.replace(".pyw", ".py");
-						f.copyTo(target, regpy);
-					}
 				} else {
 					f.moveTo(target, f.leafName);
 				}
@@ -1500,7 +1503,7 @@ Zotero.PaperMachines = {
 			win.gBrowser.selectedTab = win.gBrowser.addTab(url);			
 		}
 	},
-	openPreferences : function() {
+	openPreferences: function() {
 	  if (!this._preferencesWindow || this._preferencesWindow.closed) {
 	    var instantApply = Application.prefs.get("browser.preferences.instantApply");
 	    var features = "chrome,titlebar,toolbar,centerscreen" +
@@ -1512,6 +1515,41 @@ Zotero.PaperMachines = {
 	 
 	  this._preferencesWindow.focus();
 	},
+	findPythonExecutable: function () { 
+		var python_exe = Preferences.get("extensions.papermachines.general.python_exe");
+		if (!python_exe) {
+			var environment = Components.classes["@mozilla.org/process/environment;1"]
+	                            .getService(Components.interfaces.nsIEnvironment);
+			var path = environment.get("PATH"),
+				python_name = "pythonw",
+				directories = [];
+
+			if (Zotero.platform == "Win32") {
+				python_name += ".exe";
+				directories = ["C:\\Python27\\"];
+			} else {
+				python_name += "2.7";
+				directories = ["/usr/bin", "/usr/local/bin", "/sw/bin", "/opt/local/bin"];
+			}
+
+			for (var i = 0, n = directories.length; i < n; i++) {
+				var executable = Zotero.PaperMachines._getLocalFile(directories[i]);
+				executable.append(python_name);
+				if (executable.exists()) {
+					python_exe = executable.path;
+					break;
+				}
+			}
+
+			if (python_exe) {
+				Preferences.set("extensions.papermachines.general.python_exe", python_exe);
+			} else {
+				Zotero.PaperMachines.ERROR("Python not found! Please enter the path to Python 2.7 in the Paper Machines preference window.")
+			}
+		}
+		return python_exe;
+
+	},
 	evtListener: function (evt) {
 		var node = evt.target, doc = node.ownerDocument;
 
@@ -1542,12 +1580,17 @@ Zotero.PaperMachines.processObserver.prototype = {
   observe: function(subject, topic, data) {
 	switch (topic) {
 		case "process-failed":
-			Zotero.PaperMachines.LOG("Process " + this.processName + " failed.")
+			Zotero.PaperMachines.LOG("Process " + this.processName + " failed.");
 			this.callback(false);
 			break;
 		case "process-finished":
-			Zotero.PaperMachines.LOG("Process " + this.processName + " finished.")
-			this.callback(true);
+			Zotero.PaperMachines.LOG("Process " + this.processName + " finished with exit value " + subject.exitValue);
+			if (subject.exitValue != 0) { // something went awry
+				Zotero.PaperMachines.ERROR("Process " + this.processName + " failed.");
+				this.callback(false);
+			} else {
+				this.callback(true);				
+			}
 			break;
 	}
 	this.unregister();
diff --git a/chrome/content/papermachines/processors/dbpedia.py b/chrome/content/papermachines/processors/dbpedia.py
new file mode 100755
index 0000000..5645f2f
--- /dev/null
+++ b/chrome/content/papermachines/processors/dbpedia.py
@@ -0,0 +1,102 @@
+#!/usr/bin/env python2.7
+import sys, os, json, logging, urllib, urllib2, codecs, traceback
+import textprocessor
+
+
+class DBpedia(textprocessor.TextProcessor):
+	"""
+	annotates texts using DBpedia Spotlight
+	"""
+
+	def _basic_params(self):
+		self.name = "dbpedia"
+		self.dry_run = False
+		self.require_stopwords = False
+
+	def _get_annotated(self, text, confidence = 0.2, support = 20):
+		values = {'text': text[0:10000].encode('utf-8'),
+			'confidence': confidence,
+			'support': support}
+		data = urllib.urlencode(values)
+		req = urllib2.Request(self.url, data, self.headers)
+		response = urllib2.urlopen(req)
+		annotation = response.read()
+		encoding = req.headers.get('content-type', 'charset=utf8').split('charset=')[-1]
+
+		return unicode(annotation, encoding)
+
+	def process(self):
+		"""
+		create JSON files with named entity recognition by DBpedia
+		"""
+
+		logging.info("beginning annotation")
+
+		self.url = "http://spotlight.dbpedia.org/rest/annotate"
+		self.headers = {'Accept': 'application/json', 'content-type': 'application/x-www-form-urlencoded'}
+
+		annotated = {}
+		if not self.dry_run:
+			for filename in self.files:
+				logging.info("processing " + filename)
+				self.update_progress()
+				try:
+					annotated_filename = filename.replace(".txt", "_dbpedia.json")
+					if os.path.exists(annotated_filename):
+						annotated[annotated_filename] = filename
+					else:
+						with codecs.open(filename, 'r', encoding='utf-8') as f:
+							annotation = self._get_annotated(f.read())
+							if len(annotation) > 0:
+								annotated[annotated_filename] = filename
+								with codecs.open(annotated_filename, 'w', encoding='utf-8') as out:
+									out.write(annotation)
+				except (KeyboardInterrupt, SystemExit):
+					raise
+				except:
+					logging.error(traceback.format_exc())
+		else:
+			for filename in self.files:
+				annotated_filename = filename.replace(".txt", "_dbpedia.json")
+				if os.path.exists(annotated_filename):
+					annotated[annotated_filename] = filename
+
+		uris_to_docs = {}
+		for json_annotation, filename in annotated.iteritems():
+			itemID = self.metadata[filename]["itemID"]
+			notes = json.load(file(json_annotation))
+			entities = notes.get("Resources", [])
+			for entity in entities:
+				uri = entity.get("@URI", "http://dbpedia.org/resource/")
+				if not uri in uris_to_docs:
+					uris_to_docs[uri] = {}
+				if not itemID in uris_to_docs[uri]:
+					uris_to_docs[uri][itemID] = 0
+				uris_to_docs[uri][itemID] += 1
+
+		filtered_uris = {}
+		weights = []
+		for uri, items in uris_to_docs.iteritems():
+			weights.append(sum(items.values()))
+		weights.sort()
+		min_weight = weights[max(-100, -len(weights))]
+
+		for uri, items in uris_to_docs.iteritems():
+			if sum(items.values()) > min_weight:
+				filtered_uris[uri] = items
+
+
+
+		# params = {"DATA": json.dumps(uris_to_docs)}
+		params = {"URIS_TO_DOCS": json.dumps(filtered_uris)}
+		self.write_html(params)
+
+		logging.info("finished")
+
+
+if __name__ == "__main__":
+	try:
+		processor = DBpedia(track_progress=True)
+		processor.process()
+	except:
+		logging.error(traceback.format_exc())
\ No newline at end of file
diff --git a/chrome/content/papermachines/processors/extract.py b/chrome/content/papermachines/processors/extract.py
new file mode 100755
index 0000000..5dedf1a
--- /dev/null
+++ b/chrome/content/papermachines/processors/extract.py
@@ -0,0 +1,76 @@
+#!/usr/bin/env python2.7
+import sys, os, json, re, cStringIO, logging, traceback, codecs, urllib, subprocess
+from HTMLParser import HTMLParser
+import textprocessor
+
+class MLStripper(HTMLParser):
+    def __init__(self):
+        self.reset()
+        self.fed = []
+    def handle_data(self, d):
+        self.fed.append(d)
+    def get_data(self):
+        return u''.join(self.fed)
+
+def strip_tags(html):
+    s = MLStripper()
+    s.feed(html)
+    return s.get_data()
+
+class Extract(textprocessor.TextProcessor):
+	"""
+	Extract text from PDF or HTML files
+	"""
+
+	def _basic_params(self):
+		self.name = "extract"
+		self.pdftotext = self.extra_args[0]
+
+
+	def process(self):
+		logging.info("starting to process")
+
+		itemIDs = {}
+		for filename in self.files:
+			id = self.metadata[filename]["itemID"]
+			if id not in itemIDs:
+				itemIDs[id] = []
+			itemIDs[id].append(filename)
+
+		saved = []
+		for itemID, filenames in itemIDs.iteritems():
+			try:
+				out_file = self.metadata[filenames[0]]["outfile"]
+				out_dir = os.path.dirname(out_file)
+				if not os.path.exists(out_dir):
+					os.makedirs(out_dir)
+				text = u''
+				for filename in filenames:
+					if filename.lower().endswith(".txt"):
+						text += codecs.open(filename, 'r', encoding='utf-8', errors='ignore').read()
+					elif filename.lower().endswith(".html"):
+						text += strip_tags(codecs.open(filename, 'r', encoding='utf-8', errors='ignore').read())
+					elif filename.lower().endswith(".pdf"):
+						import_args = [self.pdftotext, '-enc', 'UTF-8', '-nopgbrk', filename, '-']
+						import_proc = subprocess.Popen(import_args, stdout = subprocess.PIPE)
+						text += import_proc.communicate()[0].decode('utf-8')
+				with codecs.open(out_file, 'w', encoding="utf-8") as f:
+					f.write(text)
+					saved.append({"itemID": itemID, "collection": self.metadata[filename]["collection"], "filename": out_file})
+				self.update_progress()
+			except:
+				logging.error(traceback.format_exc())
+		if self.progress_initialized:
+			self.progress_file.write('<1000>\n')
+		json_out = os.path.join(self.out_dir, self.name + self.collection + ".json")
+		with codecs.open(json_out, 'wb', encoding='utf-8') as f:
+			json.dump(saved, f)
+		params = {"SUCCEEDED": str(len(saved)), "TOTAL": str(len(itemIDs.keys()))}
+		self.write_html(params)
+
+if __name__ == "__main__":
+	try:
+		processor = Extract(track_progress=True)
+		processor.process()
+	except:
+		logging.error(traceback.format_exc())
\ No newline at end of file
diff --git a/chrome/content/papermachines/processors/geoparse.py b/chrome/content/papermachines/processors/geoparse.py
new file mode 100755
index 0000000..f5733f5
--- /dev/null
+++ b/chrome/content/papermachines/processors/geoparse.py
@@ -0,0 +1,166 @@
+#!/usr/bin/env python2.7
+import sys, os, json, logging, traceback, base64, time, codecs
+import cPickle as pickle
+from lib.placemaker import placemaker
+from lib.placemaker.placemaker_api import placemaker_api_key
+import textprocessor
+
+
+class Geoparse(textprocessor.TextProcessor):
+	"""
+	Geoparsing using Yahoo! Placemaker
+	"""
+
+	def _basic_params(self):
+		self.name = "geoparse"
+		self.dry_run = False
+		self.require_stopwords = False
+
+	def process(self):
+		"""
+		create a JSON file with geographical data extracted from texts
+		"""
+
+		self.name = "geoparse"
+
+		p = placemaker(base64.b64decode(placemaker_api_key))
+
+		geo_parsed = {}
+		places_by_woeid = {}
+
+		for filename in self.files:
+			logging.info("processing " + filename)
+			self.update_progress()
+
+			file_geoparsed = filename.replace(".txt", "_geoparse.json")
+
+			if os.path.exists(file_geoparsed):
+				geoparse_obj = json.load(file(file_geoparsed))
+			elif not self.dry_run:
+				geoparse_obj = {'places_by_woeid': {}, 'references': {}}
+				try:
+					# id = self.metadata[filename]['itemID']
+					str_to_parse = self.metadata[filename]['place']
+					last_index = len(str_to_parse)
+					str_to_parse += codecs.open(filename, 'r', encoding='utf8').read()[0:(48000 - last_index)] #50k characters, shortened by initial place string
+
+					city = None
+					places = []
+					
+					p.find_places(str_to_parse.encode('utf8', 'ignore'))
+					for woeid, referenced_place in p.referencedPlaces.iteritems():
+						place = referenced_place["place"]
+						geoparse_obj['places_by_woeid'][woeid] = {'name': place.name, 'type': place.placetype, 'coordinates': [place.centroid.longitude, place.centroid.latitude]}
+
+						for reference in referenced_place["references"]:
+							if reference.start < last_index:
+								city = woeid
+							else:
+								places.append(woeid)
+								if not woeid in geoparse_obj['references']:
+									geoparse_obj['references'][woeid] = []
+								geoparse_obj['references'][woeid].append((reference.start - last_index, reference.end - last_index))
+
+					geoparse_obj['places'] = places
+					geoparse_obj['city'] = city
+					json.dump(geoparse_obj, file(file_geoparsed, 'w'))
+					time.sleep(0.2)
+				except (KeyboardInterrupt, SystemExit):
+					raise
+				except:
+					logging.error(traceback.format_exc())
+
+			geo_parsed[filename] = geoparse_obj.get('places', [])
+			self.metadata[filename]['city'] = geoparse_obj.get('city')
+			for woeid, data in geoparse_obj.get('places_by_woeid', {}).iteritems():
+				places_by_woeid[int(woeid)] = data
+
+		places = {}
+		for filename, woeids in geo_parsed.iteritems():
+			year = self.metadata[filename]["year"]
+			for woeid in woeids:
+				if woeid in places_by_woeid:
+					if woeid not in places:
+						places[woeid] = {}
+						places[woeid]["name"] = places_by_woeid[woeid]["name"]
+						places[woeid]["type"] = places_by_woeid[woeid]["type"]
+						places[woeid]["coordinates"] = places_by_woeid[woeid]["coordinates"]
+						places[woeid]["weight"] = {year: 1}
+					else:
+						if year not in places[woeid]["weight"]:
+							places[woeid]["weight"][year] = 1
+						else:
+							places[woeid]["weight"][year] += 1
+
+		self.places_by_woeid = places_by_woeid
+		max_country_weight = 0
+
+		for place in sorted(places.keys()):
+			if places[place]["type"] == "Country":
+				country_sum = sum(places[place]["weight"].values())
+				if country_sum > max_country_weight:
+					max_country_weight = country_sum
+
+		placeIDsToNames = {k: v["name"] for k, v in places_by_woeid.iteritems()}
+		placeIDsToCoords = {k: v["coordinates"] for k, v in places_by_woeid.iteritems()}
+
+		linksByYear = {}
+		sources = {}
+
+		for filename in self.files:
+			if self.metadata[filename].get('city') is None or len(geo_parsed[filename]) < 2:
+				continue
+			try:
+				title = os.path.basename(filename)
+				itemID = self.metadata[filename]['itemID']
+				year = self.metadata[filename]['year']
+				if year not in linksByYear:
+					linksByYear[year] = {}
+				source = self.metadata[filename]['city']
+				if source != None:
+					if source not in sources:
+						sources[source] = {}
+					if year not in sources[source]:
+						sources[source][year] = 0
+					sources[source][year] += 1
+				targets = geo_parsed[filename]
+				for target in targets:
+					edge = str(source) + ',' + str(target)
+					if edge not in linksByYear[year]:
+						linksByYear[year][edge] = 0
+					linksByYear[year][edge] += 1
+			except:
+				logging.info(traceback.format_exc())
+
+		years = sorted(linksByYear.keys())
+		groupedLinksByYear = []
+
+		for year in years:
+			groupedLinksByYear.append([])
+			for edge in linksByYear[year]:
+				weight = linksByYear[year][edge]
+				source, target = [int(x) for x in edge.split(',')]
+				groupedLinksByYear[-1].append({'source': source, 'target': target, 'year': year, 'weight': weight})
+
+
+		params = {"PLACEIDSTOCOORDS": json.dumps(placeIDsToCoords),
+			"PLACEIDSTONAMES": json.dumps(placeIDsToNames),
+			"PLACESMENTIONED": json.dumps({k : v["weight"] for k, v in places.iteritems() if v["type"] != "Country"}),
+			"TEXTSFROMPLACE": json.dumps(sources),
+			"COUNTRIES": json.dumps({v["name"] : v["weight"] for k, v in places.iteritems() if v["type"] == "Country"}),
+			"MAX_COUNTRY_WEIGHT": str(max_country_weight),
+			"STARTDATE": str(min([int(x["year"]) for x in self.metadata.values() if x["year"].isdigit() and x["year"] != "0000"])),
+			"ENDDATE": str(max([int(x["year"]) for x in self.metadata.values() if x["year"].isdigit()])),
+			"LINKS_BY_YEAR": json.dumps(groupedLinksByYear)
+		}
+		self.write_html(params)
+
+		logging.info("finished")
+
+
+if __name__ == "__main__":
+	try:
+		processor = Geoparse(track_progress=True)
+		processor.process()
+	except:
+		logging.error(traceback.format_exc())
\ No newline at end of file
diff --git a/chrome/content/papermachines/processors/lib/merge_jstor.py b/chrome/content/papermachines/processors/lib/merge_jstor.py
index f400cca..9f18770 100755
--- a/chrome/content/papermachines/processors/lib/merge_jstor.py
+++ b/chrome/content/papermachines/processors/lib/merge_jstor.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python2.7
 
 import csv, sys, os, shutil, logging
 	
diff --git a/chrome/content/papermachines/processors/mallet.py b/chrome/content/papermachines/processors/mallet.py
new file mode 100755
index 0000000..3eb9a53
--- /dev/null
+++ b/chrome/content/papermachines/processors/mallet.py
@@ -0,0 +1,251 @@
+#!/usr/bin/env python2.7
+import sys, os, shutil, logging, tempfile, time, subprocess, math, re, urllib, json, codecs, csv, traceback, platform
+import xml.etree.ElementTree as et
+from lib.porter2 import stem
+import copy
+import textprocessor
+
+class Mallet(textprocessor.TextProcessor):
+	"""
+	Base class for MALLET functionality
+	"""
+
+	def _basic_params(self):
+		self.name = "mallet"
+
+	def _import_dfr_metadata(self, dfr_dir):
+		citation_file = os.path.join(dfr_dir, "citations.CSV")
+		citations = {}
+		for rowdict in self.parse_csv(citation_file):
+			doi = rowdict.pop("id")
+			citations[doi] = rowdict
+			self.metadata[doi] = {'title': citations[doi].get("title", ""), 'year': citations[doi].get('pubdate','')[0:4], 'label': "jstor", 'itemID': doi}
+		return citations
+
+	def _import_dfr(self, dfr_dir):
+		citations = self._import_dfr_metadata(dfr_dir)
+
+		wordcounts_dir = os.path.join(dfr_dir, "wordcounts")
+		for doi in citations.keys():
+			try:
+				this_text = ''		
+				for rowdict in self.parse_csv(os.path.join(wordcounts_dir, "wordcounts_" + doi.replace('/','_') + ".CSV")):
+					word = rowdict["WORDCOUNTS"]
+					if word in self.stopwords:
+						continue
+					if self.stemming:
+						prestem = word
+						if word not in self.stemmed:
+							self.stemmed[prestem] = stem(prestem)
+						word = self.stemmed[prestem]
+					count = int(rowdict["WEIGHT"])
+
+					this_text += (word + u' ') * count
+				if len(this_text) < 20:
+					continue
+				yield doi, this_text
+			except:
+				logging.error(doi)
+				logging.error(traceback.format_exc())
+
+	def _import_files(self):
+		if self.stemming:
+			self.stemmed = {}
+		self.docs = []
+		with codecs.open(self.texts_file, 'w', encoding='utf-8') as f:
+			for filename in self.files:
+				with codecs.open(filename, 'r', encoding='utf-8') as input_file:
+					text = input_file.read()
+					text = re.sub(r"[^\w ]+", u'', text.lower(), flags=re.UNICODE)
+					if self.stemming:
+						newtext = u''
+						for word in text.split():
+							if word not in self.stemmed:
+								self.stemmed[word] = stem(word)
+							newtext += self.stemmed[word] + u' '
+						text = newtext
+					f.write(u'\t'.join([filename, self.metadata[filename]["label"], text]) + u'\n')
+					self.docs.append(filename)
+			if self.dfr:
+				for doi, text in self._import_dfr(self.dfr_dir):
+					f.write(u'\t'.join([doi, self.metadata[doi]["label"], text]) + u'\n')
+					self.docs.append(doi)
+		with codecs.open(os.path.join(self.mallet_out_dir, "dmap"), 'w', encoding='utf-8') as dmap:
+			dmap.writelines([x + u'\n' for x in self.docs])
+		self.doc_count = len(self.docs)
+
+	def _tfidf_filter(self, top_terms = None):
+		min_df = getattr(self, "min_df", 5)
+		vocab = {}
+		inverse_vocab = {}
+		df = {}
+		tf = {}
+		tf_all_docs = {}
+		tfidf = {}
+		self.index = {}
+
+		i = 0
+		with codecs.open(self.texts_file, 'r', encoding='utf-8') as f:
+			for line in f:
+				j = 0
+				filename = ""
+				for part in line.split(u'\t'):
+					if j == 0:
+						filename = part
+					elif j == 2:
+						tf_for_doc = {}
+						flen = 0
+						for word in part.split():
+							if len(word) < 3:
+								continue
+							flen += 1
+							if word not in vocab:
+								vocab[word] = i
+								tf_for_doc[i] = 1
+								tf[i] = 0
+								df[i] = 1
+								i += 1
+							else:
+								index = vocab[word]
+								if index not in tf_for_doc:
+									tf_for_doc[index] = 0
+									df[index] += 1
+								tf_for_doc[index] += 1
+						tf_all_docs[filename] = copy.deepcopy(tf_for_doc)
+						for word_index in tf_for_doc.keys():
+							tf_val = float(tf_for_doc[word_index])/flen
+							if tf_val > tf[word_index]:
+								tf[word_index] = tf_val
+					j += 1
+			self.tf_all_docs = tf_all_docs
+			for index in vocab.values():
+				tfidf[index] = tf[index] * math.log10(float(self.doc_count)/df[index])
+			tfidf_values = tfidf.values()
+
+			if top_terms is None:
+				top_terms = min(int(len(vocab.keys()) * 0.7), 5000)
+			min_score = sorted(tfidf_values, reverse=True)[min(top_terms, len(tfidf_values) - 1)]
+
+		os.rename(self.texts_file, self.texts_file + '-pre_tf-idf')
+		inverse_vocab = {v : k for k, v in vocab.iteritems()}
+		new_vocab = {}
+
+		with codecs.open(self.texts_file, 'w', encoding='utf-8') as f:
+			for filename, freqs in tf_all_docs.iteritems():
+				text = u''
+				flen = 0
+				thisfile_vocab = []
+				for index, count in freqs.iteritems():
+					if tfidf[index] < min_score or df[index] < min_df:
+						continue
+					word = inverse_vocab[index]
+					if word in self.stopwords:
+						continue
+					if word not in new_vocab:
+						new_vocab[word] = 0
+					new_vocab[word] += count
+					thisfile_vocab.append(word)
+					text += (word + u' ') * count
+					flen += count
+				if flen > 25:
+					f.write(u'\t'.join([filename, self.metadata[filename]["label"], text]) + u'\n')
+					for word in thisfile_vocab:
+						if word not in self.index:
+							self.index[word] = []
+						self.index[word].append(self.metadata[filename]["itemID"])
+				else:
+					self.docs.remove(filename)
+		with codecs.open(os.path.join(self.mallet_out_dir, "dmap"), 'w', encoding='utf-8') as dmap:
+			dmap.writelines([x + u'\n' for x in self.docs])
+		logging.info("tf-idf complete; retained {:} of {:} words; minimum tf-idf score: {:}".format(len(new_vocab.keys()), len(vocab.keys()), min_score))
+
+	def _setup_mallet_command(self):
+		self.mallet_cp_dir = os.path.join(self.cwd, "lib", "mallet-2.0.7", "dist")
+		if self.sys == "Windows":
+			classpath_sep = u';'
+		else:
+			classpath_sep = u':'
+
+		self.mallet_classpath = os.path.join(self.mallet_cp_dir, "mallet.jar") + classpath_sep + os.path.join(self.mallet_cp_dir, "mallet-deps.jar")
+
+		self.mallet = "java -Xmx1g -ea -Djava.awt.headless=true -Dfile.encoding=UTF-8".split(' ')
+		self.mallet += ["-classpath", self.mallet_classpath]
+
+		self.mallet_out_dir = os.path.join(self.out_dir, self.name + self.collection)
+
+		if not self.dry_run:
+			if os.path.exists(self.mallet_out_dir):
+				shutil.rmtree(self.mallet_out_dir)
+			os.makedirs(self.mallet_out_dir)
+
+		self.progress_filename = os.path.join(self.out_dir, self.name + self.collection + "progress.txt")
+		self.progress_file = file(self.progress_filename, 'w')
+
+	def _import_texts(self):
+
+		logging.info("copying texts into single file")
+		self.texts_file = os.path.join(self.mallet_out_dir, self.collection + ".txt")
+
+		if not os.path.exists(self.texts_file):
+			if not self.dry_run:
+				self._import_files()
+		else:
+			if len(self.extra_args) > 0 and self.dfr:
+				self._import_dfr_metadata(self.dfr_dir)
+			self.docs = []
+			self.index = {}
+			with codecs.open(self.texts_file, 'r', 'utf-8') as f:
+				for line in f:
+					fields = line.split(u'\t')
+					filename = fields[0]
+					self.docs.append(filename)
+					this_vocab = set()
+					for word in fields[2].split():
+						this_vocab.add(word)
+					for word in this_vocab:
+						if word not in self.index:
+							self.index[word] = []
+						self.index[word].append(self.metadata[filename]["itemID"])
+			self.doc_count = len(self.docs)
+
+	def _setup_mallet_instances(self, sequence=True, tfidf = False, stemming = True):
+		self.stemming = stemming
+
+		self._setup_mallet_command()
+		self._import_texts()
+
+		self.instance_file = os.path.join(self.mallet_out_dir, self.collection + ".mallet")
+
+		logging.info("beginning text import")
+
+		if tfidf and not self.dry_run:
+			self._tfidf_filter()
+
+		with codecs.open(os.path.join(self.mallet_out_dir, "metadata.json"), 'w', encoding='utf-8') as meta_file:
+			json.dump(self.metadata, meta_file)
+
+		import_args = self.mallet + ["cc.mallet.classify.tui.Csv2Vectors", 
+			"--remove-stopwords",
+			"--stoplist-file", self.stoplist, 
+			"--input", self.texts_file,
+			"--line-regex", "^([^\\t]*)[\\t]([^\\t]*)[\\t](.*)$",
+			"--token-regex", '[\p{L}\p{M}]+',
+			"--output", self.instance_file]
+		if sequence:
+			import_args.append("--keep-sequence")
+
+		if not self.dry_run and not os.path.exists(self.instance_file):
+			import_return = subprocess.call(import_args, stdout=self.progress_file)
+	
+	def process(self):
+		"""
+		Should be redefined!
+		"""
+		pass
+
+if __name__ == "__main__":
+	try:
+		processor = Mallet(track_progress = False)
+		processor.process()
+	except:
+		logging.error(traceback.format_exc())
\ No newline at end of file
diff --git a/chrome/content/papermachines/processors/mallet_classify-file.py b/chrome/content/papermachines/processors/mallet_classify-file.py
new file mode 100755
index 0000000..6321d10
--- /dev/null
+++ b/chrome/content/papermachines/processors/mallet_classify-file.py
@@ -0,0 +1,64 @@
+#!/usr/bin/env python2.7
+import sys, os, logging, traceback, time, subprocess, codecs, json
+import mallet
+
+class MalletClassifierTest(mallet.Mallet):
+	"""
+	Train a classifier
+	"""
+	def _basic_params(self):
+		self.dry_run = False
+		self.name = "mallet_classify-file"
+		self.mallet_classifier = self.extra_args[0]
+		self.dfr = len(self.extra_args) > 1
+		if self.dfr:
+			self.dfr_dir = self.extra_args[1]
+		self.stemming = True
+
+	def process(self):
+
+		self._setup_mallet_command()
+		self._import_texts()
+
+		self.classified_filename = os.path.join(self.mallet_out_dir, "classified")
+
+		process_args = self.mallet + ["cc.mallet.classify.tui.Csv2Classify",
+			"--input", self.texts_file,
+			"--line-regex", "^([^\\t]*)[\\t]([^\\t]*)[\\t](.*)$",
+			"--classifier", self.mallet_classifier,
+			"--output", self.classified_filename]
+
+		logging.info("begin classifying texts")
+
+		start_time = time.time()
+#		if not self.dry_run:
+		classifier_return = subprocess.call(process_args, stdout=self.progress_file, stderr=self.progress_file)
+
+		finished = "Classifier finished in " + str(time.time() - start_time) + " seconds"
+		logging.info(finished)
+
+		classifications = {}
+		for line in codecs.open(self.classified_filename, 'r', encoding='utf-8'):
+			try:
+				line_parts = line.split('\t')
+				filename = line_parts.pop(0)
+				probs = {y[0]: float(y[1]) for y in self.xpartition(line_parts)}
+				classifications[filename] = self.argmax(probs)
+			except:
+				logging.error(traceback.format_exc())
+
+		outfile_name = os.path.join(self.out_dir, "mallet_classify-file" + self.collection + ".json")
+
+		with codecs.open(outfile_name, 'w', encoding='utf-8') as f:
+			json.dump(classifications, f)
+
+		params = {'CLASSIFIED': json.dumps(classifications)}
+
+		self.write_html(params)
+
+if __name__ == "__main__":
+	try:
+		processor = MalletClassifierTest(track_progress=False)
+		processor.process()
+	except:
+		logging.error(traceback.format_exc())
\ No newline at end of file
diff --git a/chrome/content/papermachines/processors/mallet_lda.py b/chrome/content/papermachines/processors/mallet_lda.py
new file mode 100755
index 0000000..257072a
--- /dev/null
+++ b/chrome/content/papermachines/processors/mallet_lda.py
@@ -0,0 +1,223 @@
+#!/usr/bin/env python2.7
+import sys, os, logging, tempfile, time, subprocess, math, re, urllib, json, codecs, csv, traceback
+import xml.etree.ElementTree as et
+from itertools import izip
+import mallet
+
+class MalletLDA(mallet.Mallet):
+	"""
+	Perform LDA using MALLET
+	"""
+
+	def _basic_params(self):
+		self.categorical = False
+		self.template_name = "mallet_lda"
+		self.name = "mallet_lda"
+		self.topics = 50
+		self.dry_run = False
+		self.dfr = len(self.extra_args) > 0
+		if self.dfr:
+			self.dfr_dir = self.extra_args[0]
+
+	def _stdev(self, X):
+		n = float(len(X))
+		xbar = float(sum(X)) / n
+		variances = [math.pow(float(x) - xbar, 2.0) for x in X]
+		return math.sqrt((1.0 / (n - 1.0)) * sum(variances))
+
+	def _cov(self, X, Y):
+		n = float(len(X))
+		xbar = sum(X) / n
+		ybar = sum(Y) / n
+		return (1.0/(n-1.0)) * sum([((x-xbar) * (y-ybar)) for x, y in zip(X, Y)])
+
+	def _find_proportions(self, topics):
+		self.proportions = {}
+		for i in range(len(topics)):
+			self.proportions[i] = float(sum(topics[i])) / len(topics[i])
+
+	def _find_stdevs(self, topics):
+		self.stdevs = {}
+		for i in range(len(topics)):
+			self.stdevs[i] = self._stdev(topics[i])
+
+	def _find_correlations(self, topics):
+		self.correlations = {}
+		for i in range(len(topics)):
+			for j in range(i+1,len(topics)):
+				self.correlations[str(i) + ',' + str(j)] = self._cov(topics[i], topics[j]) / (self.stdevs[i] * self.stdevs[j])
+	
+	def _sort_into_intervals(self):
+		years = set()
+		fname_to_year = {}
+
+		fnames = self.metadata.keys()
+		for filename in fnames:
+			x = self.metadata[filename]
+			if x['year'].isdigit() and x['year'] != '0000':
+				year = int(x['year'])
+			else:
+				year = 2012
+			years.add(year)
+			fname_to_year[filename] = year
+
+		years = sorted(years)
+		self.intervals = years
+		self.fname_to_interval = fname_to_year
+		self.fname_to_index = {fname: years.index(year) for fname, year in fname_to_year.iteritems()}
+
+	def process(self):
+		"""
+		run LDA, creating an output file divided by time
+		"""
+
+		if self.named_args is not None:
+			self.tfidf = self.named_args["tfidf"]
+			self.min_df = int(self.named_args["min_df"])
+			self.stemming = self.named_args["stemming"]
+			self.topics = int(self.named_args["topics"])
+			self.iterations = int(self.named_args["iterations"])
+			self.alpha = self.named_args["alpha"]
+			self.beta = self.named_args["beta"]
+			self.symmetric_alpha = str(self.named_args["symmetric_alpha"]).lower()
+			self.optimize_interval = self.named_args["optimize_interval"]
+			self.burn_in = int(self.named_args["burn_in"])
+		else:
+			self.tfidf = True
+			self.min_df = 5
+			self.topics = 50
+			self.stemming = True
+			self.iterations = 1000
+			self.alpha = "50.0"
+			self.beta = "0.01"
+			self.burn_in = 200
+			self.symmetric_alpha = "true"
+			self.optimize_interval = 0
+
+
+		self._setup_mallet_instances(sequence=True, tfidf=self.tfidf, stemming=self.stemming)
+
+		self.mallet_files = {'state': os.path.join(self.mallet_out_dir, "topic-state.gz"),
+			'doc-topics': os.path.join(self.mallet_out_dir, "doc-topics.txt"),
+			'topic-keys': os.path.join(self.mallet_out_dir, "topic-keys.txt"),
+			'word-topics': os.path.join(self.mallet_out_dir, "word-topics.txt"),
+			'diagnostics-file': os.path.join(self.mallet_out_dir, "diagnostics-file.txt")}
+		process_args = self.mallet + ["cc.mallet.topics.tui.TopicTrainer",
+			"--input", self.instance_file,
+			"--num-topics", str(self.topics),
+			"--num-iterations", str(self.iterations),
+			"--optimize-interval", str(self.optimize_interval),
+			"--optimize-burn-in", str(self.burn_in),
+			"--use-symmetric-alpha", self.symmetric_alpha,
+			"--alpha", self.alpha,
+			"--beta", self.beta,
+			"--output-state", self.mallet_files['state'],
+			"--output-doc-topics", self.mallet_files['doc-topics'],
+			"--output-topic-keys", self.mallet_files['topic-keys'],
+			"--diagnostics-file", self.mallet_files['diagnostics-file'],
+			"--word-topic-counts-file", self.mallet_files['word-topics']]
+
+		logging.info("begin LDA")
+
+		start_time = time.time()
+		if not self.dry_run:
+			lda_return = subprocess.call(process_args, stdout=self.progress_file, stderr=self.progress_file)
+
+		logging.info("LDA complete in " + str(time.time() - start_time) + " seconds")
+
+		coherence = {}
+		wordProbs = {}
+		allocationRatios = {}
+		with file(self.mallet_files['diagnostics-file']) as diagnostics:
+			tree = et.parse(diagnostics)
+			for elem in tree.iter("topic"):
+				topic = elem.get("id")
+				coherence[topic] = float(elem.get("coherence"))
+				allocationRatios[topic] = float(elem.get("allocation_ratio"))
+				wordProbs[topic] = []
+				for word in elem.iter("word"):
+					wordProbs[topic].append({'text': word.text, 'prob': word.get("prob")})
+
+		labels = {x[0]: {"label": x[2:5], "fulltopic": wordProbs[x[0]], "allocation_ratio": allocationRatios[x[0]]} for x in [y.split() for y in file(self.mallet_files['topic-keys']).readlines()]}
+
+		weights_by_topic = []
+		doc_metadata = {}
+
+		self._sort_into_intervals()
+
+		for i in range(self.topics):
+			weights_by_topic.append([{'x': str(j), 'y': [], 'topic': i} for j in self.intervals])		
+
+		for line in file(self.mallet_files['doc-topics']):
+			try:
+				values = line.split('\t')
+				
+				id = values.pop(0)
+				if id.startswith("#doc"):
+					continue
+				filename = self.docs[int(id)]
+				del values[0]
+
+				itemid = self.metadata[filename]["itemID"]
+
+				doc_metadata[itemid] = {"label": self.metadata[filename]["label"], "title": self.metadata[filename]["title"]}
+
+				freqs = {int(y[0]): float(y[1]) for y in self.xpartition(values)}
+				main_topic = None
+				topic_max = 0.0
+				for i in freqs.keys():
+					weights_by_topic[i][self.fname_to_index[filename]]['y'].append({"itemID": itemid, "ratio": freqs[i]})
+					if freqs[i] > topic_max:
+						main_topic = i
+						topic_max = freqs[i]
+				doc_metadata[itemid]["main_topic"] = main_topic
+			except KeyboardInterrupt:
+				sys.exit(1)
+			except:
+				logging.error(traceback.format_exc())
+
+		topics_by_year = []
+		for topic in weights_by_topic:
+			topic_sums = []	
+			for year in topic:
+				sum = 0.0
+				if len(year['y']) != 0:
+					for doc in year['y']:
+						sum += doc['ratio']
+					topic_sums.append(sum / float(len(year['y'])))
+				else:
+					topic_sums.append(0)
+			topics_by_year.append(topic_sums)
+
+		self.topics_by_year = topics_by_year
+		self._find_proportions(topics_by_year)
+		try:		
+			self._find_stdevs(topics_by_year)
+			self._find_correlations(topics_by_year)
+		except:
+			self.stdevs = {}
+			self.correlations = {}
+
+		self.template_filename = os.path.join(self.cwd, "templates", self.template_name + ".html")
+
+		params = {"CATEGORICAL": "true" if self.categorical else "false",
+				"TOPICS_DOCS": json.dumps(weights_by_topic, separators=(',',':')),
+				"DOC_METADATA": json.dumps(doc_metadata, separators=(',',':')),
+				"TOPIC_LABELS": json.dumps(labels, separators=(',',':')),
+				"TOPIC_COHERENCE": json.dumps(coherence, separators=(',',':')),
+				"TOPIC_PROPORTIONS": json.dumps(self.proportions, separators=(',',':')),
+				"TOPIC_STDEVS": json.dumps(self.stdevs, separators=(',',':')),
+				"TOPIC_CORRELATIONS": json.dumps(self.correlations, separators=(',',':'))
+		}
+
+		index = getattr(self, "index", "{}")
+		params["###INDEX###"] = json.dumps(index, separators=(',',':'))
+
+		self.write_html(params)
+
+if __name__ == "__main__":
+	try:
+		processor = MalletLDA(track_progress = False)
+		processor.process()
+	except:
+		logging.error(traceback.format_exc())
\ No newline at end of file
diff --git a/chrome/content/papermachines/processors/mallet_lda_MI.py b/chrome/content/papermachines/processors/mallet_lda_MI.py
new file mode 100755
index 0000000..550a465
--- /dev/null
+++ b/chrome/content/papermachines/processors/mallet_lda_MI.py
@@ -0,0 +1,163 @@
+#!/usr/bin/env python2.7
+import sys, os, codecs, logging, traceback, json, math
+import mallet_lda
+
+class MalletLDAMutualInformation(mallet_lda.MalletLDA):
+	"""
+	Calculate mutual information for groups of topics 
+	"""
+	def _basic_params(self):
+		self.name = "mallet_lda_MI"
+		self.categorical = True
+		self.template_name = "mallet_lda_MI"
+		self.dry_run = False
+		self.mallet_out_dir = self.extra_args[0]
+
+	def _mutualInformation(self, X, Y):
+		probs = {}
+		marginal_x = {}
+		marginal_y = {}
+
+		n = 0
+		for interval, x_topic_vals in X.iteritems():
+			if not interval in Y:
+				continue
+			y_topic_vals = Y[interval]
+
+			if len(x_topic_vals.keys()) == 0 or len(y_topic_vals.keys()) == 0:
+				continue
+
+			# what is being most discussed in each group?
+			x = self.argmax(x_topic_vals)
+			y = self.argmax(y_topic_vals)
+
+			if not x in marginal_x:
+				marginal_x[x] = 0
+			marginal_x[x] += 1
+			if not y in marginal_y:
+				marginal_y[y] = 0
+			marginal_y[y] += 1
+
+			if not x in probs:
+				probs[x] = {}
+			if not y in probs[x]:
+				probs[x][y] = 0
+			probs[x][y] += 1
+			n += 1
+
+		n_x = float(sum(marginal_x.values()))
+		for x in marginal_x.keys():
+			marginal_x[x] /= n_x
+
+		n_y = float(sum(marginal_y.values()))
+		for y in marginal_y.keys():
+			marginal_y[y] /= n_y
+
+		for x, y_probs in probs.iteritems():
+			for y in y_probs.keys():
+				probs[x][y] /= float(n)
+
+		mi = 0.0
+		for x, y_probs in probs.iteritems():
+			for y in y_probs.keys():
+				mi += (probs[x][y] * math.log(probs[x][y] / (marginal_x[x] * marginal_y[y]), 2))
+		return mi
+
+	def process(self):
+		self.metadata = json.load(codecs.open(os.path.join(self.mallet_out_dir, "metadata.json"), 'r', encoding='utf-8'))
+		self.files = self.metadata.keys()
+		
+		self.classify_file = os.path.join(self.out_dir, "mallet_classify-file" + self.collection + ".json")
+		if os.path.exists(self.classify_file):
+			with codecs.open(self.classify_file, 'r', encoding='utf-8') as f:
+				self.classified = json.load(f)
+			for filename in self.files:
+				label = self.classified.get(filename)
+				if label is not None:
+					self.metadata[filename]["label"] = label
+
+		self.labels = set([x["label"] for x in self.metadata.values()])
+
+		self.doc_topics = os.path.join(self.mallet_out_dir, "doc-topics.txt")
+		self.docs = [x.strip() for x in codecs.open(os.path.join(self.mallet_out_dir, "dmap"), 'r', encoding='utf-8')]
+
+		self._sort_into_intervals()
+		self.labels_years_topics = {}
+
+		for label in self.labels:
+			self.labels_years_topics[label] = {i: {} for i in self.intervals}
+
+		for line in file(self.doc_topics):
+			try:
+				values = line.split('\t')
+				
+				id = values.pop(0)
+				if id.startswith("#doc"):
+					continue
+				filename = self.docs[int(id)]
+				del values[0]
+
+				itemid = self.metadata[filename]["itemID"]
+
+				label = self.metadata[filename]["label"]
+
+				freqs = {int(y[0]): float(y[1]) for y in self.xpartition(values)}
+				main_topic = None
+				topic_max = 0.0
+				for i in freqs.keys():
+					if freqs[i] > topic_max:
+						main_topic = i
+						topic_max = freqs[i]
+				if main_topic is None:
+					continue
+				if not main_topic in self.labels_years_topics[label][self.fname_to_interval[filename]]:
+					self.labels_years_topics[label][self.fname_to_interval[filename]][main_topic] = 0				
+				self.labels_years_topics[label][self.fname_to_interval[filename]][main_topic] += 1
+			except KeyboardInterrupt:
+				sys.exit(1)
+			except:
+				logging.error(traceback.format_exc())
+
+		self.MIs = {}
+		labels = sorted(self.labels)
+		n = len(labels)
+		for i in range(n):
+			for j in range(i+1,n):
+				X = self.labels_years_topics[labels[i]]
+				Y = self.labels_years_topics[labels[j]]
+
+				# all_topics = []
+
+				# for A in [X,Y]:
+				# 	this_set = set()
+				# 	for interval, topic_vals in A.iteritems():
+				# 		this_set.update([topic for topic, val in topic_vals.iteritems() if val > 0])
+				# 	all_topics.append(this_set)
+
+				# topics_of_interest = all_topics[0].intersection(all_topics[1])
+
+				result = self._mutualInformation(X, Y)
+				self.MIs[str(i) + ',' + str(j)] = result
+
+		self.nodes = []
+		self.edges = []
+		node_index = {}
+
+		for key, mi in self.MIs.iteritems():
+			a, b = [int(x) for x in key.split(',')]
+			for i in [a,b]:
+				if i not in node_index:
+					node_index[i] = len(self.nodes)
+					self.nodes.append(labels[i])
+			edge = {"source": node_index[a], "target": node_index[b], "mi": mi}
+			self.edges.append(edge)
+
+		params = {"NODES": json.dumps(self.nodes), "EDGES": json.dumps(self.edges)}
+		self.write_html(params)
+
+if __name__ == "__main__":
+	try:
+		processor = MalletLDAMutualInformation(track_progress=False)
+		processor.process()
+	except:
+		logging.error(traceback.format_exc())
\ No newline at end of file
diff --git a/chrome/content/papermachines/processors/mallet_lda_categorical.py b/chrome/content/papermachines/processors/mallet_lda_categorical.py
new file mode 100755
index 0000000..1a18502
--- /dev/null
+++ b/chrome/content/papermachines/processors/mallet_lda_categorical.py
@@ -0,0 +1,24 @@
+#!/usr/bin/env python2.7
+import sys, os, logging, traceback
+import mallet_lda
+
+class MalletSubcollections(mallet_lda.MalletLDA):
+	"""
+	Set topic modeling to categorical view by default
+	"""
+	def _basic_params(self):
+		self.name = "mallet_lda_categorical"
+		self.categorical = True
+		self.template_name = "mallet_lda"
+		self.dry_run = False
+		self.topics = 50
+		self.dfr = len(self.extra_args) > 0
+		if self.dfr:
+			self.dfr_dir = self.extra_args[0]
+
+if __name__ == "__main__":
+	try:
+		processor = MalletSubcollections(track_progress=False)
+		processor.process()
+	except:
+		logging.error(traceback.format_exc())
\ No newline at end of file
diff --git a/chrome/content/papermachines/processors/mallet_lda_jstor.py b/chrome/content/papermachines/processors/mallet_lda_jstor.py
new file mode 100755
index 0000000..86d74b3
--- /dev/null
+++ b/chrome/content/papermachines/processors/mallet_lda_jstor.py
@@ -0,0 +1,40 @@
+#!/usr/bin/env python2.7
+import sys, os, logging, traceback
+from zipfile import ZipFile
+from lib.merge_jstor import merge_dfr_dirs
+import mallet_lda
+
+class MalletJSTOR(mallet_lda.MalletLDA):
+	"""
+	Alias to distinguish mallet queries with JSTOR attached
+	"""
+	def _extractAll(self, zipName, dest):
+		z = ZipFile(zipName)
+		z.extractall(dest, filter(lambda f: not f.endswith('/'), z.namelist()))
+
+	def _basic_params(self):
+		self.name = "mallet_lda_jstor"
+		self.categorical = False
+		self.template_name = "mallet_lda"
+		self.dry_run = False
+		self.topics = 50
+		self.dfr = True
+		dfr_dirs = []
+		for dfr_path in self.extra_args:
+			if dfr_path.lower().endswith(".zip"):
+				dfr_dir = os.path.basename(dfr_path).replace(".zip","")
+				this_dfr_dir = os.path.join(self.out_dir, dfr_dir)
+				self._extractAll(dfr_path, this_dfr_dir)
+				dfr_dirs.append(this_dfr_dir)
+		if len(dfr_dirs) > 1:
+			self.dfr_dir = merge_dfr_dirs(dfr_dirs)
+		else:
+			self.dfr_dir = dfr_dirs[0]
+
+
+if __name__ == "__main__":
+	try:
+		processor = MalletJSTOR(track_progress=False)
+		processor.process()
+	except:
+		logging.error(traceback.format_exc())
\ No newline at end of file
diff --git a/chrome/content/papermachines/processors/mallet_train-classifier.py b/chrome/content/papermachines/processors/mallet_train-classifier.py
new file mode 100755
index 0000000..5bc139a
--- /dev/null
+++ b/chrome/content/papermachines/processors/mallet_train-classifier.py
@@ -0,0 +1,42 @@
+#!/usr/bin/env python2.7
+import sys, os, logging, traceback, time, subprocess
+import mallet
+
+class MalletClassifier(mallet.Mallet):
+	"""
+	Train a classifier
+	"""
+	def _basic_params(self):
+		self.dry_run = False
+		self.name = "mallet_train-classifier"
+		self.dfr = False
+
+	def process(self):
+		self._setup_mallet_instances(sequence=False)
+
+		self.mallet_output = os.path.join(self.mallet_out_dir, "trained.classifier")
+		process_args = self.mallet + ["cc.mallet.classify.tui.Vectors2Classify",
+			"--input", self.instance_file,
+			"--output-classifier", self.mallet_output,
+			"--trainer", "NaiveBayes",
+			"--noOverwriteProgressMessages", "true"]
+
+		logging.info("begin training classifier")
+
+		start_time = time.time()
+		if not self.dry_run:
+			classifier_return = subprocess.call(process_args, stdout=self.progress_file, stderr=self.progress_file)
+
+		finished = "Classifier trained in " + str(time.time() - start_time) + " seconds"
+		logging.info(finished)
+
+		params = {'DONE': finished}
+
+		self.write_html(params)
+
+if __name__ == "__main__":
+	try:
+		processor = MalletClassifier(track_progress=False)
+		processor.process()
+	except:
+		logging.error(traceback.format_exc())
\ No newline at end of file
diff --git a/chrome/content/papermachines/processors/phrasenet.py b/chrome/content/papermachines/processors/phrasenet.py
new file mode 100755
index 0000000..a5aefd5
--- /dev/null
+++ b/chrome/content/papermachines/processors/phrasenet.py
@@ -0,0 +1,97 @@
+#!/usr/bin/env python2.7
+import sys, os, json, re, tempfile, cStringIO, logging, traceback, codecs
+import textprocessor
+
+class PhraseNet(textprocessor.TextProcessor):
+	"""
+	Generate phrase net
+	cf. http://www-958.ibm.com/software/data/cognos/manyeyes/page/Phrase_Net.html
+	"""
+
+	def _basic_params(self):
+		self.name = "phrasenet"
+
+	def _findPhrases(self, pattern):
+		self.nodes = {}
+		self.edges = {}
+		for filename in self.files:
+			self.update_progress()
+			with codecs.open(filename, 'r', encoding='utf8') as f:
+				logging.info("processing " + filename)
+				for re_match in pattern.finditer(f.read()):
+					match = [w.lower() for w in re_match.groups()]
+					if any([word in self.stopwords for word in match]):
+						continue
+
+					for word in match:
+						if not word in self.nodes:
+							self.nodes[word] = 1
+						else:
+							self.nodes[word] += 1
+
+					edge = match[0] + self.edgesep + match[1]
+					if not edge in self.edges:
+						self.edges[edge] = 1
+					else:
+						self.edges[edge] += 1
+					
+	def process(self):
+		logging.info("starting to process")
+
+		stopfile = os.path.join(self.cwd, "stopwords.txt")
+		logging.info("reading stopwords from " + stopfile)
+		self.stopwords = [line.strip() for line in file(stopfile)]
+
+		self.edgesep = ','
+
+		wordregex = "(\w+)"
+
+		if len(self.extra_args) > 0:
+			pattern_str = self.extra_args[0]
+		else:
+			pattern_str = "x and y"
+
+		if pattern_str.count('x') == 1 and pattern_str.count('y') == 1:
+			pattern = pattern_str.replace('x', wordregex)
+			pattern = pattern.replace('y', wordregex)
+		else:
+			pattern = pattern_str
+
+		logging.info("extracting phrases according to pattern "+ repr(pattern))
+
+		self._findPhrases(re.compile(pattern))
+
+		logging.info("generating JSON")
+
+		used_nodes = set()
+
+		jsondata = {'nodes': [], 'edges': []}
+
+		top_edges = self.edges.keys()
+		top_edges.sort(key=lambda x: self.edges[x])
+		top_edges.reverse()
+		top_edges = top_edges[:50]
+
+		for edge in top_edges:
+			words = edge.split(',')
+			used_nodes.update(words)
+
+		nodeindex = dict(zip(used_nodes, range(len(used_nodes))))
+
+		for edge in top_edges:
+			weight = self.edges[edge]
+			words = edge.split(',')
+			jsondata['edges'].append({'source': nodeindex[words[0]], 'target': nodeindex[words[1]], 'weight': weight})
+
+		for node in used_nodes:
+			jsondata['nodes'].append({'index': nodeindex[node], 'name': node, 'freq': self.nodes[node]})
+
+		params = {"DATA": json.dumps(jsondata), "PATTERN": json.dumps(pattern_str)}
+		self.write_html(params)
+
+if __name__ == "__main__":
+	try:
+		processor = PhraseNet(track_progress=True)
+		processor.process()
+	except:
+		logging.error(traceback.format_exc())
\ No newline at end of file
diff --git a/chrome/content/papermachines/processors/textprocessor.py b/chrome/content/papermachines/processors/textprocessor.py
new file mode 100755
index 0000000..5ca1cf2
--- /dev/null
+++ b/chrome/content/papermachines/processors/textprocessor.py
@@ -0,0 +1,138 @@
+#!/usr/bin/env python2.7
+import sys, os, csv, logging, tempfile, traceback, urllib, codecs, json, operator, platform
+from itertools import izip
+
+class TextProcessor:
+	"""
+	Base class for text processing in Paper Machines
+	"""
+
+	def __init__(self, track_progress=True):
+		self.sys = platform.system()
+		
+		# take in command line options
+
+		self.args_filename = sys.argv[1]
+		self.args_basename = os.path.basename(self.args_filename).replace(".json", "")
+
+		with codecs.open(self.args_filename, 'r', encoding='utf-8') as args_file:
+			args = json.load(args_file)
+
+		self.cwd = args[0]
+		csv_file = args[1]
+		self.out_dir = args[2]
+		self.collection_name = args[3]
+		self.extra_args = args[4:]
+
+		if "json" in self.extra_args:
+			json_starts_at = self.extra_args.index("json")
+			self.named_args = json.loads(self.extra_args[json_starts_at + 1])
+			self.extra_args = self.extra_args[:json_starts_at]
+		else:
+			self.named_args = None
+
+		self.collection = os.path.basename(csv_file).replace(".csv","")
+
+		self.require_stopwords = True # load stopwords by default
+
+		# call a function to set processor name, etc.
+		self._basic_params()
+
+		if self.require_stopwords:
+			self.stoplist = os.path.join(self.cwd, "stopwords.txt")
+			self.stopwords = [x.strip() for x in codecs.open(self.stoplist, 'r', encoding='utf-8').readlines()]
+
+		self.out_filename = os.path.join(self.out_dir, self.name + self.collection + "-" + self.args_basename + ".html")
+
+		# logging.basicConfig(filename=os.path.join(self.out_dir, "logs", self.name + ".log"), level=logging.INFO)
+		logging.basicConfig(filename=self.out_filename.replace(".html", ".log"), filemode='w', level=logging.INFO)
+
+		fh = logging.FileHandler(os.path.join(self.out_dir, "logs", self.name + ".log"))
+		formatter = logging.Formatter('%(name)s: %(levelname)-8s %(message)s')
+		fh.setFormatter(formatter)
+
+		logging.getLogger('').addHandler(fh)
+
+		logging.info("command: " + ' '.join([x.replace(' ','''\ ''') for x in sys.argv]))
+
+		self.metadata = {}
+
+		for rowdict in self.parse_csv(csv_file):
+			filename = rowdict.pop("filename")
+			self.metadata[filename] = rowdict
+
+		self.files = self.metadata.keys()
+		if track_progress:
+			self.track_progress = True
+			self.progress_initialized = False
+
+	def _basic_params(self):
+		self.name = "textprocessor"
+
+	def parse_csv(self, filename, dialect=csv.excel, **kwargs):
+		with file(filename, 'rb') as f:
+			csv_rows = self.unicode_csv_reader(f, dialect=dialect, **kwargs)
+			header = csv_rows.next()
+			for row in csv_rows:
+				if len(row) > 0:
+					rowdict = dict(zip(header, row))
+					yield rowdict
+
+	def unicode_csv_reader(self, utf8_data, dialect=csv.excel, **kwargs):
+		csv_reader = csv.reader(utf8_data, dialect=dialect, **kwargs)
+		for row in csv_reader:
+			yield [unicode(cell, 'utf-8') for cell in row]
+
+	def update_progress(self):
+		if self.track_progress:
+			if not self.progress_initialized:
+				self.progress_filename = os.path.join(self.out_dir, self.name + self.collection + "progress.txt")
+				self.progress_file = file(self.progress_filename, 'w')
+				self.count = 0
+				self.total = len(self.files)
+				self.progress_initialized = True
+
+			self.count += 1
+			self.progress_file.write('<' + str(int(self.count*1000.0/float(self.total))) + '>\n')
+			self.progress_file.flush()
+
+	def xpartition(self, seq, n=2):
+		return izip(*(iter(seq),) * n)
+
+	def argmax(self, obj):
+		if hasattr(obj, "index"):
+			return obj.index(max(obj))
+		elif hasattr(obj, "iteritems"):
+			return max(obj.iteritems(), key=operator.itemgetter(1))[0]
+
+	def write_html(self, user_params):
+		logging.info("writing HTML")
+		params = {"COLLECTION_NAME": self.collection_name, "DOC_METADATA": json.dumps({v["itemID"]: v for k, v in self.metadata.iteritems()})}
+		params.update(user_params)
+		try:
+			template_filename = getattr(self, "template_filename", os.path.join(self.cwd, "templates", self.name + ".html"))
+
+			with codecs.open(self.out_filename, 'w', encoding='utf-8') as outfile:
+				with codecs.open(template_filename, 'r', encoding='utf-8') as template:
+					template_str = template.read()
+					for k, v in params.iteritems():
+						template_str = template_str.replace(k, v)
+					outfile.write(template_str)
+		except:
+			logging.error(traceback.format_exc())
+
+	def process(self):
+		"""
+		Example process -- should be overridden
+		"""
+		output = file(os.path.join(self.out_dir, self.name + '.txt'), 'w')
+		for filename in self.files:
+			output.write(' '.join([filename, self.metadata[filename]]) + '\n')
+		output.close()
+
+if __name__ == "__main__":
+	try:
+		processor = TextProcessor(track_progress = True)
+		processor.process()
+	except:
+		logging.error(traceback.format_exc())
\ No newline at end of file
diff --git a/chrome/content/papermachines/processors/wordcloud.py b/chrome/content/papermachines/processors/wordcloud.py
new file mode 100755
index 0000000..163e2a3
--- /dev/null
+++ b/chrome/content/papermachines/processors/wordcloud.py
@@ -0,0 +1,118 @@
+#!/usr/bin/env python2.7
+import sys, os, json, cStringIO, tempfile, logging, traceback, codecs, math
+import textprocessor
+from lib.porter2 import stem
+
+class WordCloud(textprocessor.TextProcessor):
+	"""
+	Generate word cloud
+	"""
+	def _basic_params(self):
+		self.name = "wordcloud"
+		self.width = "300"
+		self.height = "150"
+		self.fontsize = "[10,32]"
+		self.n = 50
+		self.tfidf_scoring = False
+
+	def _findTfIdfScores(self, scale=True):
+		self.freqs = {}
+		self.tf_by_doc = {}
+		self.max_tf = {}
+		self.df = {}
+		for filename in self.files:
+			with codecs.open(filename, 'r', encoding = 'utf8') as f:
+				logging.info("processing " + filename)
+				flen = 0
+				self.tf_by_doc[filename] = {}
+				for line in f:
+					for stem in self._tokenizeAndStem(line):
+						flen += 1
+						if stem not in self.tf_by_doc[filename]:
+							self.tf_by_doc[filename][stem] = 0
+							if stem not in self.df:
+								self.df[stem] = 0
+							self.df[stem] += 1
+						self.tf_by_doc[filename][stem] += 1
+				# max_tf_d = max(self.tf_by_doc[filename].values())
+				for stem in self.tf_by_doc[filename].keys():
+					if stem not in self.freqs:
+						self.freqs[stem] = 0
+					self.freqs[stem] += self.tf_by_doc[filename][stem]
+					if scale:
+						self.tf_by_doc[filename][stem] /= float(flen) #max_tf_d
+						this_tf = self.tf_by_doc[filename][stem]
+					else:
+						this_tf = self.tf_by_doc[filename][stem] / float(flen)
+
+					if stem not in self.max_tf or self.max_tf[stem] < this_tf:
+						self.max_tf[stem] = this_tf
+				self.update_progress()
+		n = float(len(self.files))
+		self.idf = {term: math.log10(n/df) for term, df in self.df.iteritems()}
+		self.tfidf = {term: self.max_tf[term] * self.idf[term] for term in self.max_tf.keys()}
+		tfidf_values = self.tfidf.values()
+		top_terms = min(int(len(self.freqs.keys()) * 0.7), 5000)
+		min_score = sorted(tfidf_values, reverse=True)[min(top_terms, len(tfidf_values) - 1)]
+		self.filtered_freqs = {term: freq for term, freq in self.freqs.iteritems() if self.tfidf[term] > min_score and self.df[term] > 3}
+
+	def _topN(self, freqs, n = None):
+		if n is None:
+			n = self.n
+		final_freqs = []
+		top_freqs = sorted(freqs.values())
+		if len(top_freqs) == 0:
+			return []
+		min_freq = top_freqs[-min(n,len(top_freqs))] # find nth frequency from end, or start of list
+		for word, freq in freqs.iteritems():
+			if freq >= min_freq:
+				final_freqs.append({'text': word, 'value': freq})
+		return final_freqs
+
+	def _findWordFreqs(self, filenames):
+		freqs = {}
+		for filename in filenames:
+			with codecs.open(filename, 'r', encoding = 'utf8') as f:
+				logging.info("processing " + filename)
+				for line in f:
+					for stem in self._tokenizeAndStem(line):
+						if stem not in freqs:
+							freqs[stem] = 1
+						else:
+							freqs[stem] += 1
+			self.update_progress()
+		return self._topN(freqs)
+
+	def _tokenizeAndStem(self, line):
+		# uncomment for Porter stemming (slower, but groups words with their plurals, etc.)
+		# return [stem(word.strip('.,')) for word in line.split() if word.lower() not in self.stopwords and len(word) > 3]
+		return [word.lower() for word in line.split() if word.lower() not in self.stopwords and word.isalpha() and len(word) >= 3]
+
+	def process(self):
+		logging.info("starting to process")
+
+		self.template_filename = os.path.join(self.cwd, "templates", "wordcloud.html")
+
+		logging.info("finding word frequencies")
+
+		if self.tfidf_scoring:
+			self._findTfIdfScores()
+			freqs = self._topN(self.filtered_freqs)
+		else:
+			freqs = self._findWordFreqs(self.files)
+
+		params = {"DATA": json.dumps(freqs),
+				"WIDTH": self.width,
+				"HEIGHT": self.height,
+				"FONTSIZE": self.fontsize
+		}
+
+		self.write_html(params)
+
+
+if __name__ == "__main__":
+	try:
+		processor = WordCloud(track_progress = True)
+		processor.process()
+	except:
+		logging.error(traceback.format_exc())
\ No newline at end of file
diff --git a/chrome/content/papermachines/processors/wordcloud_chronological.py b/chrome/content/papermachines/processors/wordcloud_chronological.py
new file mode 100755
index 0000000..c9d987c
--- /dev/null
+++ b/chrome/content/papermachines/processors/wordcloud_chronological.py
@@ -0,0 +1,77 @@
+#!/usr/bin/env python2.7
+import sys, os, json, cStringIO, tempfile, logging, traceback, codecs, math
+from datetime import datetime, timedelta
+import wordcloud_multiple
+
+class WordCloudChronological(wordcloud_multiple.MultipleWordClouds):
+	"""
+	Generate word clouds based on time interval
+	"""
+	def _basic_params(self):
+		self.name = "wordcloud_chronological"
+		self.template_filename = os.path.join(self.cwd, "templates", "wordcloud_multiple.html")
+		self.width = "483"
+		self.height = "300"
+		self.fontsize = "[10,32]"
+		self.n = 100
+		self.tfidf_scoring = False
+		self.MWW = False
+		self.dunning = False
+		if len(self.extra_args) == 1:
+			self.interval = self.extra_args[0]
+		elif len(self.extra_args) > 1:
+			if self.extra_args[0] == "tfidf":
+				self.tfidf_scoring = True
+			elif self.extra_args[0] == "mww":
+				self.tfidf_scoring = True
+				self.MWW = True
+			elif self.extra_args[0] == "dunning":
+				self.tfidf_scoring = True
+				self.dunning = True
+			self.interval = self.extra_args[1]
+		else:
+			self.interval = "90"
+
+	def _split_into_labels(self):
+		datestr_to_datetime = {}
+		for filename in self.metadata.keys():
+			date_str = self.metadata[filename]["date"]
+			cleaned_date = date_str[0:10]
+			if "-00" in cleaned_date:
+				cleaned_date = cleaned_date[0:4] + "-01-01"
+			datestr_to_datetime[date_str] = datetime.strptime(cleaned_date, "%Y-%m-%d")
+		datetimes = sorted(datestr_to_datetime.values())
+		start_date = datetimes[0]
+		end_date = datetimes[-1]
+
+		if self.interval.isdigit():
+			interval = timedelta(int(self.interval))
+		else:
+			interval = timedelta(90)
+
+		intervals = []
+		interval_names = []
+		start = end = start_date
+		while end <= end_date:
+			end += interval
+			intervals.append((start,end))
+			interval_names.append(start.isoformat()[0:10].replace('-','/') + '-' + end.isoformat()[0:10].replace('-','/'))
+			start = end
+
+		for filename, metadata in self.metadata.iteritems():
+			label = ""
+			for i in range(len(intervals)):
+				interval = intervals[i]
+				if interval[0] <= datestr_to_datetime[metadata["date"]] < interval[1]:
+					label = interval_names[i]
+					break
+			if label not in self.labels:
+				self.labels[label] = set()
+			self.labels[label].add(filename)
+
+if __name__ == "__main__":
+	try:
+		processor = WordCloudChronological(track_progress = True)
+		processor.process()
+	except:
+		logging.error(traceback.format_exc())
\ No newline at end of file
diff --git a/chrome/content/papermachines/processors/wordcloud_large.py b/chrome/content/papermachines/processors/wordcloud_large.py
new file mode 100755
index 0000000..3300d33
--- /dev/null
+++ b/chrome/content/papermachines/processors/wordcloud_large.py
@@ -0,0 +1,22 @@
+#!/usr/bin/env python2.7
+import sys, os, logging, traceback, codecs
+import wordcloud
+
+class LargeWordCloud(wordcloud.WordCloud):
+	"""
+	Generate large word cloud
+	"""
+	def _basic_params(self):
+		self.width = "960"
+		self.height = "500"
+		self.fontsize = "[10,72]"
+		self.name = "wordcloud_large"
+		self.n = 150
+		self.tfidf_scoring = len(self.extra_args) > 0
+
+if __name__ == "__main__":
+	try:
+		processor = LargeWordCloud(track_progress=True)
+		processor.process()
+	except:
+		logging.error(traceback.format_exc())
\ No newline at end of file
diff --git a/chrome/content/papermachines/processors/wordcloud_multiple.py b/chrome/content/papermachines/processors/wordcloud_multiple.py
new file mode 100755
index 0000000..e7119a8
--- /dev/null
+++ b/chrome/content/papermachines/processors/wordcloud_multiple.py
@@ -0,0 +1,200 @@
+#!/usr/bin/env python2.7
+import sys, os, json, cStringIO, tempfile, logging, traceback, codecs, math
+import wordcloud
+
+class MultipleWordClouds(wordcloud.WordCloud):
+	"""
+	Generate word clouds based on labels
+	"""
+	def _basic_params(self):
+		self.name = "wordcloud_multiple"
+		self.width = "300"
+		self.height = "150"
+		self.fontsize = "[10,32]"
+		self.n = 50
+		self.tfidf_scoring = False
+		self.MWW = False
+		self.dunning = False
+		if len(self.extra_args) > 0:
+			if self.extra_args[0] == "tfidf":
+				self.tfidf_scoring = True
+			elif self.extra_args[0] == "mww":
+				self.tfidf_scoring = True
+				self.MWW = True
+			elif self.extra_args[0] == "dunning":
+				self.tfidf_scoring = True
+				self.dunning = True
+
+	def _rank_simple(self, vector):
+	    return sorted(range(len(vector)), key=vector.__getitem__)
+
+	def _rank(self, seq):
+		n = len(seq)
+		ivec = self._rank_simple(seq)
+		svec = [seq[rank] for rank in ivec]
+		last_obs = svec[0]
+		new_vec = [1]*n
+		dupe_indices = set()
+
+		for i in xrange(1, n):
+			if svec[i] == last_obs:
+				dupe_indices.add(i-1)
+				dupe_indices.add(i)
+			else:
+				if len(dupe_indices) > 0:
+					averank = (sum(dupe_indices) / float(len(dupe_indices))) + 1
+					for j in dupe_indices:
+						new_vec[j] = averank
+					new_vec[i] = i + 1
+					dupe_indices = set()
+				else:
+					new_vec[i] = i + 1
+			last_obs = svec[i]
+		ranks = {svec[i]: rank for i, rank in enumerate(new_vec)}
+		return ranks
+
+	def _mannWhitney(self, A, B):
+		all_obs = A + B
+		n_a = len(A)
+		n_b = len(B)
+		n_ab = len(all_obs)
+
+		ranks = self._rank(all_obs)
+		t_a = sum([ranks[obs] for obs in A])
+		mu_a = float(n_a * (n_ab + 1)) / 2
+		t_a_max = (n_a * n_b) + (n_a * (len(A) + 1))/2
+		u_a = t_a_max - t_a
+		s = math.sqrt(float(n_a * n_b * (n_ab + 1))/12)
+		if t_a > mu_a:
+			z_a = (t_a - mu_a - 0.5)/ s
+		else:
+			z_a = (t_a - mu_a + 0.5)/ s
+		rho = u_a / (n_a*n_b)
+		return rho
+
+	def _dunning_held_out(self, word, label_set, other_set):
+		sets = [label_set, other_set]
+		count_total = [0.0, 0.0, 0.0, 0.0]
+		for i in range(len(sets)):
+			for filename in sets[i]:
+				if word in self.tf_by_doc[filename]:
+					count_total[i] += self.tf_by_doc[filename][word]
+				count_total[i + 2] += sum(self.tf_by_doc[filename].values())
+			# count_total[i] = sum([word_weights[word] for filename, word_weights in self.tf_by_doc.iteritems() if filename in sets[i] and word in word_weights])
+			# count_total[i + 2] = sum([sum(word_weights.values()) for filename, word_weights in self.tf_by_doc.iteritems() if filename in sets[i]])
+		a, b, c, d = [float(x) for x in count_total]
+		if any([x == 0 for x in count_total]):
+			return 0
+		E1 = c*((a+b)/(c+d))
+		E2 = d*((a+b)/(c+d))
+		G2 = 2.0*((a*math.log(a/E1)) + (b*math.log(b/E2)))
+		return G2
+
+	def _dunning(self, word, label_set):
+		count_total = [0.0, self.freqs[word], 0.0, self.total_word_count]
+		for filename in label_set:
+			if word in self.tf_by_doc[filename]:
+				count_total[0] += self.tf_by_doc[filename][word]
+			count_total[2] += sum(self.tf_by_doc[filename].values())
+		a, b, c, d = [float(x) for x in count_total]
+		if any([x == 0 for x in count_total]):
+			return 0
+		E1 = c*((a+b)/(c+d))
+		E2 = d*((a+b)/(c+d))
+		G2 = 2.0*((a*math.log(a/E1)) + (b*math.log(b/E2)))
+		return G2
+
+	def _held_out(self, word, label_set, other_set):
+		ranks_by_set = [[],[]]
+		sets = [label_set, other_set]
+		appears_in_label_set = False
+		for i in range(len(sets)):
+			for filename in sets[i]:
+				if word in self.tf_by_doc[filename]:
+					ranks_by_set[i].append(self.tf_by_doc[filename][word])
+					if i == 0:
+						appears_in_label_set = True
+				# 	ranks_by_set[i].append(self.tf_by_doc[filename][word] * self.idf[word])
+				else:
+					ranks_by_set[i].append(0)
+		if not appears_in_label_set:
+			return 0.0
+		else:
+			return self._mannWhitney(ranks_by_set[0], ranks_by_set[1])
+
+	def _split_into_labels(self):
+		for filename, data in self.metadata.iteritems():
+			if data["label"] not in self.labels:
+				self.labels[data["label"]] = set()
+			self.labels[data["label"]].add(filename)
+
+	def process(self):
+		logging.info("splitting into labeled sets")
+		self.labels = {}
+		self._split_into_labels()
+
+		clouds = {}
+
+		all_files = set(self.files)
+		if self.tfidf_scoring:
+			if self.dunning:
+				self._findTfIdfScores(scale=False)
+			else:
+				self._findTfIdfScores()
+			# self.top_tfidf_words = [item["text"] for item in self._topN(self.filtered_freqs, 150)]
+			self.top_tfidf_words = self.filtered_freqs.keys()
+
+		self.label_order = sorted(self.labels.keys())
+		for label in self.label_order:
+			filenames = self.labels[label]
+			logging.info("finding word frequencies for " + str(label))
+			if self.tfidf_scoring and self.MWW:
+				label_set = set(filenames)
+				other_set = all_files - label_set
+				word_rho = {}
+				for word in self.top_tfidf_words:
+					word_rho[word] = self._held_out(word, label_set, other_set)
+				clouds[label] = self._topN(word_rho)
+			elif self.tfidf_scoring and self.dunning:
+				label_set = set(filenames)
+				other_set = all_files - label_set
+				word_G2 = {}
+				self.total_word_count = sum(self.freqs.values())
+				for word in self.top_tfidf_words:
+					G2 = self._dunning_held_out(word, label_set, other_set)
+					# G2 = self._dunning(word, label_set)
+					if G2 > 15.13: # critical value for p < 0.001
+						word_G2[word] = G2
+				clouds[label] = self._topN(word_G2)
+
+			elif self.tfidf_scoring:
+				tf_maxes = {}
+				for filename in filenames:
+					for term, weight in self.tf_by_doc[filename].iteritems():
+						if term not in tf_maxes:
+							tf_maxes[term] = weight
+						else:
+							if weight > tf_maxes[term]:
+								tf_maxes[term] = weight
+				tfidf_for_labelset = {term: weight * self.idf[term] for term, weight in tf_maxes.iteritems()}
+				filtered_freqs_for_labelset = {term: freq for term, freq in self.filtered_freqs.iteritems() if term in tfidf_for_labelset}
+				clouds[label] = self._topN(filtered_freqs_for_labelset)
+			else:
+				clouds[label] = self._findWordFreqs(filenames)
+
+		params = {"CLOUDS": json.dumps(clouds),
+				"ORDER": json.dumps(self.label_order),
+				"WIDTH": self.width,
+				"HEIGHT": self.height,
+				"FONTSIZE": self.fontsize
+		}
+
+		self.write_html(params)
+
+
+if __name__ == "__main__":
+	try:
+		processor = MultipleWordClouds(track_progress = True)
+		processor.process()
+	except:
+		logging.error(traceback.format_exc())
\ No newline at end of file
diff --git a/chrome/locale/en-US/papermachines/papermachines.dtd b/chrome/locale/en-US/papermachines/papermachines.dtd
index 22d7dcc..88e787b 100755
--- a/chrome/locale/en-US/papermachines/papermachines.dtd
+++ b/chrome/locale/en-US/papermachines/papermachines.dtd
@@ -63,6 +63,8 @@
 <!ENTITY papermachines.prefs.general.extract_pdf		"PDFs with OCR text">
 <!ENTITY papermachines.prefs.general.extract_notes		"Notes">
 <!ENTITY papermachines.prefs.general.extract_tags		"Tags">
+<!ENTITY papermachines.prefs.general.python_exe 		"Path to Python executable: ">
+
 
 <!ENTITY papermachines.prefs.lda.topics					"Number of topics: ">
 <!ENTITY papermachines.prefs.lda.iterations				"Number of iterations: ">
diff --git a/defaults/preferences/defaults.js b/defaults/preferences/defaults.js
index 1832f34..cbd78ad 100644
--- a/defaults/preferences/defaults.js
+++ b/defaults/preferences/defaults.js
@@ -6,6 +6,8 @@ pref("extensions.papermachines.general.extract_html", true);
 pref("extensions.papermachines.general.extract_notes", true);
 pref("extensions.papermachines.general.extract_tags", true);
 
+pref("extensions.papermachines.general.python_exe", "");
+
 pref("extensions.papermachines.import.title", "Issue");
 pref("extensions.papermachines.import.pubtitle", "The Daily News");
 pref("extensions.papermachines.import.guessdate", false);
diff --git a/install.rdf b/install.rdf
index becbabd..d740a31 100755
--- a/install.rdf
+++ b/install.rdf
@@ -5,7 +5,7 @@
 	<Description about="urn:mozilla:install-manifest">
 		<em:id>papermachines@chrisjr.org</em:id>
 		<em:name>Paper Machines</em:name>
-		<em:version>0.2.4</em:version>
+		<em:version>0.2.5</em:version>
 		<em:description>A Zotero extension for analysis and visualization in the digital humanities.</em:description>
 		<em:creator>Chris Johnson-Roberson</em:creator>
 		<em:homepageURL>http://chrisjr.github.com/papermachines/</em:homepageURL>