add a search for Python executable

papermachines · Oct 11, 2012 · cca635b · cca635b
1 parent 3887bff
commit cca635b
Show file tree

Hide file tree

Showing 23 changed files with 1,883 additions and 28 deletions.
diff --git a/README.md b/README.md
@@ -6,15 +6,15 @@ Paper Machines is an open-source extension for the [Zotero](http://www.zotero.or
 
 ## Prerequisites
 
-In order to run Paper Machines, you will need the following (note that Python and Java are installed automatically on Mac OS X):
+In order to run Paper Machines, you will need the following (Python and Java are installed automatically on Mac OS X 10.7 and above):
 
 * [Zotero](http://www.zotero.org/) with PDF indexing tools installed (see the Search pane of Zotero's Preferences)
 * a corpus of documents with high-quality metadata (recommended: at least 1,000 for topic modeling purposes)
-* Python ([download for Windows](http://www.python.org/ftp/python/2.7.3/python-2.7.3.msi))
-* Java ([download for Windows/Mac/Linux/etc.](http://java.com/en/download/index.jsp))
+* Python 2.7 ([download page](http://www.python.org/download/releases/2.7.3)) \[N.B. Mac OS 10.6 users must download this version of Python\]
+* Java ([download page](http://java.com/en/download/index.jsp))
 
 ## Installation
-Paper Machines should work either in Zotero for Firefox or Zotero Standalone. To install, you must download the <a href="https://github.com/downloads/chrisjr/papermachines/papermachines-0.2.4.xpi">XPI file</a>. If you wish to use the extension in the Standalone version, right-click on the link and save the XPI file in your Downloads folder. Then, in Zotero Standalone, go to the Tools menu -> Add-Ons. Select the gear icon at the right, then "Install Add-On From File." Navigate to your Downloads folder (or wherever you have saved the XPI file) and open it.
+Paper Machines should work either in Zotero for Firefox or Zotero Standalone. To install, you must download the <a href="https://github.com/downloads/chrisjr/papermachines/papermachines-0.2.5.xpi">XPI file</a>. If you wish to use the extension in the Standalone version, right-click on the link and save the XPI file in your Downloads folder. Then, in Zotero Standalone, go to the Tools menu -> Add-Ons. Select the gear icon at the right, then "Install Add-On From File." Navigate to your Downloads folder (or wherever you have saved the XPI file) and open it.
 
 ## Usage
 To begin, right-click (control-click for Mac) on the collection you wish to analyze and select "Extract Texts for Paper Machines." Once the extraction process is complete, this right-click menu will offer several different processes that may be run on a collection, each with an accompanying visualization. Once these processes have been run, selecting "Export Output of Paper Machines..." will allow you to choose which visualizations to export.

diff --git a/chrome/content/papermachines/options.xul b/chrome/content/papermachines/options.xul
@@ -18,6 +18,7 @@
     <preference id="pref_extract_txt" name="extensions.papermachines.general.extract_txt" type="bool"/>
     <preference id="pref_extract_notes" name="extensions.papermachines.general.extract_notes" type="bool"/>
     <preference id="pref_extract_tags" name="extensions.papermachines.general.extract_tags" type="bool"/>
+    <preference id="pref_python_exe" name="extensions.papermachines.general.python_exe" type="unichar"/>
   </preferences>
 
    <vbox>
@@ -40,6 +41,10 @@
       <checkbox preference="pref_extract_notes" label="&papermachines.prefs.general.extract_notes;" id="extract_notes"/>
       <checkbox preference="pref_extract_tags" label="&papermachines.prefs.general.extract_tags;" id="extract_tags"/>
     </groupbox>
+    <hbox align="center">
+      <label control="python_exe" value="&papermachines.prefs.general.python_exe;"/>
+      <textbox preference="pref_python_exe" id="python_exe" maxlength="64"/>
+    </hbox>
     <separator class="groove-thin"/>
     <label value="&papermachines.prefs.after_close;"/>
   </vbox>

diff --git a/chrome/content/papermachines/papermachines.js b/chrome/content/papermachines/papermachines.js
@@ -18,6 +18,7 @@ Zotero.PaperMachines = {
 	install_dir: null,
 	tagCloudReplace: true,
 	processors_dir: null,
+	python_exe: null,
 	processors: ["wordcloud", "phrasenet", "mallet", "mallet_classify", "geoparse", "dbpedia", "export-output"],
 	processNames: null, // see locale files
 	prompts: null,
@@ -231,9 +232,11 @@ Zotero.PaperMachines = {
 		this.log_dir = this._getOrCreateDir("logs", this.out_dir);
 		this.args_dir = this._getOrCreateDir("args");
 
+
 		Components.utils.import("chrome://papermachines/content/Preferences.js");
 		Components.utils.import("chrome://papermachines/content/strptime.js");
 
+		this.python_exe = this.findPythonExecutable();
 
 		var stoplist_lang = Preferences.get("extensions.papermachines.general.lang") || "en";
 
@@ -252,7 +255,7 @@ Zotero.PaperMachines = {
 		Components.utils.import("resource://gre/modules/AddonManager.jsm");
 		AddonManager.getAddonByID("papermachines@chrisjr.org",
 			function(addon) {
-				Zotero.PaperMachines._updateBundledFilesCallback(addon.getResourceURI().QueryInterface(Components.interfaces.nsIFileURL).file);
+				Zotero.PaperMachines._updateBundledFilesCallback(addon.getResourceURI("").QueryInterface(Components.interfaces.nsIFileURL).file);
 			});
 
 		// Connect to (and create, if necessary) papermachines.sqlite in the Zotero directory
@@ -389,8 +392,8 @@ Zotero.PaperMachines = {
 			return;
 		}
 
-		var proc_file = Zotero.PaperMachines.processors_dir.clone();
-		proc_file.append(processor + ".pyw");
+		var processor_file = Zotero.PaperMachines.processors_dir.clone();
+		processor_file.append(processor + ".py");
 
 		var proc = Components.classes["@mozilla.org/process/util;1"]
 			.createInstance(Components.interfaces.nsIProcess);
@@ -413,7 +416,7 @@ Zotero.PaperMachines = {
 		var argFile = Zotero.PaperMachines._getOrCreateFile(argsHashFilename, Zotero.PaperMachines.args_dir);
 		Zotero.File.putContents(argFile, args_str);
 
-		var procArgs = [argFile.path];
+		var procArgs = [processor_file.path, argFile.path];
 
 		outFile.append(processor + thisID + "-" + args_hash + ".html");
 
@@ -431,9 +434,13 @@ Zotero.PaperMachines = {
 			}
 		};
 
-		var observer = new this.processObserver(processor, processPath, callback);
+		var observer = new Zotero.PaperMachines.processObserver(processor, processPath, callback);
+
+		var python_exe_file = Zotero.PaperMachines._getLocalFile(Zotero.PaperMachines.python_exe);
+
+		Zotero.PaperMachines.LOG("running " + python_exe_file.leafName + " " + procArgs.join(" "));
 
-		proc.init(proc_file);
+		proc.init(python_exe_file);
 		proc.runAsync(procArgs, procArgs.length, observer);
 	},
 	replaceTagsBoxWithWordCloud: function (uri) {
@@ -678,7 +685,7 @@ Zotero.PaperMachines = {
 	},
 	traverseItemGroup: function (itemGroup) {
 		var itemGroups = [];
-		if ("isLibrary" in itemGroup && itemGroup.isLibrary()) {
+		if (typeof itemGroup.isLibrary == "function" && itemGroup.isLibrary()) {
 			if (itemGroup.id == "L") {
 				itemGroups.push(ZoteroPane.collectionsView._dataItems[0][0]);
 				var collectionKeys = Zotero.DB.columnQuery("SELECT key from collections WHERE libraryID IS NULL;");
@@ -687,7 +694,7 @@ Zotero.PaperMachines = {
 				}
 			}
 		} else {
-			if ("isCollection" in itemGroup && itemGroup.isCollection()) {
+			if (typeof itemGroup.isCollection == "function" && itemGroup.isCollection()) {
 				itemGroups.push(itemGroup);
 				var currentCollection = ("ref" in itemGroup) ? itemGroup.ref : itemGroup;
 				if (currentCollection.hasChildCollections()) {
@@ -696,7 +703,7 @@ Zotero.PaperMachines = {
 						itemGroups.push(Zotero.PaperMachines.traverseItemGroup(children[i]));
 					}
 				}
-			} else if ("isGroup" in itemGroup && itemGroup.isGroup()) {
+			} else if (typeof itemGroup.isGroup == "function" && itemGroup.isGroup()) {
 				if (itemGroup.ref.hasCollections()) {
 					var children = itemGroup.ref.getCollections();
 					for (var i in children) {
@@ -922,7 +929,7 @@ Zotero.PaperMachines = {
 		Zotero.PaperMachines.DB.query("INSERT OR IGNORE INTO files_to_extract (filename, itemID, outfile, collection) VALUES (?,?,?,?)", [tagsFile.path, item.id, tagsFile.path.replace("_tags.txt", ".txt"), dir.leafName]);
 	},
 	_updateBundledFilesCallback: function (installLocation) {
-		this.install_dir = installLocation;
+		Zotero.PaperMachines.install_dir = installLocation;
 		var xpiZipReader, isUnpacked = installLocation.isDirectory();
 		if(!isUnpacked) {
 			xpiZipReader = Components.classes["@mozilla.org/libjar/zip-reader;1"]
@@ -941,12 +948,12 @@ Zotero.PaperMachines = {
 			procs_dir.append("papermachines");
 			procs_dir.append("processors");
 
-			this._copyAllFiles(procs_dir, this.processors_dir);
+			this._copyAllFiles(procs_dir, Zotero.PaperMachines.processors_dir);
 		}
-		this.aux_dir = this._getOrCreateDir("support", this.processors_dir);
+		Zotero.PaperMachines.aux_dir = Zotero.PaperMachines._getOrCreateDir("support", Zotero.PaperMachines.processors_dir);
 
-		var new_aux = this._getOrCreateDir("support", this.out_dir);
-		this._copyAllFiles(this.aux_dir, new_aux);
+		var new_aux = Zotero.PaperMachines._getOrCreateDir("support", Zotero.PaperMachines.out_dir);
+		Zotero.PaperMachines._copyAllFiles(Zotero.PaperMachines.aux_dir, new_aux);
 	},
 	_copyOrMoveAllFiles: function (copy_or_move, source, target, recursive) {
 		var files = source.directoryEntries;
@@ -960,10 +967,6 @@ Zotero.PaperMachines = {
 				}
 				if (copy_or_move) {
 					f.copyTo(target, f.leafName);
-					if (f.leafName.indexOf(".pyw") != -1) {
-						var regpy = f.leafName.replace(".pyw", ".py");
-						f.copyTo(target, regpy);
-					}
 				} else {
 					f.moveTo(target, f.leafName);
 				}
@@ -1500,7 +1503,7 @@ Zotero.PaperMachines = {
 			win.gBrowser.selectedTab = win.gBrowser.addTab(url);			
 		}
 	},
-	openPreferences : function() {
+	openPreferences: function() {
 	  if (!this._preferencesWindow || this._preferencesWindow.closed) {
 	    var instantApply = Application.prefs.get("browser.preferences.instantApply");
 	    var features = "chrome,titlebar,toolbar,centerscreen" +
@@ -1512,6 +1515,41 @@ Zotero.PaperMachines = {
 
 	  this._preferencesWindow.focus();
 	},
+	findPythonExecutable: function () { 
+		var python_exe = Preferences.get("extensions.papermachines.general.python_exe");
+		if (!python_exe) {
+			var environment = Components.classes["@mozilla.org/process/environment;1"]
+	                            .getService(Components.interfaces.nsIEnvironment);
+			var path = environment.get("PATH"),
+				python_name = "pythonw",
+				directories = [];
+
+			if (Zotero.platform == "Win32") {
+				python_name += ".exe";
+				directories = ["C:\\Python27\\"];
+			} else {
+				python_name += "2.7";
+				directories = ["/usr/bin", "/usr/local/bin", "/sw/bin", "/opt/local/bin"];
+			}
+
+			for (var i = 0, n = directories.length; i < n; i++) {
+				var executable = Zotero.PaperMachines._getLocalFile(directories[i]);
+				executable.append(python_name);
+				if (executable.exists()) {
+					python_exe = executable.path;
+					break;
+				}
+			}
+
+			if (python_exe) {
+				Preferences.set("extensions.papermachines.general.python_exe", python_exe);
+			} else {
+				Zotero.PaperMachines.ERROR("Python not found! Please enter the path to Python 2.7 in the Paper Machines preference window.")
+			}
+		}
+		return python_exe;
+
+	},
 	evtListener: function (evt) {
 		var node = evt.target, doc = node.ownerDocument;
 
@@ -1542,12 +1580,17 @@ Zotero.PaperMachines.processObserver.prototype = {
   observe: function(subject, topic, data) {
 	switch (topic) {
 		case "process-failed":
-			Zotero.PaperMachines.LOG("Process " + this.processName + " failed.")
+			Zotero.PaperMachines.LOG("Process " + this.processName + " failed.");
 			this.callback(false);
 			break;
 		case "process-finished":
-			Zotero.PaperMachines.LOG("Process " + this.processName + " finished.")
-			this.callback(true);
+			Zotero.PaperMachines.LOG("Process " + this.processName + " finished with exit value " + subject.exitValue);
+			if (subject.exitValue != 0) { // something went awry
+				Zotero.PaperMachines.ERROR("Process " + this.processName + " failed.");
+				this.callback(false);
+			} else {
+				this.callback(true);				
+			}
 			break;
 	}
 	this.unregister();

diff --git a/chrome/content/papermachines/processors/dbpedia.py b/chrome/content/papermachines/processors/dbpedia.py
@@ -0,0 +1,102 @@
+#!/usr/bin/env python2.7
+import sys, os, json, logging, urllib, urllib2, codecs, traceback
+import textprocessor
+
+
+class DBpedia(textprocessor.TextProcessor):
+	"""
+	annotates texts using DBpedia Spotlight
+	"""
+
+	def _basic_params(self):
+		self.name = "dbpedia"
+		self.dry_run = False
+		self.require_stopwords = False
+
+	def _get_annotated(self, text, confidence = 0.2, support = 20):
+		values = {'text': text[0:10000].encode('utf-8'),
+			'confidence': confidence,
+			'support': support}
+		data = urllib.urlencode(values)
+		req = urllib2.Request(self.url, data, self.headers)
+		response = urllib2.urlopen(req)
+		annotation = response.read()
+		encoding = req.headers.get('content-type', 'charset=utf8').split('charset=')[-1]
+
+		return unicode(annotation, encoding)
+
+	def process(self):
+		"""
+		create JSON files with named entity recognition by DBpedia
+		"""
+
+		logging.info("beginning annotation")
+
+		self.url = "http://spotlight.dbpedia.org/rest/annotate"
+		self.headers = {'Accept': 'application/json', 'content-type': 'application/x-www-form-urlencoded'}
+
+		annotated = {}
+		if not self.dry_run:
+			for filename in self.files:
+				logging.info("processing " + filename)
+				self.update_progress()
+				try:
+					annotated_filename = filename.replace(".txt", "_dbpedia.json")
+					if os.path.exists(annotated_filename):
+						annotated[annotated_filename] = filename
+					else:
+						with codecs.open(filename, 'r', encoding='utf-8') as f:
+							annotation = self._get_annotated(f.read())
+							if len(annotation) > 0:
+								annotated[annotated_filename] = filename
+								with codecs.open(annotated_filename, 'w', encoding='utf-8') as out:
+									out.write(annotation)
+				except (KeyboardInterrupt, SystemExit):
+					raise
+				except:
+					logging.error(traceback.format_exc())
+		else:
+			for filename in self.files:
+				annotated_filename = filename.replace(".txt", "_dbpedia.json")
+				if os.path.exists(annotated_filename):
+					annotated[annotated_filename] = filename
+
+		uris_to_docs = {}
+		for json_annotation, filename in annotated.iteritems():
+			itemID = self.metadata[filename]["itemID"]
+			notes = json.load(file(json_annotation))
+			entities = notes.get("Resources", [])
+			for entity in entities:
+				uri = entity.get("@URI", "http://dbpedia.org/resource/")
+				if not uri in uris_to_docs:
+					uris_to_docs[uri] = {}
+				if not itemID in uris_to_docs[uri]:
+					uris_to_docs[uri][itemID] = 0
+				uris_to_docs[uri][itemID] += 1
+
+		filtered_uris = {}
+		weights = []
+		for uri, items in uris_to_docs.iteritems():
+			weights.append(sum(items.values()))
+		weights.sort()
+		min_weight = weights[max(-100, -len(weights))]
+
+		for uri, items in uris_to_docs.iteritems():
+			if sum(items.values()) > min_weight:
+				filtered_uris[uri] = items
+
+
+
+		# params = {"DATA": json.dumps(uris_to_docs)}
+		params = {"URIS_TO_DOCS": json.dumps(filtered_uris)}
+		self.write_html(params)
+
+		logging.info("finished")
+
+
+if __name__ == "__main__":
+	try:
+		processor = DBpedia(track_progress=True)
+		processor.process()
+	except:
+		logging.error(traceback.format_exc())