Permalink
Browse files

add a search for Python executable

  • Loading branch information...
1 parent 3887bff commit cca635b8aff4efff9b39b9fd454bb821d4f682a1 @corajr corajr committed Oct 11, 2012
View
@@ -6,15 +6,15 @@ Paper Machines is an open-source extension for the [Zotero](http://www.zotero.or
## Prerequisites
-In order to run Paper Machines, you will need the following (note that Python and Java are installed automatically on Mac OS X):
+In order to run Paper Machines, you will need the following (Python and Java are installed automatically on Mac OS X 10.7 and above):
* [Zotero](http://www.zotero.org/) with PDF indexing tools installed (see the Search pane of Zotero's Preferences)
* a corpus of documents with high-quality metadata (recommended: at least 1,000 for topic modeling purposes)
-* Python ([download for Windows](http://www.python.org/ftp/python/2.7.3/python-2.7.3.msi))
-* Java ([download for Windows/Mac/Linux/etc.](http://java.com/en/download/index.jsp))
+* Python 2.7 ([download page](http://www.python.org/download/releases/2.7.3)) \[N.B. Mac OS 10.6 users must download this version of Python\]
+* Java ([download page](http://java.com/en/download/index.jsp))
## Installation
-Paper Machines should work either in Zotero for Firefox or Zotero Standalone. To install, you must download the <a href="https://github.com/downloads/chrisjr/papermachines/papermachines-0.2.4.xpi">XPI file</a>. If you wish to use the extension in the Standalone version, right-click on the link and save the XPI file in your Downloads folder. Then, in Zotero Standalone, go to the Tools menu -> Add-Ons. Select the gear icon at the right, then "Install Add-On From File." Navigate to your Downloads folder (or wherever you have saved the XPI file) and open it.
+Paper Machines should work either in Zotero for Firefox or Zotero Standalone. To install, you must download the <a href="https://github.com/downloads/chrisjr/papermachines/papermachines-0.2.5.xpi">XPI file</a>. If you wish to use the extension in the Standalone version, right-click on the link and save the XPI file in your Downloads folder. Then, in Zotero Standalone, go to the Tools menu -> Add-Ons. Select the gear icon at the right, then "Install Add-On From File." Navigate to your Downloads folder (or wherever you have saved the XPI file) and open it.
## Usage
To begin, right-click (control-click for Mac) on the collection you wish to analyze and select "Extract Texts for Paper Machines." Once the extraction process is complete, this right-click menu will offer several different processes that may be run on a collection, each with an accompanying visualization. Once these processes have been run, selecting "Export Output of Paper Machines..." will allow you to choose which visualizations to export.
@@ -18,6 +18,7 @@
<preference id="pref_extract_txt" name="extensions.papermachines.general.extract_txt" type="bool"/>
<preference id="pref_extract_notes" name="extensions.papermachines.general.extract_notes" type="bool"/>
<preference id="pref_extract_tags" name="extensions.papermachines.general.extract_tags" type="bool"/>
+ <preference id="pref_python_exe" name="extensions.papermachines.general.python_exe" type="unichar"/>
</preferences>
<vbox>
@@ -40,6 +41,10 @@
<checkbox preference="pref_extract_notes" label="&papermachines.prefs.general.extract_notes;" id="extract_notes"/>
<checkbox preference="pref_extract_tags" label="&papermachines.prefs.general.extract_tags;" id="extract_tags"/>
</groupbox>
+ <hbox align="center">
+ <label control="python_exe" value="&papermachines.prefs.general.python_exe;"/>
+ <textbox preference="pref_python_exe" id="python_exe" maxlength="64"/>
+ </hbox>
<separator class="groove-thin"/>
<label value="&papermachines.prefs.after_close;"/>
</vbox>
@@ -18,6 +18,7 @@ Zotero.PaperMachines = {
install_dir: null,
tagCloudReplace: true,
processors_dir: null,
+ python_exe: null,
processors: ["wordcloud", "phrasenet", "mallet", "mallet_classify", "geoparse", "dbpedia", "export-output"],
processNames: null, // see locale files
prompts: null,
@@ -231,9 +232,11 @@ Zotero.PaperMachines = {
this.log_dir = this._getOrCreateDir("logs", this.out_dir);
this.args_dir = this._getOrCreateDir("args");
+
Components.utils.import("chrome://papermachines/content/Preferences.js");
Components.utils.import("chrome://papermachines/content/strptime.js");
+ this.python_exe = this.findPythonExecutable();
var stoplist_lang = Preferences.get("extensions.papermachines.general.lang") || "en";
@@ -252,7 +255,7 @@ Zotero.PaperMachines = {
Components.utils.import("resource://gre/modules/AddonManager.jsm");
AddonManager.getAddonByID("papermachines@chrisjr.org",
function(addon) {
- Zotero.PaperMachines._updateBundledFilesCallback(addon.getResourceURI().QueryInterface(Components.interfaces.nsIFileURL).file);
+ Zotero.PaperMachines._updateBundledFilesCallback(addon.getResourceURI("").QueryInterface(Components.interfaces.nsIFileURL).file);
});
// Connect to (and create, if necessary) papermachines.sqlite in the Zotero directory
@@ -389,8 +392,8 @@ Zotero.PaperMachines = {
return;
}
- var proc_file = Zotero.PaperMachines.processors_dir.clone();
- proc_file.append(processor + ".pyw");
+ var processor_file = Zotero.PaperMachines.processors_dir.clone();
+ processor_file.append(processor + ".py");
var proc = Components.classes["@mozilla.org/process/util;1"]
.createInstance(Components.interfaces.nsIProcess);
@@ -413,7 +416,7 @@ Zotero.PaperMachines = {
var argFile = Zotero.PaperMachines._getOrCreateFile(argsHashFilename, Zotero.PaperMachines.args_dir);
Zotero.File.putContents(argFile, args_str);
- var procArgs = [argFile.path];
+ var procArgs = [processor_file.path, argFile.path];
outFile.append(processor + thisID + "-" + args_hash + ".html");
@@ -431,9 +434,13 @@ Zotero.PaperMachines = {
}
};
- var observer = new this.processObserver(processor, processPath, callback);
+ var observer = new Zotero.PaperMachines.processObserver(processor, processPath, callback);
+
+ var python_exe_file = Zotero.PaperMachines._getLocalFile(Zotero.PaperMachines.python_exe);
+
+ Zotero.PaperMachines.LOG("running " + python_exe_file.leafName + " " + procArgs.join(" "));
- proc.init(proc_file);
+ proc.init(python_exe_file);
proc.runAsync(procArgs, procArgs.length, observer);
},
replaceTagsBoxWithWordCloud: function (uri) {
@@ -678,7 +685,7 @@ Zotero.PaperMachines = {
},
traverseItemGroup: function (itemGroup) {
var itemGroups = [];
- if ("isLibrary" in itemGroup && itemGroup.isLibrary()) {
+ if (typeof itemGroup.isLibrary == "function" && itemGroup.isLibrary()) {
if (itemGroup.id == "L") {
itemGroups.push(ZoteroPane.collectionsView._dataItems[0][0]);
var collectionKeys = Zotero.DB.columnQuery("SELECT key from collections WHERE libraryID IS NULL;");
@@ -687,7 +694,7 @@ Zotero.PaperMachines = {
}
}
} else {
- if ("isCollection" in itemGroup && itemGroup.isCollection()) {
+ if (typeof itemGroup.isCollection == "function" && itemGroup.isCollection()) {
itemGroups.push(itemGroup);
var currentCollection = ("ref" in itemGroup) ? itemGroup.ref : itemGroup;
if (currentCollection.hasChildCollections()) {
@@ -696,7 +703,7 @@ Zotero.PaperMachines = {
itemGroups.push(Zotero.PaperMachines.traverseItemGroup(children[i]));
}
}
- } else if ("isGroup" in itemGroup && itemGroup.isGroup()) {
+ } else if (typeof itemGroup.isGroup == "function" && itemGroup.isGroup()) {
if (itemGroup.ref.hasCollections()) {
var children = itemGroup.ref.getCollections();
for (var i in children) {
@@ -922,7 +929,7 @@ Zotero.PaperMachines = {
Zotero.PaperMachines.DB.query("INSERT OR IGNORE INTO files_to_extract (filename, itemID, outfile, collection) VALUES (?,?,?,?)", [tagsFile.path, item.id, tagsFile.path.replace("_tags.txt", ".txt"), dir.leafName]);
},
_updateBundledFilesCallback: function (installLocation) {
- this.install_dir = installLocation;
+ Zotero.PaperMachines.install_dir = installLocation;
var xpiZipReader, isUnpacked = installLocation.isDirectory();
if(!isUnpacked) {
xpiZipReader = Components.classes["@mozilla.org/libjar/zip-reader;1"]
@@ -941,12 +948,12 @@ Zotero.PaperMachines = {
procs_dir.append("papermachines");
procs_dir.append("processors");
- this._copyAllFiles(procs_dir, this.processors_dir);
+ this._copyAllFiles(procs_dir, Zotero.PaperMachines.processors_dir);
}
- this.aux_dir = this._getOrCreateDir("support", this.processors_dir);
+ Zotero.PaperMachines.aux_dir = Zotero.PaperMachines._getOrCreateDir("support", Zotero.PaperMachines.processors_dir);
- var new_aux = this._getOrCreateDir("support", this.out_dir);
- this._copyAllFiles(this.aux_dir, new_aux);
+ var new_aux = Zotero.PaperMachines._getOrCreateDir("support", Zotero.PaperMachines.out_dir);
+ Zotero.PaperMachines._copyAllFiles(Zotero.PaperMachines.aux_dir, new_aux);
},
_copyOrMoveAllFiles: function (copy_or_move, source, target, recursive) {
var files = source.directoryEntries;
@@ -960,10 +967,6 @@ Zotero.PaperMachines = {
}
if (copy_or_move) {
f.copyTo(target, f.leafName);
- if (f.leafName.indexOf(".pyw") != -1) {
- var regpy = f.leafName.replace(".pyw", ".py");
- f.copyTo(target, regpy);
- }
} else {
f.moveTo(target, f.leafName);
}
@@ -1500,7 +1503,7 @@ Zotero.PaperMachines = {
win.gBrowser.selectedTab = win.gBrowser.addTab(url);
}
},
- openPreferences : function() {
+ openPreferences: function() {
if (!this._preferencesWindow || this._preferencesWindow.closed) {
var instantApply = Application.prefs.get("browser.preferences.instantApply");
var features = "chrome,titlebar,toolbar,centerscreen" +
@@ -1512,6 +1515,41 @@ Zotero.PaperMachines = {
this._preferencesWindow.focus();
},
+ findPythonExecutable: function () {
+ var python_exe = Preferences.get("extensions.papermachines.general.python_exe");
+ if (!python_exe) {
+ var environment = Components.classes["@mozilla.org/process/environment;1"]
+ .getService(Components.interfaces.nsIEnvironment);
+ var path = environment.get("PATH"),
+ python_name = "pythonw",
+ directories = [];
+
+ if (Zotero.platform == "Win32") {
+ python_name += ".exe";
+ directories = ["C:\\Python27\\"];
+ } else {
+ python_name += "2.7";
+ directories = ["/usr/bin", "/usr/local/bin", "/sw/bin", "/opt/local/bin"];
+ }
+
+ for (var i = 0, n = directories.length; i < n; i++) {
+ var executable = Zotero.PaperMachines._getLocalFile(directories[i]);
+ executable.append(python_name);
+ if (executable.exists()) {
+ python_exe = executable.path;
+ break;
+ }
+ }
+
+ if (python_exe) {
+ Preferences.set("extensions.papermachines.general.python_exe", python_exe);
+ } else {
+ Zotero.PaperMachines.ERROR("Python not found! Please enter the path to Python 2.7 in the Paper Machines preference window.")
+ }
+ }
+ return python_exe;
+
+ },
evtListener: function (evt) {
var node = evt.target, doc = node.ownerDocument;
@@ -1542,12 +1580,17 @@ Zotero.PaperMachines.processObserver.prototype = {
observe: function(subject, topic, data) {
switch (topic) {
case "process-failed":
- Zotero.PaperMachines.LOG("Process " + this.processName + " failed.")
+ Zotero.PaperMachines.LOG("Process " + this.processName + " failed.");
this.callback(false);
break;
case "process-finished":
- Zotero.PaperMachines.LOG("Process " + this.processName + " finished.")
- this.callback(true);
+ Zotero.PaperMachines.LOG("Process " + this.processName + " finished with exit value " + subject.exitValue);
+ if (subject.exitValue != 0) { // something went awry
+ Zotero.PaperMachines.ERROR("Process " + this.processName + " failed.");
+ this.callback(false);
+ } else {
+ this.callback(true);
+ }
break;
}
this.unregister();
@@ -0,0 +1,102 @@
+#!/usr/bin/env python2.7
+import sys, os, json, logging, urllib, urllib2, codecs, traceback
+import textprocessor
+
+
+class DBpedia(textprocessor.TextProcessor):
+ """
+ annotates texts using DBpedia Spotlight
+ """
+
+ def _basic_params(self):
+ self.name = "dbpedia"
+ self.dry_run = False
+ self.require_stopwords = False
+
+ def _get_annotated(self, text, confidence = 0.2, support = 20):
+ values = {'text': text[0:10000].encode('utf-8'),
+ 'confidence': confidence,
+ 'support': support}
+ data = urllib.urlencode(values)
+ req = urllib2.Request(self.url, data, self.headers)
+ response = urllib2.urlopen(req)
+ annotation = response.read()
+ encoding = req.headers.get('content-type', 'charset=utf8').split('charset=')[-1]
+
+ return unicode(annotation, encoding)
+
+ def process(self):
+ """
+ create JSON files with named entity recognition by DBpedia
+ """
+
+ logging.info("beginning annotation")
+
+ self.url = "http://spotlight.dbpedia.org/rest/annotate"
+ self.headers = {'Accept': 'application/json', 'content-type': 'application/x-www-form-urlencoded'}
+
+ annotated = {}
+ if not self.dry_run:
+ for filename in self.files:
+ logging.info("processing " + filename)
+ self.update_progress()
+ try:
+ annotated_filename = filename.replace(".txt", "_dbpedia.json")
+ if os.path.exists(annotated_filename):
+ annotated[annotated_filename] = filename
+ else:
+ with codecs.open(filename, 'r', encoding='utf-8') as f:
+ annotation = self._get_annotated(f.read())
+ if len(annotation) > 0:
+ annotated[annotated_filename] = filename
+ with codecs.open(annotated_filename, 'w', encoding='utf-8') as out:
+ out.write(annotation)
+ except (KeyboardInterrupt, SystemExit):
+ raise
+ except:
+ logging.error(traceback.format_exc())
+ else:
+ for filename in self.files:
+ annotated_filename = filename.replace(".txt", "_dbpedia.json")
+ if os.path.exists(annotated_filename):
+ annotated[annotated_filename] = filename
+
+ uris_to_docs = {}
+ for json_annotation, filename in annotated.iteritems():
+ itemID = self.metadata[filename]["itemID"]
+ notes = json.load(file(json_annotation))
+ entities = notes.get("Resources", [])
+ for entity in entities:
+ uri = entity.get("@URI", "http://dbpedia.org/resource/")
+ if not uri in uris_to_docs:
+ uris_to_docs[uri] = {}
+ if not itemID in uris_to_docs[uri]:
+ uris_to_docs[uri][itemID] = 0
+ uris_to_docs[uri][itemID] += 1
+
+ filtered_uris = {}
+ weights = []
+ for uri, items in uris_to_docs.iteritems():
+ weights.append(sum(items.values()))
+ weights.sort()
+ min_weight = weights[max(-100, -len(weights))]
+
+ for uri, items in uris_to_docs.iteritems():
+ if sum(items.values()) > min_weight:
+ filtered_uris[uri] = items
+
+
+
+ # params = {"DATA": json.dumps(uris_to_docs)}
+ params = {"URIS_TO_DOCS": json.dumps(filtered_uris)}
+ self.write_html(params)
+
+ logging.info("finished")
+
+
+if __name__ == "__main__":
+ try:
+ processor = DBpedia(track_progress=True)
+ processor.process()
+ except:
+ logging.error(traceback.format_exc())
Oops, something went wrong.

0 comments on commit cca635b

Please sign in to comment.