Skip to content

Commit

Permalink
add a search for Python executable
Browse files Browse the repository at this point in the history
  • Loading branch information
corajr committed Oct 11, 2012
1 parent 3887bff commit cca635b
Show file tree
Hide file tree
Showing 23 changed files with 1,883 additions and 28 deletions.
8 changes: 4 additions & 4 deletions README.md
Expand Up @@ -6,15 +6,15 @@ Paper Machines is an open-source extension for the [Zotero](http://www.zotero.or

## Prerequisites

In order to run Paper Machines, you will need the following (note that Python and Java are installed automatically on Mac OS X):
In order to run Paper Machines, you will need the following (Python and Java are installed automatically on Mac OS X 10.7 and above):

* [Zotero](http://www.zotero.org/) with PDF indexing tools installed (see the Search pane of Zotero's Preferences)
* a corpus of documents with high-quality metadata (recommended: at least 1,000 for topic modeling purposes)
* Python ([download for Windows](http://www.python.org/ftp/python/2.7.3/python-2.7.3.msi))
* Java ([download for Windows/Mac/Linux/etc.](http://java.com/en/download/index.jsp))
* Python 2.7 ([download page](http://www.python.org/download/releases/2.7.3)) \[N.B. Mac OS 10.6 users must download this version of Python\]
* Java ([download page](http://java.com/en/download/index.jsp))

## Installation
Paper Machines should work either in Zotero for Firefox or Zotero Standalone. To install, you must download the <a href="https://github.com/downloads/chrisjr/papermachines/papermachines-0.2.4.xpi">XPI file</a>. If you wish to use the extension in the Standalone version, right-click on the link and save the XPI file in your Downloads folder. Then, in Zotero Standalone, go to the Tools menu -> Add-Ons. Select the gear icon at the right, then "Install Add-On From File." Navigate to your Downloads folder (or wherever you have saved the XPI file) and open it.
Paper Machines should work either in Zotero for Firefox or Zotero Standalone. To install, you must download the <a href="https://github.com/downloads/chrisjr/papermachines/papermachines-0.2.5.xpi">XPI file</a>. If you wish to use the extension in the Standalone version, right-click on the link and save the XPI file in your Downloads folder. Then, in Zotero Standalone, go to the Tools menu -> Add-Ons. Select the gear icon at the right, then "Install Add-On From File." Navigate to your Downloads folder (or wherever you have saved the XPI file) and open it.

## Usage
To begin, right-click (control-click for Mac) on the collection you wish to analyze and select "Extract Texts for Paper Machines." Once the extraction process is complete, this right-click menu will offer several different processes that may be run on a collection, each with an accompanying visualization. Once these processes have been run, selecting "Export Output of Paper Machines..." will allow you to choose which visualizations to export.
Expand Down
5 changes: 5 additions & 0 deletions chrome/content/papermachines/options.xul
Expand Up @@ -18,6 +18,7 @@
<preference id="pref_extract_txt" name="extensions.papermachines.general.extract_txt" type="bool"/>
<preference id="pref_extract_notes" name="extensions.papermachines.general.extract_notes" type="bool"/>
<preference id="pref_extract_tags" name="extensions.papermachines.general.extract_tags" type="bool"/>
<preference id="pref_python_exe" name="extensions.papermachines.general.python_exe" type="unichar"/>
</preferences>

<vbox>
Expand All @@ -40,6 +41,10 @@
<checkbox preference="pref_extract_notes" label="&papermachines.prefs.general.extract_notes;" id="extract_notes"/>
<checkbox preference="pref_extract_tags" label="&papermachines.prefs.general.extract_tags;" id="extract_tags"/>
</groupbox>
<hbox align="center">
<label control="python_exe" value="&papermachines.prefs.general.python_exe;"/>
<textbox preference="pref_python_exe" id="python_exe" maxlength="64"/>
</hbox>
<separator class="groove-thin"/>
<label value="&papermachines.prefs.after_close;"/>
</vbox>
Expand Down
87 changes: 65 additions & 22 deletions chrome/content/papermachines/papermachines.js
Expand Up @@ -18,6 +18,7 @@ Zotero.PaperMachines = {
install_dir: null,
tagCloudReplace: true,
processors_dir: null,
python_exe: null,
processors: ["wordcloud", "phrasenet", "mallet", "mallet_classify", "geoparse", "dbpedia", "export-output"],
processNames: null, // see locale files
prompts: null,
Expand Down Expand Up @@ -231,9 +232,11 @@ Zotero.PaperMachines = {
this.log_dir = this._getOrCreateDir("logs", this.out_dir);
this.args_dir = this._getOrCreateDir("args");


Components.utils.import("chrome://papermachines/content/Preferences.js");
Components.utils.import("chrome://papermachines/content/strptime.js");

this.python_exe = this.findPythonExecutable();

var stoplist_lang = Preferences.get("extensions.papermachines.general.lang") || "en";

Expand All @@ -252,7 +255,7 @@ Zotero.PaperMachines = {
Components.utils.import("resource://gre/modules/AddonManager.jsm");
AddonManager.getAddonByID("papermachines@chrisjr.org",
function(addon) {
Zotero.PaperMachines._updateBundledFilesCallback(addon.getResourceURI().QueryInterface(Components.interfaces.nsIFileURL).file);
Zotero.PaperMachines._updateBundledFilesCallback(addon.getResourceURI("").QueryInterface(Components.interfaces.nsIFileURL).file);
});

// Connect to (and create, if necessary) papermachines.sqlite in the Zotero directory
Expand Down Expand Up @@ -389,8 +392,8 @@ Zotero.PaperMachines = {
return;
}

var proc_file = Zotero.PaperMachines.processors_dir.clone();
proc_file.append(processor + ".pyw");
var processor_file = Zotero.PaperMachines.processors_dir.clone();
processor_file.append(processor + ".py");

var proc = Components.classes["@mozilla.org/process/util;1"]
.createInstance(Components.interfaces.nsIProcess);
Expand All @@ -413,7 +416,7 @@ Zotero.PaperMachines = {
var argFile = Zotero.PaperMachines._getOrCreateFile(argsHashFilename, Zotero.PaperMachines.args_dir);
Zotero.File.putContents(argFile, args_str);

var procArgs = [argFile.path];
var procArgs = [processor_file.path, argFile.path];

outFile.append(processor + thisID + "-" + args_hash + ".html");

Expand All @@ -431,9 +434,13 @@ Zotero.PaperMachines = {
}
};

var observer = new this.processObserver(processor, processPath, callback);
var observer = new Zotero.PaperMachines.processObserver(processor, processPath, callback);

var python_exe_file = Zotero.PaperMachines._getLocalFile(Zotero.PaperMachines.python_exe);

Zotero.PaperMachines.LOG("running " + python_exe_file.leafName + " " + procArgs.join(" "));

proc.init(proc_file);
proc.init(python_exe_file);
proc.runAsync(procArgs, procArgs.length, observer);
},
replaceTagsBoxWithWordCloud: function (uri) {
Expand Down Expand Up @@ -678,7 +685,7 @@ Zotero.PaperMachines = {
},
traverseItemGroup: function (itemGroup) {
var itemGroups = [];
if ("isLibrary" in itemGroup && itemGroup.isLibrary()) {
if (typeof itemGroup.isLibrary == "function" && itemGroup.isLibrary()) {
if (itemGroup.id == "L") {
itemGroups.push(ZoteroPane.collectionsView._dataItems[0][0]);
var collectionKeys = Zotero.DB.columnQuery("SELECT key from collections WHERE libraryID IS NULL;");
Expand All @@ -687,7 +694,7 @@ Zotero.PaperMachines = {
}
}
} else {
if ("isCollection" in itemGroup && itemGroup.isCollection()) {
if (typeof itemGroup.isCollection == "function" && itemGroup.isCollection()) {
itemGroups.push(itemGroup);
var currentCollection = ("ref" in itemGroup) ? itemGroup.ref : itemGroup;
if (currentCollection.hasChildCollections()) {
Expand All @@ -696,7 +703,7 @@ Zotero.PaperMachines = {
itemGroups.push(Zotero.PaperMachines.traverseItemGroup(children[i]));
}
}
} else if ("isGroup" in itemGroup && itemGroup.isGroup()) {
} else if (typeof itemGroup.isGroup == "function" && itemGroup.isGroup()) {
if (itemGroup.ref.hasCollections()) {
var children = itemGroup.ref.getCollections();
for (var i in children) {
Expand Down Expand Up @@ -922,7 +929,7 @@ Zotero.PaperMachines = {
Zotero.PaperMachines.DB.query("INSERT OR IGNORE INTO files_to_extract (filename, itemID, outfile, collection) VALUES (?,?,?,?)", [tagsFile.path, item.id, tagsFile.path.replace("_tags.txt", ".txt"), dir.leafName]);
},
_updateBundledFilesCallback: function (installLocation) {
this.install_dir = installLocation;
Zotero.PaperMachines.install_dir = installLocation;
var xpiZipReader, isUnpacked = installLocation.isDirectory();
if(!isUnpacked) {
xpiZipReader = Components.classes["@mozilla.org/libjar/zip-reader;1"]
Expand All @@ -941,12 +948,12 @@ Zotero.PaperMachines = {
procs_dir.append("papermachines");
procs_dir.append("processors");

this._copyAllFiles(procs_dir, this.processors_dir);
this._copyAllFiles(procs_dir, Zotero.PaperMachines.processors_dir);
}
this.aux_dir = this._getOrCreateDir("support", this.processors_dir);
Zotero.PaperMachines.aux_dir = Zotero.PaperMachines._getOrCreateDir("support", Zotero.PaperMachines.processors_dir);

var new_aux = this._getOrCreateDir("support", this.out_dir);
this._copyAllFiles(this.aux_dir, new_aux);
var new_aux = Zotero.PaperMachines._getOrCreateDir("support", Zotero.PaperMachines.out_dir);
Zotero.PaperMachines._copyAllFiles(Zotero.PaperMachines.aux_dir, new_aux);
},
_copyOrMoveAllFiles: function (copy_or_move, source, target, recursive) {
var files = source.directoryEntries;
Expand All @@ -960,10 +967,6 @@ Zotero.PaperMachines = {
}
if (copy_or_move) {
f.copyTo(target, f.leafName);
if (f.leafName.indexOf(".pyw") != -1) {
var regpy = f.leafName.replace(".pyw", ".py");
f.copyTo(target, regpy);
}
} else {
f.moveTo(target, f.leafName);
}
Expand Down Expand Up @@ -1500,7 +1503,7 @@ Zotero.PaperMachines = {
win.gBrowser.selectedTab = win.gBrowser.addTab(url);
}
},
openPreferences : function() {
openPreferences: function() {
if (!this._preferencesWindow || this._preferencesWindow.closed) {
var instantApply = Application.prefs.get("browser.preferences.instantApply");
var features = "chrome,titlebar,toolbar,centerscreen" +
Expand All @@ -1512,6 +1515,41 @@ Zotero.PaperMachines = {

this._preferencesWindow.focus();
},
findPythonExecutable: function () {
var python_exe = Preferences.get("extensions.papermachines.general.python_exe");
if (!python_exe) {
var environment = Components.classes["@mozilla.org/process/environment;1"]
.getService(Components.interfaces.nsIEnvironment);
var path = environment.get("PATH"),
python_name = "pythonw",
directories = [];

if (Zotero.platform == "Win32") {
python_name += ".exe";
directories = ["C:\\Python27\\"];
} else {
python_name += "2.7";
directories = ["/usr/bin", "/usr/local/bin", "/sw/bin", "/opt/local/bin"];
}

for (var i = 0, n = directories.length; i < n; i++) {
var executable = Zotero.PaperMachines._getLocalFile(directories[i]);
executable.append(python_name);
if (executable.exists()) {
python_exe = executable.path;
break;
}
}

if (python_exe) {
Preferences.set("extensions.papermachines.general.python_exe", python_exe);
} else {
Zotero.PaperMachines.ERROR("Python not found! Please enter the path to Python 2.7 in the Paper Machines preference window.")
}
}
return python_exe;

},
evtListener: function (evt) {
var node = evt.target, doc = node.ownerDocument;

Expand Down Expand Up @@ -1542,12 +1580,17 @@ Zotero.PaperMachines.processObserver.prototype = {
observe: function(subject, topic, data) {
switch (topic) {
case "process-failed":
Zotero.PaperMachines.LOG("Process " + this.processName + " failed.")
Zotero.PaperMachines.LOG("Process " + this.processName + " failed.");
this.callback(false);
break;
case "process-finished":
Zotero.PaperMachines.LOG("Process " + this.processName + " finished.")
this.callback(true);
Zotero.PaperMachines.LOG("Process " + this.processName + " finished with exit value " + subject.exitValue);
if (subject.exitValue != 0) { // something went awry
Zotero.PaperMachines.ERROR("Process " + this.processName + " failed.");
this.callback(false);
} else {
this.callback(true);
}
break;
}
this.unregister();
Expand Down
102 changes: 102 additions & 0 deletions chrome/content/papermachines/processors/dbpedia.py
@@ -0,0 +1,102 @@
#!/usr/bin/env python2.7
import sys, os, json, logging, urllib, urllib2, codecs, traceback
import textprocessor


class DBpedia(textprocessor.TextProcessor):
"""
annotates texts using DBpedia Spotlight
"""

def _basic_params(self):
self.name = "dbpedia"
self.dry_run = False
self.require_stopwords = False

def _get_annotated(self, text, confidence = 0.2, support = 20):
values = {'text': text[0:10000].encode('utf-8'),
'confidence': confidence,
'support': support}
data = urllib.urlencode(values)
req = urllib2.Request(self.url, data, self.headers)
response = urllib2.urlopen(req)
annotation = response.read()
encoding = req.headers.get('content-type', 'charset=utf8').split('charset=')[-1]

return unicode(annotation, encoding)

def process(self):
"""
create JSON files with named entity recognition by DBpedia
"""

logging.info("beginning annotation")

self.url = "http://spotlight.dbpedia.org/rest/annotate"
self.headers = {'Accept': 'application/json', 'content-type': 'application/x-www-form-urlencoded'}

annotated = {}
if not self.dry_run:
for filename in self.files:
logging.info("processing " + filename)
self.update_progress()
try:
annotated_filename = filename.replace(".txt", "_dbpedia.json")
if os.path.exists(annotated_filename):
annotated[annotated_filename] = filename
else:
with codecs.open(filename, 'r', encoding='utf-8') as f:
annotation = self._get_annotated(f.read())
if len(annotation) > 0:
annotated[annotated_filename] = filename
with codecs.open(annotated_filename, 'w', encoding='utf-8') as out:
out.write(annotation)
except (KeyboardInterrupt, SystemExit):
raise
except:
logging.error(traceback.format_exc())
else:
for filename in self.files:
annotated_filename = filename.replace(".txt", "_dbpedia.json")
if os.path.exists(annotated_filename):
annotated[annotated_filename] = filename

uris_to_docs = {}
for json_annotation, filename in annotated.iteritems():
itemID = self.metadata[filename]["itemID"]
notes = json.load(file(json_annotation))
entities = notes.get("Resources", [])
for entity in entities:
uri = entity.get("@URI", "http://dbpedia.org/resource/")
if not uri in uris_to_docs:
uris_to_docs[uri] = {}
if not itemID in uris_to_docs[uri]:
uris_to_docs[uri][itemID] = 0
uris_to_docs[uri][itemID] += 1

filtered_uris = {}
weights = []
for uri, items in uris_to_docs.iteritems():
weights.append(sum(items.values()))
weights.sort()
min_weight = weights[max(-100, -len(weights))]

for uri, items in uris_to_docs.iteritems():
if sum(items.values()) > min_weight:
filtered_uris[uri] = items



# params = {"DATA": json.dumps(uris_to_docs)}
params = {"URIS_TO_DOCS": json.dumps(filtered_uris)}
self.write_html(params)

logging.info("finished")


if __name__ == "__main__":
try:
processor = DBpedia(track_progress=True)
processor.process()
except:
logging.error(traceback.format_exc())

0 comments on commit cca635b

Please sign in to comment.