Permalink
Browse files

0.2.6 -- 0.2.5 introduced a bad bug

wherein python processes never registered as "finished," disabling the
program entirely. Regression testing is, at this point, indispensable.
  • Loading branch information...
1 parent e4af6e3 commit 8ae15f923b44bf3eb6d44ab0d7b7f6f1e70cad24 Chris Johnson-Roberson committed Oct 13, 2012
View
2 README.md
@@ -14,7 +14,7 @@ In order to run Paper Machines, you will need the following (Python and Java are
* Java ([download page](http://java.com/en/download/index.jsp))
## Installation
-Paper Machines should work either in Zotero for Firefox or Zotero Standalone. To install, you must download the <a href="https://github.com/downloads/chrisjr/papermachines/papermachines-0.2.5.xpi">XPI file</a>. If you wish to use the extension in the Standalone version, right-click on the link and save the XPI file in your Downloads folder. Then, in Zotero Standalone, go to the Tools menu -> Add-Ons. Select the gear icon at the right, then "Install Add-On From File." Navigate to your Downloads folder (or wherever you have saved the XPI file) and open it.
+Paper Machines should work either in Zotero for Firefox or Zotero Standalone. To install, you must download the <a href="https://github.com/downloads/chrisjr/papermachines/papermachines-0.2.6.xpi">XPI file</a>. If you wish to use the extension in the Standalone version, right-click on the link and save the XPI file in your Downloads folder. Then, in Zotero Standalone, go to the Tools menu -> Add-Ons. Select the gear icon at the right, then "Install Add-On From File." Navigate to your Downloads folder (or wherever you have saved the XPI file) and open it.
## Usage
To begin, right-click (control-click for Mac) on the collection you wish to analyze and select "Extract Texts for Paper Machines." Once the extraction process is complete, this right-click menu will offer several different processes that may be run on a collection, each with an accompanying visualization. Once these processes have been run, selecting "Export Output of Paper Machines..." will allow you to choose which visualizations to export.
View
7 chrome/content/papermachines/options.xul
@@ -7,8 +7,6 @@
title="Paper Machines Options"
height="400"
xmlns="http://www.mozilla.org/keymaster/gatekeeper/there.is.only.xul">
-
-<script src="chrome://papermachines/content/options.js"/>
<prefpane id="papermachines-general-pane" label="&papermachines.prefs.general;">
<preferences>
@@ -18,6 +16,7 @@
<preference id="pref_extract_txt" name="extensions.papermachines.general.extract_txt" type="bool"/>
<preference id="pref_extract_notes" name="extensions.papermachines.general.extract_notes" type="bool"/>
<preference id="pref_extract_tags" name="extensions.papermachines.general.extract_tags" type="bool"/>
+ <preference id="pref_extract_livepages" name="extensions.papermachines.general.extract_livepages" type="bool"/>
<preference id="pref_python_exe" name="extensions.papermachines.general.python_exe" type="unichar"/>
</preferences>
@@ -40,10 +39,11 @@
<checkbox preference="pref_extract_txt" label="&papermachines.prefs.general.extract_txt;" id="extract_txt"/>
<checkbox preference="pref_extract_notes" label="&papermachines.prefs.general.extract_notes;" id="extract_notes"/>
<checkbox preference="pref_extract_tags" label="&papermachines.prefs.general.extract_tags;" id="extract_tags"/>
+ <!-- <checkbox preference="pref_extract_livepages" label="&papermachines.prefs.general.extract_livepages;" id="extract_livepages"/> -->
</groupbox>
<hbox align="center">
<label control="python_exe" value="&papermachines.prefs.general.python_exe;"/>
- <textbox preference="pref_python_exe" id="python_exe" maxlength="64"/>
+ <textbox preference="pref_python_exe" id="python_exe" maxlength="100"/>
</hbox>
<separator class="groove-thin"/>
<label value="&papermachines.prefs.after_close;"/>
@@ -151,5 +151,6 @@
</groupbox>
</prefpane>
+<script src="chrome://papermachines/content/options.js"/>
</prefwindow>
View
22 chrome/content/papermachines/papermachines.js
@@ -232,7 +232,6 @@ Zotero.PaperMachines = {
this.log_dir = this._getOrCreateDir("logs", this.out_dir);
this.args_dir = this._getOrCreateDir("args");
-
Components.utils.import("chrome://papermachines/content/Preferences.js");
Components.utils.import("chrome://papermachines/content/strptime.js");
@@ -284,10 +283,10 @@ Zotero.PaperMachines = {
pdftotext.append(Zotero.Fulltext.pdfConverterFileName);
var path = "zotero://papermachines/extract/" + Zotero.PaperMachines.getItemGroupID(itemGroup) + "/" + encodeURIComponent(pdftotext.path);
- this.DB.beginTransaction();
+ // this.DB.beginTransaction();
this.DB.query("UPDATE processed_collections SET status = 'failed' WHERE processor='extract' AND collection = ?;", [id]);
this.DB.query("DELETE FROM collection_docs WHERE collection = ? OR collection IN (SELECT child FROM collections WHERE parent = ?);", [id, id]);
- this.DB.commitTransaction();
+ // this.DB.commitTransaction();
var queue = new Zotero.PaperMachines._Sequence(function() {
Zotero.UnresponsiveScriptIndicator.enable();
@@ -1584,13 +1583,16 @@ Zotero.PaperMachines.processObserver.prototype = {
this.callback(false);
break;
case "process-finished":
- Zotero.PaperMachines.LOG("Process " + this.processName + " finished with exit value " + subject.exitValue);
- if (subject.exitValue != 0) { // something went awry
- Zotero.PaperMachines.ERROR("Process " + this.processName + " failed.");
- this.callback(false);
- } else {
- this.callback(true);
- }
+ var exitValue = subject.QueryInterface(Components.interfaces.nsIProcess).exitValue;
+ if (typeof exitValue == "number") {
+ if (exitValue == 0) { //success
+ Zotero.PaperMachines.LOG("Process " + this.processName + " finished successfully.");
+ this.callback(true);
+ } else {
+ Zotero.PaperMachines.ERROR("Process " + this.processName + " failed with exit value " + exitValue);
+ this.callback(false);
+ }
+ }
break;
}
this.unregister();
View
112 chrome/content/papermachines/processors/extract.py
@@ -12,65 +12,71 @@ def handle_data(self, d):
def get_data(self):
return u''.join(self.fed)
-def strip_tags(html):
- s = MLStripper()
- s.feed(html)
- return s.get_data()
+def strip_tags(filename):
+ try:
+ html = codecs.open(filename, 'r', encoding='utf-8', errors='ignore').read()
+ s = MLStripper()
+ s.feed(html)
+ return s.get_data()
+ except:
+ logging.error("Non-fatal HTML error on {:} -- continuing".format(os.path.basename(filename)))
+# logging.error(traceback.format_exc())
+ return ""
class Extract(textprocessor.TextProcessor):
- """
- Extract text from PDF or HTML files
- """
+ """
+ Extract text from PDF or HTML files
+ """
- def _basic_params(self):
- self.name = "extract"
- self.pdftotext = self.extra_args[0]
+ def _basic_params(self):
+ self.name = "extract"
+ self.pdftotext = self.extra_args[0]
- def process(self):
- logging.info("starting to process")
+ def process(self):
+ logging.info("starting to process")
- itemIDs = {}
- for filename in self.files:
- id = self.metadata[filename]["itemID"]
- if id not in itemIDs:
- itemIDs[id] = []
- itemIDs[id].append(filename)
+ itemIDs = {}
+ for filename in self.files:
+ id = self.metadata[filename]["itemID"]
+ if id not in itemIDs:
+ itemIDs[id] = []
+ itemIDs[id].append(filename)
- saved = []
- for itemID, filenames in itemIDs.iteritems():
- try:
- out_file = self.metadata[filenames[0]]["outfile"]
- out_dir = os.path.dirname(out_file)
- if not os.path.exists(out_dir):
- os.makedirs(out_dir)
- text = u''
- for filename in filenames:
- if filename.lower().endswith(".txt"):
- text += codecs.open(filename, 'r', encoding='utf-8', errors='ignore').read()
- elif filename.lower().endswith(".html"):
- text += strip_tags(codecs.open(filename, 'r', encoding='utf-8', errors='ignore').read())
- elif filename.lower().endswith(".pdf"):
- import_args = [self.pdftotext, '-enc', 'UTF-8', '-nopgbrk', filename, '-']
- import_proc = subprocess.Popen(import_args, stdout = subprocess.PIPE)
- text += import_proc.communicate()[0].decode('utf-8')
- with codecs.open(out_file, 'w', encoding="utf-8") as f:
- f.write(text)
- saved.append({"itemID": itemID, "collection": self.metadata[filename]["collection"], "filename": out_file})
- self.update_progress()
- except:
- logging.error(traceback.format_exc())
- if self.progress_initialized:
- self.progress_file.write('<1000>\n')
- json_out = os.path.join(self.out_dir, self.name + self.collection + ".json")
- with codecs.open(json_out, 'wb', encoding='utf-8') as f:
- json.dump(saved, f)
- params = {"SUCCEEDED": str(len(saved)), "TOTAL": str(len(itemIDs.keys()))}
- self.write_html(params)
+ saved = []
+ for itemID, filenames in itemIDs.iteritems():
+ try:
+ out_file = self.metadata[filenames[0]]["outfile"]
+ out_dir = os.path.dirname(out_file)
+ if not os.path.exists(out_dir):
+ os.makedirs(out_dir)
+ text = u''
+ for filename in filenames:
+ if filename.lower().endswith(".txt"):
+ text += codecs.open(filename, 'r', encoding='utf-8', errors='ignore').read()
+ elif filename.lower().endswith(".html"):
+ text += strip_tags(filename)
+ elif filename.lower().endswith(".pdf"):
+ import_args = [self.pdftotext, '-enc', 'UTF-8', '-nopgbrk', filename, '-']
+ import_proc = subprocess.Popen(import_args, stdout = subprocess.PIPE)
+ text += import_proc.communicate()[0].decode('utf-8')
+ with codecs.open(out_file, 'w', encoding="utf-8") as f:
+ f.write(text)
+ saved.append({"itemID": itemID, "collection": self.metadata[filename]["collection"], "filename": out_file})
+ self.update_progress()
+ except:
+ logging.error(traceback.format_exc())
+ if self.progress_initialized:
+ self.progress_file.write('<1000>\n')
+ json_out = os.path.join(self.out_dir, self.name + self.collection + ".json")
+ with codecs.open(json_out, 'wb', encoding='utf-8') as f:
+ json.dump(saved, f)
+ params = {"SUCCEEDED": str(len(saved)), "TOTAL": str(len(itemIDs.keys()))}
+ self.write_html(params)
if __name__ == "__main__":
- try:
- processor = Extract(track_progress=True)
- processor.process()
- except:
- logging.error(traceback.format_exc())
+ try:
+ processor = Extract(track_progress=True)
+ processor.process()
+ except:
+ logging.error(traceback.format_exc())
View
1 chrome/locale/en-US/papermachines/papermachines.dtd
@@ -61,6 +61,7 @@
<!ENTITY papermachines.prefs.general.extract_txt "Plain Text">
<!ENTITY papermachines.prefs.general.extract_html "Web Snapshots (HTML)">
<!ENTITY papermachines.prefs.general.extract_pdf "PDFs with OCR text">
+<!ENTITY papermachines.prefs.general.extract_livepages "Live Web Pages">
<!ENTITY papermachines.prefs.general.extract_notes "Notes">
<!ENTITY papermachines.prefs.general.extract_tags "Tags">
<!ENTITY papermachines.prefs.general.python_exe "Path to Python executable: ">
View
1 defaults/preferences/defaults.js
@@ -5,6 +5,7 @@ pref("extensions.papermachines.general.extract_pdf", true);
pref("extensions.papermachines.general.extract_html", true);
pref("extensions.papermachines.general.extract_notes", true);
pref("extensions.papermachines.general.extract_tags", true);
+pref("extensions.papermachines.general.extract_livepages", false);
pref("extensions.papermachines.general.python_exe", "");
View
2 install.rdf
@@ -5,7 +5,7 @@
<Description about="urn:mozilla:install-manifest">
<em:id>papermachines@chrisjr.org</em:id>
<em:name>Paper Machines</em:name>
- <em:version>0.2.5</em:version>
+ <em:version>0.2.6</em:version>
<em:description>A Zotero extension for analysis and visualization in the digital humanities.</em:description>
<em:creator>Chris Johnson-Roberson</em:creator>
<em:homepageURL>http://chrisjr.github.com/papermachines/</em:homepageURL>

0 comments on commit 8ae15f9

Please sign in to comment.