Browse files

extract notes and tags

  • Loading branch information...
1 parent 01ccea9 commit 67f7695f0c21967843d947deb271e30cef49e415 Chris Johnson-Roberson committed Sep 28, 2012
View
39 chrome/content/papermachines/options.xul
@@ -13,21 +13,34 @@
<prefpane id="papermachines-general-pane" label="&papermachines.prefs.general;">
<preferences>
<preference id="pref_lang" name="extensions.papermachines.general.lang" type="unichar"/>
+ <preference id="pref_extract_pdf" name="extensions.papermachines.general.extract_pdf" type="bool"/>
+ <preference id="pref_extract_html" name="extensions.papermachines.general.extract_html" type="bool"/>
+ <preference id="pref_extract_notes" name="extensions.papermachines.general.extract_notes" type="bool"/>
+ <preference id="pref_extract_tags" name="extensions.papermachines.general.extract_tags" type="bool"/>
</preferences>
- <groupbox orient="vertical">
- <vbox>
- <label control="lang" value="&papermachines.prefs.general.lang;"/>
- <menulist id="lang" preference="pref_lang">
- <menupopup>
- <menuitem label="English" value="en"/>
- <menuitem label="Portuguese" value="pt"/>
- </menupopup>
- </menulist>
- <separator class="groove-thin"/>
- <caption label="&papermachines.prefs.after_close;"/>
- </vbox>
- </groupbox>
+ <vbox>
+ <groupbox orient="vertical">
+ <caption label="&papermachines.prefs.general;">
+ <label control="lang" value="&papermachines.prefs.general.lang;"/>
+ <menulist id="lang" preference="pref_lang">
+ <menupopup>
+ <menuitem label="English" value="en"/>
+ <menuitem label="Portuguese" value="pt"/>
+ </menupopup>
+ </menulist>
+ </groupbox>
+ <separator class="groove-thin"/>
+ <groupbox orient="horizontal">
+ <caption label="&papermachines.prefs.general.extract;"/>
+ <checkbox preference="pref_extract_pdf" label="&papermachines.prefs.general.extract_pdf;" id="extract_pdf"/>
+ <checkbox preference="pref_extract_html" label="&papermachines.prefs.general.extract_html;" id="extract_html"/>
+ <checkbox preference="pref_extract_notes" label="&papermachines.prefs.general.extract_notes;" id="extract_notes"/>
+ <checkbox preference="pref_extract_tags" label="&papermachines.prefs.general.extract_tags;" id="extract_tags"/>
+ </groupbox>
+ <separator class="groove-thin"/>
+ <label value="&papermachines.prefs.after_close;"/>
+ </vbox>
</prefpane>
<prefpane id="papermachines-lda-pane" label="&papermachines.collectionmenu.topicmodeling;">
View
78 chrome/content/papermachines/papermachines.js
@@ -746,7 +746,7 @@ Zotero.PaperMachines = {
var text = Zotero.PaperMachines._getLocalFile(filename);
existent = text.exists();
if (!existent) {
- this.DB.query("DELETE FROM doc_files WHERE filename = ?;", [docs[i]["filename"]]);
+ this.DB.query("DELETE FROM doc_files WHERE filename = ?;", [filename]);
}
}
return existent ? filename : false;
@@ -841,32 +841,86 @@ Zotero.PaperMachines = {
processItem: function(itemGroupName, item, dir, i, queue) {
var percentDone = (parseInt(i)+queue.runningTotal)*100.0/queue.grandTotal;
Zotero.updateZoteroPaneProgressMeter(percentDone);
+ var gettingNotes = Preferences.get("extensions.papermachines.general.extract_notes");
+ var gettingTags = Preferences.get("extensions.papermachines.general.extract_tags");
+ var gettingPDF = Preferences.get("extensions.papermachines.general.extract_pdf");
+ var gettingHTML = Preferences.get("extensions.papermachines.general.extract_html");
+
var outFile = dir.clone();
outFile.append(Zotero.PaperMachines.getFilenameForItem(item));
- if (outFile.exists()) {
- Zotero.PaperMachines.DB.query("INSERT OR IGNORE INTO collection_docs (collection,itemID) VALUES (?,?)", [dir.leafName, item.id]);
- queue.runningTotal += 1;
- queue.next();
- return;
- }
+ var notesFile = dir.clone();
+ notesFile.append(outFile.leafName.replace(".txt", "_notes.html"));
+
+ var notes_str = "<html><head></head><body>";
+
+ var tagsFile = dir.clone();
+ tagsFile.append(outFile.leafName.replace(".txt", "_tags.txt"));
+
+ // if (outFile.exists()) {
+ // Zotero.PaperMachines.DB.query("INSERT OR IGNORE INTO collection_docs (collection,itemID) VALUES (?,?)", [dir.leafName, item.id]);
+ // if (gettingNotes) {
+ // Zotero.PaperMachines._extractNotes(item, notesFile, notes_str, outFile.path, dir.leafName);
+ // }
+
+ // if (gettingTags) {
+ // Zotero.PaperMachines._extractTags(item, tagsFile, tags_str, outFile.path, dir.leafName);
+ // }
+
+ // queue.runningTotal += 1;
+ // queue.next();
+ // return;
+ // }
var attachments = item.getAttachments(false);
var recognizedAttachments = false;
- for (a in attachments) {
+ for (var a in attachments) {
var a_item = Zotero.Items.get(attachments[a]);
- if (a_item.attachmentMIMEType == 'application/pdf'
- || a_item.attachmentMIMEType == 'text/html') {
+ if ((a_item.attachmentMIMEType == 'application/pdf' && gettingPDF)
+ || (a_item.attachmentMIMEType == 'text/html' && gettingHTML)
+ || a_item.attachmentMIMEType == 'text/plain') {
recognizedAttachments = true;
var orig_file = a_item.getFile().path;
if (orig_file) {
Zotero.PaperMachines.DB.query("INSERT OR IGNORE INTO files_to_extract (filename, itemID, outfile, collection) VALUES (?,?,?,?)", [orig_file, item.id, outFile.path, dir.leafName]);
}
}
+
+ if (gettingNotes && "hasNote" in a_item && a_item.hasNote()) {
+ notes_str += a_item.getNote() + "\n---\n";
+ }
}
+
+ if (gettingNotes) {
+ Zotero.PaperMachines._extractNotes(item, notesFile, notes_str, outFile.path, dir.leafName);
+ }
+
+ if (gettingTags) {
+ Zotero.PaperMachines._extractTags(item, tagsFile, tags_str, outFile.path, dir.leafName);
+ }
+
queue.runningTotal += 1;
queue.next();
},
+ _extractNotes: function (item, notesFile, notes_str, outFile_path, dir_leafName) {
+ var notes = item.getNotes(false);
+ for (var b in notes) {
+ var note = Zotero.Items.get(notes[b]);
+ notes_str += note.getNote() + "\n---\n";
+ }
+
+ notes_str += "</body></html>";
+
+ Zotero.File.putContents(notesFile, notes_str);
+ Zotero.PaperMachines.DB.query("INSERT OR IGNORE INTO files_to_extract (filename, itemID, outfile, collection) VALUES (?,?,?,?)", [notesFile.path, item.id, outFile_path, dir_leafName]);
+ },
+ _extractTags: function () {
+ var tags = item.getTags(false);
+ var tags_str = tags.map(function (d) { return d.name}).join(", ");
+
+ Zotero.File.putContents(tagsFile, tags_str);
+ Zotero.PaperMachines.DB.query("INSERT OR IGNORE INTO files_to_extract (filename, itemID, outfile, collection) VALUES (?,?,?,?)", [tagsFile.path, item.id, outFile_path, dir_leafName]);
+ },
_updateBundledFilesCallback: function (installLocation) {
this.install_dir = installLocation;
var xpiZipReader, isUnpacked = installLocation.isDirectory();
@@ -1309,9 +1363,7 @@ Zotero.PaperMachines = {
var ris_file = Zotero.PaperMachines._getOrCreateFile(import_dir.leafName + ".ris", import_dir);
Zotero.File.putContents(ris_file, ris_str);
- // Zotero.UnresponsiveScriptIndicator.disable();
- // Zotero_File_Interface.importFile(ris_file);
- // Zotero.UnresponsiveScriptIndicator.enable();
+ Zotero_File_Interface.importFile(ris_file);
}
}
},
View
7 chrome/content/papermachines/processors/extract.pyw
@@ -19,8 +19,7 @@ def strip_tags(html):
class Extract(textprocessor.TextProcessor):
"""
- Generate phrase net
- cf. http://www-958.ibm.com/software/data/cognos/manyeyes/page/Phrase_Net.html
+ Extract text from PDF or HTML files
"""
def _basic_params(self):
@@ -51,8 +50,10 @@ class Extract(textprocessor.TextProcessor):
import_args = [self.pdftotext, '-enc', 'UTF-8', '-nopgbrk', filename, '-']
import_proc = subprocess.Popen(import_args, stdout = subprocess.PIPE)
text += import_proc.communicate()[0].decode('utf-8')
- if filename.lower().endswith(".html"):
+ elif filename.lower().endswith(".html"):
text += strip_tags(codecs.open(filename, 'r', encoding='utf-8', errors='ignore').read())
+ elif filename.lower().endswith(".txt"):
+ text += codecs.open(filename, 'r', encoding='utf-8', errors='ignore').read()
with codecs.open(out_file, 'w', encoding="utf-8") as f:
f.write(text)
saved.append({"itemID": itemID, "collection": self.metadata[filename]["collection"], "filename": out_file})
View
6 chrome/locale/en-US/papermachines/papermachines.dtd
@@ -56,6 +56,12 @@
<!ENTITY papermachines.prefs.import.startingoffset "Starting Offset">
<!ENTITY papermachines.prefs.general.lang "Stoplist Language: ">
+<!ENTITY papermachines.prefs.general.extract "Data to Extract">
+<!ENTITY papermachines.prefs.general.extract_html "Web Snapshots">
+<!ENTITY papermachines.prefs.general.extract_pdf "PDFs with OCR text">
+<!ENTITY papermachines.prefs.general.extract_notes "Notes">
+<!ENTITY papermachines.prefs.general.extract_tags "Tags">
+
<!ENTITY papermachines.prefs.lda.topics "Number of topics: ">
<!ENTITY papermachines.prefs.lda.iterations "Number of iterations: ">
<!ENTITY papermachines.prefs.lda.burn_in "Burn-in (iterations before estimating hyperparameters): ">
View
5 defaults/preferences/defaults.js
@@ -1,5 +1,10 @@
pref("extensions.papermachines.general.lang", "en");
+pref("extensions.papermachines.general.extract_pdf", true);
+pref("extensions.papermachines.general.extract_html", true);
+pref("extensions.papermachines.general.extract_notes", true);
+pref("extensions.papermachines.general.extract_tags", true);
+
pref("extensions.papermachines.import.title", "Issue");
pref("extensions.papermachines.import.pubtitle", "The Daily News");
pref("extensions.papermachines.import.guessdate", false);

0 comments on commit 67f7695

Please sign in to comment.