Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Browse files

add support for segmentation by paragraph

  • Loading branch information...
commit 331a311279b6feda5902863da1db1ad0bd952be7 1 parent b26e129
@corajr corajr authored
View
2  chrome/content/papermachines/options.xul
@@ -73,6 +73,7 @@
<preference id="pref_lda_optimize_interval" name="extensions.papermachines.lda.optimize_interval" type="int"/>
<preference id="pref_lda_symmetric_alpha" name="extensions.papermachines.lda.symmetric_alpha" type="bool"/>
<preference id="pref_lda_stemming" name="extensions.papermachines.lda.stemming" type="bool"/>
+ <preference id="pref_lda_segmentation" name="extensions.papermachines.lda.segmentation" type="bool"/>
<preference id="pref_lda_tfidf" name="extensions.papermachines.lda.tfidf" type="bool"/>
</preferences>
@@ -112,6 +113,7 @@
<checkbox preference="pref_lda_symmetric_alpha" label="&papermachines.prefs.lda.symmetric_alpha;" id="lda_symmetric_alpha"/>
<checkbox preference="pref_lda_stemming" label="&papermachines.prefs.lda.stemming;" id="lda_stemming"/>
<checkbox preference="pref_lda_tfidf" label="&papermachines.prefs.lda.tfidf;" id="lda_tfidf"/>
+ <checkbox preference="pref_lda_segmentation" label="&papermachines.prefs.lda.segmentation;" id="lda_segmentation"/>
</groupbox>
<spacer flex="1"/>
View
8 chrome/content/papermachines/papermachines.js
@@ -488,9 +488,16 @@ Zotero.PaperMachines = {
procArgs = ["-jar", this.jython_path].concat(procArgs);
+ if (Zotero.PaperMachines.memoryIntensive(processor)) {
+ procArgs = ["-Xmx1g"].concat(procArgs);
+ }
+
proc.init(java_exe_file);
proc.runAsync(procArgs, procArgs.length, observer);
},
+ memoryIntensive: function (processor) {
+ return processor.indexOf("mallet") != -1;
+ },
replaceTagsBoxWithWordCloud: function (uri) {
const XUL_NS = "http://www.mozilla.org/keymaster/gatekeeper/there.is.only.xul";
var iframe = document.createElementNS(XUL_NS, "iframe");
@@ -1372,6 +1379,7 @@ Zotero.PaperMachines = {
"mallet_lda": [{"name": "topics", "type": "text", "pref": "extensions.papermachines.lda.topics"},
{"name": "iterations", "type": "text", "pref": "extensions.papermachines.lda.iterations", "advanced": true},
{"name": "stemming", "type": "check", "pref": "extensions.papermachines.lda.stemming"},
+ {"name": "segmentation", "type": "check", "pref": "extensions.papermachines.lda.segmentation", "advanced": true},
{"name": "tfidf", "type": "check", "pref": "extensions.papermachines.lda.tfidf"},
{"name": "min_df", "type": "text", "pref": "extensions.papermachines.lda.min_df", "advanced": true},
{"name": "alpha", "type": "text", "pref": "extensions.papermachines.lda.alpha", "advanced": true},
View
34 chrome/content/papermachines/processors/mallet.py
@@ -48,24 +48,36 @@ def _import_dfr(self, dfr_dir):
logging.error(doi)
logging.error(traceback.format_exc())
+ def _output_text(self, text, f, filename):
+ text = re.sub(r"[^\w ]+", u'', text.lower(), flags=re.UNICODE)
+ if self.stemming:
+ newtext = u''
+ for word in text.split():
+ if word not in self.stemmed:
+ self.stemmed[word] = stem(self, word)
+ newtext += self.stemmed[word] + u' '
+ text = newtext
+ f.write(u'\t'.join([filename, self.metadata[filename]["label"], text]) + u'\n')
+ self.docs.append(filename)
+
def _import_files(self):
if self.stemming:
self.stemmed = {}
self.docs = []
+ self.segmentation = getattr(self, "segmentation", False)
+
with codecs.open(self.texts_file, 'w', encoding='utf-8') as f:
for filename in self.files:
with codecs.open(filename, 'r', encoding='utf-8') as input_file:
text = input_file.read()
- text = re.sub(r"[^\w ]+", u'', text.lower(), flags=re.UNICODE)
- if self.stemming:
- newtext = u''
- for word in text.split():
- if word not in self.stemmed:
- self.stemmed[word] = stem(self, word)
- newtext += self.stemmed[word] + u' '
- text = newtext
- f.write(u'\t'.join([filename, self.metadata[filename]["label"], text]) + u'\n')
- self.docs.append(filename)
+ if self.segmentation:
+ segments = filter(lambda x: x.count(' ') > 5, text.split("\n\n"))
+ for i, text_seg in enumerate(segments):
+ seg_filename = filename + "#" + str(i)
+ self.metadata[seg_filename] = self.metadata[filename]
+ self._output_text(text_seg, f, seg_filename)
+ else:
+ self._output_text(text, f, filename)
if self.dfr:
for doi, text in self._import_dfr(self.dfr_dir):
f.write(u'\t'.join([doi, self.metadata[doi]["label"], text]) + u'\n')
@@ -224,7 +236,7 @@ def _setup_mallet_instances(self, sequence=True, tfidf = False, stemming = True)
"--stoplist-file", self.stoplist,
"--input", self.texts_file,
"--line-regex", "^([^\\t]*)[\\t]([^\\t]*)[\\t](.*)$",
- "--token-regex", '[\p{L}\p{M}]+',
+ "--token-regex", "\S+" if tfidf else "[\p{L}\p{M}]+",
"--output", self.instance_file]
if sequence:
import_args.append("--keep-sequence")
View
3  chrome/content/papermachines/processors/mallet_lda.py
@@ -85,6 +85,7 @@ def process(self):
self.optimize_interval = self.named_args["optimize_interval"]
self.burn_in = int(self.named_args["burn_in"])
self.lang = self.named_args["lang"]
+ self.segmentation = self.named_args["segmentation"]
else:
self.tfidf = True
self.min_df = 5
@@ -96,9 +97,9 @@ def process(self):
self.burn_in = 200
self.symmetric_alpha = "false"
self.optimize_interval = 0
+ self.segmentation = False
self.lang = "en"
-
self._setup_mallet_instances(sequence=True, tfidf=self.tfidf, stemming=self.stemming)
self.mallet_files = {'state': os.path.join(self.mallet_out_dir, "topic-state.gz"),
View
4 chrome/locale/en-US/papermachines/papermachines.dtd
@@ -82,7 +82,6 @@
<!ENTITY papermachines.prefs.general.experimental "Enable Experimental Features">
-
<!ENTITY papermachines.prefs.lda.topics "Number of topics: ">
<!ENTITY papermachines.prefs.lda.iterations "Number of iterations: ">
<!ENTITY papermachines.prefs.lda.burn_in "Burn-in (iterations before estimating hyperparameters): ">
@@ -95,5 +94,6 @@
<!ENTITY papermachines.prefs.lda.stemming "Porter stemming">
<!ENTITY papermachines.prefs.lda.tfidf "tf*idf filtering">
<!ENTITY papermachines.prefs.lda.min_df "min doc frequency for tf*idf">
-
+<!ENTITY papermachines.prefs.lda.segmentation "Segment by paragraph
+">
View
1  chrome/locale/en-US/papermachines/papermachines.properties
@@ -42,6 +42,7 @@ paramLabels.mallet_dmr.min_df = minimum doc frequency (for tf*idf)
paramLabels.mallet_dmr.features = comma-separated list of features (e.g. decade, place, label)
paramLabels.mallet_lda.topics = Number of topics
paramLabels.mallet_lda.stemming = Stemming
+paramLabels.mallet_lda.segmentation = Segment by paragraph
paramLabels.mallet_lda.lang = Language
paramLabels.mallet_dmr.lang = Language
paramLabels.mallet_lda.tfidf = tf*idf filtering
View
1  defaults/preferences/defaults.js
@@ -33,5 +33,6 @@ pref("extensions.papermachines.lda.optimize_interval", 0);
pref("extensions.papermachines.lda.symmetric_alpha", false);
pref("extensions.papermachines.lda.stemming", true);
+pref("extensions.papermachines.lda.segmentation", false);
pref("extensions.papermachines.lda.tfidf", true);
pref("extensions.papermachines.lda.min_df", 3);
Please sign in to comment.
Something went wrong with that request. Please try again.