diff --git a/README.md b/README.md index cb68e38..37ba1cd 100644 --- a/README.md +++ b/README.md @@ -6,15 +6,15 @@ Paper Machines is an open-source extension for the [Zotero](http://www.zotero.or ## Prerequisites -In order to run Paper Machines, you will need the following (note that Python and Java are installed automatically on Mac OS X): +In order to run Paper Machines, you will need the following (Python and Java are installed automatically on Mac OS X 10.7 and above): * [Zotero](http://www.zotero.org/) with PDF indexing tools installed (see the Search pane of Zotero's Preferences) * a corpus of documents with high-quality metadata (recommended: at least 1,000 for topic modeling purposes) -* Python ([download for Windows](http://www.python.org/ftp/python/2.7.3/python-2.7.3.msi)) -* Java ([download for Windows/Mac/Linux/etc.](http://java.com/en/download/index.jsp)) +* Python 2.7 ([download page](http://www.python.org/download/releases/2.7.3)) \[N.B. Mac OS 10.6 users must download this version of Python\] +* Java ([download page](http://java.com/en/download/index.jsp)) ## Installation -Paper Machines should work either in Zotero for Firefox or Zotero Standalone. To install, you must download the XPI file. If you wish to use the extension in the Standalone version, right-click on the link and save the XPI file in your Downloads folder. Then, in Zotero Standalone, go to the Tools menu -> Add-Ons. Select the gear icon at the right, then "Install Add-On From File." Navigate to your Downloads folder (or wherever you have saved the XPI file) and open it. +Paper Machines should work either in Zotero for Firefox or Zotero Standalone. To install, you must download the XPI file. If you wish to use the extension in the Standalone version, right-click on the link and save the XPI file in your Downloads folder. Then, in Zotero Standalone, go to the Tools menu -> Add-Ons. Select the gear icon at the right, then "Install Add-On From File." Navigate to your Downloads folder (or wherever you have saved the XPI file) and open it. ## Usage To begin, right-click (control-click for Mac) on the collection you wish to analyze and select "Extract Texts for Paper Machines." Once the extraction process is complete, this right-click menu will offer several different processes that may be run on a collection, each with an accompanying visualization. Once these processes have been run, selecting "Export Output of Paper Machines..." will allow you to choose which visualizations to export. diff --git a/chrome/content/papermachines/options.xul b/chrome/content/papermachines/options.xul index 85e29e9..e522e60 100644 --- a/chrome/content/papermachines/options.xul +++ b/chrome/content/papermachines/options.xul @@ -18,6 +18,7 @@ + @@ -40,6 +41,10 @@ + + diff --git a/chrome/content/papermachines/papermachines.js b/chrome/content/papermachines/papermachines.js index 4ca75b2..2b19370 100755 --- a/chrome/content/papermachines/papermachines.js +++ b/chrome/content/papermachines/papermachines.js @@ -18,6 +18,7 @@ Zotero.PaperMachines = { install_dir: null, tagCloudReplace: true, processors_dir: null, + python_exe: null, processors: ["wordcloud", "phrasenet", "mallet", "mallet_classify", "geoparse", "dbpedia", "export-output"], processNames: null, // see locale files prompts: null, @@ -231,9 +232,11 @@ Zotero.PaperMachines = { this.log_dir = this._getOrCreateDir("logs", this.out_dir); this.args_dir = this._getOrCreateDir("args"); + Components.utils.import("chrome://papermachines/content/Preferences.js"); Components.utils.import("chrome://papermachines/content/strptime.js"); + this.python_exe = this.findPythonExecutable(); var stoplist_lang = Preferences.get("extensions.papermachines.general.lang") || "en"; @@ -252,7 +255,7 @@ Zotero.PaperMachines = { Components.utils.import("resource://gre/modules/AddonManager.jsm"); AddonManager.getAddonByID("papermachines@chrisjr.org", function(addon) { - Zotero.PaperMachines._updateBundledFilesCallback(addon.getResourceURI().QueryInterface(Components.interfaces.nsIFileURL).file); + Zotero.PaperMachines._updateBundledFilesCallback(addon.getResourceURI("").QueryInterface(Components.interfaces.nsIFileURL).file); }); // Connect to (and create, if necessary) papermachines.sqlite in the Zotero directory @@ -389,8 +392,8 @@ Zotero.PaperMachines = { return; } - var proc_file = Zotero.PaperMachines.processors_dir.clone(); - proc_file.append(processor + ".pyw"); + var processor_file = Zotero.PaperMachines.processors_dir.clone(); + processor_file.append(processor + ".py"); var proc = Components.classes["@mozilla.org/process/util;1"] .createInstance(Components.interfaces.nsIProcess); @@ -413,7 +416,7 @@ Zotero.PaperMachines = { var argFile = Zotero.PaperMachines._getOrCreateFile(argsHashFilename, Zotero.PaperMachines.args_dir); Zotero.File.putContents(argFile, args_str); - var procArgs = [argFile.path]; + var procArgs = [processor_file.path, argFile.path]; outFile.append(processor + thisID + "-" + args_hash + ".html"); @@ -431,9 +434,13 @@ Zotero.PaperMachines = { } }; - var observer = new this.processObserver(processor, processPath, callback); + var observer = new Zotero.PaperMachines.processObserver(processor, processPath, callback); + + var python_exe_file = Zotero.PaperMachines._getLocalFile(Zotero.PaperMachines.python_exe); + + Zotero.PaperMachines.LOG("running " + python_exe_file.leafName + " " + procArgs.join(" ")); - proc.init(proc_file); + proc.init(python_exe_file); proc.runAsync(procArgs, procArgs.length, observer); }, replaceTagsBoxWithWordCloud: function (uri) { @@ -678,7 +685,7 @@ Zotero.PaperMachines = { }, traverseItemGroup: function (itemGroup) { var itemGroups = []; - if ("isLibrary" in itemGroup && itemGroup.isLibrary()) { + if (typeof itemGroup.isLibrary == "function" && itemGroup.isLibrary()) { if (itemGroup.id == "L") { itemGroups.push(ZoteroPane.collectionsView._dataItems[0][0]); var collectionKeys = Zotero.DB.columnQuery("SELECT key from collections WHERE libraryID IS NULL;"); @@ -687,7 +694,7 @@ Zotero.PaperMachines = { } } } else { - if ("isCollection" in itemGroup && itemGroup.isCollection()) { + if (typeof itemGroup.isCollection == "function" && itemGroup.isCollection()) { itemGroups.push(itemGroup); var currentCollection = ("ref" in itemGroup) ? itemGroup.ref : itemGroup; if (currentCollection.hasChildCollections()) { @@ -696,7 +703,7 @@ Zotero.PaperMachines = { itemGroups.push(Zotero.PaperMachines.traverseItemGroup(children[i])); } } - } else if ("isGroup" in itemGroup && itemGroup.isGroup()) { + } else if (typeof itemGroup.isGroup == "function" && itemGroup.isGroup()) { if (itemGroup.ref.hasCollections()) { var children = itemGroup.ref.getCollections(); for (var i in children) { @@ -922,7 +929,7 @@ Zotero.PaperMachines = { Zotero.PaperMachines.DB.query("INSERT OR IGNORE INTO files_to_extract (filename, itemID, outfile, collection) VALUES (?,?,?,?)", [tagsFile.path, item.id, tagsFile.path.replace("_tags.txt", ".txt"), dir.leafName]); }, _updateBundledFilesCallback: function (installLocation) { - this.install_dir = installLocation; + Zotero.PaperMachines.install_dir = installLocation; var xpiZipReader, isUnpacked = installLocation.isDirectory(); if(!isUnpacked) { xpiZipReader = Components.classes["@mozilla.org/libjar/zip-reader;1"] @@ -941,12 +948,12 @@ Zotero.PaperMachines = { procs_dir.append("papermachines"); procs_dir.append("processors"); - this._copyAllFiles(procs_dir, this.processors_dir); + this._copyAllFiles(procs_dir, Zotero.PaperMachines.processors_dir); } - this.aux_dir = this._getOrCreateDir("support", this.processors_dir); + Zotero.PaperMachines.aux_dir = Zotero.PaperMachines._getOrCreateDir("support", Zotero.PaperMachines.processors_dir); - var new_aux = this._getOrCreateDir("support", this.out_dir); - this._copyAllFiles(this.aux_dir, new_aux); + var new_aux = Zotero.PaperMachines._getOrCreateDir("support", Zotero.PaperMachines.out_dir); + Zotero.PaperMachines._copyAllFiles(Zotero.PaperMachines.aux_dir, new_aux); }, _copyOrMoveAllFiles: function (copy_or_move, source, target, recursive) { var files = source.directoryEntries; @@ -960,10 +967,6 @@ Zotero.PaperMachines = { } if (copy_or_move) { f.copyTo(target, f.leafName); - if (f.leafName.indexOf(".pyw") != -1) { - var regpy = f.leafName.replace(".pyw", ".py"); - f.copyTo(target, regpy); - } } else { f.moveTo(target, f.leafName); } @@ -1500,7 +1503,7 @@ Zotero.PaperMachines = { win.gBrowser.selectedTab = win.gBrowser.addTab(url); } }, - openPreferences : function() { + openPreferences: function() { if (!this._preferencesWindow || this._preferencesWindow.closed) { var instantApply = Application.prefs.get("browser.preferences.instantApply"); var features = "chrome,titlebar,toolbar,centerscreen" + @@ -1512,6 +1515,41 @@ Zotero.PaperMachines = { this._preferencesWindow.focus(); }, + findPythonExecutable: function () { + var python_exe = Preferences.get("extensions.papermachines.general.python_exe"); + if (!python_exe) { + var environment = Components.classes["@mozilla.org/process/environment;1"] + .getService(Components.interfaces.nsIEnvironment); + var path = environment.get("PATH"), + python_name = "pythonw", + directories = []; + + if (Zotero.platform == "Win32") { + python_name += ".exe"; + directories = ["C:\\Python27\\"]; + } else { + python_name += "2.7"; + directories = ["/usr/bin", "/usr/local/bin", "/sw/bin", "/opt/local/bin"]; + } + + for (var i = 0, n = directories.length; i < n; i++) { + var executable = Zotero.PaperMachines._getLocalFile(directories[i]); + executable.append(python_name); + if (executable.exists()) { + python_exe = executable.path; + break; + } + } + + if (python_exe) { + Preferences.set("extensions.papermachines.general.python_exe", python_exe); + } else { + Zotero.PaperMachines.ERROR("Python not found! Please enter the path to Python 2.7 in the Paper Machines preference window.") + } + } + return python_exe; + + }, evtListener: function (evt) { var node = evt.target, doc = node.ownerDocument; @@ -1542,12 +1580,17 @@ Zotero.PaperMachines.processObserver.prototype = { observe: function(subject, topic, data) { switch (topic) { case "process-failed": - Zotero.PaperMachines.LOG("Process " + this.processName + " failed.") + Zotero.PaperMachines.LOG("Process " + this.processName + " failed."); this.callback(false); break; case "process-finished": - Zotero.PaperMachines.LOG("Process " + this.processName + " finished.") - this.callback(true); + Zotero.PaperMachines.LOG("Process " + this.processName + " finished with exit value " + subject.exitValue); + if (subject.exitValue != 0) { // something went awry + Zotero.PaperMachines.ERROR("Process " + this.processName + " failed."); + this.callback(false); + } else { + this.callback(true); + } break; } this.unregister(); diff --git a/chrome/content/papermachines/processors/dbpedia.py b/chrome/content/papermachines/processors/dbpedia.py new file mode 100755 index 0000000..5645f2f --- /dev/null +++ b/chrome/content/papermachines/processors/dbpedia.py @@ -0,0 +1,102 @@ +#!/usr/bin/env python2.7 +import sys, os, json, logging, urllib, urllib2, codecs, traceback +import textprocessor + + +class DBpedia(textprocessor.TextProcessor): + """ + annotates texts using DBpedia Spotlight + """ + + def _basic_params(self): + self.name = "dbpedia" + self.dry_run = False + self.require_stopwords = False + + def _get_annotated(self, text, confidence = 0.2, support = 20): + values = {'text': text[0:10000].encode('utf-8'), + 'confidence': confidence, + 'support': support} + data = urllib.urlencode(values) + req = urllib2.Request(self.url, data, self.headers) + response = urllib2.urlopen(req) + annotation = response.read() + encoding = req.headers.get('content-type', 'charset=utf8').split('charset=')[-1] + + return unicode(annotation, encoding) + + def process(self): + """ + create JSON files with named entity recognition by DBpedia + """ + + logging.info("beginning annotation") + + self.url = "http://spotlight.dbpedia.org/rest/annotate" + self.headers = {'Accept': 'application/json', 'content-type': 'application/x-www-form-urlencoded'} + + annotated = {} + if not self.dry_run: + for filename in self.files: + logging.info("processing " + filename) + self.update_progress() + try: + annotated_filename = filename.replace(".txt", "_dbpedia.json") + if os.path.exists(annotated_filename): + annotated[annotated_filename] = filename + else: + with codecs.open(filename, 'r', encoding='utf-8') as f: + annotation = self._get_annotated(f.read()) + if len(annotation) > 0: + annotated[annotated_filename] = filename + with codecs.open(annotated_filename, 'w', encoding='utf-8') as out: + out.write(annotation) + except (KeyboardInterrupt, SystemExit): + raise + except: + logging.error(traceback.format_exc()) + else: + for filename in self.files: + annotated_filename = filename.replace(".txt", "_dbpedia.json") + if os.path.exists(annotated_filename): + annotated[annotated_filename] = filename + + uris_to_docs = {} + for json_annotation, filename in annotated.iteritems(): + itemID = self.metadata[filename]["itemID"] + notes = json.load(file(json_annotation)) + entities = notes.get("Resources", []) + for entity in entities: + uri = entity.get("@URI", "http://dbpedia.org/resource/") + if not uri in uris_to_docs: + uris_to_docs[uri] = {} + if not itemID in uris_to_docs[uri]: + uris_to_docs[uri][itemID] = 0 + uris_to_docs[uri][itemID] += 1 + + filtered_uris = {} + weights = [] + for uri, items in uris_to_docs.iteritems(): + weights.append(sum(items.values())) + weights.sort() + min_weight = weights[max(-100, -len(weights))] + + for uri, items in uris_to_docs.iteritems(): + if sum(items.values()) > min_weight: + filtered_uris[uri] = items + + + + # params = {"DATA": json.dumps(uris_to_docs)} + params = {"URIS_TO_DOCS": json.dumps(filtered_uris)} + self.write_html(params) + + logging.info("finished") + + +if __name__ == "__main__": + try: + processor = DBpedia(track_progress=True) + processor.process() + except: + logging.error(traceback.format_exc()) \ No newline at end of file diff --git a/chrome/content/papermachines/processors/extract.py b/chrome/content/papermachines/processors/extract.py new file mode 100755 index 0000000..5dedf1a --- /dev/null +++ b/chrome/content/papermachines/processors/extract.py @@ -0,0 +1,76 @@ +#!/usr/bin/env python2.7 +import sys, os, json, re, cStringIO, logging, traceback, codecs, urllib, subprocess +from HTMLParser import HTMLParser +import textprocessor + +class MLStripper(HTMLParser): + def __init__(self): + self.reset() + self.fed = [] + def handle_data(self, d): + self.fed.append(d) + def get_data(self): + return u''.join(self.fed) + +def strip_tags(html): + s = MLStripper() + s.feed(html) + return s.get_data() + +class Extract(textprocessor.TextProcessor): + """ + Extract text from PDF or HTML files + """ + + def _basic_params(self): + self.name = "extract" + self.pdftotext = self.extra_args[0] + + + def process(self): + logging.info("starting to process") + + itemIDs = {} + for filename in self.files: + id = self.metadata[filename]["itemID"] + if id not in itemIDs: + itemIDs[id] = [] + itemIDs[id].append(filename) + + saved = [] + for itemID, filenames in itemIDs.iteritems(): + try: + out_file = self.metadata[filenames[0]]["outfile"] + out_dir = os.path.dirname(out_file) + if not os.path.exists(out_dir): + os.makedirs(out_dir) + text = u'' + for filename in filenames: + if filename.lower().endswith(".txt"): + text += codecs.open(filename, 'r', encoding='utf-8', errors='ignore').read() + elif filename.lower().endswith(".html"): + text += strip_tags(codecs.open(filename, 'r', encoding='utf-8', errors='ignore').read()) + elif filename.lower().endswith(".pdf"): + import_args = [self.pdftotext, '-enc', 'UTF-8', '-nopgbrk', filename, '-'] + import_proc = subprocess.Popen(import_args, stdout = subprocess.PIPE) + text += import_proc.communicate()[0].decode('utf-8') + with codecs.open(out_file, 'w', encoding="utf-8") as f: + f.write(text) + saved.append({"itemID": itemID, "collection": self.metadata[filename]["collection"], "filename": out_file}) + self.update_progress() + except: + logging.error(traceback.format_exc()) + if self.progress_initialized: + self.progress_file.write('<1000>\n') + json_out = os.path.join(self.out_dir, self.name + self.collection + ".json") + with codecs.open(json_out, 'wb', encoding='utf-8') as f: + json.dump(saved, f) + params = {"SUCCEEDED": str(len(saved)), "TOTAL": str(len(itemIDs.keys()))} + self.write_html(params) + +if __name__ == "__main__": + try: + processor = Extract(track_progress=True) + processor.process() + except: + logging.error(traceback.format_exc()) \ No newline at end of file diff --git a/chrome/content/papermachines/processors/geoparse.py b/chrome/content/papermachines/processors/geoparse.py new file mode 100755 index 0000000..f5733f5 --- /dev/null +++ b/chrome/content/papermachines/processors/geoparse.py @@ -0,0 +1,166 @@ +#!/usr/bin/env python2.7 +import sys, os, json, logging, traceback, base64, time, codecs +import cPickle as pickle +from lib.placemaker import placemaker +from lib.placemaker.placemaker_api import placemaker_api_key +import textprocessor + + +class Geoparse(textprocessor.TextProcessor): + """ + Geoparsing using Yahoo! Placemaker + """ + + def _basic_params(self): + self.name = "geoparse" + self.dry_run = False + self.require_stopwords = False + + def process(self): + """ + create a JSON file with geographical data extracted from texts + """ + + self.name = "geoparse" + + p = placemaker(base64.b64decode(placemaker_api_key)) + + geo_parsed = {} + places_by_woeid = {} + + for filename in self.files: + logging.info("processing " + filename) + self.update_progress() + + file_geoparsed = filename.replace(".txt", "_geoparse.json") + + if os.path.exists(file_geoparsed): + geoparse_obj = json.load(file(file_geoparsed)) + elif not self.dry_run: + geoparse_obj = {'places_by_woeid': {}, 'references': {}} + try: + # id = self.metadata[filename]['itemID'] + str_to_parse = self.metadata[filename]['place'] + last_index = len(str_to_parse) + str_to_parse += codecs.open(filename, 'r', encoding='utf8').read()[0:(48000 - last_index)] #50k characters, shortened by initial place string + + city = None + places = [] + + p.find_places(str_to_parse.encode('utf8', 'ignore')) + for woeid, referenced_place in p.referencedPlaces.iteritems(): + place = referenced_place["place"] + geoparse_obj['places_by_woeid'][woeid] = {'name': place.name, 'type': place.placetype, 'coordinates': [place.centroid.longitude, place.centroid.latitude]} + + for reference in referenced_place["references"]: + if reference.start < last_index: + city = woeid + else: + places.append(woeid) + if not woeid in geoparse_obj['references']: + geoparse_obj['references'][woeid] = [] + geoparse_obj['references'][woeid].append((reference.start - last_index, reference.end - last_index)) + + geoparse_obj['places'] = places + geoparse_obj['city'] = city + json.dump(geoparse_obj, file(file_geoparsed, 'w')) + time.sleep(0.2) + except (KeyboardInterrupt, SystemExit): + raise + except: + logging.error(traceback.format_exc()) + + geo_parsed[filename] = geoparse_obj.get('places', []) + self.metadata[filename]['city'] = geoparse_obj.get('city') + for woeid, data in geoparse_obj.get('places_by_woeid', {}).iteritems(): + places_by_woeid[int(woeid)] = data + + places = {} + for filename, woeids in geo_parsed.iteritems(): + year = self.metadata[filename]["year"] + for woeid in woeids: + if woeid in places_by_woeid: + if woeid not in places: + places[woeid] = {} + places[woeid]["name"] = places_by_woeid[woeid]["name"] + places[woeid]["type"] = places_by_woeid[woeid]["type"] + places[woeid]["coordinates"] = places_by_woeid[woeid]["coordinates"] + places[woeid]["weight"] = {year: 1} + else: + if year not in places[woeid]["weight"]: + places[woeid]["weight"][year] = 1 + else: + places[woeid]["weight"][year] += 1 + + self.places_by_woeid = places_by_woeid + max_country_weight = 0 + + for place in sorted(places.keys()): + if places[place]["type"] == "Country": + country_sum = sum(places[place]["weight"].values()) + if country_sum > max_country_weight: + max_country_weight = country_sum + + placeIDsToNames = {k: v["name"] for k, v in places_by_woeid.iteritems()} + placeIDsToCoords = {k: v["coordinates"] for k, v in places_by_woeid.iteritems()} + + linksByYear = {} + sources = {} + + for filename in self.files: + if self.metadata[filename].get('city') is None or len(geo_parsed[filename]) < 2: + continue + try: + title = os.path.basename(filename) + itemID = self.metadata[filename]['itemID'] + year = self.metadata[filename]['year'] + if year not in linksByYear: + linksByYear[year] = {} + source = self.metadata[filename]['city'] + if source != None: + if source not in sources: + sources[source] = {} + if year not in sources[source]: + sources[source][year] = 0 + sources[source][year] += 1 + targets = geo_parsed[filename] + for target in targets: + edge = str(source) + ',' + str(target) + if edge not in linksByYear[year]: + linksByYear[year][edge] = 0 + linksByYear[year][edge] += 1 + except: + logging.info(traceback.format_exc()) + + years = sorted(linksByYear.keys()) + groupedLinksByYear = [] + + for year in years: + groupedLinksByYear.append([]) + for edge in linksByYear[year]: + weight = linksByYear[year][edge] + source, target = [int(x) for x in edge.split(',')] + groupedLinksByYear[-1].append({'source': source, 'target': target, 'year': year, 'weight': weight}) + + + params = {"PLACEIDSTOCOORDS": json.dumps(placeIDsToCoords), + "PLACEIDSTONAMES": json.dumps(placeIDsToNames), + "PLACESMENTIONED": json.dumps({k : v["weight"] for k, v in places.iteritems() if v["type"] != "Country"}), + "TEXTSFROMPLACE": json.dumps(sources), + "COUNTRIES": json.dumps({v["name"] : v["weight"] for k, v in places.iteritems() if v["type"] == "Country"}), + "MAX_COUNTRY_WEIGHT": str(max_country_weight), + "STARTDATE": str(min([int(x["year"]) for x in self.metadata.values() if x["year"].isdigit() and x["year"] != "0000"])), + "ENDDATE": str(max([int(x["year"]) for x in self.metadata.values() if x["year"].isdigit()])), + "LINKS_BY_YEAR": json.dumps(groupedLinksByYear) + } + self.write_html(params) + + logging.info("finished") + + +if __name__ == "__main__": + try: + processor = Geoparse(track_progress=True) + processor.process() + except: + logging.error(traceback.format_exc()) \ No newline at end of file diff --git a/chrome/content/papermachines/processors/lib/merge_jstor.py b/chrome/content/papermachines/processors/lib/merge_jstor.py index f400cca..9f18770 100755 --- a/chrome/content/papermachines/processors/lib/merge_jstor.py +++ b/chrome/content/papermachines/processors/lib/merge_jstor.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python2.7 import csv, sys, os, shutil, logging diff --git a/chrome/content/papermachines/processors/mallet.py b/chrome/content/papermachines/processors/mallet.py new file mode 100755 index 0000000..3eb9a53 --- /dev/null +++ b/chrome/content/papermachines/processors/mallet.py @@ -0,0 +1,251 @@ +#!/usr/bin/env python2.7 +import sys, os, shutil, logging, tempfile, time, subprocess, math, re, urllib, json, codecs, csv, traceback, platform +import xml.etree.ElementTree as et +from lib.porter2 import stem +import copy +import textprocessor + +class Mallet(textprocessor.TextProcessor): + """ + Base class for MALLET functionality + """ + + def _basic_params(self): + self.name = "mallet" + + def _import_dfr_metadata(self, dfr_dir): + citation_file = os.path.join(dfr_dir, "citations.CSV") + citations = {} + for rowdict in self.parse_csv(citation_file): + doi = rowdict.pop("id") + citations[doi] = rowdict + self.metadata[doi] = {'title': citations[doi].get("title", ""), 'year': citations[doi].get('pubdate','')[0:4], 'label': "jstor", 'itemID': doi} + return citations + + def _import_dfr(self, dfr_dir): + citations = self._import_dfr_metadata(dfr_dir) + + wordcounts_dir = os.path.join(dfr_dir, "wordcounts") + for doi in citations.keys(): + try: + this_text = '' + for rowdict in self.parse_csv(os.path.join(wordcounts_dir, "wordcounts_" + doi.replace('/','_') + ".CSV")): + word = rowdict["WORDCOUNTS"] + if word in self.stopwords: + continue + if self.stemming: + prestem = word + if word not in self.stemmed: + self.stemmed[prestem] = stem(prestem) + word = self.stemmed[prestem] + count = int(rowdict["WEIGHT"]) + + this_text += (word + u' ') * count + if len(this_text) < 20: + continue + yield doi, this_text + except: + logging.error(doi) + logging.error(traceback.format_exc()) + + def _import_files(self): + if self.stemming: + self.stemmed = {} + self.docs = [] + with codecs.open(self.texts_file, 'w', encoding='utf-8') as f: + for filename in self.files: + with codecs.open(filename, 'r', encoding='utf-8') as input_file: + text = input_file.read() + text = re.sub(r"[^\w ]+", u'', text.lower(), flags=re.UNICODE) + if self.stemming: + newtext = u'' + for word in text.split(): + if word not in self.stemmed: + self.stemmed[word] = stem(word) + newtext += self.stemmed[word] + u' ' + text = newtext + f.write(u'\t'.join([filename, self.metadata[filename]["label"], text]) + u'\n') + self.docs.append(filename) + if self.dfr: + for doi, text in self._import_dfr(self.dfr_dir): + f.write(u'\t'.join([doi, self.metadata[doi]["label"], text]) + u'\n') + self.docs.append(doi) + with codecs.open(os.path.join(self.mallet_out_dir, "dmap"), 'w', encoding='utf-8') as dmap: + dmap.writelines([x + u'\n' for x in self.docs]) + self.doc_count = len(self.docs) + + def _tfidf_filter(self, top_terms = None): + min_df = getattr(self, "min_df", 5) + vocab = {} + inverse_vocab = {} + df = {} + tf = {} + tf_all_docs = {} + tfidf = {} + self.index = {} + + i = 0 + with codecs.open(self.texts_file, 'r', encoding='utf-8') as f: + for line in f: + j = 0 + filename = "" + for part in line.split(u'\t'): + if j == 0: + filename = part + elif j == 2: + tf_for_doc = {} + flen = 0 + for word in part.split(): + if len(word) < 3: + continue + flen += 1 + if word not in vocab: + vocab[word] = i + tf_for_doc[i] = 1 + tf[i] = 0 + df[i] = 1 + i += 1 + else: + index = vocab[word] + if index not in tf_for_doc: + tf_for_doc[index] = 0 + df[index] += 1 + tf_for_doc[index] += 1 + tf_all_docs[filename] = copy.deepcopy(tf_for_doc) + for word_index in tf_for_doc.keys(): + tf_val = float(tf_for_doc[word_index])/flen + if tf_val > tf[word_index]: + tf[word_index] = tf_val + j += 1 + self.tf_all_docs = tf_all_docs + for index in vocab.values(): + tfidf[index] = tf[index] * math.log10(float(self.doc_count)/df[index]) + tfidf_values = tfidf.values() + + if top_terms is None: + top_terms = min(int(len(vocab.keys()) * 0.7), 5000) + min_score = sorted(tfidf_values, reverse=True)[min(top_terms, len(tfidf_values) - 1)] + + os.rename(self.texts_file, self.texts_file + '-pre_tf-idf') + inverse_vocab = {v : k for k, v in vocab.iteritems()} + new_vocab = {} + + with codecs.open(self.texts_file, 'w', encoding='utf-8') as f: + for filename, freqs in tf_all_docs.iteritems(): + text = u'' + flen = 0 + thisfile_vocab = [] + for index, count in freqs.iteritems(): + if tfidf[index] < min_score or df[index] < min_df: + continue + word = inverse_vocab[index] + if word in self.stopwords: + continue + if word not in new_vocab: + new_vocab[word] = 0 + new_vocab[word] += count + thisfile_vocab.append(word) + text += (word + u' ') * count + flen += count + if flen > 25: + f.write(u'\t'.join([filename, self.metadata[filename]["label"], text]) + u'\n') + for word in thisfile_vocab: + if word not in self.index: + self.index[word] = [] + self.index[word].append(self.metadata[filename]["itemID"]) + else: + self.docs.remove(filename) + with codecs.open(os.path.join(self.mallet_out_dir, "dmap"), 'w', encoding='utf-8') as dmap: + dmap.writelines([x + u'\n' for x in self.docs]) + logging.info("tf-idf complete; retained {:} of {:} words; minimum tf-idf score: {:}".format(len(new_vocab.keys()), len(vocab.keys()), min_score)) + + def _setup_mallet_command(self): + self.mallet_cp_dir = os.path.join(self.cwd, "lib", "mallet-2.0.7", "dist") + if self.sys == "Windows": + classpath_sep = u';' + else: + classpath_sep = u':' + + self.mallet_classpath = os.path.join(self.mallet_cp_dir, "mallet.jar") + classpath_sep + os.path.join(self.mallet_cp_dir, "mallet-deps.jar") + + self.mallet = "java -Xmx1g -ea -Djava.awt.headless=true -Dfile.encoding=UTF-8".split(' ') + self.mallet += ["-classpath", self.mallet_classpath] + + self.mallet_out_dir = os.path.join(self.out_dir, self.name + self.collection) + + if not self.dry_run: + if os.path.exists(self.mallet_out_dir): + shutil.rmtree(self.mallet_out_dir) + os.makedirs(self.mallet_out_dir) + + self.progress_filename = os.path.join(self.out_dir, self.name + self.collection + "progress.txt") + self.progress_file = file(self.progress_filename, 'w') + + def _import_texts(self): + + logging.info("copying texts into single file") + self.texts_file = os.path.join(self.mallet_out_dir, self.collection + ".txt") + + if not os.path.exists(self.texts_file): + if not self.dry_run: + self._import_files() + else: + if len(self.extra_args) > 0 and self.dfr: + self._import_dfr_metadata(self.dfr_dir) + self.docs = [] + self.index = {} + with codecs.open(self.texts_file, 'r', 'utf-8') as f: + for line in f: + fields = line.split(u'\t') + filename = fields[0] + self.docs.append(filename) + this_vocab = set() + for word in fields[2].split(): + this_vocab.add(word) + for word in this_vocab: + if word not in self.index: + self.index[word] = [] + self.index[word].append(self.metadata[filename]["itemID"]) + self.doc_count = len(self.docs) + + def _setup_mallet_instances(self, sequence=True, tfidf = False, stemming = True): + self.stemming = stemming + + self._setup_mallet_command() + self._import_texts() + + self.instance_file = os.path.join(self.mallet_out_dir, self.collection + ".mallet") + + logging.info("beginning text import") + + if tfidf and not self.dry_run: + self._tfidf_filter() + + with codecs.open(os.path.join(self.mallet_out_dir, "metadata.json"), 'w', encoding='utf-8') as meta_file: + json.dump(self.metadata, meta_file) + + import_args = self.mallet + ["cc.mallet.classify.tui.Csv2Vectors", + "--remove-stopwords", + "--stoplist-file", self.stoplist, + "--input", self.texts_file, + "--line-regex", "^([^\\t]*)[\\t]([^\\t]*)[\\t](.*)$", + "--token-regex", '[\p{L}\p{M}]+', + "--output", self.instance_file] + if sequence: + import_args.append("--keep-sequence") + + if not self.dry_run and not os.path.exists(self.instance_file): + import_return = subprocess.call(import_args, stdout=self.progress_file) + + def process(self): + """ + Should be redefined! + """ + pass + +if __name__ == "__main__": + try: + processor = Mallet(track_progress = False) + processor.process() + except: + logging.error(traceback.format_exc()) \ No newline at end of file diff --git a/chrome/content/papermachines/processors/mallet_classify-file.py b/chrome/content/papermachines/processors/mallet_classify-file.py new file mode 100755 index 0000000..6321d10 --- /dev/null +++ b/chrome/content/papermachines/processors/mallet_classify-file.py @@ -0,0 +1,64 @@ +#!/usr/bin/env python2.7 +import sys, os, logging, traceback, time, subprocess, codecs, json +import mallet + +class MalletClassifierTest(mallet.Mallet): + """ + Train a classifier + """ + def _basic_params(self): + self.dry_run = False + self.name = "mallet_classify-file" + self.mallet_classifier = self.extra_args[0] + self.dfr = len(self.extra_args) > 1 + if self.dfr: + self.dfr_dir = self.extra_args[1] + self.stemming = True + + def process(self): + + self._setup_mallet_command() + self._import_texts() + + self.classified_filename = os.path.join(self.mallet_out_dir, "classified") + + process_args = self.mallet + ["cc.mallet.classify.tui.Csv2Classify", + "--input", self.texts_file, + "--line-regex", "^([^\\t]*)[\\t]([^\\t]*)[\\t](.*)$", + "--classifier", self.mallet_classifier, + "--output", self.classified_filename] + + logging.info("begin classifying texts") + + start_time = time.time() +# if not self.dry_run: + classifier_return = subprocess.call(process_args, stdout=self.progress_file, stderr=self.progress_file) + + finished = "Classifier finished in " + str(time.time() - start_time) + " seconds" + logging.info(finished) + + classifications = {} + for line in codecs.open(self.classified_filename, 'r', encoding='utf-8'): + try: + line_parts = line.split('\t') + filename = line_parts.pop(0) + probs = {y[0]: float(y[1]) for y in self.xpartition(line_parts)} + classifications[filename] = self.argmax(probs) + except: + logging.error(traceback.format_exc()) + + outfile_name = os.path.join(self.out_dir, "mallet_classify-file" + self.collection + ".json") + + with codecs.open(outfile_name, 'w', encoding='utf-8') as f: + json.dump(classifications, f) + + params = {'CLASSIFIED': json.dumps(classifications)} + + self.write_html(params) + +if __name__ == "__main__": + try: + processor = MalletClassifierTest(track_progress=False) + processor.process() + except: + logging.error(traceback.format_exc()) \ No newline at end of file diff --git a/chrome/content/papermachines/processors/mallet_lda.py b/chrome/content/papermachines/processors/mallet_lda.py new file mode 100755 index 0000000..257072a --- /dev/null +++ b/chrome/content/papermachines/processors/mallet_lda.py @@ -0,0 +1,223 @@ +#!/usr/bin/env python2.7 +import sys, os, logging, tempfile, time, subprocess, math, re, urllib, json, codecs, csv, traceback +import xml.etree.ElementTree as et +from itertools import izip +import mallet + +class MalletLDA(mallet.Mallet): + """ + Perform LDA using MALLET + """ + + def _basic_params(self): + self.categorical = False + self.template_name = "mallet_lda" + self.name = "mallet_lda" + self.topics = 50 + self.dry_run = False + self.dfr = len(self.extra_args) > 0 + if self.dfr: + self.dfr_dir = self.extra_args[0] + + def _stdev(self, X): + n = float(len(X)) + xbar = float(sum(X)) / n + variances = [math.pow(float(x) - xbar, 2.0) for x in X] + return math.sqrt((1.0 / (n - 1.0)) * sum(variances)) + + def _cov(self, X, Y): + n = float(len(X)) + xbar = sum(X) / n + ybar = sum(Y) / n + return (1.0/(n-1.0)) * sum([((x-xbar) * (y-ybar)) for x, y in zip(X, Y)]) + + def _find_proportions(self, topics): + self.proportions = {} + for i in range(len(topics)): + self.proportions[i] = float(sum(topics[i])) / len(topics[i]) + + def _find_stdevs(self, topics): + self.stdevs = {} + for i in range(len(topics)): + self.stdevs[i] = self._stdev(topics[i]) + + def _find_correlations(self, topics): + self.correlations = {} + for i in range(len(topics)): + for j in range(i+1,len(topics)): + self.correlations[str(i) + ',' + str(j)] = self._cov(topics[i], topics[j]) / (self.stdevs[i] * self.stdevs[j]) + + def _sort_into_intervals(self): + years = set() + fname_to_year = {} + + fnames = self.metadata.keys() + for filename in fnames: + x = self.metadata[filename] + if x['year'].isdigit() and x['year'] != '0000': + year = int(x['year']) + else: + year = 2012 + years.add(year) + fname_to_year[filename] = year + + years = sorted(years) + self.intervals = years + self.fname_to_interval = fname_to_year + self.fname_to_index = {fname: years.index(year) for fname, year in fname_to_year.iteritems()} + + def process(self): + """ + run LDA, creating an output file divided by time + """ + + if self.named_args is not None: + self.tfidf = self.named_args["tfidf"] + self.min_df = int(self.named_args["min_df"]) + self.stemming = self.named_args["stemming"] + self.topics = int(self.named_args["topics"]) + self.iterations = int(self.named_args["iterations"]) + self.alpha = self.named_args["alpha"] + self.beta = self.named_args["beta"] + self.symmetric_alpha = str(self.named_args["symmetric_alpha"]).lower() + self.optimize_interval = self.named_args["optimize_interval"] + self.burn_in = int(self.named_args["burn_in"]) + else: + self.tfidf = True + self.min_df = 5 + self.topics = 50 + self.stemming = True + self.iterations = 1000 + self.alpha = "50.0" + self.beta = "0.01" + self.burn_in = 200 + self.symmetric_alpha = "true" + self.optimize_interval = 0 + + + self._setup_mallet_instances(sequence=True, tfidf=self.tfidf, stemming=self.stemming) + + self.mallet_files = {'state': os.path.join(self.mallet_out_dir, "topic-state.gz"), + 'doc-topics': os.path.join(self.mallet_out_dir, "doc-topics.txt"), + 'topic-keys': os.path.join(self.mallet_out_dir, "topic-keys.txt"), + 'word-topics': os.path.join(self.mallet_out_dir, "word-topics.txt"), + 'diagnostics-file': os.path.join(self.mallet_out_dir, "diagnostics-file.txt")} + process_args = self.mallet + ["cc.mallet.topics.tui.TopicTrainer", + "--input", self.instance_file, + "--num-topics", str(self.topics), + "--num-iterations", str(self.iterations), + "--optimize-interval", str(self.optimize_interval), + "--optimize-burn-in", str(self.burn_in), + "--use-symmetric-alpha", self.symmetric_alpha, + "--alpha", self.alpha, + "--beta", self.beta, + "--output-state", self.mallet_files['state'], + "--output-doc-topics", self.mallet_files['doc-topics'], + "--output-topic-keys", self.mallet_files['topic-keys'], + "--diagnostics-file", self.mallet_files['diagnostics-file'], + "--word-topic-counts-file", self.mallet_files['word-topics']] + + logging.info("begin LDA") + + start_time = time.time() + if not self.dry_run: + lda_return = subprocess.call(process_args, stdout=self.progress_file, stderr=self.progress_file) + + logging.info("LDA complete in " + str(time.time() - start_time) + " seconds") + + coherence = {} + wordProbs = {} + allocationRatios = {} + with file(self.mallet_files['diagnostics-file']) as diagnostics: + tree = et.parse(diagnostics) + for elem in tree.iter("topic"): + topic = elem.get("id") + coherence[topic] = float(elem.get("coherence")) + allocationRatios[topic] = float(elem.get("allocation_ratio")) + wordProbs[topic] = [] + for word in elem.iter("word"): + wordProbs[topic].append({'text': word.text, 'prob': word.get("prob")}) + + labels = {x[0]: {"label": x[2:5], "fulltopic": wordProbs[x[0]], "allocation_ratio": allocationRatios[x[0]]} for x in [y.split() for y in file(self.mallet_files['topic-keys']).readlines()]} + + weights_by_topic = [] + doc_metadata = {} + + self._sort_into_intervals() + + for i in range(self.topics): + weights_by_topic.append([{'x': str(j), 'y': [], 'topic': i} for j in self.intervals]) + + for line in file(self.mallet_files['doc-topics']): + try: + values = line.split('\t') + + id = values.pop(0) + if id.startswith("#doc"): + continue + filename = self.docs[int(id)] + del values[0] + + itemid = self.metadata[filename]["itemID"] + + doc_metadata[itemid] = {"label": self.metadata[filename]["label"], "title": self.metadata[filename]["title"]} + + freqs = {int(y[0]): float(y[1]) for y in self.xpartition(values)} + main_topic = None + topic_max = 0.0 + for i in freqs.keys(): + weights_by_topic[i][self.fname_to_index[filename]]['y'].append({"itemID": itemid, "ratio": freqs[i]}) + if freqs[i] > topic_max: + main_topic = i + topic_max = freqs[i] + doc_metadata[itemid]["main_topic"] = main_topic + except KeyboardInterrupt: + sys.exit(1) + except: + logging.error(traceback.format_exc()) + + topics_by_year = [] + for topic in weights_by_topic: + topic_sums = [] + for year in topic: + sum = 0.0 + if len(year['y']) != 0: + for doc in year['y']: + sum += doc['ratio'] + topic_sums.append(sum / float(len(year['y']))) + else: + topic_sums.append(0) + topics_by_year.append(topic_sums) + + self.topics_by_year = topics_by_year + self._find_proportions(topics_by_year) + try: + self._find_stdevs(topics_by_year) + self._find_correlations(topics_by_year) + except: + self.stdevs = {} + self.correlations = {} + + self.template_filename = os.path.join(self.cwd, "templates", self.template_name + ".html") + + params = {"CATEGORICAL": "true" if self.categorical else "false", + "TOPICS_DOCS": json.dumps(weights_by_topic, separators=(',',':')), + "DOC_METADATA": json.dumps(doc_metadata, separators=(',',':')), + "TOPIC_LABELS": json.dumps(labels, separators=(',',':')), + "TOPIC_COHERENCE": json.dumps(coherence, separators=(',',':')), + "TOPIC_PROPORTIONS": json.dumps(self.proportions, separators=(',',':')), + "TOPIC_STDEVS": json.dumps(self.stdevs, separators=(',',':')), + "TOPIC_CORRELATIONS": json.dumps(self.correlations, separators=(',',':')) + } + + index = getattr(self, "index", "{}") + params["###INDEX###"] = json.dumps(index, separators=(',',':')) + + self.write_html(params) + +if __name__ == "__main__": + try: + processor = MalletLDA(track_progress = False) + processor.process() + except: + logging.error(traceback.format_exc()) \ No newline at end of file diff --git a/chrome/content/papermachines/processors/mallet_lda_MI.py b/chrome/content/papermachines/processors/mallet_lda_MI.py new file mode 100755 index 0000000..550a465 --- /dev/null +++ b/chrome/content/papermachines/processors/mallet_lda_MI.py @@ -0,0 +1,163 @@ +#!/usr/bin/env python2.7 +import sys, os, codecs, logging, traceback, json, math +import mallet_lda + +class MalletLDAMutualInformation(mallet_lda.MalletLDA): + """ + Calculate mutual information for groups of topics + """ + def _basic_params(self): + self.name = "mallet_lda_MI" + self.categorical = True + self.template_name = "mallet_lda_MI" + self.dry_run = False + self.mallet_out_dir = self.extra_args[0] + + def _mutualInformation(self, X, Y): + probs = {} + marginal_x = {} + marginal_y = {} + + n = 0 + for interval, x_topic_vals in X.iteritems(): + if not interval in Y: + continue + y_topic_vals = Y[interval] + + if len(x_topic_vals.keys()) == 0 or len(y_topic_vals.keys()) == 0: + continue + + # what is being most discussed in each group? + x = self.argmax(x_topic_vals) + y = self.argmax(y_topic_vals) + + if not x in marginal_x: + marginal_x[x] = 0 + marginal_x[x] += 1 + if not y in marginal_y: + marginal_y[y] = 0 + marginal_y[y] += 1 + + if not x in probs: + probs[x] = {} + if not y in probs[x]: + probs[x][y] = 0 + probs[x][y] += 1 + n += 1 + + n_x = float(sum(marginal_x.values())) + for x in marginal_x.keys(): + marginal_x[x] /= n_x + + n_y = float(sum(marginal_y.values())) + for y in marginal_y.keys(): + marginal_y[y] /= n_y + + for x, y_probs in probs.iteritems(): + for y in y_probs.keys(): + probs[x][y] /= float(n) + + mi = 0.0 + for x, y_probs in probs.iteritems(): + for y in y_probs.keys(): + mi += (probs[x][y] * math.log(probs[x][y] / (marginal_x[x] * marginal_y[y]), 2)) + return mi + + def process(self): + self.metadata = json.load(codecs.open(os.path.join(self.mallet_out_dir, "metadata.json"), 'r', encoding='utf-8')) + self.files = self.metadata.keys() + + self.classify_file = os.path.join(self.out_dir, "mallet_classify-file" + self.collection + ".json") + if os.path.exists(self.classify_file): + with codecs.open(self.classify_file, 'r', encoding='utf-8') as f: + self.classified = json.load(f) + for filename in self.files: + label = self.classified.get(filename) + if label is not None: + self.metadata[filename]["label"] = label + + self.labels = set([x["label"] for x in self.metadata.values()]) + + self.doc_topics = os.path.join(self.mallet_out_dir, "doc-topics.txt") + self.docs = [x.strip() for x in codecs.open(os.path.join(self.mallet_out_dir, "dmap"), 'r', encoding='utf-8')] + + self._sort_into_intervals() + self.labels_years_topics = {} + + for label in self.labels: + self.labels_years_topics[label] = {i: {} for i in self.intervals} + + for line in file(self.doc_topics): + try: + values = line.split('\t') + + id = values.pop(0) + if id.startswith("#doc"): + continue + filename = self.docs[int(id)] + del values[0] + + itemid = self.metadata[filename]["itemID"] + + label = self.metadata[filename]["label"] + + freqs = {int(y[0]): float(y[1]) for y in self.xpartition(values)} + main_topic = None + topic_max = 0.0 + for i in freqs.keys(): + if freqs[i] > topic_max: + main_topic = i + topic_max = freqs[i] + if main_topic is None: + continue + if not main_topic in self.labels_years_topics[label][self.fname_to_interval[filename]]: + self.labels_years_topics[label][self.fname_to_interval[filename]][main_topic] = 0 + self.labels_years_topics[label][self.fname_to_interval[filename]][main_topic] += 1 + except KeyboardInterrupt: + sys.exit(1) + except: + logging.error(traceback.format_exc()) + + self.MIs = {} + labels = sorted(self.labels) + n = len(labels) + for i in range(n): + for j in range(i+1,n): + X = self.labels_years_topics[labels[i]] + Y = self.labels_years_topics[labels[j]] + + # all_topics = [] + + # for A in [X,Y]: + # this_set = set() + # for interval, topic_vals in A.iteritems(): + # this_set.update([topic for topic, val in topic_vals.iteritems() if val > 0]) + # all_topics.append(this_set) + + # topics_of_interest = all_topics[0].intersection(all_topics[1]) + + result = self._mutualInformation(X, Y) + self.MIs[str(i) + ',' + str(j)] = result + + self.nodes = [] + self.edges = [] + node_index = {} + + for key, mi in self.MIs.iteritems(): + a, b = [int(x) for x in key.split(',')] + for i in [a,b]: + if i not in node_index: + node_index[i] = len(self.nodes) + self.nodes.append(labels[i]) + edge = {"source": node_index[a], "target": node_index[b], "mi": mi} + self.edges.append(edge) + + params = {"NODES": json.dumps(self.nodes), "EDGES": json.dumps(self.edges)} + self.write_html(params) + +if __name__ == "__main__": + try: + processor = MalletLDAMutualInformation(track_progress=False) + processor.process() + except: + logging.error(traceback.format_exc()) \ No newline at end of file diff --git a/chrome/content/papermachines/processors/mallet_lda_categorical.py b/chrome/content/papermachines/processors/mallet_lda_categorical.py new file mode 100755 index 0000000..1a18502 --- /dev/null +++ b/chrome/content/papermachines/processors/mallet_lda_categorical.py @@ -0,0 +1,24 @@ +#!/usr/bin/env python2.7 +import sys, os, logging, traceback +import mallet_lda + +class MalletSubcollections(mallet_lda.MalletLDA): + """ + Set topic modeling to categorical view by default + """ + def _basic_params(self): + self.name = "mallet_lda_categorical" + self.categorical = True + self.template_name = "mallet_lda" + self.dry_run = False + self.topics = 50 + self.dfr = len(self.extra_args) > 0 + if self.dfr: + self.dfr_dir = self.extra_args[0] + +if __name__ == "__main__": + try: + processor = MalletSubcollections(track_progress=False) + processor.process() + except: + logging.error(traceback.format_exc()) \ No newline at end of file diff --git a/chrome/content/papermachines/processors/mallet_lda_jstor.py b/chrome/content/papermachines/processors/mallet_lda_jstor.py new file mode 100755 index 0000000..86d74b3 --- /dev/null +++ b/chrome/content/papermachines/processors/mallet_lda_jstor.py @@ -0,0 +1,40 @@ +#!/usr/bin/env python2.7 +import sys, os, logging, traceback +from zipfile import ZipFile +from lib.merge_jstor import merge_dfr_dirs +import mallet_lda + +class MalletJSTOR(mallet_lda.MalletLDA): + """ + Alias to distinguish mallet queries with JSTOR attached + """ + def _extractAll(self, zipName, dest): + z = ZipFile(zipName) + z.extractall(dest, filter(lambda f: not f.endswith('/'), z.namelist())) + + def _basic_params(self): + self.name = "mallet_lda_jstor" + self.categorical = False + self.template_name = "mallet_lda" + self.dry_run = False + self.topics = 50 + self.dfr = True + dfr_dirs = [] + for dfr_path in self.extra_args: + if dfr_path.lower().endswith(".zip"): + dfr_dir = os.path.basename(dfr_path).replace(".zip","") + this_dfr_dir = os.path.join(self.out_dir, dfr_dir) + self._extractAll(dfr_path, this_dfr_dir) + dfr_dirs.append(this_dfr_dir) + if len(dfr_dirs) > 1: + self.dfr_dir = merge_dfr_dirs(dfr_dirs) + else: + self.dfr_dir = dfr_dirs[0] + + +if __name__ == "__main__": + try: + processor = MalletJSTOR(track_progress=False) + processor.process() + except: + logging.error(traceback.format_exc()) \ No newline at end of file diff --git a/chrome/content/papermachines/processors/mallet_train-classifier.py b/chrome/content/papermachines/processors/mallet_train-classifier.py new file mode 100755 index 0000000..5bc139a --- /dev/null +++ b/chrome/content/papermachines/processors/mallet_train-classifier.py @@ -0,0 +1,42 @@ +#!/usr/bin/env python2.7 +import sys, os, logging, traceback, time, subprocess +import mallet + +class MalletClassifier(mallet.Mallet): + """ + Train a classifier + """ + def _basic_params(self): + self.dry_run = False + self.name = "mallet_train-classifier" + self.dfr = False + + def process(self): + self._setup_mallet_instances(sequence=False) + + self.mallet_output = os.path.join(self.mallet_out_dir, "trained.classifier") + process_args = self.mallet + ["cc.mallet.classify.tui.Vectors2Classify", + "--input", self.instance_file, + "--output-classifier", self.mallet_output, + "--trainer", "NaiveBayes", + "--noOverwriteProgressMessages", "true"] + + logging.info("begin training classifier") + + start_time = time.time() + if not self.dry_run: + classifier_return = subprocess.call(process_args, stdout=self.progress_file, stderr=self.progress_file) + + finished = "Classifier trained in " + str(time.time() - start_time) + " seconds" + logging.info(finished) + + params = {'DONE': finished} + + self.write_html(params) + +if __name__ == "__main__": + try: + processor = MalletClassifier(track_progress=False) + processor.process() + except: + logging.error(traceback.format_exc()) \ No newline at end of file diff --git a/chrome/content/papermachines/processors/phrasenet.py b/chrome/content/papermachines/processors/phrasenet.py new file mode 100755 index 0000000..a5aefd5 --- /dev/null +++ b/chrome/content/papermachines/processors/phrasenet.py @@ -0,0 +1,97 @@ +#!/usr/bin/env python2.7 +import sys, os, json, re, tempfile, cStringIO, logging, traceback, codecs +import textprocessor + +class PhraseNet(textprocessor.TextProcessor): + """ + Generate phrase net + cf. http://www-958.ibm.com/software/data/cognos/manyeyes/page/Phrase_Net.html + """ + + def _basic_params(self): + self.name = "phrasenet" + + def _findPhrases(self, pattern): + self.nodes = {} + self.edges = {} + for filename in self.files: + self.update_progress() + with codecs.open(filename, 'r', encoding='utf8') as f: + logging.info("processing " + filename) + for re_match in pattern.finditer(f.read()): + match = [w.lower() for w in re_match.groups()] + if any([word in self.stopwords for word in match]): + continue + + for word in match: + if not word in self.nodes: + self.nodes[word] = 1 + else: + self.nodes[word] += 1 + + edge = match[0] + self.edgesep + match[1] + if not edge in self.edges: + self.edges[edge] = 1 + else: + self.edges[edge] += 1 + + def process(self): + logging.info("starting to process") + + stopfile = os.path.join(self.cwd, "stopwords.txt") + logging.info("reading stopwords from " + stopfile) + self.stopwords = [line.strip() for line in file(stopfile)] + + self.edgesep = ',' + + wordregex = "(\w+)" + + if len(self.extra_args) > 0: + pattern_str = self.extra_args[0] + else: + pattern_str = "x and y" + + if pattern_str.count('x') == 1 and pattern_str.count('y') == 1: + pattern = pattern_str.replace('x', wordregex) + pattern = pattern.replace('y', wordregex) + else: + pattern = pattern_str + + logging.info("extracting phrases according to pattern "+ repr(pattern)) + + self._findPhrases(re.compile(pattern)) + + logging.info("generating JSON") + + used_nodes = set() + + jsondata = {'nodes': [], 'edges': []} + + top_edges = self.edges.keys() + top_edges.sort(key=lambda x: self.edges[x]) + top_edges.reverse() + top_edges = top_edges[:50] + + for edge in top_edges: + words = edge.split(',') + used_nodes.update(words) + + nodeindex = dict(zip(used_nodes, range(len(used_nodes)))) + + for edge in top_edges: + weight = self.edges[edge] + words = edge.split(',') + jsondata['edges'].append({'source': nodeindex[words[0]], 'target': nodeindex[words[1]], 'weight': weight}) + + for node in used_nodes: + jsondata['nodes'].append({'index': nodeindex[node], 'name': node, 'freq': self.nodes[node]}) + + params = {"DATA": json.dumps(jsondata), "PATTERN": json.dumps(pattern_str)} + self.write_html(params) + +if __name__ == "__main__": + try: + processor = PhraseNet(track_progress=True) + processor.process() + except: + logging.error(traceback.format_exc()) \ No newline at end of file diff --git a/chrome/content/papermachines/processors/textprocessor.py b/chrome/content/papermachines/processors/textprocessor.py new file mode 100755 index 0000000..5ca1cf2 --- /dev/null +++ b/chrome/content/papermachines/processors/textprocessor.py @@ -0,0 +1,138 @@ +#!/usr/bin/env python2.7 +import sys, os, csv, logging, tempfile, traceback, urllib, codecs, json, operator, platform +from itertools import izip + +class TextProcessor: + """ + Base class for text processing in Paper Machines + """ + + def __init__(self, track_progress=True): + self.sys = platform.system() + + # take in command line options + + self.args_filename = sys.argv[1] + self.args_basename = os.path.basename(self.args_filename).replace(".json", "") + + with codecs.open(self.args_filename, 'r', encoding='utf-8') as args_file: + args = json.load(args_file) + + self.cwd = args[0] + csv_file = args[1] + self.out_dir = args[2] + self.collection_name = args[3] + self.extra_args = args[4:] + + if "json" in self.extra_args: + json_starts_at = self.extra_args.index("json") + self.named_args = json.loads(self.extra_args[json_starts_at + 1]) + self.extra_args = self.extra_args[:json_starts_at] + else: + self.named_args = None + + self.collection = os.path.basename(csv_file).replace(".csv","") + + self.require_stopwords = True # load stopwords by default + + # call a function to set processor name, etc. + self._basic_params() + + if self.require_stopwords: + self.stoplist = os.path.join(self.cwd, "stopwords.txt") + self.stopwords = [x.strip() for x in codecs.open(self.stoplist, 'r', encoding='utf-8').readlines()] + + self.out_filename = os.path.join(self.out_dir, self.name + self.collection + "-" + self.args_basename + ".html") + + # logging.basicConfig(filename=os.path.join(self.out_dir, "logs", self.name + ".log"), level=logging.INFO) + logging.basicConfig(filename=self.out_filename.replace(".html", ".log"), filemode='w', level=logging.INFO) + + fh = logging.FileHandler(os.path.join(self.out_dir, "logs", self.name + ".log")) + formatter = logging.Formatter('%(name)s: %(levelname)-8s %(message)s') + fh.setFormatter(formatter) + + logging.getLogger('').addHandler(fh) + + logging.info("command: " + ' '.join([x.replace(' ','''\ ''') for x in sys.argv])) + + self.metadata = {} + + for rowdict in self.parse_csv(csv_file): + filename = rowdict.pop("filename") + self.metadata[filename] = rowdict + + self.files = self.metadata.keys() + if track_progress: + self.track_progress = True + self.progress_initialized = False + + def _basic_params(self): + self.name = "textprocessor" + + def parse_csv(self, filename, dialect=csv.excel, **kwargs): + with file(filename, 'rb') as f: + csv_rows = self.unicode_csv_reader(f, dialect=dialect, **kwargs) + header = csv_rows.next() + for row in csv_rows: + if len(row) > 0: + rowdict = dict(zip(header, row)) + yield rowdict + + def unicode_csv_reader(self, utf8_data, dialect=csv.excel, **kwargs): + csv_reader = csv.reader(utf8_data, dialect=dialect, **kwargs) + for row in csv_reader: + yield [unicode(cell, 'utf-8') for cell in row] + + def update_progress(self): + if self.track_progress: + if not self.progress_initialized: + self.progress_filename = os.path.join(self.out_dir, self.name + self.collection + "progress.txt") + self.progress_file = file(self.progress_filename, 'w') + self.count = 0 + self.total = len(self.files) + self.progress_initialized = True + + self.count += 1 + self.progress_file.write('<' + str(int(self.count*1000.0/float(self.total))) + '>\n') + self.progress_file.flush() + + def xpartition(self, seq, n=2): + return izip(*(iter(seq),) * n) + + def argmax(self, obj): + if hasattr(obj, "index"): + return obj.index(max(obj)) + elif hasattr(obj, "iteritems"): + return max(obj.iteritems(), key=operator.itemgetter(1))[0] + + def write_html(self, user_params): + logging.info("writing HTML") + params = {"COLLECTION_NAME": self.collection_name, "DOC_METADATA": json.dumps({v["itemID"]: v for k, v in self.metadata.iteritems()})} + params.update(user_params) + try: + template_filename = getattr(self, "template_filename", os.path.join(self.cwd, "templates", self.name + ".html")) + + with codecs.open(self.out_filename, 'w', encoding='utf-8') as outfile: + with codecs.open(template_filename, 'r', encoding='utf-8') as template: + template_str = template.read() + for k, v in params.iteritems(): + template_str = template_str.replace(k, v) + outfile.write(template_str) + except: + logging.error(traceback.format_exc()) + + def process(self): + """ + Example process -- should be overridden + """ + output = file(os.path.join(self.out_dir, self.name + '.txt'), 'w') + for filename in self.files: + output.write(' '.join([filename, self.metadata[filename]]) + '\n') + output.close() + +if __name__ == "__main__": + try: + processor = TextProcessor(track_progress = True) + processor.process() + except: + logging.error(traceback.format_exc()) \ No newline at end of file diff --git a/chrome/content/papermachines/processors/wordcloud.py b/chrome/content/papermachines/processors/wordcloud.py new file mode 100755 index 0000000..163e2a3 --- /dev/null +++ b/chrome/content/papermachines/processors/wordcloud.py @@ -0,0 +1,118 @@ +#!/usr/bin/env python2.7 +import sys, os, json, cStringIO, tempfile, logging, traceback, codecs, math +import textprocessor +from lib.porter2 import stem + +class WordCloud(textprocessor.TextProcessor): + """ + Generate word cloud + """ + def _basic_params(self): + self.name = "wordcloud" + self.width = "300" + self.height = "150" + self.fontsize = "[10,32]" + self.n = 50 + self.tfidf_scoring = False + + def _findTfIdfScores(self, scale=True): + self.freqs = {} + self.tf_by_doc = {} + self.max_tf = {} + self.df = {} + for filename in self.files: + with codecs.open(filename, 'r', encoding = 'utf8') as f: + logging.info("processing " + filename) + flen = 0 + self.tf_by_doc[filename] = {} + for line in f: + for stem in self._tokenizeAndStem(line): + flen += 1 + if stem not in self.tf_by_doc[filename]: + self.tf_by_doc[filename][stem] = 0 + if stem not in self.df: + self.df[stem] = 0 + self.df[stem] += 1 + self.tf_by_doc[filename][stem] += 1 + # max_tf_d = max(self.tf_by_doc[filename].values()) + for stem in self.tf_by_doc[filename].keys(): + if stem not in self.freqs: + self.freqs[stem] = 0 + self.freqs[stem] += self.tf_by_doc[filename][stem] + if scale: + self.tf_by_doc[filename][stem] /= float(flen) #max_tf_d + this_tf = self.tf_by_doc[filename][stem] + else: + this_tf = self.tf_by_doc[filename][stem] / float(flen) + + if stem not in self.max_tf or self.max_tf[stem] < this_tf: + self.max_tf[stem] = this_tf + self.update_progress() + n = float(len(self.files)) + self.idf = {term: math.log10(n/df) for term, df in self.df.iteritems()} + self.tfidf = {term: self.max_tf[term] * self.idf[term] for term in self.max_tf.keys()} + tfidf_values = self.tfidf.values() + top_terms = min(int(len(self.freqs.keys()) * 0.7), 5000) + min_score = sorted(tfidf_values, reverse=True)[min(top_terms, len(tfidf_values) - 1)] + self.filtered_freqs = {term: freq for term, freq in self.freqs.iteritems() if self.tfidf[term] > min_score and self.df[term] > 3} + + def _topN(self, freqs, n = None): + if n is None: + n = self.n + final_freqs = [] + top_freqs = sorted(freqs.values()) + if len(top_freqs) == 0: + return [] + min_freq = top_freqs[-min(n,len(top_freqs))] # find nth frequency from end, or start of list + for word, freq in freqs.iteritems(): + if freq >= min_freq: + final_freqs.append({'text': word, 'value': freq}) + return final_freqs + + def _findWordFreqs(self, filenames): + freqs = {} + for filename in filenames: + with codecs.open(filename, 'r', encoding = 'utf8') as f: + logging.info("processing " + filename) + for line in f: + for stem in self._tokenizeAndStem(line): + if stem not in freqs: + freqs[stem] = 1 + else: + freqs[stem] += 1 + self.update_progress() + return self._topN(freqs) + + def _tokenizeAndStem(self, line): + # uncomment for Porter stemming (slower, but groups words with their plurals, etc.) + # return [stem(word.strip('.,')) for word in line.split() if word.lower() not in self.stopwords and len(word) > 3] + return [word.lower() for word in line.split() if word.lower() not in self.stopwords and word.isalpha() and len(word) >= 3] + + def process(self): + logging.info("starting to process") + + self.template_filename = os.path.join(self.cwd, "templates", "wordcloud.html") + + logging.info("finding word frequencies") + + if self.tfidf_scoring: + self._findTfIdfScores() + freqs = self._topN(self.filtered_freqs) + else: + freqs = self._findWordFreqs(self.files) + + params = {"DATA": json.dumps(freqs), + "WIDTH": self.width, + "HEIGHT": self.height, + "FONTSIZE": self.fontsize + } + + self.write_html(params) + + +if __name__ == "__main__": + try: + processor = WordCloud(track_progress = True) + processor.process() + except: + logging.error(traceback.format_exc()) \ No newline at end of file diff --git a/chrome/content/papermachines/processors/wordcloud_chronological.py b/chrome/content/papermachines/processors/wordcloud_chronological.py new file mode 100755 index 0000000..c9d987c --- /dev/null +++ b/chrome/content/papermachines/processors/wordcloud_chronological.py @@ -0,0 +1,77 @@ +#!/usr/bin/env python2.7 +import sys, os, json, cStringIO, tempfile, logging, traceback, codecs, math +from datetime import datetime, timedelta +import wordcloud_multiple + +class WordCloudChronological(wordcloud_multiple.MultipleWordClouds): + """ + Generate word clouds based on time interval + """ + def _basic_params(self): + self.name = "wordcloud_chronological" + self.template_filename = os.path.join(self.cwd, "templates", "wordcloud_multiple.html") + self.width = "483" + self.height = "300" + self.fontsize = "[10,32]" + self.n = 100 + self.tfidf_scoring = False + self.MWW = False + self.dunning = False + if len(self.extra_args) == 1: + self.interval = self.extra_args[0] + elif len(self.extra_args) > 1: + if self.extra_args[0] == "tfidf": + self.tfidf_scoring = True + elif self.extra_args[0] == "mww": + self.tfidf_scoring = True + self.MWW = True + elif self.extra_args[0] == "dunning": + self.tfidf_scoring = True + self.dunning = True + self.interval = self.extra_args[1] + else: + self.interval = "90" + + def _split_into_labels(self): + datestr_to_datetime = {} + for filename in self.metadata.keys(): + date_str = self.metadata[filename]["date"] + cleaned_date = date_str[0:10] + if "-00" in cleaned_date: + cleaned_date = cleaned_date[0:4] + "-01-01" + datestr_to_datetime[date_str] = datetime.strptime(cleaned_date, "%Y-%m-%d") + datetimes = sorted(datestr_to_datetime.values()) + start_date = datetimes[0] + end_date = datetimes[-1] + + if self.interval.isdigit(): + interval = timedelta(int(self.interval)) + else: + interval = timedelta(90) + + intervals = [] + interval_names = [] + start = end = start_date + while end <= end_date: + end += interval + intervals.append((start,end)) + interval_names.append(start.isoformat()[0:10].replace('-','/') + '-' + end.isoformat()[0:10].replace('-','/')) + start = end + + for filename, metadata in self.metadata.iteritems(): + label = "" + for i in range(len(intervals)): + interval = intervals[i] + if interval[0] <= datestr_to_datetime[metadata["date"]] < interval[1]: + label = interval_names[i] + break + if label not in self.labels: + self.labels[label] = set() + self.labels[label].add(filename) + +if __name__ == "__main__": + try: + processor = WordCloudChronological(track_progress = True) + processor.process() + except: + logging.error(traceback.format_exc()) \ No newline at end of file diff --git a/chrome/content/papermachines/processors/wordcloud_large.py b/chrome/content/papermachines/processors/wordcloud_large.py new file mode 100755 index 0000000..3300d33 --- /dev/null +++ b/chrome/content/papermachines/processors/wordcloud_large.py @@ -0,0 +1,22 @@ +#!/usr/bin/env python2.7 +import sys, os, logging, traceback, codecs +import wordcloud + +class LargeWordCloud(wordcloud.WordCloud): + """ + Generate large word cloud + """ + def _basic_params(self): + self.width = "960" + self.height = "500" + self.fontsize = "[10,72]" + self.name = "wordcloud_large" + self.n = 150 + self.tfidf_scoring = len(self.extra_args) > 0 + +if __name__ == "__main__": + try: + processor = LargeWordCloud(track_progress=True) + processor.process() + except: + logging.error(traceback.format_exc()) \ No newline at end of file diff --git a/chrome/content/papermachines/processors/wordcloud_multiple.py b/chrome/content/papermachines/processors/wordcloud_multiple.py new file mode 100755 index 0000000..e7119a8 --- /dev/null +++ b/chrome/content/papermachines/processors/wordcloud_multiple.py @@ -0,0 +1,200 @@ +#!/usr/bin/env python2.7 +import sys, os, json, cStringIO, tempfile, logging, traceback, codecs, math +import wordcloud + +class MultipleWordClouds(wordcloud.WordCloud): + """ + Generate word clouds based on labels + """ + def _basic_params(self): + self.name = "wordcloud_multiple" + self.width = "300" + self.height = "150" + self.fontsize = "[10,32]" + self.n = 50 + self.tfidf_scoring = False + self.MWW = False + self.dunning = False + if len(self.extra_args) > 0: + if self.extra_args[0] == "tfidf": + self.tfidf_scoring = True + elif self.extra_args[0] == "mww": + self.tfidf_scoring = True + self.MWW = True + elif self.extra_args[0] == "dunning": + self.tfidf_scoring = True + self.dunning = True + + def _rank_simple(self, vector): + return sorted(range(len(vector)), key=vector.__getitem__) + + def _rank(self, seq): + n = len(seq) + ivec = self._rank_simple(seq) + svec = [seq[rank] for rank in ivec] + last_obs = svec[0] + new_vec = [1]*n + dupe_indices = set() + + for i in xrange(1, n): + if svec[i] == last_obs: + dupe_indices.add(i-1) + dupe_indices.add(i) + else: + if len(dupe_indices) > 0: + averank = (sum(dupe_indices) / float(len(dupe_indices))) + 1 + for j in dupe_indices: + new_vec[j] = averank + new_vec[i] = i + 1 + dupe_indices = set() + else: + new_vec[i] = i + 1 + last_obs = svec[i] + ranks = {svec[i]: rank for i, rank in enumerate(new_vec)} + return ranks + + def _mannWhitney(self, A, B): + all_obs = A + B + n_a = len(A) + n_b = len(B) + n_ab = len(all_obs) + + ranks = self._rank(all_obs) + t_a = sum([ranks[obs] for obs in A]) + mu_a = float(n_a * (n_ab + 1)) / 2 + t_a_max = (n_a * n_b) + (n_a * (len(A) + 1))/2 + u_a = t_a_max - t_a + s = math.sqrt(float(n_a * n_b * (n_ab + 1))/12) + if t_a > mu_a: + z_a = (t_a - mu_a - 0.5)/ s + else: + z_a = (t_a - mu_a + 0.5)/ s + rho = u_a / (n_a*n_b) + return rho + + def _dunning_held_out(self, word, label_set, other_set): + sets = [label_set, other_set] + count_total = [0.0, 0.0, 0.0, 0.0] + for i in range(len(sets)): + for filename in sets[i]: + if word in self.tf_by_doc[filename]: + count_total[i] += self.tf_by_doc[filename][word] + count_total[i + 2] += sum(self.tf_by_doc[filename].values()) + # count_total[i] = sum([word_weights[word] for filename, word_weights in self.tf_by_doc.iteritems() if filename in sets[i] and word in word_weights]) + # count_total[i + 2] = sum([sum(word_weights.values()) for filename, word_weights in self.tf_by_doc.iteritems() if filename in sets[i]]) + a, b, c, d = [float(x) for x in count_total] + if any([x == 0 for x in count_total]): + return 0 + E1 = c*((a+b)/(c+d)) + E2 = d*((a+b)/(c+d)) + G2 = 2.0*((a*math.log(a/E1)) + (b*math.log(b/E2))) + return G2 + + def _dunning(self, word, label_set): + count_total = [0.0, self.freqs[word], 0.0, self.total_word_count] + for filename in label_set: + if word in self.tf_by_doc[filename]: + count_total[0] += self.tf_by_doc[filename][word] + count_total[2] += sum(self.tf_by_doc[filename].values()) + a, b, c, d = [float(x) for x in count_total] + if any([x == 0 for x in count_total]): + return 0 + E1 = c*((a+b)/(c+d)) + E2 = d*((a+b)/(c+d)) + G2 = 2.0*((a*math.log(a/E1)) + (b*math.log(b/E2))) + return G2 + + def _held_out(self, word, label_set, other_set): + ranks_by_set = [[],[]] + sets = [label_set, other_set] + appears_in_label_set = False + for i in range(len(sets)): + for filename in sets[i]: + if word in self.tf_by_doc[filename]: + ranks_by_set[i].append(self.tf_by_doc[filename][word]) + if i == 0: + appears_in_label_set = True + # ranks_by_set[i].append(self.tf_by_doc[filename][word] * self.idf[word]) + else: + ranks_by_set[i].append(0) + if not appears_in_label_set: + return 0.0 + else: + return self._mannWhitney(ranks_by_set[0], ranks_by_set[1]) + + def _split_into_labels(self): + for filename, data in self.metadata.iteritems(): + if data["label"] not in self.labels: + self.labels[data["label"]] = set() + self.labels[data["label"]].add(filename) + + def process(self): + logging.info("splitting into labeled sets") + self.labels = {} + self._split_into_labels() + + clouds = {} + + all_files = set(self.files) + if self.tfidf_scoring: + if self.dunning: + self._findTfIdfScores(scale=False) + else: + self._findTfIdfScores() + # self.top_tfidf_words = [item["text"] for item in self._topN(self.filtered_freqs, 150)] + self.top_tfidf_words = self.filtered_freqs.keys() + + self.label_order = sorted(self.labels.keys()) + for label in self.label_order: + filenames = self.labels[label] + logging.info("finding word frequencies for " + str(label)) + if self.tfidf_scoring and self.MWW: + label_set = set(filenames) + other_set = all_files - label_set + word_rho = {} + for word in self.top_tfidf_words: + word_rho[word] = self._held_out(word, label_set, other_set) + clouds[label] = self._topN(word_rho) + elif self.tfidf_scoring and self.dunning: + label_set = set(filenames) + other_set = all_files - label_set + word_G2 = {} + self.total_word_count = sum(self.freqs.values()) + for word in self.top_tfidf_words: + G2 = self._dunning_held_out(word, label_set, other_set) + # G2 = self._dunning(word, label_set) + if G2 > 15.13: # critical value for p < 0.001 + word_G2[word] = G2 + clouds[label] = self._topN(word_G2) + + elif self.tfidf_scoring: + tf_maxes = {} + for filename in filenames: + for term, weight in self.tf_by_doc[filename].iteritems(): + if term not in tf_maxes: + tf_maxes[term] = weight + else: + if weight > tf_maxes[term]: + tf_maxes[term] = weight + tfidf_for_labelset = {term: weight * self.idf[term] for term, weight in tf_maxes.iteritems()} + filtered_freqs_for_labelset = {term: freq for term, freq in self.filtered_freqs.iteritems() if term in tfidf_for_labelset} + clouds[label] = self._topN(filtered_freqs_for_labelset) + else: + clouds[label] = self._findWordFreqs(filenames) + + params = {"CLOUDS": json.dumps(clouds), + "ORDER": json.dumps(self.label_order), + "WIDTH": self.width, + "HEIGHT": self.height, + "FONTSIZE": self.fontsize + } + + self.write_html(params) + + +if __name__ == "__main__": + try: + processor = MultipleWordClouds(track_progress = True) + processor.process() + except: + logging.error(traceback.format_exc()) \ No newline at end of file diff --git a/chrome/locale/en-US/papermachines/papermachines.dtd b/chrome/locale/en-US/papermachines/papermachines.dtd index 22d7dcc..88e787b 100755 --- a/chrome/locale/en-US/papermachines/papermachines.dtd +++ b/chrome/locale/en-US/papermachines/papermachines.dtd @@ -63,6 +63,8 @@ + + diff --git a/defaults/preferences/defaults.js b/defaults/preferences/defaults.js index 1832f34..cbd78ad 100644 --- a/defaults/preferences/defaults.js +++ b/defaults/preferences/defaults.js @@ -6,6 +6,8 @@ pref("extensions.papermachines.general.extract_html", true); pref("extensions.papermachines.general.extract_notes", true); pref("extensions.papermachines.general.extract_tags", true); +pref("extensions.papermachines.general.python_exe", ""); + pref("extensions.papermachines.import.title", "Issue"); pref("extensions.papermachines.import.pubtitle", "The Daily News"); pref("extensions.papermachines.import.guessdate", false); diff --git a/install.rdf b/install.rdf index becbabd..d740a31 100755 --- a/install.rdf +++ b/install.rdf @@ -5,7 +5,7 @@ papermachines@chrisjr.org Paper Machines - 0.2.4 + 0.2.5 A Zotero extension for analysis and visualization in the digital humanities. Chris Johnson-Roberson http://chrisjr.github.com/papermachines/