Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

get rid of .pyw files

  • Loading branch information...
commit e4af6e310be114b8c2710aa4fabe4bc65a501eb0 1 parent cca635b
@chrisjr chrisjr authored
View
102 chrome/content/papermachines/processors/dbpedia.pyw
@@ -1,102 +0,0 @@
-#!/usr/bin/env python
-import sys, os, json, logging, urllib, urllib2, codecs, traceback
-import textprocessor
-
-
-class DBpedia(textprocessor.TextProcessor):
- """
- annotates texts using DBpedia Spotlight
- """
-
- def _basic_params(self):
- self.name = "dbpedia"
- self.dry_run = False
- self.require_stopwords = False
-
- def _get_annotated(self, text, confidence = 0.2, support = 20):
- values = {'text': text[0:10000].encode('utf-8'),
- 'confidence': confidence,
- 'support': support}
- data = urllib.urlencode(values)
- req = urllib2.Request(self.url, data, self.headers)
- response = urllib2.urlopen(req)
- annotation = response.read()
- encoding = req.headers.get('content-type', 'charset=utf8').split('charset=')[-1]
-
- return unicode(annotation, encoding)
-
- def process(self):
- """
- create JSON files with named entity recognition by DBpedia
- """
-
- logging.info("beginning annotation")
-
- self.url = "http://spotlight.dbpedia.org/rest/annotate"
- self.headers = {'Accept': 'application/json', 'content-type': 'application/x-www-form-urlencoded'}
-
- annotated = {}
- if not self.dry_run:
- for filename in self.files:
- logging.info("processing " + filename)
- self.update_progress()
- try:
- annotated_filename = filename.replace(".txt", "_dbpedia.json")
- if os.path.exists(annotated_filename):
- annotated[annotated_filename] = filename
- else:
- with codecs.open(filename, 'r', encoding='utf-8') as f:
- annotation = self._get_annotated(f.read())
- if len(annotation) > 0:
- annotated[annotated_filename] = filename
- with codecs.open(annotated_filename, 'w', encoding='utf-8') as out:
- out.write(annotation)
- except (KeyboardInterrupt, SystemExit):
- raise
- except:
- logging.error(traceback.format_exc())
- else:
- for filename in self.files:
- annotated_filename = filename.replace(".txt", "_dbpedia.json")
- if os.path.exists(annotated_filename):
- annotated[annotated_filename] = filename
-
- uris_to_docs = {}
- for json_annotation, filename in annotated.iteritems():
- itemID = self.metadata[filename]["itemID"]
- notes = json.load(file(json_annotation))
- entities = notes.get("Resources", [])
- for entity in entities:
- uri = entity.get("@URI", "http://dbpedia.org/resource/")
- if not uri in uris_to_docs:
- uris_to_docs[uri] = {}
- if not itemID in uris_to_docs[uri]:
- uris_to_docs[uri][itemID] = 0
- uris_to_docs[uri][itemID] += 1
-
- filtered_uris = {}
- weights = []
- for uri, items in uris_to_docs.iteritems():
- weights.append(sum(items.values()))
- weights.sort()
- min_weight = weights[max(-100, -len(weights))]
-
- for uri, items in uris_to_docs.iteritems():
- if sum(items.values()) > min_weight:
- filtered_uris[uri] = items
-
-
-
- # params = {"DATA": json.dumps(uris_to_docs)}
- params = {"URIS_TO_DOCS": json.dumps(filtered_uris)}
- self.write_html(params)
-
- logging.info("finished")
-
-
-if __name__ == "__main__":
- try:
- processor = DBpedia(track_progress=True)
- processor.process()
- except:
- logging.error(traceback.format_exc())
View
76 chrome/content/papermachines/processors/extract.pyw
@@ -1,76 +0,0 @@
-#!/usr/bin/env python
-import sys, os, json, re, cStringIO, logging, traceback, codecs, urllib, subprocess
-from HTMLParser import HTMLParser
-import textprocessor
-
-class MLStripper(HTMLParser):
- def __init__(self):
- self.reset()
- self.fed = []
- def handle_data(self, d):
- self.fed.append(d)
- def get_data(self):
- return u''.join(self.fed)
-
-def strip_tags(html):
- s = MLStripper()
- s.feed(html)
- return s.get_data()
-
-class Extract(textprocessor.TextProcessor):
- """
- Extract text from PDF or HTML files
- """
-
- def _basic_params(self):
- self.name = "extract"
- self.pdftotext = self.extra_args[0]
-
-
- def process(self):
- logging.info("starting to process")
-
- itemIDs = {}
- for filename in self.files:
- id = self.metadata[filename]["itemID"]
- if id not in itemIDs:
- itemIDs[id] = []
- itemIDs[id].append(filename)
-
- saved = []
- for itemID, filenames in itemIDs.iteritems():
- try:
- out_file = self.metadata[filenames[0]]["outfile"]
- out_dir = os.path.dirname(out_file)
- if not os.path.exists(out_dir):
- os.makedirs(out_dir)
- text = u''
- for filename in filenames:
- if filename.lower().endswith(".txt"):
- text += codecs.open(filename, 'r', encoding='utf-8', errors='ignore').read()
- elif filename.lower().endswith(".html"):
- text += strip_tags(codecs.open(filename, 'r', encoding='utf-8', errors='ignore').read())
- elif filename.lower().endswith(".pdf"):
- import_args = [self.pdftotext, '-enc', 'UTF-8', '-nopgbrk', filename, '-']
- import_proc = subprocess.Popen(import_args, stdout = subprocess.PIPE)
- text += import_proc.communicate()[0].decode('utf-8')
- with codecs.open(out_file, 'w', encoding="utf-8") as f:
- f.write(text)
- saved.append({"itemID": itemID, "collection": self.metadata[filename]["collection"], "filename": out_file})
- self.update_progress()
- except:
- logging.error(traceback.format_exc())
- if self.progress_initialized:
- self.progress_file.write('<1000>\n')
- json_out = os.path.join(self.out_dir, self.name + self.collection + ".json")
- with codecs.open(json_out, 'wb', encoding='utf-8') as f:
- json.dump(saved, f)
- params = {"SUCCEEDED": str(len(saved)), "TOTAL": str(len(itemIDs.keys()))}
- self.write_html(params)
-
-if __name__ == "__main__":
- try:
- processor = Extract(track_progress=True)
- processor.process()
- except:
- logging.error(traceback.format_exc())
View
166 chrome/content/papermachines/processors/geoparse.pyw
@@ -1,166 +0,0 @@
-#!/usr/bin/env python
-import sys, os, json, logging, traceback, base64, time, codecs
-import cPickle as pickle
-from lib.placemaker import placemaker
-from lib.placemaker.placemaker_api import placemaker_api_key
-import textprocessor
-
-
-class Geoparse(textprocessor.TextProcessor):
- """
- Geoparsing using Yahoo! Placemaker
- """
-
- def _basic_params(self):
- self.name = "geoparse"
- self.dry_run = False
- self.require_stopwords = False
-
- def process(self):
- """
- create a JSON file with geographical data extracted from texts
- """
-
- self.name = "geoparse"
-
- p = placemaker(base64.b64decode(placemaker_api_key))
-
- geo_parsed = {}
- places_by_woeid = {}
-
- for filename in self.files:
- logging.info("processing " + filename)
- self.update_progress()
-
- file_geoparsed = filename.replace(".txt", "_geoparse.json")
-
- if os.path.exists(file_geoparsed):
- geoparse_obj = json.load(file(file_geoparsed))
- elif not self.dry_run:
- geoparse_obj = {'places_by_woeid': {}, 'references': {}}
- try:
- # id = self.metadata[filename]['itemID']
- str_to_parse = self.metadata[filename]['place']
- last_index = len(str_to_parse)
- str_to_parse += codecs.open(filename, 'r', encoding='utf8').read()[0:(48000 - last_index)] #50k characters, shortened by initial place string
-
- city = None
- places = []
-
- p.find_places(str_to_parse.encode('utf8', 'ignore'))
- for woeid, referenced_place in p.referencedPlaces.iteritems():
- place = referenced_place["place"]
- geoparse_obj['places_by_woeid'][woeid] = {'name': place.name, 'type': place.placetype, 'coordinates': [place.centroid.longitude, place.centroid.latitude]}
-
- for reference in referenced_place["references"]:
- if reference.start < last_index:
- city = woeid
- else:
- places.append(woeid)
- if not woeid in geoparse_obj['references']:
- geoparse_obj['references'][woeid] = []
- geoparse_obj['references'][woeid].append((reference.start - last_index, reference.end - last_index))
-
- geoparse_obj['places'] = places
- geoparse_obj['city'] = city
- json.dump(geoparse_obj, file(file_geoparsed, 'w'))
- time.sleep(0.2)
- except (KeyboardInterrupt, SystemExit):
- raise
- except:
- logging.error(traceback.format_exc())
-
- geo_parsed[filename] = geoparse_obj.get('places', [])
- self.metadata[filename]['city'] = geoparse_obj.get('city')
- for woeid, data in geoparse_obj.get('places_by_woeid', {}).iteritems():
- places_by_woeid[int(woeid)] = data
-
- places = {}
- for filename, woeids in geo_parsed.iteritems():
- year = self.metadata[filename]["year"]
- for woeid in woeids:
- if woeid in places_by_woeid:
- if woeid not in places:
- places[woeid] = {}
- places[woeid]["name"] = places_by_woeid[woeid]["name"]
- places[woeid]["type"] = places_by_woeid[woeid]["type"]
- places[woeid]["coordinates"] = places_by_woeid[woeid]["coordinates"]
- places[woeid]["weight"] = {year: 1}
- else:
- if year not in places[woeid]["weight"]:
- places[woeid]["weight"][year] = 1
- else:
- places[woeid]["weight"][year] += 1
-
- self.places_by_woeid = places_by_woeid
- max_country_weight = 0
-
- for place in sorted(places.keys()):
- if places[place]["type"] == "Country":
- country_sum = sum(places[place]["weight"].values())
- if country_sum > max_country_weight:
- max_country_weight = country_sum
-
- placeIDsToNames = {k: v["name"] for k, v in places_by_woeid.iteritems()}
- placeIDsToCoords = {k: v["coordinates"] for k, v in places_by_woeid.iteritems()}
-
- linksByYear = {}
- sources = {}
-
- for filename in self.files:
- if self.metadata[filename].get('city') is None or len(geo_parsed[filename]) < 2:
- continue
- try:
- title = os.path.basename(filename)
- itemID = self.metadata[filename]['itemID']
- year = self.metadata[filename]['year']
- if year not in linksByYear:
- linksByYear[year] = {}
- source = self.metadata[filename]['city']
- if source != None:
- if source not in sources:
- sources[source] = {}
- if year not in sources[source]:
- sources[source][year] = 0
- sources[source][year] += 1
- targets = geo_parsed[filename]
- for target in targets:
- edge = str(source) + ',' + str(target)
- if edge not in linksByYear[year]:
- linksByYear[year][edge] = 0
- linksByYear[year][edge] += 1
- except:
- logging.info(traceback.format_exc())
-
- years = sorted(linksByYear.keys())
- groupedLinksByYear = []
-
- for year in years:
- groupedLinksByYear.append([])
- for edge in linksByYear[year]:
- weight = linksByYear[year][edge]
- source, target = [int(x) for x in edge.split(',')]
- groupedLinksByYear[-1].append({'source': source, 'target': target, 'year': year, 'weight': weight})
-
-
- params = {"PLACEIDSTOCOORDS": json.dumps(placeIDsToCoords),
- "PLACEIDSTONAMES": json.dumps(placeIDsToNames),
- "PLACESMENTIONED": json.dumps({k : v["weight"] for k, v in places.iteritems() if v["type"] != "Country"}),
- "TEXTSFROMPLACE": json.dumps(sources),
- "COUNTRIES": json.dumps({v["name"] : v["weight"] for k, v in places.iteritems() if v["type"] == "Country"}),
- "MAX_COUNTRY_WEIGHT": str(max_country_weight),
- "STARTDATE": str(min([int(x["year"]) for x in self.metadata.values() if x["year"].isdigit() and x["year"] != "0000"])),
- "ENDDATE": str(max([int(x["year"]) for x in self.metadata.values() if x["year"].isdigit()])),
- "LINKS_BY_YEAR": json.dumps(groupedLinksByYear)
- }
- self.write_html(params)
-
- logging.info("finished")
-
-
-if __name__ == "__main__":
- try:
- processor = Geoparse(track_progress=True)
- processor.process()
- except:
- logging.error(traceback.format_exc())
View
251 chrome/content/papermachines/processors/mallet.pyw
@@ -1,251 +0,0 @@
-#!/usr/bin/env python
-import sys, os, shutil, logging, tempfile, time, subprocess, math, re, urllib, json, codecs, csv, traceback, platform
-import xml.etree.ElementTree as et
-from lib.porter2 import stem
-import copy
-import textprocessor
-
-class Mallet(textprocessor.TextProcessor):
- """
- Base class for MALLET functionality
- """
-
- def _basic_params(self):
- self.name = "mallet"
-
- def _import_dfr_metadata(self, dfr_dir):
- citation_file = os.path.join(dfr_dir, "citations.CSV")
- citations = {}
- for rowdict in self.parse_csv(citation_file):
- doi = rowdict.pop("id")
- citations[doi] = rowdict
- self.metadata[doi] = {'title': citations[doi].get("title", ""), 'year': citations[doi].get('pubdate','')[0:4], 'label': "jstor", 'itemID': doi}
- return citations
-
- def _import_dfr(self, dfr_dir):
- citations = self._import_dfr_metadata(dfr_dir)
-
- wordcounts_dir = os.path.join(dfr_dir, "wordcounts")
- for doi in citations.keys():
- try:
- this_text = ''
- for rowdict in self.parse_csv(os.path.join(wordcounts_dir, "wordcounts_" + doi.replace('/','_') + ".CSV")):
- word = rowdict["WORDCOUNTS"]
- if word in self.stopwords:
- continue
- if self.stemming:
- prestem = word
- if word not in self.stemmed:
- self.stemmed[prestem] = stem(prestem)
- word = self.stemmed[prestem]
- count = int(rowdict["WEIGHT"])
-
- this_text += (word + u' ') * count
- if len(this_text) < 20:
- continue
- yield doi, this_text
- except:
- logging.error(doi)
- logging.error(traceback.format_exc())
-
- def _import_files(self):
- if self.stemming:
- self.stemmed = {}
- self.docs = []
- with codecs.open(self.texts_file, 'w', encoding='utf-8') as f:
- for filename in self.files:
- with codecs.open(filename, 'r', encoding='utf-8') as input_file:
- text = input_file.read()
- text = re.sub(r"[^\w ]+", u'', text.lower(), flags=re.UNICODE)
- if self.stemming:
- newtext = u''
- for word in text.split():
- if word not in self.stemmed:
- self.stemmed[word] = stem(word)
- newtext += self.stemmed[word] + u' '
- text = newtext
- f.write(u'\t'.join([filename, self.metadata[filename]["label"], text]) + u'\n')
- self.docs.append(filename)
- if self.dfr:
- for doi, text in self._import_dfr(self.dfr_dir):
- f.write(u'\t'.join([doi, self.metadata[doi]["label"], text]) + u'\n')
- self.docs.append(doi)
- with codecs.open(os.path.join(self.mallet_out_dir, "dmap"), 'w', encoding='utf-8') as dmap:
- dmap.writelines([x + u'\n' for x in self.docs])
- self.doc_count = len(self.docs)
-
- def _tfidf_filter(self, top_terms = None):
- min_df = getattr(self, "min_df", 5)
- vocab = {}
- inverse_vocab = {}
- df = {}
- tf = {}
- tf_all_docs = {}
- tfidf = {}
- self.index = {}
-
- i = 0
- with codecs.open(self.texts_file, 'r', encoding='utf-8') as f:
- for line in f:
- j = 0
- filename = ""
- for part in line.split(u'\t'):
- if j == 0:
- filename = part
- elif j == 2:
- tf_for_doc = {}
- flen = 0
- for word in part.split():
- if len(word) < 3:
- continue
- flen += 1
- if word not in vocab:
- vocab[word] = i
- tf_for_doc[i] = 1
- tf[i] = 0
- df[i] = 1
- i += 1
- else:
- index = vocab[word]
- if index not in tf_for_doc:
- tf_for_doc[index] = 0
- df[index] += 1
- tf_for_doc[index] += 1
- tf_all_docs[filename] = copy.deepcopy(tf_for_doc)
- for word_index in tf_for_doc.keys():
- tf_val = float(tf_for_doc[word_index])/flen
- if tf_val > tf[word_index]:
- tf[word_index] = tf_val
- j += 1
- self.tf_all_docs = tf_all_docs
- for index in vocab.values():
- tfidf[index] = tf[index] * math.log10(float(self.doc_count)/df[index])
- tfidf_values = tfidf.values()
-
- if top_terms is None:
- top_terms = min(int(len(vocab.keys()) * 0.7), 5000)
- min_score = sorted(tfidf_values, reverse=True)[min(top_terms, len(tfidf_values) - 1)]
-
- os.rename(self.texts_file, self.texts_file + '-pre_tf-idf')
- inverse_vocab = {v : k for k, v in vocab.iteritems()}
- new_vocab = {}
-
- with codecs.open(self.texts_file, 'w', encoding='utf-8') as f:
- for filename, freqs in tf_all_docs.iteritems():
- text = u''
- flen = 0
- thisfile_vocab = []
- for index, count in freqs.iteritems():
- if tfidf[index] < min_score or df[index] < min_df:
- continue
- word = inverse_vocab[index]
- if word in self.stopwords:
- continue
- if word not in new_vocab:
- new_vocab[word] = 0
- new_vocab[word] += count
- thisfile_vocab.append(word)
- text += (word + u' ') * count
- flen += count
- if flen > 25:
- f.write(u'\t'.join([filename, self.metadata[filename]["label"], text]) + u'\n')
- for word in thisfile_vocab:
- if word not in self.index:
- self.index[word] = []
- self.index[word].append(self.metadata[filename]["itemID"])
- else:
- self.docs.remove(filename)
- with codecs.open(os.path.join(self.mallet_out_dir, "dmap"), 'w', encoding='utf-8') as dmap:
- dmap.writelines([x + u'\n' for x in self.docs])
- logging.info("tf-idf complete; retained {:} of {:} words; minimum tf-idf score: {:}".format(len(new_vocab.keys()), len(vocab.keys()), min_score))
-
- def _setup_mallet_command(self):
- self.mallet_cp_dir = os.path.join(self.cwd, "lib", "mallet-2.0.7", "dist")
- if self.sys == "Windows":
- classpath_sep = u';'
- else:
- classpath_sep = u':'
-
- self.mallet_classpath = os.path.join(self.mallet_cp_dir, "mallet.jar") + classpath_sep + os.path.join(self.mallet_cp_dir, "mallet-deps.jar")
-
- self.mallet = "java -Xmx1g -ea -Djava.awt.headless=true -Dfile.encoding=UTF-8".split(' ')
- self.mallet += ["-classpath", self.mallet_classpath]
-
- self.mallet_out_dir = os.path.join(self.out_dir, self.name + self.collection)
-
- if not self.dry_run:
- if os.path.exists(self.mallet_out_dir):
- shutil.rmtree(self.mallet_out_dir)
- os.makedirs(self.mallet_out_dir)
-
- self.progress_filename = os.path.join(self.out_dir, self.name + self.collection + "progress.txt")
- self.progress_file = file(self.progress_filename, 'w')
-
- def _import_texts(self):
-
- logging.info("copying texts into single file")
- self.texts_file = os.path.join(self.mallet_out_dir, self.collection + ".txt")
-
- if not os.path.exists(self.texts_file):
- if not self.dry_run:
- self._import_files()
- else:
- if len(self.extra_args) > 0 and self.dfr:
- self._import_dfr_metadata(self.dfr_dir)
- self.docs = []
- self.index = {}
- with codecs.open(self.texts_file, 'r', 'utf-8') as f:
- for line in f:
- fields = line.split(u'\t')
- filename = fields[0]
- self.docs.append(filename)
- this_vocab = set()
- for word in fields[2].split():
- this_vocab.add(word)
- for word in this_vocab:
- if word not in self.index:
- self.index[word] = []
- self.index[word].append(self.metadata[filename]["itemID"])
- self.doc_count = len(self.docs)
-
- def _setup_mallet_instances(self, sequence=True, tfidf = False, stemming = True):
- self.stemming = stemming
-
- self._setup_mallet_command()
- self._import_texts()
-
- self.instance_file = os.path.join(self.mallet_out_dir, self.collection + ".mallet")
-
- logging.info("beginning text import")
-
- if tfidf and not self.dry_run:
- self._tfidf_filter()
-
- with codecs.open(os.path.join(self.mallet_out_dir, "metadata.json"), 'w', encoding='utf-8') as meta_file:
- json.dump(self.metadata, meta_file)
-
- import_args = self.mallet + ["cc.mallet.classify.tui.Csv2Vectors",
- "--remove-stopwords",
- "--stoplist-file", self.stoplist,
- "--input", self.texts_file,
- "--line-regex", "^([^\\t]*)[\\t]([^\\t]*)[\\t](.*)$",
- "--token-regex", '[\p{L}\p{M}]+',
- "--output", self.instance_file]
- if sequence:
- import_args.append("--keep-sequence")
-
- if not self.dry_run and not os.path.exists(self.instance_file):
- import_return = subprocess.call(import_args, stdout=self.progress_file)
-
- def process(self):
- """
- Should be redefined!
- """
- pass
-
-if __name__ == "__main__":
- try:
- processor = Mallet(track_progress = False)
- processor.process()
- except:
- logging.error(traceback.format_exc())
View
64 chrome/content/papermachines/processors/mallet_classify-file.pyw
@@ -1,64 +0,0 @@
-#!/usr/bin/env python
-import sys, os, logging, traceback, time, subprocess, codecs, json
-import mallet
-
-class MalletClassifierTest(mallet.Mallet):
- """
- Train a classifier
- """
- def _basic_params(self):
- self.dry_run = False
- self.name = "mallet_classify-file"
- self.mallet_classifier = self.extra_args[0]
- self.dfr = len(self.extra_args) > 1
- if self.dfr:
- self.dfr_dir = self.extra_args[1]
- self.stemming = True
-
- def process(self):
-
- self._setup_mallet_command()
- self._import_texts()
-
- self.classified_filename = os.path.join(self.mallet_out_dir, "classified")
-
- process_args = self.mallet + ["cc.mallet.classify.tui.Csv2Classify",
- "--input", self.texts_file,
- "--line-regex", "^([^\\t]*)[\\t]([^\\t]*)[\\t](.*)$",
- "--classifier", self.mallet_classifier,
- "--output", self.classified_filename]
-
- logging.info("begin classifying texts")
-
- start_time = time.time()
-# if not self.dry_run:
- classifier_return = subprocess.call(process_args, stdout=self.progress_file, stderr=self.progress_file)
-
- finished = "Classifier finished in " + str(time.time() - start_time) + " seconds"
- logging.info(finished)
-
- classifications = {}
- for line in codecs.open(self.classified_filename, 'r', encoding='utf-8'):
- try:
- line_parts = line.split('\t')
- filename = line_parts.pop(0)
- probs = {y[0]: float(y[1]) for y in self.xpartition(line_parts)}
- classifications[filename] = self.argmax(probs)
- except:
- logging.error(traceback.format_exc())
-
- outfile_name = os.path.join(self.out_dir, "mallet_classify-file" + self.collection + ".json")
-
- with codecs.open(outfile_name, 'w', encoding='utf-8') as f:
- json.dump(classifications, f)
-
- params = {'CLASSIFIED': json.dumps(classifications)}
-
- self.write_html(params)
-
-if __name__ == "__main__":
- try:
- processor = MalletClassifierTest(track_progress=False)
- processor.process()
- except:
- logging.error(traceback.format_exc())
View
223 chrome/content/papermachines/processors/mallet_lda.pyw
@@ -1,223 +0,0 @@
-#!/usr/bin/env python
-import sys, os, logging, tempfile, time, subprocess, math, re, urllib, json, codecs, csv, traceback
-import xml.etree.ElementTree as et
-from itertools import izip
-import mallet
-
-class MalletLDA(mallet.Mallet):
- """
- Perform LDA using MALLET
- """
-
- def _basic_params(self):
- self.categorical = False
- self.template_name = "mallet_lda"
- self.name = "mallet_lda"
- self.topics = 50
- self.dry_run = False
- self.dfr = len(self.extra_args) > 0
- if self.dfr:
- self.dfr_dir = self.extra_args[0]
-
- def _stdev(self, X):
- n = float(len(X))
- xbar = float(sum(X)) / n
- variances = [math.pow(float(x) - xbar, 2.0) for x in X]
- return math.sqrt((1.0 / (n - 1.0)) * sum(variances))
-
- def _cov(self, X, Y):
- n = float(len(X))
- xbar = sum(X) / n
- ybar = sum(Y) / n
- return (1.0/(n-1.0)) * sum([((x-xbar) * (y-ybar)) for x, y in zip(X, Y)])
-
- def _find_proportions(self, topics):
- self.proportions = {}
- for i in range(len(topics)):
- self.proportions[i] = float(sum(topics[i])) / len(topics[i])
-
- def _find_stdevs(self, topics):
- self.stdevs = {}
- for i in range(len(topics)):
- self.stdevs[i] = self._stdev(topics[i])
-
- def _find_correlations(self, topics):
- self.correlations = {}
- for i in range(len(topics)):
- for j in range(i+1,len(topics)):
- self.correlations[str(i) + ',' + str(j)] = self._cov(topics[i], topics[j]) / (self.stdevs[i] * self.stdevs[j])
-
- def _sort_into_intervals(self):
- years = set()
- fname_to_year = {}
-
- fnames = self.metadata.keys()
- for filename in fnames:
- x = self.metadata[filename]
- if x['year'].isdigit() and x['year'] != '0000':
- year = int(x['year'])
- else:
- year = 2012
- years.add(year)
- fname_to_year[filename] = year
-
- years = sorted(years)
- self.intervals = years
- self.fname_to_interval = fname_to_year
- self.fname_to_index = {fname: years.index(year) for fname, year in fname_to_year.iteritems()}
-
- def process(self):
- """
- run LDA, creating an output file divided by time
- """
-
- if self.named_args is not None:
- self.tfidf = self.named_args["tfidf"]
- self.min_df = int(self.named_args["min_df"])
- self.stemming = self.named_args["stemming"]
- self.topics = int(self.named_args["topics"])
- self.iterations = int(self.named_args["iterations"])
- self.alpha = self.named_args["alpha"]
- self.beta = self.named_args["beta"]
- self.symmetric_alpha = str(self.named_args["symmetric_alpha"]).lower()
- self.optimize_interval = self.named_args["optimize_interval"]
- self.burn_in = int(self.named_args["burn_in"])
- else:
- self.tfidf = True
- self.min_df = 5
- self.topics = 50
- self.stemming = True
- self.iterations = 1000
- self.alpha = "50.0"
- self.beta = "0.01"
- self.burn_in = 200
- self.symmetric_alpha = "true"
- self.optimize_interval = 0
-
-
- self._setup_mallet_instances(sequence=True, tfidf=self.tfidf, stemming=self.stemming)
-
- self.mallet_files = {'state': os.path.join(self.mallet_out_dir, "topic-state.gz"),
- 'doc-topics': os.path.join(self.mallet_out_dir, "doc-topics.txt"),
- 'topic-keys': os.path.join(self.mallet_out_dir, "topic-keys.txt"),
- 'word-topics': os.path.join(self.mallet_out_dir, "word-topics.txt"),
- 'diagnostics-file': os.path.join(self.mallet_out_dir, "diagnostics-file.txt")}
- process_args = self.mallet + ["cc.mallet.topics.tui.TopicTrainer",
- "--input", self.instance_file,
- "--num-topics", str(self.topics),
- "--num-iterations", str(self.iterations),
- "--optimize-interval", str(self.optimize_interval),
- "--optimize-burn-in", str(self.burn_in),
- "--use-symmetric-alpha", self.symmetric_alpha,
- "--alpha", self.alpha,
- "--beta", self.beta,
- "--output-state", self.mallet_files['state'],
- "--output-doc-topics", self.mallet_files['doc-topics'],
- "--output-topic-keys", self.mallet_files['topic-keys'],
- "--diagnostics-file", self.mallet_files['diagnostics-file'],
- "--word-topic-counts-file", self.mallet_files['word-topics']]
-
- logging.info("begin LDA")
-
- start_time = time.time()
- if not self.dry_run:
- lda_return = subprocess.call(process_args, stdout=self.progress_file, stderr=self.progress_file)
-
- logging.info("LDA complete in " + str(time.time() - start_time) + " seconds")
-
- coherence = {}
- wordProbs = {}
- allocationRatios = {}
- with file(self.mallet_files['diagnostics-file']) as diagnostics:
- tree = et.parse(diagnostics)
- for elem in tree.iter("topic"):
- topic = elem.get("id")
- coherence[topic] = float(elem.get("coherence"))
- allocationRatios[topic] = float(elem.get("allocation_ratio"))
- wordProbs[topic] = []
- for word in elem.iter("word"):
- wordProbs[topic].append({'text': word.text, 'prob': word.get("prob")})
-
- labels = {x[0]: {"label": x[2:5], "fulltopic": wordProbs[x[0]], "allocation_ratio": allocationRatios[x[0]]} for x in [y.split() for y in file(self.mallet_files['topic-keys']).readlines()]}
-
- weights_by_topic = []
- doc_metadata = {}
-
- self._sort_into_intervals()
-
- for i in range(self.topics):
- weights_by_topic.append([{'x': str(j), 'y': [], 'topic': i} for j in self.intervals])
-
- for line in file(self.mallet_files['doc-topics']):
- try:
- values = line.split('\t')
-
- id = values.pop(0)
- if id.startswith("#doc"):
- continue
- filename = self.docs[int(id)]
- del values[0]
-
- itemid = self.metadata[filename]["itemID"]
-
- doc_metadata[itemid] = {"label": self.metadata[filename]["label"], "title": self.metadata[filename]["title"]}
-
- freqs = {int(y[0]): float(y[1]) for y in self.xpartition(values)}
- main_topic = None
- topic_max = 0.0
- for i in freqs.keys():
- weights_by_topic[i][self.fname_to_index[filename]]['y'].append({"itemID": itemid, "ratio": freqs[i]})
- if freqs[i] > topic_max:
- main_topic = i
- topic_max = freqs[i]
- doc_metadata[itemid]["main_topic"] = main_topic
- except KeyboardInterrupt:
- sys.exit(1)
- except:
- logging.error(traceback.format_exc())
-
- topics_by_year = []
- for topic in weights_by_topic:
- topic_sums = []
- for year in topic:
- sum = 0.0
- if len(year['y']) != 0:
- for doc in year['y']:
- sum += doc['ratio']
- topic_sums.append(sum / float(len(year['y'])))
- else:
- topic_sums.append(0)
- topics_by_year.append(topic_sums)
-
- self.topics_by_year = topics_by_year
- self._find_proportions(topics_by_year)
- try:
- self._find_stdevs(topics_by_year)
- self._find_correlations(topics_by_year)
- except:
- self.stdevs = {}
- self.correlations = {}
-
- self.template_filename = os.path.join(self.cwd, "templates", self.template_name + ".html")
-
- params = {"CATEGORICAL": "true" if self.categorical else "false",
- "TOPICS_DOCS": json.dumps(weights_by_topic, separators=(',',':')),
- "DOC_METADATA": json.dumps(doc_metadata, separators=(',',':')),
- "TOPIC_LABELS": json.dumps(labels, separators=(',',':')),
- "TOPIC_COHERENCE": json.dumps(coherence, separators=(',',':')),
- "TOPIC_PROPORTIONS": json.dumps(self.proportions, separators=(',',':')),
- "TOPIC_STDEVS": json.dumps(self.stdevs, separators=(',',':')),
- "TOPIC_CORRELATIONS": json.dumps(self.correlations, separators=(',',':'))
- }
-
- index = getattr(self, "index", "{}")
- params["###INDEX###"] = json.dumps(index, separators=(',',':'))
-
- self.write_html(params)
-
-if __name__ == "__main__":
- try:
- processor = MalletLDA(track_progress = False)
- processor.process()
- except:
- logging.error(traceback.format_exc())
View
163 chrome/content/papermachines/processors/mallet_lda_MI.pyw
@@ -1,163 +0,0 @@
-#!/usr/bin/env python
-import sys, os, codecs, logging, traceback, json, math
-import mallet_lda
-
-class MalletLDAMutualInformation(mallet_lda.MalletLDA):
- """
- Calculate mutual information for groups of topics
- """
- def _basic_params(self):
- self.name = "mallet_lda_MI"
- self.categorical = True
- self.template_name = "mallet_lda_MI"
- self.dry_run = False
- self.mallet_out_dir = self.extra_args[0]
-
- def _mutualInformation(self, X, Y):
- probs = {}
- marginal_x = {}
- marginal_y = {}
-
- n = 0
- for interval, x_topic_vals in X.iteritems():
- if not interval in Y:
- continue
- y_topic_vals = Y[interval]
-
- if len(x_topic_vals.keys()) == 0 or len(y_topic_vals.keys()) == 0:
- continue
-
- # what is being most discussed in each group?
- x = self.argmax(x_topic_vals)
- y = self.argmax(y_topic_vals)
-
- if not x in marginal_x:
- marginal_x[x] = 0
- marginal_x[x] += 1
- if not y in marginal_y:
- marginal_y[y] = 0
- marginal_y[y] += 1
-
- if not x in probs:
- probs[x] = {}
- if not y in probs[x]:
- probs[x][y] = 0
- probs[x][y] += 1
- n += 1
-
- n_x = float(sum(marginal_x.values()))
- for x in marginal_x.keys():
- marginal_x[x] /= n_x
-
- n_y = float(sum(marginal_y.values()))
- for y in marginal_y.keys():
- marginal_y[y] /= n_y
-
- for x, y_probs in probs.iteritems():
- for y in y_probs.keys():
- probs[x][y] /= float(n)
-
- mi = 0.0
- for x, y_probs in probs.iteritems():
- for y in y_probs.keys():
- mi += (probs[x][y] * math.log(probs[x][y] / (marginal_x[x] * marginal_y[y]), 2))
- return mi
-
- def process(self):
- self.metadata = json.load(codecs.open(os.path.join(self.mallet_out_dir, "metadata.json"), 'r', encoding='utf-8'))
- self.files = self.metadata.keys()
-
- self.classify_file = os.path.join(self.out_dir, "mallet_classify-file" + self.collection + ".json")
- if os.path.exists(self.classify_file):
- with codecs.open(self.classify_file, 'r', encoding='utf-8') as f:
- self.classified = json.load(f)
- for filename in self.files:
- label = self.classified.get(filename)
- if label is not None:
- self.metadata[filename]["label"] = label
-
- self.labels = set([x["label"] for x in self.metadata.values()])
-
- self.doc_topics = os.path.join(self.mallet_out_dir, "doc-topics.txt")
- self.docs = [x.strip() for x in codecs.open(os.path.join(self.mallet_out_dir, "dmap"), 'r', encoding='utf-8')]
-
- self._sort_into_intervals()
- self.labels_years_topics = {}
-
- for label in self.labels:
- self.labels_years_topics[label] = {i: {} for i in self.intervals}
-
- for line in file(self.doc_topics):
- try:
- values = line.split('\t')
-
- id = values.pop(0)
- if id.startswith("#doc"):
- continue
- filename = self.docs[int(id)]
- del values[0]
-
- itemid = self.metadata[filename]["itemID"]
-
- label = self.metadata[filename]["label"]
-
- freqs = {int(y[0]): float(y[1]) for y in self.xpartition(values)}
- main_topic = None
- topic_max = 0.0
- for i in freqs.keys():
- if freqs[i] > topic_max:
- main_topic = i
- topic_max = freqs[i]
- if main_topic is None:
- continue
- if not main_topic in self.labels_years_topics[label][self.fname_to_interval[filename]]:
- self.labels_years_topics[label][self.fname_to_interval[filename]][main_topic] = 0
- self.labels_years_topics[label][self.fname_to_interval[filename]][main_topic] += 1
- except KeyboardInterrupt:
- sys.exit(1)
- except:
- logging.error(traceback.format_exc())
-
- self.MIs = {}
- labels = sorted(self.labels)
- n = len(labels)
- for i in range(n):
- for j in range(i+1,n):
- X = self.labels_years_topics[labels[i]]
- Y = self.labels_years_topics[labels[j]]
-
- # all_topics = []
-
- # for A in [X,Y]:
- # this_set = set()
- # for interval, topic_vals in A.iteritems():
- # this_set.update([topic for topic, val in topic_vals.iteritems() if val > 0])
- # all_topics.append(this_set)
-
- # topics_of_interest = all_topics[0].intersection(all_topics[1])
-
- result = self._mutualInformation(X, Y)
- self.MIs[str(i) + ',' + str(j)] = result
-
- self.nodes = []
- self.edges = []
- node_index = {}
-
- for key, mi in self.MIs.iteritems():
- a, b = [int(x) for x in key.split(',')]
- for i in [a,b]:
- if i not in node_index:
- node_index[i] = len(self.nodes)
- self.nodes.append(labels[i])
- edge = {"source": node_index[a], "target": node_index[b], "mi": mi}
- self.edges.append(edge)
-
- params = {"NODES": json.dumps(self.nodes), "EDGES": json.dumps(self.edges)}
- self.write_html(params)
-
-if __name__ == "__main__":
- try:
- processor = MalletLDAMutualInformation(track_progress=False)
- processor.process()
- except:
- logging.error(traceback.format_exc())
View
24 chrome/content/papermachines/processors/mallet_lda_categorical.pyw
@@ -1,24 +0,0 @@
-#!/usr/bin/env python
-import sys, os, logging, traceback
-import mallet_lda
-
-class MalletSubcollections(mallet_lda.MalletLDA):
- """
- Set topic modeling to categorical view by default
- """
- def _basic_params(self):
- self.name = "mallet_lda_categorical"
- self.categorical = True
- self.template_name = "mallet_lda"
- self.dry_run = False
- self.topics = 50
- self.dfr = len(self.extra_args) > 0
- if self.dfr:
- self.dfr_dir = self.extra_args[0]
-
-if __name__ == "__main__":
- try:
- processor = MalletSubcollections(track_progress=False)
- processor.process()
- except:
- logging.error(traceback.format_exc())
View
40 chrome/content/papermachines/processors/mallet_lda_jstor.pyw
@@ -1,40 +0,0 @@
-#!/usr/bin/env python
-import sys, os, logging, traceback
-from zipfile import ZipFile
-from lib.merge_jstor import merge_dfr_dirs
-import mallet_lda
-
-class MalletJSTOR(mallet_lda.MalletLDA):
- """
- Alias to distinguish mallet queries with JSTOR attached
- """
- def _extractAll(self, zipName, dest):
- z = ZipFile(zipName)
- z.extractall(dest, filter(lambda f: not f.endswith('/'), z.namelist()))
-
- def _basic_params(self):
- self.name = "mallet_lda_jstor"
- self.categorical = False
- self.template_name = "mallet_lda"
- self.dry_run = False
- self.topics = 50
- self.dfr = True
- dfr_dirs = []
- for dfr_path in self.extra_args:
- if dfr_path.lower().endswith(".zip"):
- dfr_dir = os.path.basename(dfr_path).replace(".zip","")
- this_dfr_dir = os.path.join(self.out_dir, dfr_dir)
- self._extractAll(dfr_path, this_dfr_dir)
- dfr_dirs.append(this_dfr_dir)
- if len(dfr_dirs) > 1:
- self.dfr_dir = merge_dfr_dirs(dfr_dirs)
- else:
- self.dfr_dir = dfr_dirs[0]
-
-
-if __name__ == "__main__":
- try:
- processor = MalletJSTOR(track_progress=False)
- processor.process()
- except:
- logging.error(traceback.format_exc())
View
42 chrome/content/papermachines/processors/mallet_train-classifier.pyw
@@ -1,42 +0,0 @@
-#!/usr/bin/env python
-import sys, os, logging, traceback, time, subprocess
-import mallet
-
-class MalletClassifier(mallet.Mallet):
- """
- Train a classifier
- """
- def _basic_params(self):
- self.dry_run = False
- self.name = "mallet_train-classifier"
- self.dfr = False
-
- def process(self):
- self._setup_mallet_instances(sequence=False)
-
- self.mallet_output = os.path.join(self.mallet_out_dir, "trained.classifier")
- process_args = self.mallet + ["cc.mallet.classify.tui.Vectors2Classify",
- "--input", self.instance_file,
- "--output-classifier", self.mallet_output,
- "--trainer", "NaiveBayes",
- "--noOverwriteProgressMessages", "true"]
-
- logging.info("begin training classifier")
-
- start_time = time.time()
- if not self.dry_run:
- classifier_return = subprocess.call(process_args, stdout=self.progress_file, stderr=self.progress_file)
-
- finished = "Classifier trained in " + str(time.time() - start_time) + " seconds"
- logging.info(finished)
-
- params = {'DONE': finished}
-
- self.write_html(params)
-
-if __name__ == "__main__":
- try:
- processor = MalletClassifier(track_progress=False)
- processor.process()
- except:
- logging.error(traceback.format_exc())
View
97 chrome/content/papermachines/processors/phrasenet.pyw
@@ -1,97 +0,0 @@
-#!/usr/bin/env python
-import sys, os, json, re, tempfile, cStringIO, logging, traceback, codecs
-import textprocessor
-
-class PhraseNet(textprocessor.TextProcessor):
- """
- Generate phrase net
- cf. http://www-958.ibm.com/software/data/cognos/manyeyes/page/Phrase_Net.html
- """
-
- def _basic_params(self):
- self.name = "phrasenet"
-
- def _findPhrases(self, pattern):
- self.nodes = {}
- self.edges = {}
- for filename in self.files:
- self.update_progress()
- with codecs.open(filename, 'r', encoding='utf8') as f:
- logging.info("processing " + filename)
- for re_match in pattern.finditer(f.read()):
- match = [w.lower() for w in re_match.groups()]
- if any([word in self.stopwords for word in match]):
- continue
-
- for word in match:
- if not word in self.nodes:
- self.nodes[word] = 1
- else:
- self.nodes[word] += 1
-
- edge = match[0] + self.edgesep + match[1]
- if not edge in self.edges:
- self.edges[edge] = 1
- else:
- self.edges[edge] += 1
-
- def process(self):
- logging.info("starting to process")
-
- stopfile = os.path.join(self.cwd, "stopwords.txt")
- logging.info("reading stopwords from " + stopfile)
- self.stopwords = [line.strip() for line in file(stopfile)]
-
- self.edgesep = ','
-
- wordregex = "(\w+)"
-
- if len(self.extra_args) > 0:
- pattern_str = self.extra_args[0]
- else:
- pattern_str = "x and y"
-
- if pattern_str.count('x') == 1 and pattern_str.count('y') == 1:
- pattern = pattern_str.replace('x', wordregex)
- pattern = pattern.replace('y', wordregex)
- else:
- pattern = pattern_str
-
- logging.info("extracting phrases according to pattern "+ repr(pattern))
-
- self._findPhrases(re.compile(pattern))
-
- logging.info("generating JSON")
-
- used_nodes = set()
-
- jsondata = {'nodes': [], 'edges': []}
-
- top_edges = self.edges.keys()
- top_edges.sort(key=lambda x: self.edges[x])
- top_edges.reverse()
- top_edges = top_edges[:50]
-
- for edge in top_edges:
- words = edge.split(',')
- used_nodes.update(words)
-
- nodeindex = dict(zip(used_nodes, range(len(used_nodes))))
-
- for edge in top_edges:
- weight = self.edges[edge]
- words = edge.split(',')
- jsondata['edges'].append({'source': nodeindex[words[0]], 'target': nodeindex[words[1]], 'weight': weight})
-
- for node in used_nodes:
- jsondata['nodes'].append({'index': nodeindex[node], 'name': node, 'freq': self.nodes[node]})
-
- params = {"DATA": json.dumps(jsondata), "PATTERN": json.dumps(pattern_str)}
- self.write_html(params)
-
-if __name__ == "__main__":
- try:
- processor = PhraseNet(track_progress=True)
- processor.process()
- except:
- logging.error(traceback.format_exc())
View
138 chrome/content/papermachines/processors/textprocessor.pyw
@@ -1,138 +0,0 @@
-#!/usr/bin/env python
-import sys, os, csv, logging, tempfile, traceback, urllib, codecs, json, operator, platform
-from itertools import izip
-
-class TextProcessor:
- """
- Base class for text processing in Paper Machines
- """
-
- def __init__(self, track_progress=True):
- self.sys = platform.system()
-
- # take in command line options
-
- self.args_filename = sys.argv[1]
- self.args_basename = os.path.basename(self.args_filename).replace(".json", "")
-
- with codecs.open(self.args_filename, 'r', encoding='utf-8') as args_file:
- args = json.load(args_file)
-
- self.cwd = args[0]
- csv_file = args[1]
- self.out_dir = args[2]
- self.collection_name = args[3]
- self.extra_args = args[4:]
-
- if "json" in self.extra_args:
- json_starts_at = self.extra_args.index("json")
- self.named_args = json.loads(self.extra_args[json_starts_at + 1])
- self.extra_args = self.extra_args[:json_starts_at]
- else:
- self.named_args = None
-
- self.collection = os.path.basename(csv_file).replace(".csv","")
-
- self.require_stopwords = True # load stopwords by default
-
- # call a function to set processor name, etc.
- self._basic_params()
-
- if self.require_stopwords:
- self.stoplist = os.path.join(self.cwd, "stopwords.txt")
- self.stopwords = [x.strip() for x in codecs.open(self.stoplist, 'r', encoding='utf-8').readlines()]
-
- self.out_filename = os.path.join(self.out_dir, self.name + self.collection + "-" + self.args_basename + ".html")
-
- # logging.basicConfig(filename=os.path.join(self.out_dir, "logs", self.name + ".log"), level=logging.INFO)
- logging.basicConfig(filename=self.out_filename.replace(".html", ".log"), filemode='w', level=logging.INFO)
-
- fh = logging.FileHandler(os.path.join(self.out_dir, "logs", self.name + ".log"))
- formatter = logging.Formatter('%(name)s: %(levelname)-8s %(message)s')
- fh.setFormatter(formatter)
-
- logging.getLogger('').addHandler(fh)
-
- logging.info("command: " + ' '.join([x.replace(' ','''\ ''') for x in sys.argv]))
-
- self.metadata = {}
-
- for rowdict in self.parse_csv(csv_file):
- filename = rowdict.pop("filename")
- self.metadata[filename] = rowdict
-
- self.files = self.metadata.keys()
- if track_progress:
- self.track_progress = True
- self.progress_initialized = False
-
- def _basic_params(self):
- self.name = "textprocessor"
-
- def parse_csv(self, filename, dialect=csv.excel, **kwargs):
- with file(filename, 'rb') as f:
- csv_rows = self.unicode_csv_reader(f, dialect=dialect, **kwargs)
- header = csv_rows.next()
- for row in csv_rows:
- if len(row) > 0:
- rowdict = dict(zip(header, row))
- yield rowdict
-
- def unicode_csv_reader(self, utf8_data, dialect=csv.excel, **kwargs):
- csv_reader = csv.reader(utf8_data, dialect=dialect, **kwargs)
- for row in csv_reader:
- yield [unicode(cell, 'utf-8') for cell in row]
-
- def update_progress(self):
- if self.track_progress:
- if not self.progress_initialized:
- self.progress_filename = os.path.join(self.out_dir, self.name + self.collection + "progress.txt")
- self.progress_file = file(self.progress_filename, 'w')
- self.count = 0
- self.total = len(self.files)
- self.progress_initialized = True
-
- self.count += 1
- self.progress_file.write('<' + str(int(self.count*1000.0/float(self.total))) + '>\n')
- self.progress_file.flush()
-
- def xpartition(self, seq, n=2):
- return izip(*(iter(seq),) * n)
-
- def argmax(self, obj):
- if hasattr(obj, "index"):
- return obj.index(max(obj))
- elif hasattr(obj, "iteritems"):
- return max(obj.iteritems(), key=operator.itemgetter(1))[0]
-
- def write_html(self, user_params):
- logging.info("writing HTML")
- params = {"COLLECTION_NAME": self.collection_name, "DOC_METADATA": json.dumps({v["itemID"]: v for k, v in self.metadata.iteritems()})}
- params.update(user_params)
- try:
- template_filename = getattr(self, "template_filename", os.path.join(self.cwd, "templates", self.name + ".html"))
-
- with codecs.open(self.out_filename, 'w', encoding='utf-8') as outfile:
- with codecs.open(template_filename, 'r', encoding='utf-8') as template:
- template_str = template.read()
- for k, v in params.iteritems():
- template_str = template_str.replace(k, v)
- outfile.write(template_str)
- except:
- logging.error(traceback.format_exc())
-
- def process(self):
- """
- Example process -- should be overridden
- """
- output = file(os.path.join(self.out_dir, self.name + '.txt'), 'w')
- for filename in self.files:
- output.write(' '.join([filename, self.metadata[filename]]) + '\n')
- output.close()
-
-if __name__ == "__main__":
- try:
- processor = TextProcessor(track_progress = True)
- processor.process()
- except:
- logging.error(traceback.format_exc())
View
118 chrome/content/papermachines/processors/wordcloud.pyw
@@ -1,118 +0,0 @@
-#!/usr/bin/env python
-import sys, os, json, cStringIO, tempfile, logging, traceback, codecs, math
-import textprocessor
-from lib.porter2 import stem
-
-class WordCloud(textprocessor.TextProcessor):
- """
- Generate word cloud
- """
- def _basic_params(self):
- self.name = "wordcloud"
- self.width = "300"
- self.height = "150"
- self.fontsize = "[10,32]"
- self.n = 50
- self.tfidf_scoring = False
-
- def _findTfIdfScores(self, scale=True):
- self.freqs = {}
- self.tf_by_doc = {}
- self.max_tf = {}
- self.df = {}
- for filename in self.files:
- with codecs.open(filename, 'r', encoding = 'utf8') as f:
- logging.info("processing " + filename)
- flen = 0
- self.tf_by_doc[filename] = {}
- for line in f:
- for stem in self._tokenizeAndStem(line):
- flen += 1
- if stem not in self.tf_by_doc[filename]:
- self.tf_by_doc[filename][stem] = 0
- if stem not in self.df:
- self.df[stem] = 0
- self.df[stem] += 1
- self.tf_by_doc[filename][stem] += 1
- # max_tf_d = max(self.tf_by_doc[filename].values())
- for stem in self.tf_by_doc[filename].keys():
- if stem not in self.freqs:
- self.freqs[stem] = 0
- self.freqs[stem] += self.tf_by_doc[filename][stem]
- if scale:
- self.tf_by_doc[filename][stem] /= float(flen) #max_tf_d
- this_tf = self.tf_by_doc[filename][stem]
- else:
- this_tf = self.tf_by_doc[filename][stem] / float(flen)
-
- if stem not in self.max_tf or self.max_tf[stem] < this_tf:
- self.max_tf[stem] = this_tf
- self.update_progress()
- n = float(len(self.files))
- self.idf = {term: math.log10(n/df) for term, df in self.df.iteritems()}
- self.tfidf = {term: self.max_tf[term] * self.idf[term] for term in self.max_tf.keys()}
- tfidf_values = self.tfidf.values()
- top_terms = min(int(len(self.freqs.keys()) * 0.7), 5000)
- min_score = sorted(tfidf_values, reverse=True)[min(top_terms, len(tfidf_values) - 1)]
- self.filtered_freqs = {term: freq for term, freq in self.freqs.iteritems() if self.tfidf[term] > min_score and self.df[term] > 3}
-
- def _topN(self, freqs, n = None):
- if n is None:
- n = self.n
- final_freqs = []
- top_freqs = sorted(freqs.values())
- if len(top_freqs) == 0:
- return []
- min_freq = top_freqs[-min(n,len(top_freqs))] # find nth frequency from end, or start of list
- for word, freq in freqs.iteritems():
- if freq >= min_freq:
- final_freqs.append({'text': word, 'value': freq})
- return final_freqs
-
- def _findWordFreqs(self, filenames):
- freqs = {}
- for filename in filenames:
- with codecs.open(filename, 'r', encoding = 'utf8') as f:
- logging.info("processing " + filename)
- for line in f:
- for stem in self._tokenizeAndStem(line):
- if stem not in freqs:
- freqs[stem] = 1
- else:
- freqs[stem] += 1
- self.update_progress()
- return self._topN(freqs)
-
- def _tokenizeAndStem(self, line):
- # uncomment for Porter stemming (slower, but groups words with their plurals, etc.)
- # return [stem(word.strip('.,')) for word in line.split() if word.lower() not in self.stopwords and len(word) > 3]
- return [word.lower() for word in line.split() if word.lower() not in self.stopwords and word.isalpha() and len(word) >= 3]
-
- def process(self):
- logging.info("starting to process")
-
- self.template_filename = os.path.join(self.cwd, "templates", "wordcloud.html")
-
- logging.info("finding word frequencies")
-
- if self.tfidf_scoring:
- self._findTfIdfScores()
- freqs = self._topN(self.filtered_freqs)
- else:
- freqs = self._findWordFreqs(self.files)
-
- params = {"DATA": json.dumps(freqs),
- "WIDTH": self.width,
- "HEIGHT": self.height,
- "FONTSIZE": self.fontsize
- }
-
- self.write_html(params)
-
-
-if __name__ == "__main__":
- try:
- processor = WordCloud(track_progress = True)
- processor.process()
- except:
- logging.error(traceback.format_exc())
View
77 chrome/content/papermachines/processors/wordcloud_chronological.pyw
@@ -1,77 +0,0 @@
-#!/usr/bin/env python
-import sys, os, json, cStringIO, tempfile, logging, traceback, codecs, math
-from datetime import datetime, timedelta
-import wordcloud_multiple
-
-class WordCloudChronological(wordcloud_multiple.MultipleWordClouds):
- """
- Generate word clouds based on time interval
- """
- def _basic_params(self):
- self.name = "wordcloud_chronological"
- self.template_filename = os.path.join(self.cwd, "templates", "wordcloud_multiple.html")
- self.width = "483"
- self.height = "300"
- self.fontsize = "[10,32]"
- self.n = 100
- self.tfidf_scoring = False
- self.MWW = False
- self.dunning = False
- if len(self.extra_args) == 1:
- self.interval = self.extra_args[0]
- elif len(self.extra_args) > 1:
- if self.extra_args[0] == "tfidf":
- self.tfidf_scoring = True
- elif self.extra_args[0] == "mww":
- self.tfidf_scoring = True
- self.MWW = True
- elif self.extra_args[0] == "dunning":
- self.tfidf_scoring = True
- self.dunning = True
- self.interval = self.extra_args[1]
- else:
- self.interval = "90"
-
- def _split_into_labels(self):
- datestr_to_datetime = {}
- for filename in self.metadata.keys():
- date_str = self.metadata[filename]["date"]
- cleaned_date = date_str[0:10]
- if "-00" in cleaned_date:
- cleaned_date = cleaned_date[0:4] + "-01-01"
- datestr_to_datetime[date_str] = datetime.strptime(cleaned_date, "%Y-%m-%d")
- datetimes = sorted(datestr_to_datetime.values())
- start_date = datetimes[0]
- end_date = datetimes[-1]
-
- if self.interval.isdigit():
- interval = timedelta(int(self.interval))
- else:
- interval = timedelta(90)
-
- intervals = []
- interval_names = []
- start = end = start_date
- while end <= end_date:
- end += interval
- intervals.append((start,end))
- interval_names.append(start.isoformat()[0:10].replace('-','/') + '-' + end.isoformat()[0:10].replace('-','/'))
- start = end
-
- for filename, metadata in self.metadata.iteritems():
- label = ""
- for i in range(len(intervals)):
- interval = intervals[i]
- if interval[0] <= datestr_to_datetime[metadata["date"]] < interval[1]:
- label = interval_names[i]
- break
- if label not in self.labels:
- self.labels[label] = set()
- self.labels[label].add(filename)
-
-if __name__ == "__main__":
- try:
- processor = WordCloudChronological(track_progress = True)
- processor.process()
- except:
- logging.error(traceback.format_exc())
View
22 chrome/content/papermachines/processors/wordcloud_large.pyw
@@ -1,22 +0,0 @@
-#!/usr/bin/env python
-import sys, os, logging, traceback, codecs
-import wordcloud
-
-class LargeWordCloud(wordcloud.WordCloud):
- """
- Generate large word cloud
- """
- def _basic_params(self):
- self.width = "960"
- self.height = "500"
- self.fontsize = "[10,72]"
- self.name = "wordcloud_large"
- self.n = 150
- self.tfidf_scoring = len(self.extra_args) > 0
-
-if __name__ == "__main__":
- try:
- processor = LargeWordCloud(track_progress=True)
- processor.process()
- except:
- logging.error(traceback.format_exc())
View
200 chrome/content/papermachines/processors/wordcloud_multiple.pyw
@@ -1,200 +0,0 @@
-#!/usr/bin/env python
-import sys, os, json, cStringIO, tempfile, logging, traceback, codecs, math
-import wordcloud
-
-class MultipleWordClouds(wordcloud.WordCloud):
- """
- Generate word clouds based on labels
- """
- def _basic_params(self):
- self.name = "wordcloud_multiple"
- self.width = "300"
- self.height = "150"
- self.fontsize = "[10,32]"
- self.n = 50
- self.tfidf_scoring = False
- self.MWW = False
- self.dunning = False
- if len(self.extra_args) > 0:
- if self.extra_args[0] == "tfidf":
- self.tfidf_scoring = True
- elif self.extra_args[0] == "mww":
- self.tfidf_scoring = True
- self.MWW = True
- elif self.extra_args[0] == "dunning":
- self.tfidf_scoring = True
- self.dunning = True
-
- def _rank_simple(self, vector):
- return sorted(range(len(vector)), key=vector.__getitem__)
-
- def _rank(self, seq):
- n = len(seq)
- ivec = self._rank_simple(seq)
- svec = [seq[rank] for rank in ivec]
- last_obs = svec[0]
- new_vec = [1]*n
- dupe_indices = set()
-
- for i in xrange(1, n):
- if svec[i] == last_obs:
- dupe_indices.add(i-1)
- dupe_indices.add(i)
- else:
- if len(dupe_indices) > 0:
- averank = (sum(dupe_indices) / float(len(dupe_indices))) + 1
- for j in dupe_indices:
- new_vec[j] = averank
- new_vec[i] = i + 1
- dupe_indices = set()
- else:
- new_vec[i] = i + 1
- last_obs = svec[i]
- ranks = {svec[i]: rank for i, rank in enumerate(new_vec)}
- return ranks
-
- def _mannWhitney(self, A, B):
- all_obs = A + B
- n_a = len(A)
- n_b = len(B)
- n_ab = len(all_obs)
-
- ranks = self._rank(all_obs)
- t_a = sum([ranks[obs] for obs in A])
- mu_a = float(n_a * (n_ab + 1)) / 2
- t_a_max = (n_a * n_b) + (n_a * (len(A) + 1))/2
- u_a = t_a_max - t_a
- s = math.sqrt(float(n_a * n_b * (n_ab + 1))/12)
- if t_a > mu_a:
- z_a = (t_a - mu_a - 0.5)/ s
- else:
- z_a = (t_a - mu_a + 0.5)/ s
- rho = u_a / (n_a*n_b)
- return rho
-
- def _dunning_held_out(self, word, label_set, other_set):
- sets = [label_set, other_set]
- count_total = [0.0, 0.0, 0.0, 0.0]
- for i in range(len(sets)):
- for filename in sets[i]:
- if word in self.tf_by_doc[filename]:
- count_total[i] += self.tf_by_doc[filename][word]
- count_total[i + 2] += sum(self.tf_by_doc[filename].values())
- # count_total[i] = sum([word_weights[word] for filename, word_weights in self.tf_by_doc.iteritems() if filename in sets[i] and word in word_weights])
- # count_total[i + 2] = sum([sum(word_weights.values()) for filename, word_weights in self.tf_by_doc.iteritems() if filename in sets[i]])
- a, b, c, d = [float(x) for x in count_total]
- if any([x == 0 for x in count_total]):
- return 0
- E1 = c*((a+b)/(c+d))
- E2 = d*((a+b)/(c+d))
- G2 = 2.0*((a*math.log(a/E1)) + (b*math.log(b/E2)))
- return G2
-
- def _dunning(self, word, label_set):
- count_total = [0.0, self.freqs[word], 0.0, self.total_word_count]
- for filename in label_set:
- if word in self.tf_by_doc[filename]:
- count_total[0] += self.tf_by_doc[filename][word]
- count_total[2] += sum(self.tf_by_doc[filename].values())
- a, b, c, d = [float(x) for x in count_total]
- if any([x == 0 for x in count_total]):
- return 0
- E1 = c*((a+b)/(c+d))
- E2 = d*((a+b)/(c+d))
- G2 = 2.0*((a*math.log(a/E1)) + (b*math.log(b/E2)))
- return G2
-
- def _held_out(self, word, label_set, other_set):
- ranks_by_set = [[],[]]
- sets = [label_set, other_set]
- appears_in_label_set = False
- for i in range(len(sets)):
- for filename in sets[i]:
- if word in self.tf_by_doc[filename]:
- ranks_by_set[i].append(self.tf_by_doc[filename][word])
- if i == 0:
- appears_in_label_set = True
- # ranks_by_set[i].append(self.tf_by_doc[filename][word] * self.idf[word])
- else:
- ranks_by_set[i].append(0)
- if not appears_in_label_set:
- return 0.0
- else:
- return self._mannWhitney(ranks_by_set[0], ranks_by_set[1])
-
- def _split_into_labels(self):
- for filename, data in self.metadata.iteritems():
- if data["label"] not in self.labels:
- self.labels[data["label"]] = set()
- self.labels[data["label"]].add(filename)
-
- def process(self):
- logging.info("splitting into labeled sets")
- self.labels = {}
- self._split_into_labels()
-
- clouds = {}
-
- all_files = set(self.files)
- if self.tfidf_scoring:
- if self.dunning:
- self._findTfIdfScores(scale=False)
- else:
- self._findTfIdfScores()
- # self.top_tfidf_words = [item["text"] for item in self._topN(self.filtered_freqs, 150)]
- self.top_tfidf_words = self.filtered_freqs.keys()
-
- self.label_order = sorted(self.labels.keys())
- for label in self.label_order:
- filenames = self.labels[label]
- logging.info("finding word frequencies for " + str(label))
- if self.tfidf_scoring and self.MWW:
- label_set = set(filenames)
- other_set = all_files - label_set
- word_rho = {}
- for word in self.top_tfidf_words:
- word_rho[word] = self._held_out(word, label_set, other_set)
- clouds[label] = self._topN(word_rho)
- elif self.tfidf_scoring and self.dunning:
- label_set = set(filenames)
- other_set = all_files - label_set
- word_G2 = {}
- self.total_word_count = sum(self.freqs.values())
- for word in self.top_tfidf_words:
- G2 = self._dunning_held_out(word, label_set, other_set)
- # G2 = self._dunning(word, label_set)
- if G2 > 15.13: # critical value for p < 0.001
- word_G2[word] = G2
- clouds[label] = self._topN(word_G2)
-
- elif self.tfidf_scoring:
- tf_maxes = {}
- for filename in filenames:
- for term, weight in self.tf_by_doc[filename].iteritems():
- if term not in tf_maxes:
- tf_maxes[term] = weight
- else:
- if weight > tf_maxes[term]:
- tf_maxes[term] = weight
- tfidf_for_labelset = {term: weight * self.idf[term] for term, weight in tf_maxes.iteritems()}
- filtered_freqs_for_labelset = {term: freq for term, freq in self.filtered_freqs.iteritems() if term in tfidf_for_labelset}
- clouds[label] = self._topN(filtered_freqs_for_labelset)
- else:
- clouds[label] = self._findWordFreqs(filenames)
-
- params = {"CLOUDS": json.dumps(clouds),
- "ORDER": json.dumps(self.label_order),
- "WIDTH": self.width,
- "HEIGHT": self.height,
- "FONTSIZE": self.fontsize
- }
-
- self.write_html(params)
-
-
-if __name__ == "__main__":
- try:
- processor = MultipleWordClouds(track_progress = True)
- processor.process()
- except:
- logging.error(traceback.format_exc())
Please sign in to comment.
Something went wrong with that request. Please try again.