refactored data conversion for external models to use luigi

macks22 · Feb 13, 2015 · ffa4f2d · ffa4f2d
1 parent 2481ce2
commit ffa4f2d
Show file tree

Hide file tree

Showing 2 changed files with 98 additions and 32 deletions.
diff --git a/pipeline/convert.py b/pipeline/convert.py
@@ -3,40 +3,88 @@
 """
 
 import os
+
 import gensim
+import luigi
+
 import util
+import build_graphs
+import repdocs
+import config
+
+
+class WriteCodaFiles(build_graphs.YearFilterableTask):
+    """CODA requiers only a tsv edgelist."""
+
+    def requires(self):
+        return build_graphs.AuthorCitationGraphLCCEdgelist(self.start, self.end)
+
+    @property
+    def base_paths(self):
+        return 'lcc-author-citation-graph.edgelist.tsv'
+
+    def run(self):
+        util.swap_file_delim(self.input(), ' ', self.output(), '\t')
+
+
+class WriteTermIdMap(repdocs.YearFilterableTask):
+    """Use the paper repdoc dictionary to write a term_id to term mapping."""
+
+    def requires(self):
+        return repdocs.BuildPaperRepdocDictionary(self.start, self.end)
 
+    @property
+    def base_paths(self):
+        return 'lcc-repdoc-corpus-term-id-map.tsv'
 
-def write_coda_files(edgelist_fname):
-    """CODA requires only a tsv edgelist."""
-    outfile = '%s.tsv' % os.path.splitext(edgelist_fname)[0]
-    util.swap_file_delim(edgelist_fname, ' ', outfile, '\t')
+    def run(self):
+        dict = gensim.corpora.Dictionary.load(self.input().path)
+        lines = ('\t'.join([unicode(term_id), term]).encode('utf-8')
+                 for term, term_id in dict.token2id.items())
+        with self.output().open('w') as outfile:
+            outfile.write('\n'.join(lines))
 
 
-def write_cesna_files(
-        term_idmap_fname, tf_corpus_fname,
-        feat_outfile='lcc-repdoc-corpus-author-term-presence.tsv'):
+class WriteLCCAuthorBinaryTerms(repdocs.YearFilterableTask):
+    """Write SNAP-formatted feature file: binary terms per node (author)."""
+
+    def requires(self):
+        return repdocs.BuildLCCAuthorRepdocCorpusTf(self.start, self.end)
+
+    @property
+    def base_paths(self):
+        return 'lcc-repdoc-corpus-author-term-presence.tsv'
+
+    def run(self):
+        corpus = gensim.corpora.MmCorpus(self.input().path)
+        with self.output().open('w') as wf:
+            docs_with_tf = (
+                (docnum, corpus.docbyoffset(offset))
+                for docnum, offset in enumerate(corpus.index))
+            docs_as_pairs = (
+                zip([docnum] * len(doc), [term_id for term_id, _ in doc])
+                for docnum, doc in docs_with_tf)
+            docs_as_lines = (
+                ['%s\t%s' % (docnum, termid) for docnum, termid in pairs]
+                for pairs in docs_as_pairs)
+            docs = ('\n'.join(lines) for lines in docs_as_lines)
+
+            for doc in docs:
+                wf.write('\n'.join(doc))
+
+
+class WriteCesnaFiles(repdocs.YearFilterableTask):
     """CESNA requires the same edgelist as CODA, but also requires (1) (node_id
     \t term_id) pairs for all term features (2) (term_id \t term) pairs for all
-    terms in the corpus note that because this is mm format, we need to subtract
+    terms in the corpus. Note that because this is mm format, we need to subtract
     1 from all ids.
     """
-    outfile = '%s.tsv' % os.path.splitext(term_idmap_fname)[0]
-    util.swap_file_delim(term_idmap_fname, ',', outfile, '\t')
-
-    # Write author features file (binary - term id indicates node has term)
-    corpus = gensim.corpora.MmCorpus(tf_corpus_fname)
-    with open(feat_outfile, 'w') as wf:
-        docs_with_tf = (
-            docnum, corpus.docbyoffset(offset)
-            for docnum, offset in enumerate(corpus.index))
-        docs_as_pairs = (
-            zip([docnum] * len(doc), [term_id for term_id, _ in doc])
-            for docnum, doc in docs_with_tf)
-        docs_as_lines = (
-            ['%s\t%s' % (docid, termid) for docid, termid in pairs]
-            for pairs in docs_as_pairs)
-        docs = ('\n'.join(lines) for lines in docs_as_lines)
-
-        for doc in docs:
-            wf.write('\n'.join(doc))
+
+    def requires(self):
+        yield WriteCodaFiles(self.start, self.end)
+        yield WriteTermIdMap(self.start, self.end)
+        yield WriteLCCAuthorBinaryTerms(self.start, self.end)
+
+
+if __name__ == "__main__":
+    luigi.run()
diff --git a/pipeline/util.py b/pipeline/util.py
@@ -113,12 +113,30 @@ def yield_csv_records(csv_file):
     f.close()
 
 
+def smart_open(file_like, mode='r'):
+    try:
+        return open(file_like, mode)
+    except TypeError:
+        try:
+            return file_like.open(mode)
+        except (TypeError, AttributeError):
+            return file_like
+
+
 def swap_file_delim(infile, indelim, outfile, outdelim):
-    with open(infile) as rf:
-        in_lines = (l.strip().split(indelim) for l in rf)
-        out_lines = (outdelim.join(l) for l in in_lines)
-        with open(outfile, 'w') as wf:
-            wf.write('\n'.join(out_lines))
+    """Swap out every instance of `indelim` in the input file for `outdelim` and
+    write to `outfile`. `infile` can be either a filename or file descriptor,
+    but `outfile` must be a filename.
+    """
+    rf = smart_open(infile, 'r')
+    wf = smart_open(outfile, 'w')
+
+    in_lines = (l.strip().split(indelim) for l in rf)
+    out_lines = (outdelim.join(l) for l in in_lines)
+    wf.write('\n'.join(out_lines))
+
+    rf.close()
+    wf.close()
 
 
 def build_and_save_idmap(graph, outfile, idname='author'):