Skip to content

Commit

Permalink
refactored data conversion for external models to use luigi
Browse files Browse the repository at this point in the history
  • Loading branch information
macks22 committed Feb 13, 2015
1 parent 2481ce2 commit ffa4f2d
Show file tree
Hide file tree
Showing 2 changed files with 98 additions and 32 deletions.
102 changes: 75 additions & 27 deletions pipeline/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,40 +3,88 @@
"""

import os

import gensim
import luigi

import util
import build_graphs
import repdocs
import config


class WriteCodaFiles(build_graphs.YearFilterableTask):
"""CODA requiers only a tsv edgelist."""

def requires(self):
return build_graphs.AuthorCitationGraphLCCEdgelist(self.start, self.end)

@property
def base_paths(self):
return 'lcc-author-citation-graph.edgelist.tsv'

def run(self):
util.swap_file_delim(self.input(), ' ', self.output(), '\t')


class WriteTermIdMap(repdocs.YearFilterableTask):
"""Use the paper repdoc dictionary to write a term_id to term mapping."""

def requires(self):
return repdocs.BuildPaperRepdocDictionary(self.start, self.end)

@property
def base_paths(self):
return 'lcc-repdoc-corpus-term-id-map.tsv'

def write_coda_files(edgelist_fname):
"""CODA requires only a tsv edgelist."""
outfile = '%s.tsv' % os.path.splitext(edgelist_fname)[0]
util.swap_file_delim(edgelist_fname, ' ', outfile, '\t')
def run(self):
dict = gensim.corpora.Dictionary.load(self.input().path)
lines = ('\t'.join([unicode(term_id), term]).encode('utf-8')
for term, term_id in dict.token2id.items())
with self.output().open('w') as outfile:
outfile.write('\n'.join(lines))


def write_cesna_files(
term_idmap_fname, tf_corpus_fname,
feat_outfile='lcc-repdoc-corpus-author-term-presence.tsv'):
class WriteLCCAuthorBinaryTerms(repdocs.YearFilterableTask):
"""Write SNAP-formatted feature file: binary terms per node (author)."""

def requires(self):
return repdocs.BuildLCCAuthorRepdocCorpusTf(self.start, self.end)

@property
def base_paths(self):
return 'lcc-repdoc-corpus-author-term-presence.tsv'

def run(self):
corpus = gensim.corpora.MmCorpus(self.input().path)
with self.output().open('w') as wf:
docs_with_tf = (
(docnum, corpus.docbyoffset(offset))
for docnum, offset in enumerate(corpus.index))
docs_as_pairs = (
zip([docnum] * len(doc), [term_id for term_id, _ in doc])
for docnum, doc in docs_with_tf)
docs_as_lines = (
['%s\t%s' % (docnum, termid) for docnum, termid in pairs]
for pairs in docs_as_pairs)
docs = ('\n'.join(lines) for lines in docs_as_lines)

for doc in docs:
wf.write('\n'.join(doc))


class WriteCesnaFiles(repdocs.YearFilterableTask):
"""CESNA requires the same edgelist as CODA, but also requires (1) (node_id
\t term_id) pairs for all term features (2) (term_id \t term) pairs for all
terms in the corpus note that because this is mm format, we need to subtract
terms in the corpus. Note that because this is mm format, we need to subtract
1 from all ids.
"""
outfile = '%s.tsv' % os.path.splitext(term_idmap_fname)[0]
util.swap_file_delim(term_idmap_fname, ',', outfile, '\t')

# Write author features file (binary - term id indicates node has term)
corpus = gensim.corpora.MmCorpus(tf_corpus_fname)
with open(feat_outfile, 'w') as wf:
docs_with_tf = (
docnum, corpus.docbyoffset(offset)
for docnum, offset in enumerate(corpus.index))
docs_as_pairs = (
zip([docnum] * len(doc), [term_id for term_id, _ in doc])
for docnum, doc in docs_with_tf)
docs_as_lines = (
['%s\t%s' % (docid, termid) for docid, termid in pairs]
for pairs in docs_as_pairs)
docs = ('\n'.join(lines) for lines in docs_as_lines)

for doc in docs:
wf.write('\n'.join(doc))

def requires(self):
yield WriteCodaFiles(self.start, self.end)
yield WriteTermIdMap(self.start, self.end)
yield WriteLCCAuthorBinaryTerms(self.start, self.end)


if __name__ == "__main__":
luigi.run()
28 changes: 23 additions & 5 deletions pipeline/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,12 +113,30 @@ def yield_csv_records(csv_file):
f.close()


def smart_open(file_like, mode='r'):
try:
return open(file_like, mode)
except TypeError:
try:
return file_like.open(mode)
except (TypeError, AttributeError):
return file_like


def swap_file_delim(infile, indelim, outfile, outdelim):
with open(infile) as rf:
in_lines = (l.strip().split(indelim) for l in rf)
out_lines = (outdelim.join(l) for l in in_lines)
with open(outfile, 'w') as wf:
wf.write('\n'.join(out_lines))
"""Swap out every instance of `indelim` in the input file for `outdelim` and
write to `outfile`. `infile` can be either a filename or file descriptor,
but `outfile` must be a filename.
"""
rf = smart_open(infile, 'r')
wf = smart_open(outfile, 'w')

in_lines = (l.strip().split(indelim) for l in rf)
out_lines = (outdelim.join(l) for l in in_lines)
wf.write('\n'.join(out_lines))

rf.close()
wf.close()


def build_and_save_idmap(graph, outfile, idname='author'):
Expand Down

0 comments on commit ffa4f2d

Please sign in to comment.