Skip to content

Commit

Permalink
Merge pull request #1 from piskvorky/buma
Browse files Browse the repository at this point in the history
make len(Dictionary.from_corpus) consistent with its content
  • Loading branch information
buma committed Sep 7, 2012
2 parents af9c594 + a61e537 commit bb1124a
Show file tree
Hide file tree
Showing 2 changed files with 17 additions and 17 deletions.
31 changes: 15 additions & 16 deletions gensim/corpora/dictionary.py
Original file line number Diff line number Diff line change
Expand Up @@ -247,30 +247,29 @@ def load_from_text(fname):
@staticmethod
def from_corpus(corpus):
"""
Scan document term word count matrix for all word ids that appear
in it, then construct and return Dictionary with a mapping which
maps each ``wordId -> str(wordId)``.
Create Dictionary from an existing corpus. This can be useful if you only
have a term-document BOW matrix (represented by `corpus`), but not the
original text corpus.
The resulting mapping only covers words actually used in the corpus,
up to the highest wordId found.
Usefull only if you do not have text corpus.
This will scan the term-document count matrix for all word ids that
appear in it, then construct and return Dictionary which maps each
`word_id -> str(word_id)`.
"""
result = Dictionary()
max_id = -1
for docno, document in enumerate(corpus):
if docno % 10000 == 0:
logger.info("adding document #%i to %s" % (docno, result))
result.num_docs += 1
result.num_nnz += len(document)
for wordid, word in document:
result.num_pos += word
str_wordid = str(wordid)
tokenid = result.token2id.get(str_wordid, None)
if tokenid is None:
result.dfs[wordid] = 1
result.token2id[str_wordid] = wordid
else:
result.dfs[wordid] += 1
for wordid, word_freq in document:
max_id = max(wordid, max_id)
result.num_pos += word_freq
result.dfs[wordid] = result.dfs.get(wordid, 0) + 1
# now make sure length(result) == get_max_id(corpus) + 1
for i in xrange(max_id + 1):
result.token2id[str(i)] = i

logger.info("built %s from %i documents (total %i corpus positions)" %
(result, result.num_docs, result.num_pos))
return result
Expand Down
3 changes: 2 additions & 1 deletion gensim/test/test_corpora_dictionary.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ def testFilter(self):
self.assertEqual(d.dfs, expected)

def test_saveAsText_and_loadFromText(self):
""" `Dictionary` can be saved as textfile and loaded again from textfile. """
"""`Dictionary` can be saved as textfile and loaded again from textfile. """
tmpf = get_tmpfile('dict_test.txt')
d = Dictionary(self.texts)
d.save_as_text(tmpf)
Expand All @@ -115,6 +115,7 @@ def test_saveAsText_and_loadFromText(self):
self.assertEqual(d_loaded.token2id, d.token2id)

def test_from_corpus(self):
"""build `Dictionary` from an existing corpus"""

documents = ["Human machine interface for lab abc computer applications",
"A survey of user opinion of computer system response time",
Expand Down

0 comments on commit bb1124a

Please sign in to comment.