Merge pull request #1 from piskvorky/buma

make len(Dictionary.from_corpus) consistent with its content
piskvorky · Sep 7, 2012 · bb1124a · bb1124a
2 parents af9c594 + a61e537
commit bb1124a
Show file tree

Hide file tree

Showing 2 changed files with 17 additions and 17 deletions.
diff --git a/gensim/corpora/dictionary.py b/gensim/corpora/dictionary.py
@@ -247,30 +247,29 @@ def load_from_text(fname):
     @staticmethod
     def from_corpus(corpus):
         """
-        Scan document term word count matrix for all word ids that appear
-        in it, then construct and return Dictionary with a mapping  which
-        maps each ``wordId -> str(wordId)``.
+        Create Dictionary from an existing corpus. This can be useful if you only
+        have a term-document BOW matrix (represented by `corpus`), but not the
+        original text corpus.
 
-        The resulting mapping only covers words actually used in the corpus,
-        up to the highest wordId found.
-
-        Usefull only if you do not have text corpus.
+        This will scan the term-document count matrix for all word ids that
+        appear in it, then construct and return Dictionary which maps each
+        `word_id -> str(word_id)`.
         """
         result = Dictionary()
+        max_id = -1
         for docno, document in enumerate(corpus):
             if docno % 10000 == 0:
                 logger.info("adding document #%i to %s" % (docno, result))
             result.num_docs += 1
             result.num_nnz += len(document)
-            for wordid, word in document:
-                result.num_pos += word
-                str_wordid = str(wordid)
-                tokenid = result.token2id.get(str_wordid, None)
-                if tokenid is None:
-                    result.dfs[wordid] = 1
-                    result.token2id[str_wordid] = wordid
-                else:
-                    result.dfs[wordid] += 1
+            for wordid, word_freq in document:
+                max_id = max(wordid, max_id)
+                result.num_pos += word_freq
+                result.dfs[wordid] = result.dfs.get(wordid, 0) + 1
+        # now make sure length(result) == get_max_id(corpus) + 1
+        for i in xrange(max_id + 1):
+            result.token2id[str(i)] = i
+
         logger.info("built %s from %i documents (total %i corpus positions)" %
                      (result, result.num_docs, result.num_pos))
         return result

diff --git a/gensim/test/test_corpora_dictionary.py b/gensim/test/test_corpora_dictionary.py
@@ -103,7 +103,7 @@ def testFilter(self):
         self.assertEqual(d.dfs, expected)
 
     def test_saveAsText_and_loadFromText(self):
-        """ `Dictionary` can be saved as textfile and loaded again from textfile. """
+        """`Dictionary` can be saved as textfile and loaded again from textfile. """
         tmpf = get_tmpfile('dict_test.txt')
         d = Dictionary(self.texts)
         d.save_as_text(tmpf)
@@ -115,6 +115,7 @@ def test_saveAsText_and_loadFromText(self):
         self.assertEqual(d_loaded.token2id, d.token2id)
 
     def test_from_corpus(self):
+        """build `Dictionary` from an existing corpus"""
 
         documents = ["Human machine interface for lab abc computer applications",
                 "A survey of user opinion of computer system response time",