Merge branch 'release-0.12.4' with #596

piskvorky · Jan 31, 2016 · 3ade404 · 3ade404
2 parents b61287a + 4d8dd20
commit 3ade404
Show file tree

Hide file tree

Showing 5 changed files with 743 additions and 723 deletions.
diff --git a/CHANGELOG.txt b/CHANGELOG.txt
@@ -5,7 +5,7 @@ Changes
 
 * Word2vec in line with original word2vec.c (Andrey Kutuzov, #538) 
 - Same default values. See diff https://github.com/akutuzov/gensim/commit/6456cbcd75e6f8720451766ba31cc046b4463ae2
- - Standalone script with command line arguments matching those of original C tool.
+- Standalone script with command line arguments matching those of original C tool.
    Usage ./word2vec_standalone.py -train data.txt -output trained_vec.txt -size 200 -window 2 -sample 1e-4
 * load_word2vec_format() performance (@svenkreiss, #555)
   - Remove `init_sims()` call for performance improvements when normalized vectors are not needed.

diff --git a/gensim/corpora/dictionary.py b/gensim/corpora/dictionary.py
@@ -187,17 +187,19 @@ def filter_extremes(self, no_below=5, no_above=0.5, keep_n=100000):
         **Note**: Due to the gap shrinking, the same word may have a different
         word id before and after the call to this function!
         """
-        no_above_abs = int(no_above * self.num_docs) # convert fractional threshold to absolute threshold
+        no_above_abs = int(no_above * self.num_docs)  # convert fractional threshold to absolute threshold
 
         # determine which tokens to keep
-        good_ids = (v for v in itervalues(self.token2id)
-                      if no_below <= self.dfs.get(v, 0) <= no_above_abs)
+        good_ids = (
+            v for v in itervalues(self.token2id)
+            if no_below <= self.dfs.get(v, 0) <= no_above_abs)
         good_ids = sorted(good_ids, key=self.dfs.get, reverse=True)
         if keep_n is not None:
             good_ids = good_ids[:keep_n]
         bad_words = [(self[id], self.dfs.get(id, 0)) for id in set(self).difference(good_ids)]
         logger.info("discarding %i tokens: %s...", len(self) - len(good_ids), bad_words[:10])
-        logger.info("keeping %i tokens which were in no less than %i and no more than %i (=%.1f%%) documents",
+        logger.info(
+            "keeping %i tokens which were in no less than %i and no more than %i (=%.1f%%) documents",
             len(good_ids), no_below, no_above_abs, 100.0 * no_above)
 
         # do the actual filtering, then rebuild dictionary to remove gaps in ids
@@ -256,7 +258,7 @@ def save_as_text(self, fname, sort_by_word=True):
         Note: text format should be use for corpus inspection. Use `save`/`load`
         to store in binary format (pickle) for improved performance.
         """
-        logger.info("saving dictionary mapping to %s" % fname)
+        logger.info("saving dictionary mapping to %s", fname)
         with utils.smart_open(fname, 'wb') as fout:
             if sort_by_word:
                 for token, tokenid in sorted(iteritems(self.token2id)):
@@ -354,7 +356,7 @@ def from_corpus(corpus, id2word=None):
         max_id = -1
         for docno, document in enumerate(corpus):
             if docno % 10000 == 0:
-                logger.info("adding document #%i to %s" % (docno, result))
+                logger.info("adding document #%i to %s", docno, result)
             result.num_docs += 1
             result.num_nnz += len(document)
             for wordid, word_freq in document:
@@ -372,6 +374,7 @@ def from_corpus(corpus, id2word=None):
             # make sure all token ids have a valid `dfs` entry
             result.dfs[id] = result.dfs.get(id, 0)
 
-        logger.info("built %s from %i documents (total %i corpus positions)" %
-                     (result, result.num_docs, result.num_pos))
+        logger.info(
+            "built %s from %i documents (total %i corpus positions)",
+            result, result.num_docs, result.num_pos)
         return result