Skip to content

Commit

Permalink
Merge branch 'release-0.12.4' with #596
Browse files Browse the repository at this point in the history
  • Loading branch information
tmylk committed Jan 31, 2016
2 parents b61287a + 4d8dd20 commit 3ade404
Show file tree
Hide file tree
Showing 5 changed files with 743 additions and 723 deletions.
2 changes: 1 addition & 1 deletion CHANGELOG.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ Changes

* Word2vec in line with original word2vec.c (Andrey Kutuzov, #538)
- Same default values. See diff https://github.com/akutuzov/gensim/commit/6456cbcd75e6f8720451766ba31cc046b4463ae2
- Standalone script with command line arguments matching those of original C tool.
- Standalone script with command line arguments matching those of original C tool.
Usage ./word2vec_standalone.py -train data.txt -output trained_vec.txt -size 200 -window 2 -sample 1e-4
* load_word2vec_format() performance (@svenkreiss, #555)
- Remove `init_sims()` call for performance improvements when normalized vectors are not needed.
Expand Down
19 changes: 11 additions & 8 deletions gensim/corpora/dictionary.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,17 +187,19 @@ def filter_extremes(self, no_below=5, no_above=0.5, keep_n=100000):
**Note**: Due to the gap shrinking, the same word may have a different
word id before and after the call to this function!
"""
no_above_abs = int(no_above * self.num_docs) # convert fractional threshold to absolute threshold
no_above_abs = int(no_above * self.num_docs) # convert fractional threshold to absolute threshold

# determine which tokens to keep
good_ids = (v for v in itervalues(self.token2id)
if no_below <= self.dfs.get(v, 0) <= no_above_abs)
good_ids = (
v for v in itervalues(self.token2id)
if no_below <= self.dfs.get(v, 0) <= no_above_abs)
good_ids = sorted(good_ids, key=self.dfs.get, reverse=True)
if keep_n is not None:
good_ids = good_ids[:keep_n]
bad_words = [(self[id], self.dfs.get(id, 0)) for id in set(self).difference(good_ids)]
logger.info("discarding %i tokens: %s...", len(self) - len(good_ids), bad_words[:10])
logger.info("keeping %i tokens which were in no less than %i and no more than %i (=%.1f%%) documents",
logger.info(
"keeping %i tokens which were in no less than %i and no more than %i (=%.1f%%) documents",
len(good_ids), no_below, no_above_abs, 100.0 * no_above)

# do the actual filtering, then rebuild dictionary to remove gaps in ids
Expand Down Expand Up @@ -256,7 +258,7 @@ def save_as_text(self, fname, sort_by_word=True):
Note: text format should be use for corpus inspection. Use `save`/`load`
to store in binary format (pickle) for improved performance.
"""
logger.info("saving dictionary mapping to %s" % fname)
logger.info("saving dictionary mapping to %s", fname)
with utils.smart_open(fname, 'wb') as fout:
if sort_by_word:
for token, tokenid in sorted(iteritems(self.token2id)):
Expand Down Expand Up @@ -354,7 +356,7 @@ def from_corpus(corpus, id2word=None):
max_id = -1
for docno, document in enumerate(corpus):
if docno % 10000 == 0:
logger.info("adding document #%i to %s" % (docno, result))
logger.info("adding document #%i to %s", docno, result)
result.num_docs += 1
result.num_nnz += len(document)
for wordid, word_freq in document:
Expand All @@ -372,6 +374,7 @@ def from_corpus(corpus, id2word=None):
# make sure all token ids have a valid `dfs` entry
result.dfs[id] = result.dfs.get(id, 0)

logger.info("built %s from %i documents (total %i corpus positions)" %
(result, result.num_docs, result.num_pos))
logger.info(
"built %s from %i documents (total %i corpus positions)",
result, result.num_docs, result.num_pos)
return result

0 comments on commit 3ade404

Please sign in to comment.