Merge pull request #593 from piskvorky/pr_538_tests

PR #538 Word2vec defaults changed + a stub for test
piskvorky · Jan 28, 2016 · 8b26889 · 8b26889
2 parents ef9a9d9 + 055cd7d
commit 8b26889
Show file tree

Hide file tree

Showing 4 changed files with 206 additions and 27 deletions.
diff --git a/CHANGELOG.txt b/CHANGELOG.txt
@@ -3,6 +3,13 @@ Changes
 
 0.12.4, XX/XX/XXXX
 
+* Word2vec default values of hyperparameters now are identical to those of original C tool, generally improving performance (Andrey Kutuzov, #538)
+ - By default use CBOW with negative sampling and the mean of context vectors, instead of skip-gram with hierarchical softmax.
+ - By default use 3 workers, instead of 1.
+ - By default use 5 iterations instead of 1.
+ - By defaut use downsampling with the value 1e-3.
+ - Additionally, wordvec.py can be now used as a standalone script with command line arguments matching those of original C tool.
+   (for example, ./word2vec.py -train data.txt -output trained_vec.txt -size 200 -window 2 -sample 1e-4 -negative 10 -hs 0 -binary 0 -iter 3)
 * load_word2vec_format() performance (@svenkreiss, #555)
   - Remove `init_sims()` call for performance improvements when normalized vectors are not needed.
   - Remove `norm_only` parameter (API change). Call `init_sims(replace=True)` after the `load_word2vec_format()` call for the old `norm_only=True` behavior.

diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py
@@ -342,8 +342,8 @@ class Word2Vec(utils.SaveLoad):
     """
     def __init__(
             self, sentences=None, size=100, alpha=0.025, window=5, min_count=5,
-            max_vocab_size=None, sample=0, seed=1, workers=1, min_alpha=0.0001,
-            sg=1, hs=1, negative=0, cbow_mean=0, hashfxn=hash, iter=1, null_word=0,
+            max_vocab_size=None, sample=1e-3, seed=1, workers=3, min_alpha=0.0001,
+            sg=0, hs=0, negative=5, cbow_mean=1, hashfxn=hash, iter=5, null_word=0,
             trim_rule=None, sorted_vocab=1, batch_words=MAX_WORDS_IN_BATCH):
         """
         Initialize the model from an iterable of `sentences`. Each sentence is a
@@ -357,8 +357,8 @@ def __init__(
         If you don't supply `sentences`, the model is left uninitialized -- use if
         you plan to initialize it in some other way.
 
-        `sg` defines the training algorithm. By default (`sg=1`), skip-gram is used.
-        Otherwise, `cbow` is employed.
+        `sg` defines the training algorithm. By default (`sg=0`), CBOW is used.
+        Otherwise (`sg=1`), skip-gram is employed.
 
         `size` is the dimensionality of the feature vectors.
 
@@ -376,16 +376,18 @@ def __init__(
         need about 1GB of RAM. Set to `None` for no limit (default).
 
         `sample` = threshold for configuring which higher-frequency words are randomly downsampled;
-            default is 0 (off), useful value is 1e-5.
+            default is 1e-3, useful range is (0, 1e-5).
 
         `workers` = use this many worker threads to train the model (=faster training with multicore machines).
 
-        `hs` = if 1 (default), hierarchical sampling will be used for model training (else set to 0).
+        `hs` = if 1, hierarchical softmax will be used for model training.
+        If set to 0 (default), and `negative` is non-zero, negative sampling will be used.
 
         `negative` = if > 0, negative sampling will be used, the int for negative
         specifies how many "noise words" should be drawn (usually between 5-20).
+        Default is 5. If set to 0, no negative samping is used.
 
-        `cbow_mean` = if 0 (default), use the sum of the context word vectors. If 1, use the mean.
+        `cbow_mean` = if 0, use the sum of the context word vectors. If 1 (default), use the mean.
         Only applies when cbow is used.
 
         `hashfxn` = hash function to use to randomly initialize weights, for increased
@@ -1600,35 +1602,65 @@ def __iter__(self):
                         i += self.max_sentence_length
 
 
-# Example: ./word2vec.py ~/workspace/word2vec/text8 ~/workspace/word2vec/questions-words.txt ./text8
+# Example: ./word2vec.py -train data.txt -output vec.txt -size 200 -window 5 -sample 1e-4 -negative 5 -hs 0 -binary 0 -cbow 1 -iter 3
 if __name__ == "__main__":
+    import argparse
     logging.basicConfig(
         format='%(asctime)s : %(threadName)s : %(levelname)s : %(message)s',
         level=logging.INFO)
-    logger.info("running %s", " ".join(sys.argv))
-    logger.info("using optimization %s", FAST_VERSION)
+    logging.info("running %s", " ".join(sys.argv))
+    logging.info("using optimization %s", FAST_VERSION)
+
 
     # check and process cmdline input
     program = os.path.basename(sys.argv[0])
     if len(sys.argv) < 2:
         print(globals()['__doc__'] % locals())
         sys.exit(1)
-    infile = sys.argv[1]
+
     from gensim.models.word2vec import Word2Vec  # avoid referencing __main__ in pickle
 
     seterr(all='raise')  # don't ignore numpy errors
 
-    # model = Word2Vec(LineSentence(infile), size=200, min_count=5, workers=4)
-    model = Word2Vec(Text8Corpus(infile, 10), size=256, min_count=5, workers=4, sg=0, hs=0, cbow_mean=1, negative=5)
-
-    if len(sys.argv) > 3:
-        outfile = sys.argv[3]
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-train", help="Use text data from file TRAIN to train the model", required=True)
+    parser.add_argument("-output", help="Use file OUTPUT to save the resulting word vectors")
+    parser.add_argument("-window", help="Set max skip length WINDOW between words; default is 5", type=int, default=5)
+    parser.add_argument("-size", help="Set size of word vectors; default is 100", type=int, default=100)
+    parser.add_argument("-sample", help="Set threshold for occurrence of words. Those that appear with higher frequency in the training data will be randomly down-sampled; default is 1e-3, useful range is (0, 1e-5)", type=float, default=1e-3)
+    parser.add_argument("-hs", help="Use Hierarchical Softmax; default is 0 (not used)", type=int, default=0, choices=[0, 1])
+    parser.add_argument("-negative", help="Number of negative examples; default is 5, common values are 3 - 10 (0 = not used)", type=int, default=5)
+    parser.add_argument("-threads", help="Use THREADS threads (default 12)", type=int, default=12)
+    parser.add_argument("-iter", help="Run more training iterations (default 5)", type=int, default=5)
+    parser.add_argument("-min_count", help="This will discard words that appear less than MIN_COUNT times; default is 5", type=int, default=5)
+    parser.add_argument("-cbow", help="Use the continuous bag of words model; default is 1 (use 0 for skip-gram model)", type=int, default=1, choices=[0, 1])
+    parser.add_argument("-binary", help="Save the resulting vectors in binary mode; default is 0 (off)", type=int, default=0, choices=[0, 1])
+    parser.add_argument("-accuracy", help="Use questions from file ACCURACY to evaluate the model")
+
+    args = parser.parse_args()
+
+    if args.cbow == 0:
+        skipgram = 1
+    else:
+        skipgram = 0
+
+    corpus = LineSentence(args.train)
+
+    model = Word2Vec(corpus, size=args.size, min_count=args.min_count, workers=args.threads, window=args.window,sample=args.sample,sg=skipgram,hs=args.hs,negative=args.negative,cbow_mean=1,iter=args.iter)
+
+    if args.output:
+        outfile = args.output
+        model.save_word2vec_format(outfile, binary=args.binary)
+    else:
+        outfile = args.train
         model.save(outfile + '.model')
+    if args.binary == 1:
         model.save_word2vec_format(outfile + '.model.bin', binary=True)
+    else:
         model.save_word2vec_format(outfile + '.model.txt', binary=False)
 
-    if len(sys.argv) > 2:
-        questions_file = sys.argv[2]
-        model.accuracy(sys.argv[2])
+    if args.accuracy:
+        questions_file = args.accuracy
+        model.accuracy(questions_file)
 
     logger.info("finished running %s", program)
diff --git a/gensim/scripts/word2vec_standalone.py b/gensim/scripts/word2vec_standalone.py
@@ -0,0 +1,129 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
+
+
+"""
+USAGE: %(program)s -train CORPUS -output VECTORS -size SIZE -window WINDOW
+-cbow CBOW -sample SAMPLE -hs HS -negative NEGATIVE -threads THREADS -iter ITER
+-min_count MIN-COUNT -alpha ALPHA -binary BINARY -accuracy FILE
+
+Trains a neural embedding model on text file CORPUS.
+Parameters essentially reproduce those used by the original C tool 
+(see https://code.google.com/archive/p/word2vec/).
+
+Parameters for training:
+        -train <file>
+                Use text data from <file> to train the model
+        -output <file>
+                Use <file> to save the resulting word vectors / word clusters
+        -size <int>
+                Set size of word vectors; default is 100
+        -window <int>
+                Set max skip length between words; default is 5
+        -sample <float>
+                Set threshold for occurrence of words. Those that appear with higher frequency in the training data
+                will be randomly down-sampled; default is 1e-3, useful range is (0, 1e-5)
+        -hs <int>
+                Use Hierarchical Softmax; default is 0 (not used)
+        -negative <int>
+                Number of negative examples; default is 5, common values are 3 - 10 (0 = not used)
+        -threads <int>
+                Use <int> threads (default 3)
+        -iter <int>
+                Run more training iterations (default 5)
+        -min_count <int>
+                This will discard words that appear less than <int> times; default is 5
+        -alpha <float>
+                Set the starting learning rate; default is 0.025 for skip-gram and 0.05 for CBOW
+        -binary <int>
+                Save the resulting vectors in binary moded; default is 0 (off)
+        -cbow <int>
+                Use the continuous bag of words model; default is 1 (use 0 for skip-gram model)
+        -accuracy <file>
+                Compute accuracy of the resulting model analogical inference power on questions file <file>
+                See an example of questions file at https://code.google.com/p/word2vec/source/browse/trunk/questions-words.txt
+
+Example: python -m gensim.scripts.word2vec_standalone -train data.txt -output vec.txt -size 200 -sample 1e-4 -binary 0 -iter 3
+"""
+
+
+import logging
+import os.path
+import sys
+import argparse
+from numpy import seterr
+
+logger = logging.getLogger(__name__)
+
+from gensim.models.word2vec import Word2Vec, LineSentence  # avoid referencing __main__ in pickle
+
+
+if __name__ == "__main__":
+
+    logging.basicConfig(
+        format='%(asctime)s : %(threadName)s : %(levelname)s : %(message)s',
+        level=logging.INFO)
+    logger.info("running %s", " ".join(sys.argv))
+
+
+    # check and process cmdline input
+    program = os.path.basename(sys.argv[0])
+    if len(sys.argv) < 2:
+        print(globals()['__doc__'] % locals())
+        sys.exit(1)
+
+
+
+    seterr(all='raise')  # don't ignore numpy errors
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-train", help="Use text data from file TRAIN to train the model", required=True)
+    parser.add_argument("-output", help="Use file OUTPUT to save the resulting word vectors")
+    parser.add_argument("-window", help="Set max skip length WINDOW between words; default is 5", type=int, default=5)
+    parser.add_argument("-size", help="Set size of word vectors; default is 100", type=int, default=100)
+    parser.add_argument("-sample", help="""Set threshold for occurrence of words. Those that appear with higher frequency in the training data will be randomly down-sampled;
+    default is 1e-3, useful range is (0, 1e-5)""", type=float, default=1e-3)
+    parser.add_argument("-hs", help="Use Hierarchical Softmax; default is 0 (not used)", type=int, default=0, choices=[0, 1])
+    parser.add_argument("-negative", help="Number of negative examples; default is 5, common values are 3 - 10 (0 = not used)", type=int, default=5)
+    parser.add_argument("-threads", help="Use THREADS threads (default 3)", type=int, default=3)
+    parser.add_argument("-iter", help="Run more training iterations (default 5)", type=int, default=5)
+    parser.add_argument("-min_count", help="This will discard words that appear less than MIN_COUNT times; default is 5", type=int, default=5)
+    parser.add_argument("-alpha", help="Set the starting learning rate; default is 0.025 for skip-gram and 0.05 for CBOW", type=float)
+    parser.add_argument("-cbow", help="Use the continuous bag of words model; default is 1 (use 0 for skip-gram model)", type=int, default=1, choices=[0, 1])
+    parser.add_argument("-binary", help="Save the resulting vectors in binary mode; default is 0 (off)", type=int, default=0, choices=[0, 1])
+    parser.add_argument("-accuracy", help="Use questions from file ACCURACY to evaluate the model")
+
+    args = parser.parse_args()
+
+    if args.cbow == 0:
+        skipgram = 1
+        if not args.alpha:
+            args.alpha = 0.025
+    else:
+        skipgram = 0
+        if not args.alpha:
+            args.alpha = 0.05
+
+    corpus = LineSentence(args.train)
+
+    model = Word2Vec(corpus, size=args.size, min_count=args.min_count, workers=args.threads, window=args.window,
+    sample=args.sample, alpha=args.alpha, sg=skipgram, hs=args.hs, negative=args.negative, cbow_mean=1, iter=args.iter)
+
+    if args.output:
+        outfile = args.output
+        model.save_word2vec_format(outfile, binary=args.binary)
+    else:
+        outfile = args.train.split('.')[0]
+        model.save(outfile + '.model')
+        if args.binary == 1:
+            model.save_word2vec_format(outfile + '.model.bin', binary=True)
+        else:
+            model.save_word2vec_format(outfile + '.model.txt', binary=False)
+
+    if args.accuracy:
+        questions_file = args.accuracy
+        model.accuracy(questions_file)
+
+    logger.info("finished running %s", program)
diff --git a/gensim/test/test_word2vec.py b/gensim/test/test_word2vec.py
@@ -19,6 +19,8 @@
 import numpy
 
 from gensim import utils, matutils
+from gensim.utils import check_output
+from subprocess import PIPE
 from gensim.models import word2vec
 
 module_path = os.path.dirname(__file__)  # needed because sample data files are located in the same folder
@@ -154,7 +156,7 @@ def testVocab(self):
         total_words = sum(len(sentence) for sentence in corpus)
 
         # try vocab building explicitly, using all words
-        model = word2vec.Word2Vec(min_count=1)
+        model = word2vec.Word2Vec(min_count=1, hs=1, negative=0)
         model.build_vocab(corpus)
         self.assertTrue(len(model.vocab) == 6981)
         # with min_count=1, we're not throwing away anything, so make sure the word counts add up to be the entire corpus
@@ -163,7 +165,7 @@ def testVocab(self):
         numpy.allclose(model.vocab['the'].code, [1, 1, 0, 0])
 
         # test building vocab with default params
-        model = word2vec.Word2Vec()
+        model = word2vec.Word2Vec(hs=1, negative=0)
         model.build_vocab(corpus)
         self.assertTrue(len(model.vocab) == 1750)
         numpy.allclose(model.vocab['the'].code, [1, 1, 1, 0])
@@ -177,7 +179,7 @@ def testVocab(self):
     def testTraining(self):
         """Test word2vec training."""
         # build vocabulary, don't train yet
-        model = word2vec.Word2Vec(size=2, min_count=1)
+        model = word2vec.Word2Vec(size=2, min_count=1, hs=1, negative=0)
         model.build_vocab(sentences)
 
         self.assertTrue(model.syn0.shape == (len(model.vocab), 2))
@@ -194,12 +196,12 @@ def testTraining(self):
         self.assertEqual(sims, sims2)
 
         # build vocab and train in one step; must be the same as above
-        model2 = word2vec.Word2Vec(sentences, size=2, min_count=1)
+        model2 = word2vec.Word2Vec(sentences, size=2, min_count=1, hs=1, negative=0)
         self.models_equal(model, model2)
 
     def testScoring(self):
         """Test word2vec scoring."""
-        model = word2vec.Word2Vec(sentences, size=2, min_count=1)
+        model = word2vec.Word2Vec(sentences, size=2, min_count=1, hs=1, negative=0)
 
         # just score and make sure they exist
         scores = model.score(sentences, len(sentences))
@@ -259,14 +261,14 @@ def test_cbow_hs(self):
     def test_cbow_neg(self):
         """Test CBOW w/ negative sampling"""
         model = word2vec.Word2Vec(sg=0, cbow_mean=1, alpha=0.05, window=5, hs=0, negative=15,
-                                  min_count=5, iter=10, workers=2)
+                                  min_count=5, iter=10, workers=2, sample=0)
         self.model_sanity(model)
 
     def testTrainingCbow(self):
         """Test CBOW word2vec training."""
         # to test training, make the corpus larger by repeating its sentences over and over
         # build vocabulary, don't train yet
-        model = word2vec.Word2Vec(size=2, min_count=1, sg=0)
+        model = word2vec.Word2Vec(size=2, min_count=1, sg=0, hs=1, negative=0)
         model.build_vocab(sentences)
         self.assertTrue(model.syn0.shape == (len(model.vocab), 2))
         self.assertTrue(model.syn1.shape == (len(model.vocab), 2))
@@ -282,7 +284,7 @@ def testTrainingCbow(self):
         self.assertEqual(sims, sims2)
 
         # build vocab and train in one step; must be the same as above
-        model2 = word2vec.Word2Vec(sentences, size=2, min_count=1, sg=0)
+        model2 = word2vec.Word2Vec(sentences, size=2, min_count=1, sg=0, hs=1, negative=0)
         self.models_equal(model, model2)
 
     def testTrainingSgNegative(self):
@@ -405,6 +407,15 @@ def testLineSentenceWorksWithNormalFile(self):
                     self.assertEqual(words, utils.to_unicode(orig.readline()).split())
 #endclass TestWord2VecSentenceIterators
 
+# TODO: get correct path to Python binary
+# class TestWord2VecScripts(unittest.TestCase):
+#     def testWord2VecStandAloneScript(self):
+#         """Does Word2Vec script launch standalone?"""
+#         cmd = 'python -m gensim.scripts.word2vec_standalone -train ' + datapath('testcorpus.txt') + ' -output vec.txt -size 200 -sample 1e-4 -binary 0 -iter 3 -min_count 1'
+#         output = check_output(cmd, stderr=PIPE)
+#         self.assertEqual(output, '0')
+# #endclass TestWord2VecScripts
+
 
 if not hasattr(TestWord2VecModel, 'assertLess'):
     # workaround for python 2.6