Skip to content

Commit

Permalink
Merge pull request #593 from piskvorky/pr_538_tests
Browse files Browse the repository at this point in the history
PR #538 Word2vec defaults changed + a stub for test
  • Loading branch information
tmylk committed Jan 28, 2016
2 parents ef9a9d9 + 055cd7d commit 8b26889
Show file tree
Hide file tree
Showing 4 changed files with 206 additions and 27 deletions.
7 changes: 7 additions & 0 deletions CHANGELOG.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,13 @@ Changes

0.12.4, XX/XX/XXXX

* Word2vec default values of hyperparameters now are identical to those of original C tool, generally improving performance (Andrey Kutuzov, #538)
- By default use CBOW with negative sampling and the mean of context vectors, instead of skip-gram with hierarchical softmax.
- By default use 3 workers, instead of 1.
- By default use 5 iterations instead of 1.
- By defaut use downsampling with the value 1e-3.
- Additionally, wordvec.py can be now used as a standalone script with command line arguments matching those of original C tool.
(for example, ./word2vec.py -train data.txt -output trained_vec.txt -size 200 -window 2 -sample 1e-4 -negative 10 -hs 0 -binary 0 -iter 3)
* load_word2vec_format() performance (@svenkreiss, #555)
- Remove `init_sims()` call for performance improvements when normalized vectors are not needed.
- Remove `norm_only` parameter (API change). Call `init_sims(replace=True)` after the `load_word2vec_format()` call for the old `norm_only=True` behavior.
Expand Down
70 changes: 51 additions & 19 deletions gensim/models/word2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -342,8 +342,8 @@ class Word2Vec(utils.SaveLoad):
"""
def __init__(
self, sentences=None, size=100, alpha=0.025, window=5, min_count=5,
max_vocab_size=None, sample=0, seed=1, workers=1, min_alpha=0.0001,
sg=1, hs=1, negative=0, cbow_mean=0, hashfxn=hash, iter=1, null_word=0,
max_vocab_size=None, sample=1e-3, seed=1, workers=3, min_alpha=0.0001,
sg=0, hs=0, negative=5, cbow_mean=1, hashfxn=hash, iter=5, null_word=0,
trim_rule=None, sorted_vocab=1, batch_words=MAX_WORDS_IN_BATCH):
"""
Initialize the model from an iterable of `sentences`. Each sentence is a
Expand All @@ -357,8 +357,8 @@ def __init__(
If you don't supply `sentences`, the model is left uninitialized -- use if
you plan to initialize it in some other way.
`sg` defines the training algorithm. By default (`sg=1`), skip-gram is used.
Otherwise, `cbow` is employed.
`sg` defines the training algorithm. By default (`sg=0`), CBOW is used.
Otherwise (`sg=1`), skip-gram is employed.
`size` is the dimensionality of the feature vectors.
Expand All @@ -376,16 +376,18 @@ def __init__(
need about 1GB of RAM. Set to `None` for no limit (default).
`sample` = threshold for configuring which higher-frequency words are randomly downsampled;
default is 0 (off), useful value is 1e-5.
default is 1e-3, useful range is (0, 1e-5).
`workers` = use this many worker threads to train the model (=faster training with multicore machines).
`hs` = if 1 (default), hierarchical sampling will be used for model training (else set to 0).
`hs` = if 1, hierarchical softmax will be used for model training.
If set to 0 (default), and `negative` is non-zero, negative sampling will be used.
`negative` = if > 0, negative sampling will be used, the int for negative
specifies how many "noise words" should be drawn (usually between 5-20).
Default is 5. If set to 0, no negative samping is used.
`cbow_mean` = if 0 (default), use the sum of the context word vectors. If 1, use the mean.
`cbow_mean` = if 0, use the sum of the context word vectors. If 1 (default), use the mean.
Only applies when cbow is used.
`hashfxn` = hash function to use to randomly initialize weights, for increased
Expand Down Expand Up @@ -1600,35 +1602,65 @@ def __iter__(self):
i += self.max_sentence_length


# Example: ./word2vec.py ~/workspace/word2vec/text8 ~/workspace/word2vec/questions-words.txt ./text8
# Example: ./word2vec.py -train data.txt -output vec.txt -size 200 -window 5 -sample 1e-4 -negative 5 -hs 0 -binary 0 -cbow 1 -iter 3
if __name__ == "__main__":
import argparse
logging.basicConfig(
format='%(asctime)s : %(threadName)s : %(levelname)s : %(message)s',
level=logging.INFO)
logger.info("running %s", " ".join(sys.argv))
logger.info("using optimization %s", FAST_VERSION)
logging.info("running %s", " ".join(sys.argv))
logging.info("using optimization %s", FAST_VERSION)


# check and process cmdline input
program = os.path.basename(sys.argv[0])
if len(sys.argv) < 2:
print(globals()['__doc__'] % locals())
sys.exit(1)
infile = sys.argv[1]

from gensim.models.word2vec import Word2Vec # avoid referencing __main__ in pickle

seterr(all='raise') # don't ignore numpy errors

# model = Word2Vec(LineSentence(infile), size=200, min_count=5, workers=4)
model = Word2Vec(Text8Corpus(infile, 10), size=256, min_count=5, workers=4, sg=0, hs=0, cbow_mean=1, negative=5)

if len(sys.argv) > 3:
outfile = sys.argv[3]
parser = argparse.ArgumentParser()
parser.add_argument("-train", help="Use text data from file TRAIN to train the model", required=True)
parser.add_argument("-output", help="Use file OUTPUT to save the resulting word vectors")
parser.add_argument("-window", help="Set max skip length WINDOW between words; default is 5", type=int, default=5)
parser.add_argument("-size", help="Set size of word vectors; default is 100", type=int, default=100)
parser.add_argument("-sample", help="Set threshold for occurrence of words. Those that appear with higher frequency in the training data will be randomly down-sampled; default is 1e-3, useful range is (0, 1e-5)", type=float, default=1e-3)
parser.add_argument("-hs", help="Use Hierarchical Softmax; default is 0 (not used)", type=int, default=0, choices=[0, 1])
parser.add_argument("-negative", help="Number of negative examples; default is 5, common values are 3 - 10 (0 = not used)", type=int, default=5)
parser.add_argument("-threads", help="Use THREADS threads (default 12)", type=int, default=12)
parser.add_argument("-iter", help="Run more training iterations (default 5)", type=int, default=5)
parser.add_argument("-min_count", help="This will discard words that appear less than MIN_COUNT times; default is 5", type=int, default=5)
parser.add_argument("-cbow", help="Use the continuous bag of words model; default is 1 (use 0 for skip-gram model)", type=int, default=1, choices=[0, 1])
parser.add_argument("-binary", help="Save the resulting vectors in binary mode; default is 0 (off)", type=int, default=0, choices=[0, 1])
parser.add_argument("-accuracy", help="Use questions from file ACCURACY to evaluate the model")

args = parser.parse_args()

if args.cbow == 0:
skipgram = 1
else:
skipgram = 0

corpus = LineSentence(args.train)

model = Word2Vec(corpus, size=args.size, min_count=args.min_count, workers=args.threads, window=args.window,sample=args.sample,sg=skipgram,hs=args.hs,negative=args.negative,cbow_mean=1,iter=args.iter)

if args.output:
outfile = args.output
model.save_word2vec_format(outfile, binary=args.binary)
else:
outfile = args.train
model.save(outfile + '.model')
if args.binary == 1:
model.save_word2vec_format(outfile + '.model.bin', binary=True)
else:
model.save_word2vec_format(outfile + '.model.txt', binary=False)

if len(sys.argv) > 2:
questions_file = sys.argv[2]
model.accuracy(sys.argv[2])
if args.accuracy:
questions_file = args.accuracy
model.accuracy(questions_file)

logger.info("finished running %s", program)
129 changes: 129 additions & 0 deletions gensim/scripts/word2vec_standalone.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html


"""
USAGE: %(program)s -train CORPUS -output VECTORS -size SIZE -window WINDOW
-cbow CBOW -sample SAMPLE -hs HS -negative NEGATIVE -threads THREADS -iter ITER
-min_count MIN-COUNT -alpha ALPHA -binary BINARY -accuracy FILE
Trains a neural embedding model on text file CORPUS.
Parameters essentially reproduce those used by the original C tool
(see https://code.google.com/archive/p/word2vec/).
Parameters for training:
-train <file>
Use text data from <file> to train the model
-output <file>
Use <file> to save the resulting word vectors / word clusters
-size <int>
Set size of word vectors; default is 100
-window <int>
Set max skip length between words; default is 5
-sample <float>
Set threshold for occurrence of words. Those that appear with higher frequency in the training data
will be randomly down-sampled; default is 1e-3, useful range is (0, 1e-5)
-hs <int>
Use Hierarchical Softmax; default is 0 (not used)
-negative <int>
Number of negative examples; default is 5, common values are 3 - 10 (0 = not used)
-threads <int>
Use <int> threads (default 3)
-iter <int>
Run more training iterations (default 5)
-min_count <int>
This will discard words that appear less than <int> times; default is 5
-alpha <float>
Set the starting learning rate; default is 0.025 for skip-gram and 0.05 for CBOW
-binary <int>
Save the resulting vectors in binary moded; default is 0 (off)
-cbow <int>
Use the continuous bag of words model; default is 1 (use 0 for skip-gram model)
-accuracy <file>
Compute accuracy of the resulting model analogical inference power on questions file <file>
See an example of questions file at https://code.google.com/p/word2vec/source/browse/trunk/questions-words.txt
Example: python -m gensim.scripts.word2vec_standalone -train data.txt -output vec.txt -size 200 -sample 1e-4 -binary 0 -iter 3
"""


import logging
import os.path
import sys
import argparse
from numpy import seterr

logger = logging.getLogger(__name__)

from gensim.models.word2vec import Word2Vec, LineSentence # avoid referencing __main__ in pickle


if __name__ == "__main__":

logging.basicConfig(
format='%(asctime)s : %(threadName)s : %(levelname)s : %(message)s',
level=logging.INFO)
logger.info("running %s", " ".join(sys.argv))


# check and process cmdline input
program = os.path.basename(sys.argv[0])
if len(sys.argv) < 2:
print(globals()['__doc__'] % locals())
sys.exit(1)



seterr(all='raise') # don't ignore numpy errors

parser = argparse.ArgumentParser()
parser.add_argument("-train", help="Use text data from file TRAIN to train the model", required=True)
parser.add_argument("-output", help="Use file OUTPUT to save the resulting word vectors")
parser.add_argument("-window", help="Set max skip length WINDOW between words; default is 5", type=int, default=5)
parser.add_argument("-size", help="Set size of word vectors; default is 100", type=int, default=100)
parser.add_argument("-sample", help="""Set threshold for occurrence of words. Those that appear with higher frequency in the training data will be randomly down-sampled;
default is 1e-3, useful range is (0, 1e-5)""", type=float, default=1e-3)
parser.add_argument("-hs", help="Use Hierarchical Softmax; default is 0 (not used)", type=int, default=0, choices=[0, 1])
parser.add_argument("-negative", help="Number of negative examples; default is 5, common values are 3 - 10 (0 = not used)", type=int, default=5)
parser.add_argument("-threads", help="Use THREADS threads (default 3)", type=int, default=3)
parser.add_argument("-iter", help="Run more training iterations (default 5)", type=int, default=5)
parser.add_argument("-min_count", help="This will discard words that appear less than MIN_COUNT times; default is 5", type=int, default=5)
parser.add_argument("-alpha", help="Set the starting learning rate; default is 0.025 for skip-gram and 0.05 for CBOW", type=float)
parser.add_argument("-cbow", help="Use the continuous bag of words model; default is 1 (use 0 for skip-gram model)", type=int, default=1, choices=[0, 1])
parser.add_argument("-binary", help="Save the resulting vectors in binary mode; default is 0 (off)", type=int, default=0, choices=[0, 1])
parser.add_argument("-accuracy", help="Use questions from file ACCURACY to evaluate the model")

args = parser.parse_args()

if args.cbow == 0:
skipgram = 1
if not args.alpha:
args.alpha = 0.025
else:
skipgram = 0
if not args.alpha:
args.alpha = 0.05

corpus = LineSentence(args.train)

model = Word2Vec(corpus, size=args.size, min_count=args.min_count, workers=args.threads, window=args.window,
sample=args.sample, alpha=args.alpha, sg=skipgram, hs=args.hs, negative=args.negative, cbow_mean=1, iter=args.iter)

if args.output:
outfile = args.output
model.save_word2vec_format(outfile, binary=args.binary)
else:
outfile = args.train.split('.')[0]
model.save(outfile + '.model')
if args.binary == 1:
model.save_word2vec_format(outfile + '.model.bin', binary=True)
else:
model.save_word2vec_format(outfile + '.model.txt', binary=False)

if args.accuracy:
questions_file = args.accuracy
model.accuracy(questions_file)

logger.info("finished running %s", program)
27 changes: 19 additions & 8 deletions gensim/test/test_word2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@
import numpy

from gensim import utils, matutils
from gensim.utils import check_output
from subprocess import PIPE
from gensim.models import word2vec

module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder
Expand Down Expand Up @@ -154,7 +156,7 @@ def testVocab(self):
total_words = sum(len(sentence) for sentence in corpus)

# try vocab building explicitly, using all words
model = word2vec.Word2Vec(min_count=1)
model = word2vec.Word2Vec(min_count=1, hs=1, negative=0)
model.build_vocab(corpus)
self.assertTrue(len(model.vocab) == 6981)
# with min_count=1, we're not throwing away anything, so make sure the word counts add up to be the entire corpus
Expand All @@ -163,7 +165,7 @@ def testVocab(self):
numpy.allclose(model.vocab['the'].code, [1, 1, 0, 0])

# test building vocab with default params
model = word2vec.Word2Vec()
model = word2vec.Word2Vec(hs=1, negative=0)
model.build_vocab(corpus)
self.assertTrue(len(model.vocab) == 1750)
numpy.allclose(model.vocab['the'].code, [1, 1, 1, 0])
Expand All @@ -177,7 +179,7 @@ def testVocab(self):
def testTraining(self):
"""Test word2vec training."""
# build vocabulary, don't train yet
model = word2vec.Word2Vec(size=2, min_count=1)
model = word2vec.Word2Vec(size=2, min_count=1, hs=1, negative=0)
model.build_vocab(sentences)

self.assertTrue(model.syn0.shape == (len(model.vocab), 2))
Expand All @@ -194,12 +196,12 @@ def testTraining(self):
self.assertEqual(sims, sims2)

# build vocab and train in one step; must be the same as above
model2 = word2vec.Word2Vec(sentences, size=2, min_count=1)
model2 = word2vec.Word2Vec(sentences, size=2, min_count=1, hs=1, negative=0)
self.models_equal(model, model2)

def testScoring(self):
"""Test word2vec scoring."""
model = word2vec.Word2Vec(sentences, size=2, min_count=1)
model = word2vec.Word2Vec(sentences, size=2, min_count=1, hs=1, negative=0)

# just score and make sure they exist
scores = model.score(sentences, len(sentences))
Expand Down Expand Up @@ -259,14 +261,14 @@ def test_cbow_hs(self):
def test_cbow_neg(self):
"""Test CBOW w/ negative sampling"""
model = word2vec.Word2Vec(sg=0, cbow_mean=1, alpha=0.05, window=5, hs=0, negative=15,
min_count=5, iter=10, workers=2)
min_count=5, iter=10, workers=2, sample=0)
self.model_sanity(model)

def testTrainingCbow(self):
"""Test CBOW word2vec training."""
# to test training, make the corpus larger by repeating its sentences over and over
# build vocabulary, don't train yet
model = word2vec.Word2Vec(size=2, min_count=1, sg=0)
model = word2vec.Word2Vec(size=2, min_count=1, sg=0, hs=1, negative=0)
model.build_vocab(sentences)
self.assertTrue(model.syn0.shape == (len(model.vocab), 2))
self.assertTrue(model.syn1.shape == (len(model.vocab), 2))
Expand All @@ -282,7 +284,7 @@ def testTrainingCbow(self):
self.assertEqual(sims, sims2)

# build vocab and train in one step; must be the same as above
model2 = word2vec.Word2Vec(sentences, size=2, min_count=1, sg=0)
model2 = word2vec.Word2Vec(sentences, size=2, min_count=1, sg=0, hs=1, negative=0)
self.models_equal(model, model2)

def testTrainingSgNegative(self):
Expand Down Expand Up @@ -405,6 +407,15 @@ def testLineSentenceWorksWithNormalFile(self):
self.assertEqual(words, utils.to_unicode(orig.readline()).split())
#endclass TestWord2VecSentenceIterators

# TODO: get correct path to Python binary
# class TestWord2VecScripts(unittest.TestCase):
# def testWord2VecStandAloneScript(self):
# """Does Word2Vec script launch standalone?"""
# cmd = 'python -m gensim.scripts.word2vec_standalone -train ' + datapath('testcorpus.txt') + ' -output vec.txt -size 200 -sample 1e-4 -binary 0 -iter 3 -min_count 1'
# output = check_output(cmd, stderr=PIPE)
# self.assertEqual(output, '0')
# #endclass TestWord2VecScripts


if not hasattr(TestWord2VecModel, 'assertLess'):
# workaround for python 2.6
Expand Down

0 comments on commit 8b26889

Please sign in to comment.