-
-
Notifications
You must be signed in to change notification settings - Fork 4.4k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
cbow_mean default changed from 0 to 1. #538
Changes from 17 commits
1c63c9a
280a488
ddeb002
f2ac3a9
cf09e8c
b8b8f57
6456cbc
966a4b0
d9ec7e4
76d2df7
0b6f45b
7fb5f18
bc7a447
e689b4f
a5274ab
5c32ca8
ac889b3
92087c0
06785b5
3ac5fd4
e0ac3d2
0aad977
1db616b
e4eb8ba
ab25344
6b3f01d
2bf45d3
1a579ec
78372bf
0c10fa6
8a3d58b
c5249b9
a40e624
dbd0eab
b61287a
3ade404
9e6522e
87c4e9c
9c74b40
7b30025
de79c8e
d4f9cc5
e0627c6
b8b30c2
f3f2a52
873f184
68a3e86
498474d
ce64d5a
0936971
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -342,8 +342,8 @@ class Word2Vec(utils.SaveLoad): | |
""" | ||
def __init__( | ||
self, sentences=None, size=100, alpha=0.025, window=5, min_count=5, | ||
max_vocab_size=None, sample=0, seed=1, workers=1, min_alpha=0.0001, | ||
sg=1, hs=1, negative=0, cbow_mean=0, hashfxn=hash, iter=1, null_word=0, | ||
max_vocab_size=None, sample=1e-3, seed=1, workers=12, min_alpha=0.0001, | ||
sg=0, hs=1, negative=0, cbow_mean=1, hashfxn=hash, iter=5, null_word=0, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Word2Vec's switch from |
||
trim_rule=None, sorted_vocab=1): | ||
""" | ||
Initialize the model from an iterable of `sentences`. Each sentence is a | ||
|
@@ -357,8 +357,8 @@ def __init__( | |
If you don't supply `sentences`, the model is left uninitialized -- use if | ||
you plan to initialize it in some other way. | ||
|
||
`sg` defines the training algorithm. By default (`sg=1`), skip-gram is used. | ||
Otherwise, `cbow` is employed. | ||
`sg` defines the training algorithm. By default (`sg=0`), CBOW is used. | ||
Otherwise (`sg=1`), SkipGram is employed. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. To match style previous and elsewhere, 'skip-gram' > 'SkipGram'. |
||
|
||
`size` is the dimensionality of the feature vectors. | ||
|
||
|
@@ -376,16 +376,17 @@ def __init__( | |
need about 1GB of RAM. Set to `None` for no limit (default). | ||
|
||
`sample` = threshold for configuring which higher-frequency words are randomly downsampled; | ||
default is 0 (off), useful value is 1e-5. | ||
default is 1e-3, useful value is 1e-5, 0 stands for off. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should match word2vec.c recommendation – 'useful range is 0 to 1e-5` – rather than imply that the default is somehow not as useful as some other value. |
||
|
||
`workers` = use this many worker threads to train the model (=faster training with multicore machines). | ||
|
||
`hs` = if 1 (default), hierarchical sampling will be used for model training (else set to 0). | ||
`hs` = if 1 (default), hierarchical sampling will be used for model training (if set to 0, negative sampling will be used). | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Technically |
||
|
||
`negative` = if > 0, negative sampling will be used, the int for negative | ||
specifies how many "noise words" should be drawn (usually between 5-20). | ||
Default is 0, thus hierarchical softmax is used. | ||
|
||
`cbow_mean` = if 0 (default), use the sum of the context word vectors. If 1, use the mean. | ||
`cbow_mean` = if 0, use the sum of the context word vectors. If 1 (default), use the mean. | ||
Only applies when cbow is used. | ||
|
||
`hashfxn` = hash function to use to randomly initialize weights, for increased | ||
|
@@ -402,7 +403,6 @@ def __init__( | |
|
||
`sorted_vocab` = if 1 (default), sort the vocabulary by descending frequency before | ||
assigning word indexes. | ||
|
||
""" | ||
self.vocab = {} # mapping from a word (string) to a Vocab object | ||
self.index2word = [] # map from a word's matrix index (int) to word (string) | ||
|
@@ -1595,35 +1595,65 @@ def __iter__(self): | |
i += self.max_sentence_length | ||
|
||
|
||
# Example: ./word2vec.py ~/workspace/word2vec/text8 ~/workspace/word2vec/questions-words.txt ./text8 | ||
# Example: ./word2vec.py -train data.txt -output vec.txt -size 200 -window 5 -sample 1e-4 -negative 5 -hs 0 -binary 0 -cbow 1 -iter 3 | ||
if __name__ == "__main__": | ||
import argparse | ||
logging.basicConfig( | ||
format='%(asctime)s : %(threadName)s : %(levelname)s : %(message)s', | ||
level=logging.INFO) | ||
logger.info("running %s", " ".join(sys.argv)) | ||
logger.info("using optimization %s", FAST_VERSION) | ||
logging.info("running %s", " ".join(sys.argv)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why this change? Better use |
||
logging.info("using optimization %s", FAST_VERSION) | ||
|
||
|
||
# check and process cmdline input | ||
program = os.path.basename(sys.argv[0]) | ||
if len(sys.argv) < 2: | ||
print(globals()['__doc__'] % locals()) | ||
sys.exit(1) | ||
infile = sys.argv[1] | ||
|
||
from gensim.models.word2vec import Word2Vec # avoid referencing __main__ in pickle | ||
|
||
seterr(all='raise') # don't ignore numpy errors | ||
|
||
# model = Word2Vec(LineSentence(infile), size=200, min_count=5, workers=4) | ||
model = Word2Vec(Text8Corpus(infile, 10), size=256, min_count=5, workers=4, sg=0, hs=0, cbow_mean=1, negative=5) | ||
|
||
if len(sys.argv) > 3: | ||
outfile = sys.argv[3] | ||
|
||
parser = argparse.ArgumentParser() | ||
parser.add_argument("-train", help="Use text data from file TRAIN to train the model", required=True) | ||
parser.add_argument("-output", help="Use file OUTPUT to save the resulting word vectors") | ||
parser.add_argument("-window", help="Set max skip length WINDOW between words; default is 5", type=int, default=5) | ||
parser.add_argument("-size", help="Set size of word vectors; default is 100", type=int, default=100) | ||
parser.add_argument("-sample", help="Set threshold for occurrence of words. Those that appear with higher frequency in the training data will be randomly down-sampled; default is 1e-3, useful range is (0, 1e-5)", type=float, default=1e-3) | ||
parser.add_argument("-hs", help="Use Hierarchical Softmax; default is 0 (not used)", type=int, default=0, choices=[0, 1]) | ||
parser.add_argument("-negative", help="Number of negative examples; default is 5, common values are 3 - 10 (0 = not used)", type=int, default=5) | ||
parser.add_argument("-threads", help="Use THREADS threads (default 12)", type=int, default=12) | ||
parser.add_argument("-iter", help="Run more training iterations (default 5)", type=int, default=5) | ||
parser.add_argument("-min_count", help="This will discard words that appear less than MIN_COUNT times; default is 5", type=int, default=5) | ||
parser.add_argument("-cbow", help="Use the continuous bag of words model; default is 1 (use 0 for skip-gram model)", type=int, default=1, choices=[0, 1]) | ||
parser.add_argument("-binary", help="Save the resulting vectors in binary mode; default is 0 (off)", type=int, default=0, choices=[0, 1]) | ||
parser.add_argument("-accuracy", help="Use questions from file ACCURACY to evaluate the model") | ||
|
||
args = parser.parse_args() | ||
|
||
if args.cbow == 0: | ||
skipgram = 1 | ||
else: | ||
skipgram = 0 | ||
|
||
corpus = LineSentence(args.train) | ||
|
||
model = Word2Vec(corpus, size=args.size, min_count=args.min_count, workers=args.threads, window=args.window,sample=args.sample,sg=skipgram,hs=args.hs,negative=args.negative,cbow_mean=1,iter=args.iter) | ||
|
||
if args.output: | ||
outfile = args.output | ||
model.save_word2vec_format(outfile, binary=args.binary) | ||
else: | ||
outfile = args.train | ||
model.save(outfile + '.model') | ||
if args.binary == 1: | ||
model.save_word2vec_format(outfile + '.model.bin', binary=True) | ||
else: | ||
model.save_word2vec_format(outfile + '.model.txt', binary=False) | ||
|
||
if args.accuracy: | ||
questions_file = args.accuracy | ||
model.accuracy(questions_file) | ||
|
||
if len(sys.argv) > 2: | ||
questions_file = sys.argv[2] | ||
model.accuracy(sys.argv[2]) | ||
|
||
logger.info("finished running %s", program) | ||
logging.info("finished running %s", program) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Since our multithreading doesn't parallelize as effectively as the C code (and in fact hits a point of diminishing returns even before reaching the CPU-core count), the
workers
parameter is one exception where we should pick a different value. I suggest 3 as a value likely to help without causing issues.