From 92e6e22694835b22aaf47ce7f80930a24fe42cb1 Mon Sep 17 00:00:00 2001 From: persiyanov Date: Mon, 4 Jun 2018 16:20:58 +0300 Subject: [PATCH 01/41] multistream scan vocab for doc2vec, word2vec & fastText --- gensim/models/base_any2vec.py | 42 ++++++++---- gensim/models/doc2vec.py | 125 ++++++++++++++++++++++++++++------ gensim/models/word2vec.py | 94 +++++++++++++++++++++++-- gensim/utils.py | 22 ++++++ 4 files changed, 242 insertions(+), 41 deletions(-) diff --git a/gensim/models/base_any2vec.py b/gensim/models/base_any2vec.py index e6a31263ec..c8fa2dd865 100644 --- a/gensim/models/base_any2vec.py +++ b/gensim/models/base_any2vec.py @@ -17,6 +17,7 @@ from types import GeneratorType from gensim.utils import deprecated import warnings +import itertools try: from queue import Queue @@ -200,7 +201,7 @@ def _log_epoch_progress(self, progress_queue, job_queue, cur_epoch=0, total_exam self.total_train_time += elapsed return trained_word_count, raw_word_count, job_tally - def _train_epoch(self, data_iterable, cur_epoch=0, total_examples=None, + def _train_epoch(self, data_iterable, multistream=False, cur_epoch=0, total_examples=None, total_words=None, queue_factor=2, report_delay=1.0): """Train one epoch.""" job_queue = Queue(maxsize=queue_factor * self.workers) @@ -213,6 +214,9 @@ def _train_epoch(self, data_iterable, cur_epoch=0, total_examples=None, for _ in xrange(self.workers) ] + # Chain all input streams into one, because multistream training is not supported yet. + if multistream: + data_iterable = itertools.chain(*data_iterable) workers.append(threading.Thread( target=self._job_producer, args=(data_iterable, job_queue), @@ -228,7 +232,7 @@ def _train_epoch(self, data_iterable, cur_epoch=0, total_examples=None, return trained_word_count, raw_word_count, job_tally - def train(self, data_iterable, epochs=None, total_examples=None, + def train(self, data_iterable, multistream=False, epochs=None, total_examples=None, total_words=None, queue_factor=2, report_delay=1.0, callbacks=(), **kwargs): """Handle multi-worker training.""" self._set_train_params(**kwargs) @@ -253,8 +257,8 @@ def train(self, data_iterable, epochs=None, total_examples=None, callback.on_epoch_begin(self) trained_word_count_epoch, raw_word_count_epoch, job_tally_epoch = self._train_epoch( - data_iterable, cur_epoch=cur_epoch, total_examples=total_examples, total_words=total_words, - queue_factor=queue_factor, report_delay=report_delay) + data_iterable, multistream=multistream, cur_epoch=cur_epoch, total_examples=total_examples, + total_words=total_words, queue_factor=queue_factor, report_delay=report_delay) trained_word_count += trained_word_count_epoch raw_word_count += raw_word_count_epoch job_tally += job_tally_epoch @@ -297,8 +301,8 @@ def _do_train_job(self, data_iterable, job_parameters, thread_private_mem): def _set_train_params(self, **kwargs): raise NotImplementedError() - def __init__(self, sentences=None, workers=3, vector_size=100, epochs=5, callbacks=(), batch_words=10000, - trim_rule=None, sg=0, alpha=0.025, window=5, seed=1, hs=0, negative=5, cbow_mean=1, + def __init__(self, sentences=None, multistream=False, workers=3, vector_size=100, epochs=5, callbacks=(), + batch_words=10000, trim_rule=None, sg=0, alpha=0.025, window=5, seed=1, hs=0, negative=5, cbow_mean=1, min_alpha=0.0001, compute_loss=False, fast_version=0, **kwargs): self.sg = int(sg) if vector_size % 4 != 0: @@ -330,12 +334,14 @@ def __init__(self, sentences=None, workers=3, vector_size=100, epochs=5, callbac self.neg_labels[0] = 1. if sentences is not None: - if isinstance(sentences, GeneratorType): + if multistream and not isinstance(sentences, (tuple, list)): + raise TypeError("If multistream=True, you must pass tuple or list as the sentences argument.") + if not multistream and isinstance(sentences, GeneratorType): raise TypeError("You can't pass a generator as the sentences argument. Try an iterator.") - self.build_vocab(sentences, trim_rule=trim_rule) + self.build_vocab(sentences, multistream=multistream, trim_rule=trim_rule) self.train( - sentences, total_examples=self.corpus_count, epochs=self.epochs, start_alpha=self.alpha, - end_alpha=self.min_alpha, compute_loss=compute_loss) + sentences, total_examples=self.corpus_count, epochs=self.epochs, multistream=multistream, + start_alpha=self.alpha, end_alpha=self.min_alpha, compute_loss=compute_loss) else: if trim_rule is not None: logger.warning( @@ -459,17 +465,23 @@ def __str__(self): self.__class__.__name__, len(self.wv.index2word), self.vector_size, self.alpha ) - def build_vocab(self, sentences, update=False, progress_per=10000, keep_raw_vocab=False, trim_rule=None, **kwargs): + def build_vocab(self, sentences, multistream=False, update=False, progress_per=10000, + keep_raw_vocab=False, trim_rule=None, **kwargs): """Build vocabulary from a sequence of sentences (can be a once-only generator stream). Each sentence is a iterable of iterables (can simply be a list of unicode strings too). Parameters ---------- - sentences : iterable of iterables + sentences : {iterable of iterables, list or tuple of iterable of iterables} The `sentences` iterable can be simply a list of lists of tokens, but for larger corpora, consider an iterable that streams the sentences directly from disk/network. See :class:`~gensim.models.word2vec.BrownCorpus`, :class:`~gensim.models.word2vec.Text8Corpus` or :class:`~gensim.models.word2vec.LineSentence` in :mod:`~gensim.models.word2vec` module for such examples. + If `multistream=True`, `sentences` must be list or tuple of input streams described above. + multistream : bool + If True, use `sentences` as list of input streams and speed up vocab building by parallelization + with `min(len(sentences), self.workers)` processes. This option can lead up to 2.5x reduction + in vocabulary building time. update : bool If true, the new words in `sentences` will be added to model's vocab. progress_per : int @@ -477,7 +489,7 @@ def build_vocab(self, sentences, update=False, progress_per=10000, keep_raw_voca """ total_words, corpus_count = self.vocabulary.scan_vocab( - sentences, progress_per=progress_per, trim_rule=trim_rule) + sentences, multistream=multistream, progress_per=progress_per, trim_rule=trim_rule) self.corpus_count = corpus_count report_values = self.vocabulary.prepare_vocab( self.hs, self.negative, self.wv, update=update, keep_raw_vocab=keep_raw_vocab, @@ -555,7 +567,7 @@ def estimate_memory(self, vocab_size=None, report=None): ) return report - def train(self, sentences, total_examples=None, total_words=None, + def train(self, sentences, multistream=False, total_examples=None, total_words=None, epochs=None, start_alpha=None, end_alpha=None, word_count=0, queue_factor=2, report_delay=1.0, compute_loss=False, callbacks=()): @@ -564,7 +576,7 @@ def train(self, sentences, total_examples=None, total_words=None, self.compute_loss = compute_loss self.running_training_loss = 0.0 return super(BaseWordEmbeddingsModel, self).train( - sentences, total_examples=total_examples, total_words=total_words, + sentences, multistream=multistream, total_examples=total_examples, total_words=total_words, epochs=epochs, start_alpha=start_alpha, end_alpha=end_alpha, word_count=word_count, queue_factor=queue_factor, report_delay=report_delay, compute_loss=compute_loss, callbacks=callbacks) diff --git a/gensim/models/doc2vec.py b/gensim/models/doc2vec.py index f57694273d..a5fa4974c8 100644 --- a/gensim/models/doc2vec.py +++ b/gensim/models/doc2vec.py @@ -48,6 +48,7 @@ import logging import os import warnings +import multiprocessing try: from queue import Queue @@ -699,7 +700,8 @@ def estimate_memory(self, vocab_size=None, report=None): report['doctag_syn0'] = self.docvecs.count * self.vector_size * dtype(REAL).itemsize return super(Doc2Vec, self).estimate_memory(vocab_size, report=report) - def build_vocab(self, documents, update=False, progress_per=10000, keep_raw_vocab=False, trim_rule=None, **kwargs): + def build_vocab(self, documents, multistream=False, update=False, + progress_per=10000, keep_raw_vocab=False, trim_rule=None, **kwargs): """Build vocabulary from a sequence of sentences (can be a once-only generator stream). Each sentence is a iterable of iterables (can simply be a list of unicode strings too). @@ -710,6 +712,10 @@ def build_vocab(self, documents, update=False, progress_per=10000, keep_raw_voca consider an iterable that streams the documents directly from disk/network. See :class:`~gensim.models.doc2vec.TaggedBrownCorpus` or :class:`~gensim.models.doc2vec.TaggedLineDocument` in :mod:`~gensim.models.doc2vec` module for such examples. + multistream : bool + If True, use `documents` as list of input streams and speed up vocab building by parallelization + with `min(len(documents), self.workers)` processes. This option can lead up to 2.5x reduction + in vocabulary building time. keep_raw_vocab : bool If not true, delete the raw vocabulary after the scaling is done and free up RAM. trim_rule : function @@ -726,7 +732,7 @@ def build_vocab(self, documents, update=False, progress_per=10000, keep_raw_voca If true, the new words in `sentences` will be added to model's vocab. """ total_words, corpus_count = self.vocabulary.scan_vocab( - documents, self.docvecs, progress_per=progress_per, trim_rule=trim_rule) + documents, self.docvecs, multistream=multistream, progress_per=progress_per, trim_rule=trim_rule) self.corpus_count = corpus_count report_values = self.vocabulary.prepare_vocab( self.hs, self.negative, self.wv, update=update, keep_raw_vocab=keep_raw_vocab, trim_rule=trim_rule, @@ -789,14 +795,96 @@ def build_vocab_from_freq(self, word_freq, keep_raw_vocab=False, corpus_count=No self.hs, self.negative, self.wv, self.docvecs, update=update) +def _note_doctag(key, document_no, document_length, docvecs): + """Note a document tag during initial corpus scan, for structure sizing.""" + if isinstance(key, integer_types + (integer,)): + docvecs.max_rawint = max(docvecs.max_rawint, key) + else: + if key in docvecs.doctags: + docvecs.doctags[key] = docvecs.doctags[key].repeat(document_length) + else: + docvecs.doctags[key] = Doctag(len(docvecs.offset2doctag), document_length, 1) + docvecs.offset2doctag.append(key) + docvecs.count = docvecs.max_rawint + 1 + len(docvecs.offset2doctag) + + +def _scan_vocab_worker(stream, docvecs, progress_queue, max_vocab_size, trim_rule): + min_reduce = 1 + vocab = defaultdict(int) + checked_string_types = 0 + document_no = -1 + total_words = 0 + for document_no, document in enumerate(stream): + if not checked_string_types: + if isinstance(document.words, string_types): + log_msg = "Each 'words' should be a list of words (usually unicode strings). " \ + "First 'words' here is instead plain %s." % type(document.words) + progress_queue.put(log_msg) + + checked_string_types += 1 + + document_length = len(document.words) + + for tag in document.tags: + _note_doctag(tag, document_no, document_length, docvecs) + + for word in document.words: + vocab[word] += 1 + total_words += len(document.words) + + if max_vocab_size and len(vocab) > max_vocab_size: + utils.prune_vocab(vocab, min_reduce, trim_rule=trim_rule) + min_reduce += 1 + + progress_queue.put((total_words, document_no + 1)) + progress_queue.put(None) + return vocab + + class Doc2VecVocab(Word2VecVocab): def __init__(self, max_vocab_size=None, min_count=5, sample=1e-3, sorted_vocab=True, null_word=0): super(Doc2VecVocab, self).__init__( max_vocab_size=max_vocab_size, min_count=min_count, sample=sample, sorted_vocab=sorted_vocab, null_word=null_word) - def scan_vocab(self, documents, docvecs, progress_per=10000, trim_rule=None): - logger.info("collecting all words and their counts") + def _scan_vocab_multistream(self, input_streams, docvecs, progress_per, workers, trim_rule): + manager = multiprocessing.Manager() + progress_queue = manager.Queue() + + logger.info("Scanning vocab in %i processes.", min(workers, len(input_streams))) + pool = multiprocessing.Pool(processes=min(workers, len(input_streams))) + + results = [ + pool.apply_async(_scan_vocab_worker, + (stream, docvecs, progress_queue, self.max_vocab_size, trim_rule) + ) for stream in input_streams + ] + pool.close() + + unfinished_tasks = len(results) + total_words = 0 + document_no = -1 + while unfinished_tasks > 0: + report = progress_queue.get() + if report is None: + unfinished_tasks -= 1 + logger.info("scan vocab task finished, processed %i documents and %i words;" + " awaiting finish of %i more tasks", document_no + 1, total_words, unfinished_tasks) + elif isinstance(report, string_types): + logger.warning(report) + else: + num_words, num_documents = report + total_words += num_words + document_no += num_documents + + if document_no % progress_per == 0: + logger.info("PROGRESS: at sentence #%i, processed %i words", document_no, total_words) + + corpus_count = document_no + 1 + self.raw_vocab = reduce(utils.merge_dicts, [res.get() for res in results]) + return total_words, corpus_count + + def _scan_vocab_singlestream(self, documents, docvecs, progress_per, trim_rule): document_no = -1 total_words = 0 min_reduce = 1 @@ -824,7 +912,7 @@ def scan_vocab(self, documents, docvecs, progress_per=10000, trim_rule=None): document_length = len(document.words) for tag in document.tags: - self.note_doctag(tag, document_no, document_length, docvecs) + _note_doctag(tag, document_no, document_length, docvecs) for word in document.words: vocab[word] += 1 @@ -834,25 +922,24 @@ def scan_vocab(self, documents, docvecs, progress_per=10000, trim_rule=None): utils.prune_vocab(vocab, min_reduce, trim_rule=trim_rule) min_reduce += 1 - logger.info( - "collected %i word types and %i unique tags from a corpus of %i examples and %i words", - len(vocab), docvecs.count, document_no + 1, total_words - ) corpus_count = document_no + 1 self.raw_vocab = vocab return total_words, corpus_count - def note_doctag(self, key, document_no, document_length, docvecs): - """Note a document tag during initial corpus scan, for structure sizing.""" - if isinstance(key, integer_types + (integer,)): - docvecs.max_rawint = max(docvecs.max_rawint, key) + def scan_vocab(self, documents, docvecs, multistream=False, progress_per=10000, workers=1, trim_rule=None): + logger.info("collecting all words and their counts") + if not multistream: + total_words, corpus_count = self._scan_vocab_singlestream(documents, docvecs, progress_per, trim_rule) else: - if key in docvecs.doctags: - docvecs.doctags[key] = docvecs.doctags[key].repeat(document_length) - else: - docvecs.doctags[key] = Doctag(len(docvecs.offset2doctag), document_length, 1) - docvecs.offset2doctag.append(key) - docvecs.count = docvecs.max_rawint + 1 + len(docvecs.offset2doctag) + total_words, corpus_count = self._scan_vocab_multistream(documents, docvecs, progress_per, workers, + trim_rule) + + logger.info( + "collected %i word types and %i unique tags from a corpus of %i examples and %i words", + len(self.raw_vocab), docvecs.count, corpus_count, total_words + ) + + return total_words, corpus_count def indexed_doctags(self, doctag_tokens, docvecs): """Return indexes and backing-arrays used in training examples.""" diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index 9539aa8d2c..5ad9d49fe2 100755 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -110,6 +110,7 @@ from copy import deepcopy from collections import defaultdict import threading +import multiprocessing import itertools import warnings @@ -1144,9 +1145,43 @@ def __iter__(self): i += self.max_sentence_length +def _scan_vocab_worker(stream, progress_queue, max_vocab_size=None, trim_rule=None): + """Do an initial scan of all words appearing in stream. + + Note: This function can not be Word2VecVocab's method because + of multiprocessing synchronization specifics in Python. + """ + min_reduce = 1 + vocab = defaultdict(int) + checked_string_types = 0 + sentence_no = -1 + total_words = 0 + for sentence_no, sentence in enumerate(stream): + if not checked_string_types: + if isinstance(sentence, string_types): + log_msg = "Each 'sentences' item should be a list of words (usually unicode strings). " \ + "First item here is instead plain %s." % type(sentence) + progress_queue.put(log_msg) + + checked_string_types += 1 + + for word in sentence: + vocab[word] += 1 + + if max_vocab_size and len(vocab) > max_vocab_size: + utils.prune_vocab(vocab, min_reduce, trim_rule=trim_rule) + min_reduce += 1 + + total_words += len(sentence) + + progress_queue.put((total_words, sentence_no + 1)) + progress_queue.put(None) + return vocab + + class Word2VecVocab(utils.SaveLoad): def __init__(self, max_vocab_size=None, min_count=5, sample=1e-3, sorted_vocab=True, null_word=0, - max_final_vocab=None): + max_final_vocab=None): self.max_vocab_size = max_vocab_size self.min_count = min_count self.sample = sample @@ -1156,9 +1191,7 @@ def __init__(self, max_vocab_size=None, min_count=5, sample=1e-3, sorted_vocab=T self.raw_vocab = None self.max_final_vocab = max_final_vocab - def scan_vocab(self, sentences, progress_per=10000, trim_rule=None): - """Do an initial scan of all words appearing in sentences.""" - logger.info("collecting all words and their counts") + def _scan_vocab_singlestream(self, sentences, progress_per, trim_rule): sentence_no = -1 total_words = 0 min_reduce = 1 @@ -1186,12 +1219,59 @@ def scan_vocab(self, sentences, progress_per=10000, trim_rule=None): utils.prune_vocab(vocab, min_reduce, trim_rule=trim_rule) min_reduce += 1 + corpus_count = sentence_no + 1 + self.raw_vocab = vocab + return total_words, corpus_count + + def _scan_vocab_multistream(self, input_streams, progress_per, workers, trim_rule): + manager = multiprocessing.Manager() + progress_queue = manager.Queue() + + logger.info("Scanning vocab in %i processes.", min(workers, len(input_streams))) + pool = multiprocessing.Pool(processes=min(workers, len(input_streams))) + + results = [ + pool.apply_async(_scan_vocab_worker, + (stream, progress_queue, progress_per, self.max_vocab_size, trim_rule) + ) for stream in input_streams + ] + pool.close() + + unfinished_tasks = len(results) + total_words = 0 + sentence_no = -1 + while unfinished_tasks > 0: + report = progress_queue.get() + if report is None: + unfinished_tasks -= 1 + logger.info("scan vocab task finished, processed %i sentences and %i words;" + " awaiting finish of %i more tasks", sentence_no + 1, total_words, unfinished_tasks) + elif isinstance(report, string_types): + logger.warning(report) + else: + num_words, num_sentences = report + total_words += num_words + sentence_no += num_sentences + + if sentence_no % progress_per == 0: + logger.info("PROGRESS: at sentence #%i, processed %i words", sentence_no, total_words) + + corpus_count = sentence_no + 1 + self.raw_vocab = reduce(utils.merge_dicts, [res.get() for res in results]) + return total_words, corpus_count + + def scan_vocab(self, sentences, multistream=False, progress_per=10000, workers=1, trim_rule=None): + logger.info("collecting all words and their counts") + if not multistream: + total_words, corpus_count = self._scan_vocab_singlestream(sentences, progress_per, trim_rule) + else: + total_words, corpus_count = self._scan_vocab_multistream(sentences, progress_per, workers, trim_rule) + logger.info( "collected %i word types from a corpus of %i raw words and %i sentences", - len(vocab), total_words, sentence_no + 1 + len(self.raw_vocab), total_words, corpus_count ) - corpus_count = sentence_no + 1 - self.raw_vocab = vocab + return total_words, corpus_count def sort_vocab(self, wv): diff --git a/gensim/utils.py b/gensim/utils.py index 6d2823c652..1571a87a1e 100644 --- a/gensim/utils.py +++ b/gensim/utils.py @@ -1709,6 +1709,28 @@ def prune_vocab(vocab, min_reduce, trim_rule=None): return result +def merge_dicts(dict1, dict2): + """Merge `dict1` of (word, freq1) and `dict2` of (word, freq2) into `dict1` of (word, freq1+freq2). + Parameters + ---------- + dict1 : dict + First dictionary. + dict2 : dict + Second dictionary. + Returns + ------- + result : dict + Merged dictionary with sum of frequencies as values. + """ + for word, freq in dict2.iteritems(): + if word in dict1: + dict1[word] += freq + else: + dict1[word] = freq + + return dict1 + + def qsize(queue): """Get the (approximate) queue size where available. From 2618a2ea49a05efd30242d022b3b35c27e65250b Mon Sep 17 00:00:00 2001 From: persiyanov Date: Mon, 4 Jun 2018 17:25:18 +0300 Subject: [PATCH 02/41] fixes --- gensim/models/base_any2vec.py | 9 +++-- gensim/models/doc2vec.py | 18 +++++----- gensim/models/word2vec.py | 13 +++---- gensim/scripts/benchmark_vocab.py | 59 +++++++++++++++++++++++++++++++ 4 files changed, 80 insertions(+), 19 deletions(-) create mode 100644 gensim/scripts/benchmark_vocab.py diff --git a/gensim/models/base_any2vec.py b/gensim/models/base_any2vec.py index c8fa2dd865..47d0f6f6d5 100644 --- a/gensim/models/base_any2vec.py +++ b/gensim/models/base_any2vec.py @@ -465,7 +465,7 @@ def __str__(self): self.__class__.__name__, len(self.wv.index2word), self.vector_size, self.alpha ) - def build_vocab(self, sentences, multistream=False, update=False, progress_per=10000, + def build_vocab(self, sentences, multistream=False, workers=None, update=False, progress_per=10000, keep_raw_vocab=False, trim_rule=None, **kwargs): """Build vocabulary from a sequence of sentences (can be a once-only generator stream). Each sentence is a iterable of iterables (can simply be a list of unicode strings too). @@ -482,14 +482,19 @@ def build_vocab(self, sentences, multistream=False, update=False, progress_per=1 If True, use `sentences` as list of input streams and speed up vocab building by parallelization with `min(len(sentences), self.workers)` processes. This option can lead up to 2.5x reduction in vocabulary building time. + workers : int + Used if `multistream=True`. Determines how many processes to use for vocab building. update : bool If true, the new words in `sentences` will be added to model's vocab. progress_per : int Indicates how many words to process before showing/updating the progress. """ + if workers is None: + workers = self.workers + total_words, corpus_count = self.vocabulary.scan_vocab( - sentences, multistream=multistream, progress_per=progress_per, trim_rule=trim_rule) + sentences, multistream=multistream, progress_per=progress_per, trim_rule=trim_rule, workers=workers) self.corpus_count = corpus_count report_values = self.vocabulary.prepare_vocab( self.hs, self.negative, self.wv, update=update, keep_raw_vocab=keep_raw_vocab, diff --git a/gensim/models/doc2vec.py b/gensim/models/doc2vec.py index a5fa4974c8..8f3eb3fdcd 100644 --- a/gensim/models/doc2vec.py +++ b/gensim/models/doc2vec.py @@ -700,7 +700,7 @@ def estimate_memory(self, vocab_size=None, report=None): report['doctag_syn0'] = self.docvecs.count * self.vector_size * dtype(REAL).itemsize return super(Doc2Vec, self).estimate_memory(vocab_size, report=report) - def build_vocab(self, documents, multistream=False, update=False, + def build_vocab(self, documents, multistream=False, workers=None, update=False, progress_per=10000, keep_raw_vocab=False, trim_rule=None, **kwargs): """Build vocabulary from a sequence of sentences (can be a once-only generator stream). Each sentence is a iterable of iterables (can simply be a list of unicode strings too). @@ -716,6 +716,8 @@ def build_vocab(self, documents, multistream=False, update=False, If True, use `documents` as list of input streams and speed up vocab building by parallelization with `min(len(documents), self.workers)` processes. This option can lead up to 2.5x reduction in vocabulary building time. + workers : int + Used if `multistream=True`. Determines how many processes to use for vocab building. keep_raw_vocab : bool If not true, delete the raw vocabulary after the scaling is done and free up RAM. trim_rule : function @@ -732,7 +734,9 @@ def build_vocab(self, documents, multistream=False, update=False, If true, the new words in `sentences` will be added to model's vocab. """ total_words, corpus_count = self.vocabulary.scan_vocab( - documents, self.docvecs, multistream=multistream, progress_per=progress_per, trim_rule=trim_rule) + documents, self.docvecs, multistream=multistream, + progress_per=progress_per, trim_rule=trim_rule, workers=workers + ) self.corpus_count = corpus_count report_values = self.vocabulary.prepare_vocab( self.hs, self.negative, self.wv, update=update, keep_raw_vocab=keep_raw_vocab, trim_rule=trim_rule, @@ -847,7 +851,7 @@ def __init__(self, max_vocab_size=None, min_count=5, sample=1e-3, sorted_vocab=T max_vocab_size=max_vocab_size, min_count=min_count, sample=sample, sorted_vocab=sorted_vocab, null_word=null_word) - def _scan_vocab_multistream(self, input_streams, docvecs, progress_per, workers, trim_rule): + def _scan_vocab_multistream(self, input_streams, docvecs, workers, trim_rule): manager = multiprocessing.Manager() progress_queue = manager.Queue() @@ -877,9 +881,6 @@ def _scan_vocab_multistream(self, input_streams, docvecs, progress_per, workers, total_words += num_words document_no += num_documents - if document_no % progress_per == 0: - logger.info("PROGRESS: at sentence #%i, processed %i words", document_no, total_words) - corpus_count = document_no + 1 self.raw_vocab = reduce(utils.merge_dicts, [res.get() for res in results]) return total_words, corpus_count @@ -926,13 +927,12 @@ def _scan_vocab_singlestream(self, documents, docvecs, progress_per, trim_rule): self.raw_vocab = vocab return total_words, corpus_count - def scan_vocab(self, documents, docvecs, multistream=False, progress_per=10000, workers=1, trim_rule=None): + def scan_vocab(self, documents, docvecs, multistream=False, progress_per=10000, workers=None, trim_rule=None): logger.info("collecting all words and their counts") if not multistream: total_words, corpus_count = self._scan_vocab_singlestream(documents, docvecs, progress_per, trim_rule) else: - total_words, corpus_count = self._scan_vocab_multistream(documents, docvecs, progress_per, workers, - trim_rule) + total_words, corpus_count = self._scan_vocab_multistream(documents, docvecs, workers, trim_rule) logger.info( "collected %i word types and %i unique tags from a corpus of %i examples and %i words", diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index 5ad9d49fe2..54918049d9 100755 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -1223,7 +1223,7 @@ def _scan_vocab_singlestream(self, sentences, progress_per, trim_rule): self.raw_vocab = vocab return total_words, corpus_count - def _scan_vocab_multistream(self, input_streams, progress_per, workers, trim_rule): + def _scan_vocab_multistream(self, input_streams, workers, trim_rule): manager = multiprocessing.Manager() progress_queue = manager.Queue() @@ -1232,7 +1232,7 @@ def _scan_vocab_multistream(self, input_streams, progress_per, workers, trim_rul results = [ pool.apply_async(_scan_vocab_worker, - (stream, progress_queue, progress_per, self.max_vocab_size, trim_rule) + (stream, progress_queue, self.max_vocab_size, trim_rule) ) for stream in input_streams ] pool.close() @@ -1253,19 +1253,16 @@ def _scan_vocab_multistream(self, input_streams, progress_per, workers, trim_rul total_words += num_words sentence_no += num_sentences - if sentence_no % progress_per == 0: - logger.info("PROGRESS: at sentence #%i, processed %i words", sentence_no, total_words) - corpus_count = sentence_no + 1 self.raw_vocab = reduce(utils.merge_dicts, [res.get() for res in results]) return total_words, corpus_count - def scan_vocab(self, sentences, multistream=False, progress_per=10000, workers=1, trim_rule=None): + def scan_vocab(self, sentences, multistream=False, progress_per=10000, workers=None, trim_rule=None): logger.info("collecting all words and their counts") if not multistream: total_words, corpus_count = self._scan_vocab_singlestream(sentences, progress_per, trim_rule) else: - total_words, corpus_count = self._scan_vocab_multistream(sentences, progress_per, workers, trim_rule) + total_words, corpus_count = self._scan_vocab_multistream(sentences, workers, trim_rule) logger.info( "collected %i word types from a corpus of %i raw words and %i sentences", @@ -1283,7 +1280,7 @@ def sort_vocab(self, wv): wv.vocab[word].index = i def prepare_vocab(self, hs, negative, wv, update=False, keep_raw_vocab=False, trim_rule=None, - min_count=None, sample=None, dry_run=False): + min_count=None, sample=None, dry_run=False, **kwargs): """Apply vocabulary settings for `min_count` (discarding less-frequent words) and `sample` (controlling the downsampling of more-frequent words). diff --git a/gensim/scripts/benchmark_vocab.py b/gensim/scripts/benchmark_vocab.py new file mode 100644 index 0000000000..b9ab5cedda --- /dev/null +++ b/gensim/scripts/benchmark_vocab.py @@ -0,0 +1,59 @@ +from __future__ import unicode_literals +from __future__ import print_function + +import logging +import argparse +import time +import os +import glob +import itertools + +from gensim.models.word2vec import Word2Vec, LineSentence +from gensim.models.doc2vec import Doc2Vec, TaggedLineDocument +from gensim.models.fasttext import FastText + + +logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) + +logger = logging.getLogger(__name__) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='GSOC Multistream-API: evaluate vocab performance ' + 'for word2vec') + parser.add_argument('--input', type=str, help='Input file or regexp for multistream.') + parser.add_argument('--size', type=int, default=300) + parser.add_argument('--workers-grid', nargs='+', type=int, default=[1, 2, 3, 4, 5, 8, 10]) + parser.add_argument('--model', type=str, default='word2vec') + parser.add_argument('--label', type=str, default='untitled') + + args = parser.parse_args() + + input_ = os.path.expanduser(args.input) + input_files = glob.glob(input_) + logger.info('Glob found {} input files. List: {}'.format(len(input_files), input_files)) + + for workers in args.workers_grid: + if args.model == 'word2vec': + input_streams = [LineSentence(_) for _ in input_files] + model = Word2Vec() + elif args.model == 'doc2vec': + input_streams = [TaggedLineDocument(_) for _ in input_files] + model = Doc2Vec() + elif args.model == 'fasttext': + input_streams = [LineSentence(_) for _ in input_files] + model = FastText() + else: + raise NotImplementedError("Model '{}' is not supported", args.model) + + if workers == 1: + multistream = False + input_streams = itertools.chain(*input_streams) + else: + multistream = True + + logger.info('Start building vocab with model={}, workers={}, multistream={}'.format(args.model, workers, multistream)) + start_time = time.time() + model.build_vocab(input_streams, workers=workers, multistream=multistream) + end_time = time.time() + logger.info('Model = {}\tWorkers = {}\tVocab time = {:.2f} secs'.format(args.model, workers, end_time - start_time)) \ No newline at end of file From 7960af833812a22cb72b9eaf61f4344121f930c8 Mon Sep 17 00:00:00 2001 From: persiyanov Date: Mon, 4 Jun 2018 18:19:11 +0300 Subject: [PATCH 03/41] fix tags for doc2vec --- gensim/models/doc2vec.py | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/gensim/models/doc2vec.py b/gensim/models/doc2vec.py index 8f3eb3fdcd..95615d9ced 100644 --- a/gensim/models/doc2vec.py +++ b/gensim/models/doc2vec.py @@ -733,6 +733,9 @@ def build_vocab(self, documents, multistream=False, workers=None, update=False, update : bool If true, the new words in `sentences` will be added to model's vocab. """ + if workers is None: + workers = self.workers + total_words, corpus_count = self.vocabulary.scan_vocab( documents, self.docvecs, multistream=multistream, progress_per=progress_per, trim_rule=trim_rule, workers=workers @@ -799,7 +802,7 @@ def build_vocab_from_freq(self, word_freq, keep_raw_vocab=False, corpus_count=No self.hs, self.negative, self.wv, self.docvecs, update=update) -def _note_doctag(key, document_no, document_length, docvecs): +def _note_doctag(key, document_length, docvecs): """Note a document tag during initial corpus scan, for structure sizing.""" if isinstance(key, integer_types + (integer,)): docvecs.max_rawint = max(docvecs.max_rawint, key) @@ -812,9 +815,10 @@ def _note_doctag(key, document_no, document_length, docvecs): docvecs.count = docvecs.max_rawint + 1 + len(docvecs.offset2doctag) -def _scan_vocab_worker(stream, docvecs, progress_queue, max_vocab_size, trim_rule): +def _scan_vocab_worker(stream, progress_queue, max_vocab_size, trim_rule): min_reduce = 1 vocab = defaultdict(int) + doclen2tags = defaultdict(list) checked_string_types = 0 document_no = -1 total_words = 0 @@ -830,7 +834,7 @@ def _scan_vocab_worker(stream, docvecs, progress_queue, max_vocab_size, trim_rul document_length = len(document.words) for tag in document.tags: - _note_doctag(tag, document_no, document_length, docvecs) + doclen2tags[document_length].append(tag) for word in document.words: vocab[word] += 1 @@ -842,7 +846,7 @@ def _scan_vocab_worker(stream, docvecs, progress_queue, max_vocab_size, trim_rul progress_queue.put((total_words, document_no + 1)) progress_queue.put(None) - return vocab + return vocab, doclen2tags class Doc2VecVocab(Word2VecVocab): @@ -860,7 +864,7 @@ def _scan_vocab_multistream(self, input_streams, docvecs, workers, trim_rule): results = [ pool.apply_async(_scan_vocab_worker, - (stream, docvecs, progress_queue, self.max_vocab_size, trim_rule) + (stream, progress_queue, self.max_vocab_size, trim_rule) ) for stream in input_streams ] pool.close() @@ -882,7 +886,14 @@ def _scan_vocab_multistream(self, input_streams, docvecs, workers, trim_rule): document_no += num_documents corpus_count = document_no + 1 - self.raw_vocab = reduce(utils.merge_dicts, [res.get() for res in results]) + results = [res.get() for res in results] # pairs (vocab, doclen2tags) + self.raw_vocab = reduce(utils.merge_dicts, [r[0] for r in results]) + + for (_, doclen2tags) in results: + for document_length, tags in doclen2tags.iteritems(): + for tag in tags: + _note_doctag(tag, document_length, docvecs) + return total_words, corpus_count def _scan_vocab_singlestream(self, documents, docvecs, progress_per, trim_rule): @@ -913,7 +924,7 @@ def _scan_vocab_singlestream(self, documents, docvecs, progress_per, trim_rule): document_length = len(document.words) for tag in document.tags: - _note_doctag(tag, document_no, document_length, docvecs) + _note_doctag(tag, document_length, docvecs) for word in document.words: vocab[word] += 1 From b8da97af3f7e0d24ccefc486fd65b769108f55d2 Mon Sep 17 00:00:00 2001 From: persiyanov Date: Mon, 4 Jun 2018 18:58:54 +0300 Subject: [PATCH 04/41] fix tests --- gensim/models/doc2vec.py | 4 ++-- gensim/models/fasttext.py | 4 ++-- gensim/models/word2vec.py | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/gensim/models/doc2vec.py b/gensim/models/doc2vec.py index 95615d9ced..9eb5fa9b18 100644 --- a/gensim/models/doc2vec.py +++ b/gensim/models/doc2vec.py @@ -463,7 +463,7 @@ def _do_train_job(self, job, alpha, inits): ) return tally, self._raw_word_count(job) - def train(self, documents, total_examples=None, total_words=None, + def train(self, documents, multistream=False, total_examples=None, total_words=None, epochs=None, start_alpha=None, end_alpha=None, word_count=0, queue_factor=2, report_delay=1.0, callbacks=()): """Update the model's neural weights from a sequence of sentences (can be a once-only generator stream). @@ -508,7 +508,7 @@ def train(self, documents, total_examples=None, total_words=None, List of callbacks that need to be executed/run at specific stages during training. """ super(Doc2Vec, self).train( - documents, total_examples=total_examples, total_words=total_words, + documents, multistream=False, total_examples=total_examples, total_words=total_words, epochs=epochs, start_alpha=start_alpha, end_alpha=end_alpha, word_count=word_count, queue_factor=queue_factor, report_delay=report_delay, callbacks=callbacks) diff --git a/gensim/models/fasttext.py b/gensim/models/fasttext.py index 46e68b44ec..23bc9ff55b 100644 --- a/gensim/models/fasttext.py +++ b/gensim/models/fasttext.py @@ -457,7 +457,7 @@ def _do_train_job(self, sentences, alpha, inits): return tally, self._raw_word_count(sentences) - def train(self, sentences, total_examples=None, total_words=None, + def train(self, sentences, multistream=False, total_examples=None, total_words=None, epochs=None, start_alpha=None, end_alpha=None, word_count=0, queue_factor=2, report_delay=1.0, callbacks=(), **kwargs): """Update the model's neural weights from a sequence of sentences (can be a once-only generator stream). @@ -513,7 +513,7 @@ def train(self, sentences, total_examples=None, total_words=None, """ super(FastText, self).train( - sentences, total_examples=total_examples, total_words=total_words, + sentences, multistream=multistream, total_examples=total_examples, total_words=total_words, epochs=epochs, start_alpha=start_alpha, end_alpha=end_alpha, word_count=word_count, queue_factor=queue_factor, report_delay=report_delay, callbacks=callbacks) self.trainables.get_vocab_word_vecs(self.wv) diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index 54918049d9..22bb9cc94e 100755 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -556,7 +556,7 @@ def _set_train_params(self, **kwargs): self.compute_loss = kwargs['compute_loss'] self.running_training_loss = 0 - def train(self, sentences, total_examples=None, total_words=None, + def train(self, sentences, multistream=False, total_examples=None, total_words=None, epochs=None, start_alpha=None, end_alpha=None, word_count=0, queue_factor=2, report_delay=1.0, compute_loss=False, callbacks=()): """Update the model's neural weights from a sequence of sentences (can be a once-only generator stream). @@ -614,7 +614,7 @@ def train(self, sentences, total_examples=None, total_words=None, """ return super(Word2Vec, self).train( - sentences, total_examples=total_examples, total_words=total_words, + sentences, multistream=multistream, total_examples=total_examples, total_words=total_words, epochs=epochs, start_alpha=start_alpha, end_alpha=end_alpha, word_count=word_count, queue_factor=queue_factor, report_delay=report_delay, compute_loss=compute_loss, callbacks=callbacks) From 16be7160dc58922288018c67b074dc5b8157df4a Mon Sep 17 00:00:00 2001 From: persiyanov Date: Mon, 4 Jun 2018 20:36:43 +0300 Subject: [PATCH 05/41] removed benchmark vocab --- gensim/scripts/benchmark_vocab.py | 59 ------------------------------- 1 file changed, 59 deletions(-) delete mode 100644 gensim/scripts/benchmark_vocab.py diff --git a/gensim/scripts/benchmark_vocab.py b/gensim/scripts/benchmark_vocab.py deleted file mode 100644 index b9ab5cedda..0000000000 --- a/gensim/scripts/benchmark_vocab.py +++ /dev/null @@ -1,59 +0,0 @@ -from __future__ import unicode_literals -from __future__ import print_function - -import logging -import argparse -import time -import os -import glob -import itertools - -from gensim.models.word2vec import Word2Vec, LineSentence -from gensim.models.doc2vec import Doc2Vec, TaggedLineDocument -from gensim.models.fasttext import FastText - - -logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) - -logger = logging.getLogger(__name__) - - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='GSOC Multistream-API: evaluate vocab performance ' - 'for word2vec') - parser.add_argument('--input', type=str, help='Input file or regexp for multistream.') - parser.add_argument('--size', type=int, default=300) - parser.add_argument('--workers-grid', nargs='+', type=int, default=[1, 2, 3, 4, 5, 8, 10]) - parser.add_argument('--model', type=str, default='word2vec') - parser.add_argument('--label', type=str, default='untitled') - - args = parser.parse_args() - - input_ = os.path.expanduser(args.input) - input_files = glob.glob(input_) - logger.info('Glob found {} input files. List: {}'.format(len(input_files), input_files)) - - for workers in args.workers_grid: - if args.model == 'word2vec': - input_streams = [LineSentence(_) for _ in input_files] - model = Word2Vec() - elif args.model == 'doc2vec': - input_streams = [TaggedLineDocument(_) for _ in input_files] - model = Doc2Vec() - elif args.model == 'fasttext': - input_streams = [LineSentence(_) for _ in input_files] - model = FastText() - else: - raise NotImplementedError("Model '{}' is not supported", args.model) - - if workers == 1: - multistream = False - input_streams = itertools.chain(*input_streams) - else: - multistream = True - - logger.info('Start building vocab with model={}, workers={}, multistream={}'.format(args.model, workers, multistream)) - start_time = time.time() - model.build_vocab(input_streams, workers=workers, multistream=multistream) - end_time = time.time() - logger.info('Model = {}\tWorkers = {}\tVocab time = {:.2f} secs'.format(args.model, workers, end_time - start_time)) \ No newline at end of file From c2d674a3a8e22f4ede9e64270ed38fe772e334db Mon Sep 17 00:00:00 2001 From: persiyanov Date: Thu, 7 Jun 2018 13:53:22 +0300 Subject: [PATCH 06/41] addressing comments --- gensim/models/base_any2vec.py | 12 +++++++----- gensim/models/doc2vec.py | 6 +++--- gensim/utils.py | 4 ++-- 3 files changed, 12 insertions(+), 10 deletions(-) diff --git a/gensim/models/base_any2vec.py b/gensim/models/base_any2vec.py index 47d0f6f6d5..9365d6f4c2 100644 --- a/gensim/models/base_any2vec.py +++ b/gensim/models/base_any2vec.py @@ -334,10 +334,14 @@ def __init__(self, sentences=None, multistream=False, workers=3, vector_size=100 self.neg_labels[0] = 1. if sentences is not None: - if multistream and not isinstance(sentences, (tuple, list)): - raise TypeError("If multistream=True, you must pass tuple or list as the sentences argument.") + if multistream: + if not isinstance(sentences, (tuple, list)): + raise TypeError("If multistream=True, you must pass tuple or list as the sentences argument.") + if any(isinstance(stream, GeneratorType) for stream in sentences): + raise TypeError("You can't pass a generators as input streams. Try an iterator.") if not multistream and isinstance(sentences, GeneratorType): raise TypeError("You can't pass a generator as the sentences argument. Try an iterator.") + self.build_vocab(sentences, multistream=multistream, trim_rule=trim_rule) self.train( sentences, total_examples=self.corpus_count, epochs=self.epochs, multistream=multistream, @@ -490,9 +494,7 @@ def build_vocab(self, sentences, multistream=False, workers=None, update=False, Indicates how many words to process before showing/updating the progress. """ - if workers is None: - workers = self.workers - + workers = workers or self.workers total_words, corpus_count = self.vocabulary.scan_vocab( sentences, multistream=multistream, progress_per=progress_per, trim_rule=trim_rule, workers=workers) self.corpus_count = corpus_count diff --git a/gensim/models/doc2vec.py b/gensim/models/doc2vec.py index 9eb5fa9b18..99ad549b08 100644 --- a/gensim/models/doc2vec.py +++ b/gensim/models/doc2vec.py @@ -57,6 +57,7 @@ from collections import namedtuple, defaultdict from timeit import default_timer +from functools import reduce from numpy import zeros, float32 as REAL, empty, ones, \ memmap as np_memmap, vstack, integer, dtype, sum as np_sum, add as np_add, repeat as np_repeat, concatenate @@ -733,9 +734,7 @@ def build_vocab(self, documents, multistream=False, workers=None, update=False, update : bool If true, the new words in `sentences` will be added to model's vocab. """ - if workers is None: - workers = self.workers - + workers = workers or self.workers total_words, corpus_count = self.vocabulary.scan_vocab( documents, self.docvecs, multistream=multistream, progress_per=progress_per, trim_rule=trim_rule, workers=workers @@ -889,6 +888,7 @@ def _scan_vocab_multistream(self, input_streams, docvecs, workers, trim_rule): results = [res.get() for res in results] # pairs (vocab, doclen2tags) self.raw_vocab = reduce(utils.merge_dicts, [r[0] for r in results]) + # Update `docvecs` with document tags information. for (_, doclen2tags) in results: for document_length, tags in doclen2tags.iteritems(): for tag in tags: diff --git a/gensim/utils.py b/gensim/utils.py index 1571a87a1e..a8dd0d51e8 100644 --- a/gensim/utils.py +++ b/gensim/utils.py @@ -1713,9 +1713,9 @@ def merge_dicts(dict1, dict2): """Merge `dict1` of (word, freq1) and `dict2` of (word, freq2) into `dict1` of (word, freq1+freq2). Parameters ---------- - dict1 : dict + dict1 : dict of (str, int) First dictionary. - dict2 : dict + dict2 : dict of (str, int) Second dictionary. Returns ------- From 85e689c9076b80a7c2006d51c34b99e28723ba41 Mon Sep 17 00:00:00 2001 From: persiyanov Date: Thu, 7 Jun 2018 21:20:26 +0300 Subject: [PATCH 07/41] make interfaces and documentation more pretty --- gensim/models/base_any2vec.py | 2 +- gensim/models/doc2vec.py | 43 +++++++++++++++++++---------------- gensim/models/word2vec.py | 22 +++++++++++------- gensim/test/test_utils.py | 10 ++++++++ 4 files changed, 49 insertions(+), 28 deletions(-) diff --git a/gensim/models/base_any2vec.py b/gensim/models/base_any2vec.py index 9365d6f4c2..cc44430d8e 100644 --- a/gensim/models/base_any2vec.py +++ b/gensim/models/base_any2vec.py @@ -481,7 +481,7 @@ def build_vocab(self, sentences, multistream=False, workers=None, update=False, consider an iterable that streams the sentences directly from disk/network. See :class:`~gensim.models.word2vec.BrownCorpus`, :class:`~gensim.models.word2vec.Text8Corpus` or :class:`~gensim.models.word2vec.LineSentence` in :mod:`~gensim.models.word2vec` module for such examples. - If `multistream=True`, `sentences` must be list or tuple of input streams described above. + If `multistream=True`, `sentences` must be a list or tuple of iterables described above. multistream : bool If True, use `sentences` as list of input streams and speed up vocab building by parallelization with `min(len(sentences), self.workers)` processes. This option can lead up to 2.5x reduction diff --git a/gensim/models/doc2vec.py b/gensim/models/doc2vec.py index 99ad549b08..b7e421bebc 100644 --- a/gensim/models/doc2vec.py +++ b/gensim/models/doc2vec.py @@ -276,8 +276,8 @@ def repeat(self, word_count): class Doc2Vec(BaseWordEmbeddingsModel): """Class for training, using and evaluating neural networks described in http://arxiv.org/pdf/1405.4053v2.pdf""" - def __init__(self, documents=None, dm_mean=None, dm=1, dbow_words=0, dm_concat=0, dm_tag_count=1, - docvecs=None, docvecs_mapfile=None, comment=None, trim_rule=None, callbacks=(), **kwargs): + def __init__(self, documents=None, dm_mean=None, dm=1, dbow_words=0, dm_concat=0, dm_tag_count=1, docvecs=None, + docvecs_mapfile=None, comment=None, trim_rule=None, callbacks=(), multistream=False, **kwargs): """Initialize the model from an iterable of `documents`. Each document is a TaggedDocument object that will be used for training. @@ -351,6 +351,8 @@ def __init__(self, documents=None, dm_mean=None, dm=1, dbow_words=0, dm_concat=0 of the model. callbacks : :obj: `list` of :obj: `~gensim.models.callbacks.CallbackAny2Vec` List of callbacks that need to be executed/run at specific stages during training. + multistream : bool + If True, use `sentences` as list of input streams and speed up IO by parallelization. """ @@ -402,10 +404,10 @@ def __init__(self, documents=None, dm_mean=None, dm=1, dbow_words=0, dm_concat=0 if documents is not None: if isinstance(documents, GeneratorType): raise TypeError("You can't pass a generator as the documents argument. Try an iterator.") - self.build_vocab(documents, trim_rule=trim_rule) + self.build_vocab(documents, trim_rule=trim_rule, multistream=multistream, workers=self.workers) self.train( documents, total_examples=self.corpus_count, epochs=self.epochs, - start_alpha=self.alpha, end_alpha=self.min_alpha, callbacks=callbacks) + start_alpha=self.alpha, end_alpha=self.min_alpha, callbacks=callbacks, multistream=multistream) @property def dm(self): @@ -464,9 +466,9 @@ def _do_train_job(self, job, alpha, inits): ) return tally, self._raw_word_count(job) - def train(self, documents, multistream=False, total_examples=None, total_words=None, + def train(self, documents, total_examples=None, total_words=None, epochs=None, start_alpha=None, end_alpha=None, - word_count=0, queue_factor=2, report_delay=1.0, callbacks=()): + word_count=0, queue_factor=2, report_delay=1.0, multistream=False, callbacks=()): """Update the model's neural weights from a sequence of sentences (can be a once-only generator stream). The `documents` iterable can be simply a list of TaggedDocument elements. @@ -483,9 +485,10 @@ def train(self, documents, multistream=False, total_examples=None, total_words=N Parameters ---------- - documents : iterable of iterables + documents : {iterable of iterables, list or tuple of iterable of iterables} The `documents` iterable can be simply a list of TaggedDocument elements, but for larger corpora, consider an iterable that streams the documents directly from disk/network. + If `multistream=True`, `documents` must be a list or tuple of iterables described above. See :class:`~gensim.models.doc2vec.TaggedBrownCorpus` or :class:`~gensim.models.doc2vec.TaggedLineDocument` in :mod:`~gensim.models.doc2vec` module for such examples. total_examples : int @@ -507,6 +510,8 @@ def train(self, documents, multistream=False, total_examples=None, total_words=N Seconds to wait before reporting progress. callbacks : :obj: `list` of :obj: `~gensim.models.callbacks.CallbackAny2Vec` List of callbacks that need to be executed/run at specific stages during training. + multistream : bool + If True, use `documents` as list of input streams and speed up IO by parallelization. """ super(Doc2Vec, self).train( documents, multistream=False, total_examples=total_examples, total_words=total_words, @@ -701,8 +706,8 @@ def estimate_memory(self, vocab_size=None, report=None): report['doctag_syn0'] = self.docvecs.count * self.vector_size * dtype(REAL).itemsize return super(Doc2Vec, self).estimate_memory(vocab_size, report=report) - def build_vocab(self, documents, multistream=False, workers=None, update=False, - progress_per=10000, keep_raw_vocab=False, trim_rule=None, **kwargs): + def build_vocab(self, documents, update=False, progress_per=10000, keep_raw_vocab=False, + trim_rule=None, multistream=False, workers=None, **kwargs): """Build vocabulary from a sequence of sentences (can be a once-only generator stream). Each sentence is a iterable of iterables (can simply be a list of unicode strings too). @@ -712,13 +717,11 @@ def build_vocab(self, documents, multistream=False, workers=None, update=False, The `documents` iterable can be simply a list of TaggedDocument elements, but for larger corpora, consider an iterable that streams the documents directly from disk/network. See :class:`~gensim.models.doc2vec.TaggedBrownCorpus` or :class:`~gensim.models.doc2vec.TaggedLineDocument` + progress_per : int + Indicates how many words to process before showing/updating the progress. + update : bool + If true, the new words in `sentences` will be added to model's vocab. in :mod:`~gensim.models.doc2vec` module for such examples. - multistream : bool - If True, use `documents` as list of input streams and speed up vocab building by parallelization - with `min(len(documents), self.workers)` processes. This option can lead up to 2.5x reduction - in vocabulary building time. - workers : int - Used if `multistream=True`. Determines how many processes to use for vocab building. keep_raw_vocab : bool If not true, delete the raw vocabulary after the scaling is done and free up RAM. trim_rule : function @@ -729,10 +732,12 @@ def build_vocab(self, documents, multistream=False, workers=None, update=False, :attr:`gensim.utils.RULE_DISCARD`, :attr:`gensim.utils.RULE_KEEP` or :attr:`gensim.utils.RULE_DEFAULT`. Note: The rule, if given, is only used to prune vocabulary during build_vocab() and is not stored as part of the model. - progress_per : int - Indicates how many words to process before showing/updating the progress. - update : bool - If true, the new words in `sentences` will be added to model's vocab. + multistream : bool + If True, use `documents` as list of input streams and speed up vocab building by parallelization + with `min(len(documents), self.workers)` processes. This option can lead up to 2.5x reduction + in vocabulary building time. + workers : int + Used if `multistream=True`. Determines how many processes to use for vocab building. """ workers = workers or self.workers total_words, corpus_count = self.vocabulary.scan_vocab( diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index 22bb9cc94e..edf9620f36 100755 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -428,18 +428,19 @@ def __init__(self, sentences=None, size=100, alpha=0.025, window=5, min_count=5, max_vocab_size=None, sample=1e-3, seed=1, workers=3, min_alpha=0.0001, sg=0, hs=0, negative=5, cbow_mean=1, hashfxn=hash, iter=5, null_word=0, trim_rule=None, sorted_vocab=1, batch_words=MAX_WORDS_IN_BATCH, compute_loss=False, callbacks=(), - max_final_vocab=None): + max_final_vocab=None, multistream=False): """ Initialize the model from an iterable of `sentences`. Each sentence is a list of words (unicode strings) that will be used for training. Parameters ---------- - sentences : iterable of iterables + sentences : {iterable of iterables, list or tuple of iterable of iterables} The `sentences` iterable can be simply a list of lists of tokens, but for larger corpora, consider an iterable that streams the sentences directly from disk/network. See :class:`~gensim.models.word2vec.BrownCorpus`, :class:`~gensim.models.word2vec.Text8Corpus` or :class:`~gensim.models.word2vec.LineSentence` in :mod:`~gensim.models.word2vec` module for such examples. + If `multistream=True`, `sentences` must be a list or tuple of iterables described above. If you don't supply `sentences`, the model is left uninitialized -- use if you plan to initialize it in some other way. @@ -505,6 +506,8 @@ def __init__(self, sentences=None, size=100, alpha=0.025, window=5, min_count=5, If True, computes and stores loss value which can be retrieved using `model.get_latest_training_loss()`. callbacks : :obj: `list` of :obj: `~gensim.models.callbacks.CallbackAny2Vec` List of callbacks that need to be executed/run at specific stages during training. + multistream : bool + If True, use `sentences` as list of input streams and speed up IO by parallelization. Examples -------- @@ -529,9 +532,9 @@ def __init__(self, sentences=None, size=100, alpha=0.025, window=5, min_count=5, self.trainables = Word2VecTrainables(seed=seed, vector_size=size, hashfxn=hashfxn) super(Word2Vec, self).__init__( - sentences=sentences, workers=workers, vector_size=size, epochs=iter, callbacks=callbacks, - batch_words=batch_words, trim_rule=trim_rule, sg=sg, alpha=alpha, window=window, seed=seed, - hs=hs, negative=negative, cbow_mean=cbow_mean, min_alpha=min_alpha, compute_loss=compute_loss, + sentences=sentences, multistream=multistream, workers=workers, vector_size=size, epochs=iter, + callbacks=callbacks, batch_words=batch_words, trim_rule=trim_rule, sg=sg, alpha=alpha, window=window, + seed=seed, hs=hs, negative=negative, cbow_mean=cbow_mean, min_alpha=min_alpha, compute_loss=compute_loss, fast_version=FAST_VERSION) def _do_train_job(self, sentences, alpha, inits): @@ -556,9 +559,9 @@ def _set_train_params(self, **kwargs): self.compute_loss = kwargs['compute_loss'] self.running_training_loss = 0 - def train(self, sentences, multistream=False, total_examples=None, total_words=None, + def train(self, sentences, total_examples=None, total_words=None, epochs=None, start_alpha=None, end_alpha=None, word_count=0, - queue_factor=2, report_delay=1.0, compute_loss=False, callbacks=()): + queue_factor=2, report_delay=1.0, compute_loss=False, callbacks=(), multistream=False): """Update the model's neural weights from a sequence of sentences (can be a once-only generator stream). For Word2Vec, each sentence must be a list of unicode strings. (Subclasses may accept other examples.) @@ -575,9 +578,10 @@ def train(self, sentences, multistream=False, total_examples=None, total_words=N Parameters ---------- - sentences : iterable of iterables + sentences : {iterable of iterables, list or tuple of iterable of iterables} The `sentences` iterable can be simply a list of lists of tokens, but for larger corpora, consider an iterable that streams the sentences directly from disk/network. + If `multistream=True`, `sentences` must be a list or tuple of iterables described above. See :class:`~gensim.models.word2vec.BrownCorpus`, :class:`~gensim.models.word2vec.Text8Corpus` or :class:`~gensim.models.word2vec.LineSentence` in :mod:`~gensim.models.word2vec` module for such examples. total_examples : int @@ -601,6 +605,8 @@ def train(self, sentences, multistream=False, total_examples=None, total_words=N If True, computes and stores loss value which can be retrieved using `model.get_latest_training_loss()`. callbacks : :obj: `list` of :obj: `~gensim.models.callbacks.CallbackAny2Vec` List of callbacks that need to be executed/run at specific stages during training. + multistream : bool + If True, use `sentences` as list of input streams and speed up IO by parallelization. Examples -------- diff --git a/gensim/test/test_utils.py b/gensim/test/test_utils.py index 0df0d6efc2..cb2a39a32b 100644 --- a/gensim/test/test_utils.py +++ b/gensim/test/test_utils.py @@ -120,6 +120,16 @@ def test_sample_dict(self): self.assertTrue(True) +class TestMergeDicts(unittest.TestCase): + def test_merge_dicts(self): + d1 = {"word1": 5, "word2": 1, "word3": 2} + d2 = {"word1": 2, "word3": 3, "word4": 10} + + res_dict = utils.merge_dicts(d1, d2) + expected_dict = {"word1": 7, "word2": 1, "word3": 5, "word4": 10} + self.assertEqual(res_dict, expected_dict) + + class TestWindowing(unittest.TestCase): arr10_5 = np.array([ From 0d5ae38a8ab7adc7288eb2f86c8d5dbfc5781373 Mon Sep 17 00:00:00 2001 From: persiyanov Date: Thu, 7 Jun 2018 21:49:55 +0300 Subject: [PATCH 08/41] add word2vec multistream tests --- gensim/models/doc2vec.py | 10 ++++++++- gensim/test/test_word2vec.py | 42 ++++++++++++++++++++++++++++++++++++ 2 files changed, 51 insertions(+), 1 deletion(-) diff --git a/gensim/models/doc2vec.py b/gensim/models/doc2vec.py index b7e421bebc..cadd316234 100644 --- a/gensim/models/doc2vec.py +++ b/gensim/models/doc2vec.py @@ -402,7 +402,15 @@ def __init__(self, documents=None, dm_mean=None, dm=1, dbow_words=0, dm_concat=0 self.comment = comment if documents is not None: - if isinstance(documents, GeneratorType): + if multistream: + if not isinstance(documents, (tuple, list)): + raise TypeError("If multistream=True, you must pass tuple or list as the documents argument.") + if any(isinstance(stream, GeneratorType) for stream in documents): + raise TypeError("You can't pass a generators as input streams. Try an iterator.") + if any(isinstance(stream, TaggedLineDocument) for stream in documents): + warnings.warn("Using TaggedLineDocument in multistream mode can lead to incorrect results " + "because of tags collision.") + if not multistream and isinstance(documents, GeneratorType): raise TypeError("You can't pass a generator as the documents argument. Try an iterator.") self.build_vocab(documents, trim_rule=trim_rule, multistream=multistream, workers=self.workers) self.train( diff --git a/gensim/test/test_word2vec.py b/gensim/test/test_word2vec.py index 411d200676..ba818d7d4d 100644 --- a/gensim/test/test_word2vec.py +++ b/gensim/test/test_word2vec.py @@ -166,6 +166,20 @@ def testMaxFinalVocab(self): self.assertEqual(reported_values['num_retained_words'], 4) self.assertEqual(model.vocabulary.effective_min_count, 3) + def testMultiStreamBuildVocab(self): + # Expected vocab + model = word2vec.Word2Vec(min_count=0) + model.build_vocab(sentences) + singlestream_vocab = model.vocabulary.raw_vocab + + # Multistream vocab + model = word2vec.Word2Vec(min_count=0) + input_streams = [sentences[:len(sentences)/2], sentences[len(sentences)/2:]] + model.build_vocab(input_streams, multistream=True, workers=2) + multistream_vocab = model.vocabulary.raw_vocab + + self.assertEqual(singlestream_vocab, multistream_vocab) + def testOnlineLearning(self): """Test that the algorithm is able to add new words to the vocabulary and to a trained model when using a sorted vocabulary""" @@ -480,6 +494,34 @@ def testTraining(self): model2 = word2vec.Word2Vec(sentences, size=2, min_count=1, hs=1, negative=0) self.models_equal(model, model2) + def testMultistreamTraining(self): + """Test word2vec multistream training.""" + # build vocabulary, don't train yet + input_streams = [sentences[:len(sentences)/2], sentences[len(sentences)/2:]] + model = word2vec.Word2Vec(size=2, min_count=1, hs=1, negative=0) + model.build_vocab(input_streams, multistream=True) + + self.assertTrue(model.wv.syn0.shape == (len(model.wv.vocab), 2)) + self.assertTrue(model.syn1.shape == (len(model.wv.vocab), 2)) + + model.train(input_streams, total_examples=model.corpus_count, epochs=model.iter, multistream=True) + sims = model.most_similar('graph', topn=10) + # self.assertTrue(sims[0][0] == 'trees', sims) # most similar + + # test querying for "most similar" by vector + graph_vector = model.wv.syn0norm[model.wv.vocab['graph'].index] + sims2 = model.most_similar(positive=[graph_vector], topn=11) + sims2 = [(w, sim) for w, sim in sims2 if w != 'graph'] # ignore 'graph' itself + self.assertEqual(sims, sims2) + + # build vocab and train in one step; must be the same as above + model2 = word2vec.Word2Vec(input_streams, size=2, min_count=1, hs=1, negative=0, multistream=True) + self.models_equal(model, model2) + + # train singlestream model; must be the same as above + model3 = word2vec.Word2Vec(sentences, size=2, min_count=1, hs=1, negative=0) + self.models_equal(model, model3) + def testScoring(self): """Test word2vec scoring.""" model = word2vec.Word2Vec(sentences, size=2, min_count=1, hs=1, negative=0) From df3ae5fbf0c8df3fe78929a859f18e5cb405a20a Mon Sep 17 00:00:00 2001 From: persiyanov Date: Fri, 8 Jun 2018 10:51:17 +0300 Subject: [PATCH 09/41] fix pep8 --- gensim/test/test_word2vec.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gensim/test/test_word2vec.py b/gensim/test/test_word2vec.py index ba818d7d4d..a2e24ef2c1 100644 --- a/gensim/test/test_word2vec.py +++ b/gensim/test/test_word2vec.py @@ -174,7 +174,7 @@ def testMultiStreamBuildVocab(self): # Multistream vocab model = word2vec.Word2Vec(min_count=0) - input_streams = [sentences[:len(sentences)/2], sentences[len(sentences)/2:]] + input_streams = [sentences[:len(sentences) / 2], sentences[len(sentences) / 2:]] model.build_vocab(input_streams, multistream=True, workers=2) multistream_vocab = model.vocabulary.raw_vocab @@ -497,7 +497,7 @@ def testTraining(self): def testMultistreamTraining(self): """Test word2vec multistream training.""" # build vocabulary, don't train yet - input_streams = [sentences[:len(sentences)/2], sentences[len(sentences)/2:]] + input_streams = [sentences[:len(sentences) / 2], sentences[len(sentences) / 2:]] model = word2vec.Word2Vec(size=2, min_count=1, hs=1, negative=0) model.build_vocab(input_streams, multistream=True) From 49357cb8bf152f8ca4eb0bd33de92ec6f6484855 Mon Sep 17 00:00:00 2001 From: persiyanov Date: Fri, 8 Jun 2018 10:53:10 +0300 Subject: [PATCH 10/41] iteritems -> items --- gensim/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/utils.py b/gensim/utils.py index a8dd0d51e8..199bc9def8 100644 --- a/gensim/utils.py +++ b/gensim/utils.py @@ -1722,7 +1722,7 @@ def merge_dicts(dict1, dict2): result : dict Merged dictionary with sum of frequencies as values. """ - for word, freq in dict2.iteritems(): + for word, freq in dict2.items(): if word in dict1: dict1[word] += freq else: From 0365eeadb0bec05905e27c40c527a31ebe0a1e9c Mon Sep 17 00:00:00 2001 From: persiyanov Date: Fri, 8 Jun 2018 10:59:27 +0300 Subject: [PATCH 11/41] more precise test --- gensim/test/test_word2vec.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/gensim/test/test_word2vec.py b/gensim/test/test_word2vec.py index a2e24ef2c1..da49366a5f 100644 --- a/gensim/test/test_word2vec.py +++ b/gensim/test/test_word2vec.py @@ -498,7 +498,7 @@ def testMultistreamTraining(self): """Test word2vec multistream training.""" # build vocabulary, don't train yet input_streams = [sentences[:len(sentences) / 2], sentences[len(sentences) / 2:]] - model = word2vec.Word2Vec(size=2, min_count=1, hs=1, negative=0) + model = word2vec.Word2Vec(size=2, min_count=1, hs=1, negative=0, workers=3) model.build_vocab(input_streams, multistream=True) self.assertTrue(model.wv.syn0.shape == (len(model.wv.vocab), 2)) @@ -515,12 +515,12 @@ def testMultistreamTraining(self): self.assertEqual(sims, sims2) # build vocab and train in one step; must be the same as above - model2 = word2vec.Word2Vec(input_streams, size=2, min_count=1, hs=1, negative=0, multistream=True) + model2 = word2vec.Word2Vec(input_streams, size=2, min_count=1, hs=1, negative=0, multistream=True, workers=3) self.models_equal(model, model2) # train singlestream model; must be the same as above - model3 = word2vec.Word2Vec(sentences, size=2, min_count=1, hs=1, negative=0) - self.models_equal(model, model3) + model3 = word2vec.Word2Vec(sentences, size=2, min_count=1, hs=1, negative=0, workers=3) + self.models_equal(model2, model3) def testScoring(self): """Test word2vec scoring.""" From 812ab8cc88111bc709a0b3b7f7b9bf1cb6d383da Mon Sep 17 00:00:00 2001 From: persiyanov Date: Fri, 8 Jun 2018 11:53:53 +0300 Subject: [PATCH 12/41] add doc2vec tests --- gensim/models/doc2vec.py | 2 +- gensim/test/test_doc2vec.py | 32 ++++++++++++++++++++++++++++++++ gensim/test/test_word2vec.py | 6 +++--- 3 files changed, 36 insertions(+), 4 deletions(-) diff --git a/gensim/models/doc2vec.py b/gensim/models/doc2vec.py index cadd316234..3ce65f2082 100644 --- a/gensim/models/doc2vec.py +++ b/gensim/models/doc2vec.py @@ -522,7 +522,7 @@ def train(self, documents, total_examples=None, total_words=None, If True, use `documents` as list of input streams and speed up IO by parallelization. """ super(Doc2Vec, self).train( - documents, multistream=False, total_examples=total_examples, total_words=total_words, + documents, multistream=multistream, total_examples=total_examples, total_words=total_words, epochs=epochs, start_alpha=start_alpha, end_alpha=end_alpha, word_count=word_count, queue_factor=queue_factor, report_delay=report_delay, callbacks=callbacks) diff --git a/gensim/test/test_doc2vec.py b/gensim/test/test_doc2vec.py index 559e166d4f..39e4bf54a1 100644 --- a/gensim/test/test_doc2vec.py +++ b/gensim/test/test_doc2vec.py @@ -298,6 +298,38 @@ def test_training(self): model2 = doc2vec.Doc2Vec(corpus, size=100, min_count=2, iter=20, workers=1) self.models_equal(model, model2) + def test_multistream_training(self): + """Test doc2vec multistream training.""" + + print "----------- BEFORE TRAINING MODEL 1" + model = doc2vec.Doc2Vec(size=100, min_count=2, iter=20, workers=1) + model.build_vocab(list_corpus) + self.assertEqual(model.docvecs.doctag_syn0.shape, (300, 100)) + model.train(list_corpus, total_examples=model.corpus_count, epochs=model.iter) + print "----------- AFTER TRAINING MODEL 1" + self.model_sanity(model) + + # build vocab and train in one step; must be the same as above + print "----------- BEFORE TRAINING MODEL 2" + input_streams = [list_corpus[:len(list_corpus) / 2], list_corpus[len(list_corpus) / 2:]] + model2 = doc2vec.Doc2Vec(input_streams, multistream=True, size=100, min_count=2, iter=20, workers=1) + print "----------- AFTER TRAINING MODEL 2" + self.models_equal(model, model2) + + def test_multistream_build_vocab(self): + # Expected vocab + model = doc2vec.Doc2Vec(min_count=0) + model.build_vocab(list_corpus) + singlestream_vocab = model.vocabulary.raw_vocab + + # Multistream vocab + model = doc2vec.Doc2Vec(min_count=0) + input_streams = [list_corpus[:len(list_corpus) / 2], list_corpus[len(list_corpus) / 2:]] + model.build_vocab(input_streams, multistream=True, workers=2) + multistream_vocab = model.vocabulary.raw_vocab + + self.assertEqual(singlestream_vocab, multistream_vocab) + def test_dbow_hs(self): """Test DBOW doc2vec training.""" model = doc2vec.Doc2Vec(list_corpus, dm=0, hs=1, negative=0, min_count=2, iter=20) diff --git a/gensim/test/test_word2vec.py b/gensim/test/test_word2vec.py index da49366a5f..8f6611b413 100644 --- a/gensim/test/test_word2vec.py +++ b/gensim/test/test_word2vec.py @@ -498,7 +498,7 @@ def testMultistreamTraining(self): """Test word2vec multistream training.""" # build vocabulary, don't train yet input_streams = [sentences[:len(sentences) / 2], sentences[len(sentences) / 2:]] - model = word2vec.Word2Vec(size=2, min_count=1, hs=1, negative=0, workers=3) + model = word2vec.Word2Vec(size=2, min_count=1, hs=1, negative=0, workers=1) model.build_vocab(input_streams, multistream=True) self.assertTrue(model.wv.syn0.shape == (len(model.wv.vocab), 2)) @@ -515,11 +515,11 @@ def testMultistreamTraining(self): self.assertEqual(sims, sims2) # build vocab and train in one step; must be the same as above - model2 = word2vec.Word2Vec(input_streams, size=2, min_count=1, hs=1, negative=0, multistream=True, workers=3) + model2 = word2vec.Word2Vec(input_streams, size=2, min_count=1, hs=1, negative=0, multistream=True, workers=1) self.models_equal(model, model2) # train singlestream model; must be the same as above - model3 = word2vec.Word2Vec(sentences, size=2, min_count=1, hs=1, negative=0, workers=3) + model3 = word2vec.Word2Vec(sentences, size=2, min_count=1, hs=1, negative=0, workers=1) self.models_equal(model2, model3) def testScoring(self): From f11f44d559c3a8b2bf403b38e39a9057516ebd08 Mon Sep 17 00:00:00 2001 From: persiyanov Date: Fri, 8 Jun 2018 12:16:21 +0300 Subject: [PATCH 13/41] add fasttext tests --- gensim/models/doc2vec.py | 7 +++++-- gensim/models/fasttext.py | 36 +++++++++++++++++++++++++----------- gensim/test/test_fasttext.py | 32 ++++++++++++++++++++++++++++++++ 3 files changed, 62 insertions(+), 13 deletions(-) diff --git a/gensim/models/doc2vec.py b/gensim/models/doc2vec.py index 3ce65f2082..22fb6d8617 100644 --- a/gensim/models/doc2vec.py +++ b/gensim/models/doc2vec.py @@ -283,11 +283,12 @@ def __init__(self, documents=None, dm_mean=None, dm=1, dbow_words=0, dm_concat=0 Parameters ---------- - documents : iterable of iterables + documents : {iterable of iterables, list or tuple of iterable of iterables} The `documents` iterable can be simply a list of TaggedDocument elements, but for larger corpora, consider an iterable that streams the documents directly from disk/network. If you don't supply `documents`, the model is left uninitialized -- use if you plan to initialize it in some other way. + If `multistream=True`, `documents` must be a list or tuple of iterables described above. dm : int {1,0} Defines the training algorithm. If `dm=1`, 'distributed memory' (PV-DM) is used. @@ -721,10 +722,11 @@ def build_vocab(self, documents, update=False, progress_per=10000, keep_raw_voca Parameters ---------- - documents : iterable of iterables + documents : {iterable of iterables, list or tuple of iterable of iterables} The `documents` iterable can be simply a list of TaggedDocument elements, but for larger corpora, consider an iterable that streams the documents directly from disk/network. See :class:`~gensim.models.doc2vec.TaggedBrownCorpus` or :class:`~gensim.models.doc2vec.TaggedLineDocument` + If `multistream=True`, `documents` must be a list or tuple of iterables described above. progress_per : int Indicates how many words to process before showing/updating the progress. update : bool @@ -746,6 +748,7 @@ def build_vocab(self, documents, update=False, progress_per=10000, keep_raw_voca in vocabulary building time. workers : int Used if `multistream=True`. Determines how many processes to use for vocab building. + """ workers = workers or self.workers total_words, corpus_count = self.vocabulary.scan_vocab( diff --git a/gensim/models/fasttext.py b/gensim/models/fasttext.py index 23bc9ff55b..1277a49fd2 100644 --- a/gensim/models/fasttext.py +++ b/gensim/models/fasttext.py @@ -163,17 +163,18 @@ class FastText(BaseWordEmbeddingsModel): def __init__(self, sentences=None, sg=0, hs=0, size=100, alpha=0.025, window=5, min_count=5, max_vocab_size=None, word_ngrams=1, sample=1e-3, seed=1, workers=3, min_alpha=0.0001, negative=5, cbow_mean=1, hashfxn=hash, iter=5, null_word=0, min_n=3, max_n=6, sorted_vocab=1, - bucket=2000000, trim_rule=None, batch_words=MAX_WORDS_IN_BATCH, callbacks=()): + bucket=2000000, trim_rule=None, batch_words=MAX_WORDS_IN_BATCH, callbacks=(), multistream=False): """Initialize the model from an iterable of `sentences`. Each sentence is a list of words (unicode strings) that will be used for training. Parameters ---------- - sentences : iterable of iterables + sentences : {iterable of iterables, list or tuple of iterable of iterables} The `sentences` iterable can be simply a list of lists of tokens, but for larger corpora, consider an iterable that streams the sentences directly from disk/network. See :class:`~gensim.models.word2vec.BrownCorpus`, :class:`~gensim.models.word2vec.Text8Corpus` or :class:`~gensim.models.word2vec.LineSentence` in :mod:`~gensim.models.word2vec` module for such examples. + If `multistream=True`, `sentences` must be a list or tuple of iterables described above. If you don't supply `sentences`, the model is left uninitialized -- use if you plan to initialize it in some other way. sg : int {1, 0} @@ -243,6 +244,8 @@ def __init__(self, sentences=None, sg=0, hs=0, size=100, alpha=0.025, window=5, memory usage of the model. This option specifies the number of buckets used by the model. callbacks : :obj: `list` of :obj: `~gensim.models.callbacks.CallbackAny2Vec` List of callbacks that need to be executed/run at specific stages during training. + multistream : bool + If True, use `sentences` as list of input streams and speed up IO by parallelization. Examples -------- @@ -273,9 +276,9 @@ def __init__(self, sentences=None, sg=0, hs=0, size=100, alpha=0.025, window=5, self.wv.bucket = self.bucket super(FastText, self).__init__( - sentences=sentences, workers=workers, vector_size=size, epochs=iter, callbacks=callbacks, - batch_words=batch_words, trim_rule=trim_rule, sg=sg, alpha=alpha, window=window, seed=seed, - hs=hs, negative=negative, cbow_mean=cbow_mean, min_alpha=min_alpha, fast_version=FAST_VERSION) + sentences=sentences, multistream=multistream, workers=workers, vector_size=size, epochs=iter, + callbacks=callbacks, batch_words=batch_words, trim_rule=trim_rule, sg=sg, alpha=alpha, window=window, + seed=seed, hs=hs, negative=negative, cbow_mean=cbow_mean, min_alpha=min_alpha, fast_version=FAST_VERSION) @property @deprecated("Attribute will be removed in 4.0.0, use wv.min_n instead") @@ -327,17 +330,19 @@ def syn0_ngrams_lockf(self): def num_ngram_vectors(self): return self.wv.num_ngram_vectors - def build_vocab(self, sentences, update=False, progress_per=10000, keep_raw_vocab=False, trim_rule=None, **kwargs): + def build_vocab(self, sentences, update=False, progress_per=10000, keep_raw_vocab=False, trim_rule=None, + multistream=False, workers=None, **kwargs): """Build vocabulary from a sequence of sentences (can be a once-only generator stream). Each sentence must be a list of unicode strings. Parameters ---------- - sentences : iterable of iterables + sentences : {iterable of iterables, list or tuple of iterable of iterables} The `sentences` iterable can be simply a list of lists of tokens, but for larger corpora, consider an iterable that streams the sentences directly from disk/network. See :class:`~gensim.models.word2vec.BrownCorpus`, :class:`~gensim.models.word2vec.Text8Corpus` or :class:`~gensim.models.word2vec.LineSentence` in :mod:`~gensim.models.word2vec` module for such examples. + If `multistream=True`, `sentences` must be a list or tuple of iterables described above. keep_raw_vocab : bool If not true, delete the raw vocabulary after the scaling is done and free up RAM. trim_rule : function @@ -352,6 +357,12 @@ def build_vocab(self, sentences, update=False, progress_per=10000, keep_raw_voca Indicates how many words to process before showing/updating the progress. update : bool If true, the new words in `sentences` will be added to model's vocab. + multistream : bool + If True, use `sentences` as list of input streams and speed up vocab building by parallelization + with `min(len(sentences), self.workers)` processes. This option can lead up to 2.5x reduction + in vocabulary building time. + workers : int + Used if `multistream=True`. Determines how many processes to use for vocab building. Example ------- @@ -379,7 +390,7 @@ def build_vocab(self, sentences, update=False, progress_per=10000, keep_raw_voca return super(FastText, self).build_vocab( sentences, update=update, progress_per=progress_per, - keep_raw_vocab=keep_raw_vocab, trim_rule=trim_rule, **kwargs) + keep_raw_vocab=keep_raw_vocab, trim_rule=trim_rule, multistream=multistream, workers=workers, **kwargs) def _set_train_params(self, **kwargs): pass @@ -457,9 +468,9 @@ def _do_train_job(self, sentences, alpha, inits): return tally, self._raw_word_count(sentences) - def train(self, sentences, multistream=False, total_examples=None, total_words=None, + def train(self, sentences, total_examples=None, total_words=None, epochs=None, start_alpha=None, end_alpha=None, - word_count=0, queue_factor=2, report_delay=1.0, callbacks=(), **kwargs): + word_count=0, queue_factor=2, report_delay=1.0, callbacks=(), multistream=False, **kwargs): """Update the model's neural weights from a sequence of sentences (can be a once-only generator stream). For FastText, each sentence must be a list of unicode strings. @@ -476,9 +487,10 @@ def train(self, sentences, multistream=False, total_examples=None, total_words=N Parameters ---------- - sentences : iterable of iterables + sentences : {iterable of iterables, list or tuple of iterable of iterables} The `sentences` iterable can be simply a list of lists of tokens, but for larger corpora, consider an iterable that streams the sentences directly from disk/network. + If `multistream=True`, `sentences` must be a list or tuple of iterables described above. See :class:`~gensim.models.word2vec.BrownCorpus`, :class:`~gensim.models.word2vec.Text8Corpus` or :class:`~gensim.models.word2vec.LineSentence` in :mod:`~gensim.models.word2vec` module for such examples. total_examples : int @@ -500,6 +512,8 @@ def train(self, sentences, multistream=False, total_examples=None, total_words=N Seconds to wait before reporting progress. callbacks : :obj: `list` of :obj: `~gensim.models.callbacks.CallbackAny2Vec` List of callbacks that need to be executed/run at specific stages during training. + multistream : bool + If True, use `sentences` as list of input streams and speed up IO by parallelization. Examples -------- diff --git a/gensim/test/test_fasttext.py b/gensim/test/test_fasttext.py index a2ffcfb0fa..5c76c65377 100644 --- a/gensim/test/test_fasttext.py +++ b/gensim/test/test_fasttext.py @@ -80,6 +80,38 @@ def test_training(self): oov_vec = model['minor'] # oov word self.assertEqual(len(oov_vec), 10) + def test_multistream_tranining(self): + input_streams = [sentences[:len(sentences) / 2], sentences[len(sentences) / 2:]] + model = FT_gensim(size=10, min_count=1, hs=1, negative=0, seed=42, workers=1) + model.build_vocab(input_streams, multistream=True, workers=2) + self.model_sanity(model) + + model.train(input_streams, multistream=True, total_examples=model.corpus_count, epochs=model.iter) + sims = model.most_similar('graph', topn=10) + + self.assertEqual(model.wv.syn0.shape, (12, 10)) + self.assertEqual(len(model.wv.vocab), 12) + self.assertEqual(model.wv.syn0_vocab.shape[1], 10) + self.assertEqual(model.wv.syn0_ngrams.shape[1], 10) + self.model_sanity(model) + + # test querying for "most similar" by vector + graph_vector = model.wv.syn0norm[model.wv.vocab['graph'].index] + sims2 = model.most_similar(positive=[graph_vector], topn=11) + sims2 = [(w, sim) for w, sim in sims2 if w != 'graph'] # ignore 'graph' itself + self.assertEqual(sims, sims2) + + # build vocab and train in one step; must be the same as above + model2 = FT_gensim(input_streams, multistream=True, size=10, min_count=1, hs=1, negative=0, seed=42, workers=1) + self.models_equal(model, model2) + + # verify oov-word vector retrieval + invocab_vec = model['minors'] # invocab word + self.assertEqual(len(invocab_vec), 10) + + oov_vec = model['minor'] # oov word + self.assertEqual(len(oov_vec), 10) + def models_equal(self, model, model2): self.assertEqual(len(model.wv.vocab), len(model2.wv.vocab)) self.assertEqual(model.num_ngram_vectors, model2.num_ngram_vectors) From 941dfd8f47d52833f47c67b833b20168d54ebd01 Mon Sep 17 00:00:00 2001 From: persiyanov Date: Fri, 8 Jun 2018 12:57:40 +0300 Subject: [PATCH 14/41] remove prints --- gensim/test/test_doc2vec.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/gensim/test/test_doc2vec.py b/gensim/test/test_doc2vec.py index 39e4bf54a1..12500523e3 100644 --- a/gensim/test/test_doc2vec.py +++ b/gensim/test/test_doc2vec.py @@ -301,19 +301,15 @@ def test_training(self): def test_multistream_training(self): """Test doc2vec multistream training.""" - print "----------- BEFORE TRAINING MODEL 1" model = doc2vec.Doc2Vec(size=100, min_count=2, iter=20, workers=1) model.build_vocab(list_corpus) self.assertEqual(model.docvecs.doctag_syn0.shape, (300, 100)) model.train(list_corpus, total_examples=model.corpus_count, epochs=model.iter) - print "----------- AFTER TRAINING MODEL 1" self.model_sanity(model) # build vocab and train in one step; must be the same as above - print "----------- BEFORE TRAINING MODEL 2" input_streams = [list_corpus[:len(list_corpus) / 2], list_corpus[len(list_corpus) / 2:]] model2 = doc2vec.Doc2Vec(input_streams, multistream=True, size=100, min_count=2, iter=20, workers=1) - print "----------- AFTER TRAINING MODEL 2" self.models_equal(model, model2) def test_multistream_build_vocab(self): From 36e7238466794d314e4559cc3823d2a6e27e9f16 Mon Sep 17 00:00:00 2001 From: persiyanov Date: Fri, 8 Jun 2018 13:02:08 +0300 Subject: [PATCH 15/41] fix seed=42 --- gensim/test/test_doc2vec.py | 4 ++-- gensim/test/test_fasttext.py | 2 +- gensim/test/test_word2vec.py | 7 ++++--- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/gensim/test/test_doc2vec.py b/gensim/test/test_doc2vec.py index 12500523e3..ffe3093559 100644 --- a/gensim/test/test_doc2vec.py +++ b/gensim/test/test_doc2vec.py @@ -301,7 +301,7 @@ def test_training(self): def test_multistream_training(self): """Test doc2vec multistream training.""" - model = doc2vec.Doc2Vec(size=100, min_count=2, iter=20, workers=1) + model = doc2vec.Doc2Vec(size=100, min_count=2, iter=20, workers=1, seed=42) model.build_vocab(list_corpus) self.assertEqual(model.docvecs.doctag_syn0.shape, (300, 100)) model.train(list_corpus, total_examples=model.corpus_count, epochs=model.iter) @@ -309,7 +309,7 @@ def test_multistream_training(self): # build vocab and train in one step; must be the same as above input_streams = [list_corpus[:len(list_corpus) / 2], list_corpus[len(list_corpus) / 2:]] - model2 = doc2vec.Doc2Vec(input_streams, multistream=True, size=100, min_count=2, iter=20, workers=1) + model2 = doc2vec.Doc2Vec(input_streams, multistream=True, size=100, min_count=2, iter=20, workers=1, seed=42) self.models_equal(model, model2) def test_multistream_build_vocab(self): diff --git a/gensim/test/test_fasttext.py b/gensim/test/test_fasttext.py index 5c76c65377..a6b2ba745e 100644 --- a/gensim/test/test_fasttext.py +++ b/gensim/test/test_fasttext.py @@ -80,7 +80,7 @@ def test_training(self): oov_vec = model['minor'] # oov word self.assertEqual(len(oov_vec), 10) - def test_multistream_tranining(self): + def test_multistream_training(self): input_streams = [sentences[:len(sentences) / 2], sentences[len(sentences) / 2:]] model = FT_gensim(size=10, min_count=1, hs=1, negative=0, seed=42, workers=1) model.build_vocab(input_streams, multistream=True, workers=2) diff --git a/gensim/test/test_word2vec.py b/gensim/test/test_word2vec.py index 8f6611b413..8437d95f6c 100644 --- a/gensim/test/test_word2vec.py +++ b/gensim/test/test_word2vec.py @@ -498,7 +498,7 @@ def testMultistreamTraining(self): """Test word2vec multistream training.""" # build vocabulary, don't train yet input_streams = [sentences[:len(sentences) / 2], sentences[len(sentences) / 2:]] - model = word2vec.Word2Vec(size=2, min_count=1, hs=1, negative=0, workers=1) + model = word2vec.Word2Vec(size=2, min_count=1, hs=1, negative=0, workers=1, seed=42) model.build_vocab(input_streams, multistream=True) self.assertTrue(model.wv.syn0.shape == (len(model.wv.vocab), 2)) @@ -515,11 +515,12 @@ def testMultistreamTraining(self): self.assertEqual(sims, sims2) # build vocab and train in one step; must be the same as above - model2 = word2vec.Word2Vec(input_streams, size=2, min_count=1, hs=1, negative=0, multistream=True, workers=1) + model2 = word2vec.Word2Vec(input_streams, size=2, min_count=1, hs=1, negative=0, + multistream=True, workers=1, seed=42) self.models_equal(model, model2) # train singlestream model; must be the same as above - model3 = word2vec.Word2Vec(sentences, size=2, min_count=1, hs=1, negative=0, workers=1) + model3 = word2vec.Word2Vec(sentences, size=2, min_count=1, hs=1, negative=0, workers=1, seed=42) self.models_equal(model2, model3) def testScoring(self): From fa57f7a7dbc8b26733c96baaef0222929b03b567 Mon Sep 17 00:00:00 2001 From: persiyanov Date: Fri, 8 Jun 2018 15:48:40 +0300 Subject: [PATCH 16/41] fixed tests --- gensim/test/test_doc2vec.py | 20 +++++++++++--------- gensim/test/test_word2vec.py | 7 ++----- 2 files changed, 13 insertions(+), 14 deletions(-) diff --git a/gensim/test/test_doc2vec.py b/gensim/test/test_doc2vec.py index ffe3093559..bbeb15478e 100644 --- a/gensim/test/test_doc2vec.py +++ b/gensim/test/test_doc2vec.py @@ -300,17 +300,20 @@ def test_training(self): def test_multistream_training(self): """Test doc2vec multistream training.""" + input_streams = [list_corpus[:len(list_corpus) / 2], list_corpus[len(list_corpus) / 2:]] - model = doc2vec.Doc2Vec(size=100, min_count=2, iter=20, workers=1, seed=42) - model.build_vocab(list_corpus) + model = doc2vec.Doc2Vec(inpsize=100, min_count=2, iter=20, workers=1, seed=42) + model.build_vocab(input_streams, multistream=True, workers=1) self.assertEqual(model.docvecs.doctag_syn0.shape, (300, 100)) - model.train(list_corpus, total_examples=model.corpus_count, epochs=model.iter) + model.train(input_streams, multistream=True, total_examples=model.corpus_count, epochs=model.iter) self.model_sanity(model) # build vocab and train in one step; must be the same as above - input_streams = [list_corpus[:len(list_corpus) / 2], list_corpus[len(list_corpus) / 2:]] model2 = doc2vec.Doc2Vec(input_streams, multistream=True, size=100, min_count=2, iter=20, workers=1, seed=42) - self.models_equal(model, model2) + + # check resulted vectors; note that order of words may be different + for word in model.wv.index2word: + self.assertEqual(model.wv.most_similar(word, topn=5), model2.wv.most_similar(word, topn=5)) def test_multistream_build_vocab(self): # Expected vocab @@ -319,10 +322,10 @@ def test_multistream_build_vocab(self): singlestream_vocab = model.vocabulary.raw_vocab # Multistream vocab - model = doc2vec.Doc2Vec(min_count=0) + model2 = doc2vec.Doc2Vec(min_count=0) input_streams = [list_corpus[:len(list_corpus) / 2], list_corpus[len(list_corpus) / 2:]] - model.build_vocab(input_streams, multistream=True, workers=2) - multistream_vocab = model.vocabulary.raw_vocab + model2.build_vocab(input_streams, multistream=True, workers=2) + multistream_vocab = model2.vocabulary.raw_vocab self.assertEqual(singlestream_vocab, multistream_vocab) @@ -441,7 +444,6 @@ def models_equal(self, model, model2): # check docvecs self.assertEqual(len(model.docvecs.doctags), len(model2.docvecs.doctags)) self.assertEqual(len(model.docvecs.offset2doctag), len(model2.docvecs.offset2doctag)) - self.assertTrue(np.allclose(model.docvecs.doctag_syn0, model2.docvecs.doctag_syn0)) def test_delete_temporary_training_data(self): """Test doc2vec model after delete_temporary_training_data""" diff --git a/gensim/test/test_word2vec.py b/gensim/test/test_word2vec.py index 8437d95f6c..df2e10c3d2 100644 --- a/gensim/test/test_word2vec.py +++ b/gensim/test/test_word2vec.py @@ -519,10 +519,6 @@ def testMultistreamTraining(self): multistream=True, workers=1, seed=42) self.models_equal(model, model2) - # train singlestream model; must be the same as above - model3 = word2vec.Word2Vec(sentences, size=2, min_count=1, hs=1, negative=0, workers=1, seed=42) - self.models_equal(model2, model3) - def testScoring(self): """Test word2vec scoring.""" model = word2vec.Word2Vec(sentences, size=2, min_count=1, hs=1, negative=0) @@ -737,7 +733,8 @@ def testRNG(self): def models_equal(self, model, model2): self.assertEqual(len(model.wv.vocab), len(model2.wv.vocab)) - self.assertTrue(np.allclose(model.wv.syn0, model2.wv.syn0)) + print "word2vec models_equal, max diff {}".format(np.max(np.abs(model.wv.syn0 - model2.wv.syn0))) + self.assertTrue(np.allclose(model.wv.syn0, model2.wv.syn0), msg=np.max(np.abs(model.wv.syn0 - model2.wv.syn0))) if model.hs: self.assertTrue(np.allclose(model.syn1, model2.syn1)) if model.negative: From 9ea007d5151eac3c47789bbcb6b78576dad43098 Mon Sep 17 00:00:00 2001 From: persiyanov Date: Fri, 8 Jun 2018 15:51:15 +0300 Subject: [PATCH 17/41] add build_vocab test for fasttext --- gensim/test/test_fasttext.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/gensim/test/test_fasttext.py b/gensim/test/test_fasttext.py index a6b2ba745e..bfa758dc31 100644 --- a/gensim/test/test_fasttext.py +++ b/gensim/test/test_fasttext.py @@ -112,6 +112,20 @@ def test_multistream_training(self): oov_vec = model['minor'] # oov word self.assertEqual(len(oov_vec), 10) + def test_multistream_build_vocab(self): + # Expected vocab + model = FT_gensim(min_count=0) + model.build_vocab(list_corpus) + singlestream_vocab = model.vocabulary.raw_vocab + + # Multistream vocab + model2 = FT_gensim(min_count=0) + input_streams = [list_corpus[:len(list_corpus) / 2], list_corpus[len(list_corpus) / 2:]] + model2.build_vocab(input_streams, multistream=True, workers=2) + multistream_vocab = model2.vocabulary.raw_vocab + + self.assertEqual(singlestream_vocab, multistream_vocab) + def models_equal(self, model, model2): self.assertEqual(len(model.wv.vocab), len(model2.wv.vocab)) self.assertEqual(model.num_ngram_vectors, model2.num_ngram_vectors) From aec68ea82b95cd57fb4238845f947f39955916f8 Mon Sep 17 00:00:00 2001 From: persiyanov Date: Fri, 8 Jun 2018 15:52:26 +0300 Subject: [PATCH 18/41] fix --- gensim/test/test_word2vec.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/gensim/test/test_word2vec.py b/gensim/test/test_word2vec.py index df2e10c3d2..188368a76b 100644 --- a/gensim/test/test_word2vec.py +++ b/gensim/test/test_word2vec.py @@ -733,8 +733,7 @@ def testRNG(self): def models_equal(self, model, model2): self.assertEqual(len(model.wv.vocab), len(model2.wv.vocab)) - print "word2vec models_equal, max diff {}".format(np.max(np.abs(model.wv.syn0 - model2.wv.syn0))) - self.assertTrue(np.allclose(model.wv.syn0, model2.wv.syn0), msg=np.max(np.abs(model.wv.syn0 - model2.wv.syn0))) + self.assertTrue(np.allclose(model.wv.syn0, model2.wv.syn0)) if model.hs: self.assertTrue(np.allclose(model.syn1, model2.syn1)) if model.negative: From 07f3fd45d875746fb382fa58e495dbd158c454f7 Mon Sep 17 00:00:00 2001 From: persiyanov Date: Fri, 8 Jun 2018 16:39:16 +0300 Subject: [PATCH 19/41] change size from 10 to 5 in fasttext test because of appveyor memory limits --- gensim/test/test_fasttext.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/gensim/test/test_fasttext.py b/gensim/test/test_fasttext.py index bfa758dc31..5fc299ca15 100644 --- a/gensim/test/test_fasttext.py +++ b/gensim/test/test_fasttext.py @@ -82,17 +82,17 @@ def test_training(self): def test_multistream_training(self): input_streams = [sentences[:len(sentences) / 2], sentences[len(sentences) / 2:]] - model = FT_gensim(size=10, min_count=1, hs=1, negative=0, seed=42, workers=1) + model = FT_gensim(size=5, min_count=1, hs=1, negative=0, seed=42, workers=1) model.build_vocab(input_streams, multistream=True, workers=2) self.model_sanity(model) model.train(input_streams, multistream=True, total_examples=model.corpus_count, epochs=model.iter) sims = model.most_similar('graph', topn=10) - self.assertEqual(model.wv.syn0.shape, (12, 10)) + self.assertEqual(model.wv.syn0.shape, (12, 5)) self.assertEqual(len(model.wv.vocab), 12) - self.assertEqual(model.wv.syn0_vocab.shape[1], 10) - self.assertEqual(model.wv.syn0_ngrams.shape[1], 10) + self.assertEqual(model.wv.syn0_vocab.shape[1], 5) + self.assertEqual(model.wv.syn0_ngrams.shape[1], 5) self.model_sanity(model) # test querying for "most similar" by vector @@ -102,15 +102,15 @@ def test_multistream_training(self): self.assertEqual(sims, sims2) # build vocab and train in one step; must be the same as above - model2 = FT_gensim(input_streams, multistream=True, size=10, min_count=1, hs=1, negative=0, seed=42, workers=1) + model2 = FT_gensim(input_streams, multistream=True, size=5, min_count=1, hs=1, negative=0, seed=42, workers=1) self.models_equal(model, model2) # verify oov-word vector retrieval invocab_vec = model['minors'] # invocab word - self.assertEqual(len(invocab_vec), 10) + self.assertEqual(len(invocab_vec), 5) oov_vec = model['minor'] # oov word - self.assertEqual(len(oov_vec), 10) + self.assertEqual(len(oov_vec), 5) def test_multistream_build_vocab(self): # Expected vocab From 8b49fb823925375848d5163f4392b858d241df93 Mon Sep 17 00:00:00 2001 From: persiyanov Date: Fri, 8 Jun 2018 16:58:24 +0300 Subject: [PATCH 20/41] another test with memory error --- gensim/test/test_fasttext.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gensim/test/test_fasttext.py b/gensim/test/test_fasttext.py index 5fc299ca15..0d830d05af 100644 --- a/gensim/test/test_fasttext.py +++ b/gensim/test/test_fasttext.py @@ -114,12 +114,12 @@ def test_multistream_training(self): def test_multistream_build_vocab(self): # Expected vocab - model = FT_gensim(min_count=0) + model = FT_gensim(size=5, min_count=1, hs=1, negative=0, seed=42) model.build_vocab(list_corpus) singlestream_vocab = model.vocabulary.raw_vocab # Multistream vocab - model2 = FT_gensim(min_count=0) + model2 = FT_gensim(size=5, min_count=1, hs=1, negative=0, seed=42) input_streams = [list_corpus[:len(list_corpus) / 2], list_corpus[len(list_corpus) / 2:]] model2.build_vocab(input_streams, multistream=True, workers=2) multistream_vocab = model2.vocabulary.raw_vocab From d0c11d9ab2fc3a3865d265249c1db7e8b7d17a04 Mon Sep 17 00:00:00 2001 From: persiyanov Date: Fri, 8 Jun 2018 17:46:43 +0300 Subject: [PATCH 21/41] fix py3 tests --- gensim/test/test_doc2vec.py | 6 +++--- gensim/test/test_fasttext.py | 5 +++-- gensim/test/test_word2vec.py | 6 +++--- 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/gensim/test/test_doc2vec.py b/gensim/test/test_doc2vec.py index bbeb15478e..b38df81e28 100644 --- a/gensim/test/test_doc2vec.py +++ b/gensim/test/test_doc2vec.py @@ -9,7 +9,7 @@ """ -from __future__ import with_statement +from __future__ import with_statement, division import logging import unittest @@ -300,7 +300,7 @@ def test_training(self): def test_multistream_training(self): """Test doc2vec multistream training.""" - input_streams = [list_corpus[:len(list_corpus) / 2], list_corpus[len(list_corpus) / 2:]] + input_streams = [list_corpus[:len(list_corpus) // 2], list_corpus[len(list_corpus) // 2:]] model = doc2vec.Doc2Vec(inpsize=100, min_count=2, iter=20, workers=1, seed=42) model.build_vocab(input_streams, multistream=True, workers=1) @@ -323,7 +323,7 @@ def test_multistream_build_vocab(self): # Multistream vocab model2 = doc2vec.Doc2Vec(min_count=0) - input_streams = [list_corpus[:len(list_corpus) / 2], list_corpus[len(list_corpus) / 2:]] + input_streams = [list_corpus[:len(list_corpus) // 2], list_corpus[len(list_corpus) // 2:]] model2.build_vocab(input_streams, multistream=True, workers=2) multistream_vocab = model2.vocabulary.raw_vocab diff --git a/gensim/test/test_fasttext.py b/gensim/test/test_fasttext.py index 0d830d05af..288a003dee 100644 --- a/gensim/test/test_fasttext.py +++ b/gensim/test/test_fasttext.py @@ -1,5 +1,6 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- +from __future__ import division import logging import unittest @@ -81,7 +82,7 @@ def test_training(self): self.assertEqual(len(oov_vec), 10) def test_multistream_training(self): - input_streams = [sentences[:len(sentences) / 2], sentences[len(sentences) / 2:]] + input_streams = [sentences[:len(sentences) // 2], sentences[len(sentences) // 2:]] model = FT_gensim(size=5, min_count=1, hs=1, negative=0, seed=42, workers=1) model.build_vocab(input_streams, multistream=True, workers=2) self.model_sanity(model) @@ -120,7 +121,7 @@ def test_multistream_build_vocab(self): # Multistream vocab model2 = FT_gensim(size=5, min_count=1, hs=1, negative=0, seed=42) - input_streams = [list_corpus[:len(list_corpus) / 2], list_corpus[len(list_corpus) / 2:]] + input_streams = [list_corpus[:len(list_corpus) // 2], list_corpus[len(list_corpus) // 2:]] model2.build_vocab(input_streams, multistream=True, workers=2) multistream_vocab = model2.vocabulary.raw_vocab diff --git a/gensim/test/test_word2vec.py b/gensim/test/test_word2vec.py index 188368a76b..76c5ca37f3 100644 --- a/gensim/test/test_word2vec.py +++ b/gensim/test/test_word2vec.py @@ -7,7 +7,7 @@ """ Automated tests for checking transformation algorithms (the models package). """ - +from __future__ import division import logging import unittest @@ -174,7 +174,7 @@ def testMultiStreamBuildVocab(self): # Multistream vocab model = word2vec.Word2Vec(min_count=0) - input_streams = [sentences[:len(sentences) / 2], sentences[len(sentences) / 2:]] + input_streams = [sentences[:len(sentences) // 2], sentences[len(sentences) // 2:]] model.build_vocab(input_streams, multistream=True, workers=2) multistream_vocab = model.vocabulary.raw_vocab @@ -497,7 +497,7 @@ def testTraining(self): def testMultistreamTraining(self): """Test word2vec multistream training.""" # build vocabulary, don't train yet - input_streams = [sentences[:len(sentences) / 2], sentences[len(sentences) / 2:]] + input_streams = [sentences[:len(sentences) // 2], sentences[len(sentences) // 2:]] model = word2vec.Word2Vec(size=2, min_count=1, hs=1, negative=0, workers=1, seed=42) model.build_vocab(input_streams, multistream=True) From 597444839b96657c6d4eabcfd98617b432207851 Mon Sep 17 00:00:00 2001 From: persiyanov Date: Fri, 8 Jun 2018 18:29:17 +0300 Subject: [PATCH 22/41] fix iteritems for py3 --- gensim/models/doc2vec.py | 4 ++-- gensim/utils.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/gensim/models/doc2vec.py b/gensim/models/doc2vec.py index 22fb6d8617..f9a11cff4f 100644 --- a/gensim/models/doc2vec.py +++ b/gensim/models/doc2vec.py @@ -68,7 +68,7 @@ from gensim.models.word2vec import Word2VecKeyedVectors, Word2VecVocab, Word2VecTrainables, train_cbow_pair,\ train_sg_pair, train_batch_sg from six.moves import xrange -from six import string_types, integer_types, itervalues +from six import string_types, integer_types, itervalues, iteritems from gensim.models.base_any2vec import BaseWordEmbeddingsModel from gensim.models.keyedvectors import Doc2VecKeyedVectors from types import GeneratorType @@ -906,7 +906,7 @@ def _scan_vocab_multistream(self, input_streams, docvecs, workers, trim_rule): # Update `docvecs` with document tags information. for (_, doclen2tags) in results: - for document_length, tags in doclen2tags.iteritems(): + for document_length, tags in iteritems(doclen2tags): for tag in tags: _note_doctag(tag, document_length, docvecs) diff --git a/gensim/utils.py b/gensim/utils.py index 199bc9def8..e14c8d4240 100644 --- a/gensim/utils.py +++ b/gensim/utils.py @@ -1722,7 +1722,7 @@ def merge_dicts(dict1, dict2): result : dict Merged dictionary with sum of frequencies as values. """ - for word, freq in dict2.items(): + for word, freq in iteritems(dict2): if word in dict1: dict1[word] += freq else: From 14198475aec15b84af7456a955d6fd4d12220d2c Mon Sep 17 00:00:00 2001 From: persiyanov Date: Fri, 8 Jun 2018 19:32:05 +0300 Subject: [PATCH 23/41] fix functools reduce --- gensim/models/word2vec.py | 1 + 1 file changed, 1 insertion(+) diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index edf9620f36..8d772a3b81 100755 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -133,6 +133,7 @@ from gensim.utils import deprecated from six import iteritems, itervalues, string_types from six.moves import xrange +from functools import reduce logger = logging.getLogger(__name__) From 280e8266e8273b3ca5ddad903b94b63e286b0942 Mon Sep 17 00:00:00 2001 From: persiyanov Date: Tue, 12 Jun 2018 17:58:20 +0300 Subject: [PATCH 24/41] addressing comments --- gensim/models/word2vec.py | 2 +- gensim/test/test_word2vec.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index 8d772a3b81..d4ab4b5ee7 100755 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -1287,7 +1287,7 @@ def sort_vocab(self, wv): wv.vocab[word].index = i def prepare_vocab(self, hs, negative, wv, update=False, keep_raw_vocab=False, trim_rule=None, - min_count=None, sample=None, dry_run=False, **kwargs): + min_count=None, sample=None, dry_run=False): """Apply vocabulary settings for `min_count` (discarding less-frequent words) and `sample` (controlling the downsampling of more-frequent words). diff --git a/gensim/test/test_word2vec.py b/gensim/test/test_word2vec.py index 76c5ca37f3..aaf61aacde 100644 --- a/gensim/test/test_word2vec.py +++ b/gensim/test/test_word2vec.py @@ -506,7 +506,6 @@ def testMultistreamTraining(self): model.train(input_streams, total_examples=model.corpus_count, epochs=model.iter, multistream=True) sims = model.most_similar('graph', topn=10) - # self.assertTrue(sims[0][0] == 'trees', sims) # most similar # test querying for "most similar" by vector graph_vector = model.wv.syn0norm[model.wv.vocab['graph'].index] From 7d489f479962b52c9cc72bab5f8ab5f033cedcdf Mon Sep 17 00:00:00 2001 From: persiyanov Date: Wed, 13 Jun 2018 18:09:51 +0300 Subject: [PATCH 25/41] addressing @jayantj comments --- gensim/models/doc2vec.py | 12 +++++------- gensim/models/word2vec.py | 11 +++++------ gensim/test/test_utils.py | 2 +- gensim/utils.py | 2 +- 4 files changed, 12 insertions(+), 15 deletions(-) diff --git a/gensim/models/doc2vec.py b/gensim/models/doc2vec.py index f9a11cff4f..5f673271e2 100644 --- a/gensim/models/doc2vec.py +++ b/gensim/models/doc2vec.py @@ -886,31 +886,29 @@ def _scan_vocab_multistream(self, input_streams, docvecs, workers, trim_rule): unfinished_tasks = len(results) total_words = 0 - document_no = -1 + total_documents = 0 while unfinished_tasks > 0: report = progress_queue.get() if report is None: unfinished_tasks -= 1 logger.info("scan vocab task finished, processed %i documents and %i words;" - " awaiting finish of %i more tasks", document_no + 1, total_words, unfinished_tasks) + " awaiting finish of %i more tasks", total_documents, total_words, unfinished_tasks) elif isinstance(report, string_types): logger.warning(report) else: num_words, num_documents = report total_words += num_words - document_no += num_documents + total_documents += num_documents - corpus_count = document_no + 1 results = [res.get() for res in results] # pairs (vocab, doclen2tags) - self.raw_vocab = reduce(utils.merge_dicts, [r[0] for r in results]) + self.raw_vocab = reduce(utils.merge_counts, [r[0] for r in results]) # Update `docvecs` with document tags information. for (_, doclen2tags) in results: for document_length, tags in iteritems(doclen2tags): for tag in tags: _note_doctag(tag, document_length, docvecs) - - return total_words, corpus_count + return total_words, total_documents def _scan_vocab_singlestream(self, documents, docvecs, progress_per, trim_rule): document_no = -1 diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index d4ab4b5ee7..b64697acad 100755 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -1246,23 +1246,22 @@ def _scan_vocab_multistream(self, input_streams, workers, trim_rule): unfinished_tasks = len(results) total_words = 0 - sentence_no = -1 + total_sentences = 0 while unfinished_tasks > 0: report = progress_queue.get() if report is None: unfinished_tasks -= 1 logger.info("scan vocab task finished, processed %i sentences and %i words;" - " awaiting finish of %i more tasks", sentence_no + 1, total_words, unfinished_tasks) + " awaiting finish of %i more tasks", total_sentences, total_words, unfinished_tasks) elif isinstance(report, string_types): logger.warning(report) else: num_words, num_sentences = report total_words += num_words - sentence_no += num_sentences + total_sentences += num_sentences - corpus_count = sentence_no + 1 - self.raw_vocab = reduce(utils.merge_dicts, [res.get() for res in results]) - return total_words, corpus_count + self.raw_vocab = reduce(utils.merge_counts, [res.get() for res in results]) + return total_words, total_sentences def scan_vocab(self, sentences, multistream=False, progress_per=10000, workers=None, trim_rule=None): logger.info("collecting all words and their counts") diff --git a/gensim/test/test_utils.py b/gensim/test/test_utils.py index cb2a39a32b..652247ffdc 100644 --- a/gensim/test/test_utils.py +++ b/gensim/test/test_utils.py @@ -125,7 +125,7 @@ def test_merge_dicts(self): d1 = {"word1": 5, "word2": 1, "word3": 2} d2 = {"word1": 2, "word3": 3, "word4": 10} - res_dict = utils.merge_dicts(d1, d2) + res_dict = utils.merge_counts(d1, d2) expected_dict = {"word1": 7, "word2": 1, "word3": 5, "word4": 10} self.assertEqual(res_dict, expected_dict) diff --git a/gensim/utils.py b/gensim/utils.py index e14c8d4240..79939df983 100644 --- a/gensim/utils.py +++ b/gensim/utils.py @@ -1709,7 +1709,7 @@ def prune_vocab(vocab, min_reduce, trim_rule=None): return result -def merge_dicts(dict1, dict2): +def merge_counts(dict1, dict2): """Merge `dict1` of (word, freq1) and `dict2` of (word, freq2) into `dict1` of (word, freq1+freq2). Parameters ---------- From 49a1ee6178b4db35d01abf16ff180a9a1d845090 Mon Sep 17 00:00:00 2001 From: persiyanov Date: Wed, 13 Jun 2018 21:10:26 +0300 Subject: [PATCH 26/41] fix language --- gensim/models/base_any2vec.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/models/base_any2vec.py b/gensim/models/base_any2vec.py index cc44430d8e..4f440b02a7 100644 --- a/gensim/models/base_any2vec.py +++ b/gensim/models/base_any2vec.py @@ -338,7 +338,7 @@ def __init__(self, sentences=None, multistream=False, workers=3, vector_size=100 if not isinstance(sentences, (tuple, list)): raise TypeError("If multistream=True, you must pass tuple or list as the sentences argument.") if any(isinstance(stream, GeneratorType) for stream in sentences): - raise TypeError("You can't pass a generators as input streams. Try an iterator.") + raise TypeError("You can't pass a generator as input streams. Try an iterator.") if not multistream and isinstance(sentences, GeneratorType): raise TypeError("You can't pass a generator as the sentences argument. Try an iterator.") From 1cbad7f3c15ebd3b34112bca2397f4725739ded1 Mon Sep 17 00:00:00 2001 From: persiyanov Date: Wed, 13 Jun 2018 23:37:09 +0300 Subject: [PATCH 27/41] add final vocab pruning in multistream modes --- gensim/models/doc2vec.py | 2 ++ gensim/models/word2vec.py | 2 ++ gensim/test/test_utils.py | 13 +++++++++++++ gensim/utils.py | 23 +++++++++++++++++++++++ 4 files changed, 40 insertions(+) diff --git a/gensim/models/doc2vec.py b/gensim/models/doc2vec.py index 5f673271e2..3494f27473 100644 --- a/gensim/models/doc2vec.py +++ b/gensim/models/doc2vec.py @@ -902,6 +902,8 @@ def _scan_vocab_multistream(self, input_streams, docvecs, workers, trim_rule): results = [res.get() for res in results] # pairs (vocab, doclen2tags) self.raw_vocab = reduce(utils.merge_counts, [r[0] for r in results]) + if self.max_vocab_size: + utils.trim_vocab_by_freq(self.raw_vocab, self.max_vocab_size, trim_rule=trim_rule) # Update `docvecs` with document tags information. for (_, doclen2tags) in results: diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index b64697acad..7e2c50423d 100755 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -1261,6 +1261,8 @@ def _scan_vocab_multistream(self, input_streams, workers, trim_rule): total_sentences += num_sentences self.raw_vocab = reduce(utils.merge_counts, [res.get() for res in results]) + if self.max_vocab_size: + utils.trim_vocab_by_freq(self.raw_vocab, self.max_vocab_size, trim_rule=trim_rule) return total_words, total_sentences def scan_vocab(self, sentences, multistream=False, progress_per=10000, workers=None, trim_rule=None): diff --git a/gensim/test/test_utils.py b/gensim/test/test_utils.py index 652247ffdc..2b9686002b 100644 --- a/gensim/test/test_utils.py +++ b/gensim/test/test_utils.py @@ -120,6 +120,19 @@ def test_sample_dict(self): self.assertTrue(True) +class TestTrimVocabByFreq(unittest.TestCase): + def test_trim_vocab(self): + d = {"word1": 5, "word2": 1, "word3": 2} + expected_dict = {"word1": 5, "word3": 2} + utils.trim_vocab_by_freq(d, topk=2) + self.assertEqual(d, expected_dict) + + d = {"word1": 5, "word2": 2, "word3": 2, "word4": 1} + expected_dict = {"word1": 5} + utils.trim_vocab_by_freq(d, topk=2) + self.assertEqual(d, expected_dict) + + class TestMergeDicts(unittest.TestCase): def test_merge_dicts(self): d1 = {"word1": 5, "word2": 1, "word3": 2} diff --git a/gensim/utils.py b/gensim/utils.py index 79939df983..adc2c96293 100644 --- a/gensim/utils.py +++ b/gensim/utils.py @@ -1709,6 +1709,29 @@ def prune_vocab(vocab, min_reduce, trim_rule=None): return result +def trim_vocab_by_freq(vocab, topk, trim_rule=None): + """Retain `topk` most frequent words in `vocab`. + If there are more words with the same frequency as `topk`-th one, they will be dropped. + Modifies `vocab` in place, returns nothing. + + Parameters + ---------- + vocab : dict + Input dictionary. + topk : int + Number of words with highest frequencies to keep. + trim_rule : function, optional + Function for trimming entities from vocab, default behaviour is `vocab[w] <= min_count`. + + """ + if topk >= len(vocab): + return + + sorted_vocab = sorted(vocab.keys(), key=lambda word: vocab[word], reverse=True) + min_count = vocab[sorted_vocab[topk]] + 1 + prune_vocab(vocab, min_count, trim_rule=trim_rule) + + def merge_counts(dict1, dict2): """Merge `dict1` of (word, freq1) and `dict2` of (word, freq2) into `dict1` of (word, freq1+freq2). Parameters From d024625d372c6e35561cb699013a17090394edae Mon Sep 17 00:00:00 2001 From: persiyanov Date: Thu, 14 Jun 2018 17:54:16 +0300 Subject: [PATCH 28/41] keys -> iterkeys --- gensim/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/utils.py b/gensim/utils.py index adc2c96293..380b57ad3b 100644 --- a/gensim/utils.py +++ b/gensim/utils.py @@ -1727,7 +1727,7 @@ def trim_vocab_by_freq(vocab, topk, trim_rule=None): if topk >= len(vocab): return - sorted_vocab = sorted(vocab.keys(), key=lambda word: vocab[word], reverse=True) + sorted_vocab = sorted(iterkeys(vocab), key=lambda word: vocab[word], reverse=True) min_count = vocab[sorted_vocab[topk]] + 1 prune_vocab(vocab, min_count, trim_rule=trim_rule) From 5e4de1944d60904d821ceb667cd03175d232401f Mon Sep 17 00:00:00 2001 From: persiyanov Date: Fri, 15 Jun 2018 12:21:36 +0300 Subject: [PATCH 29/41] use heapq.nlargest --- gensim/utils.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/gensim/utils.py b/gensim/utils.py index 380b57ad3b..b592225c40 100644 --- a/gensim/utils.py +++ b/gensim/utils.py @@ -33,12 +33,13 @@ import sys import subprocess import inspect +import heapq import numpy as np import numbers import scipy.sparse -from six import iterkeys, iteritems, u, string_types, unichr +from six import iterkeys, iteritems, itervalues, u, string_types, unichr from six.moves import xrange from smart_open import smart_open @@ -1727,8 +1728,7 @@ def trim_vocab_by_freq(vocab, topk, trim_rule=None): if topk >= len(vocab): return - sorted_vocab = sorted(iterkeys(vocab), key=lambda word: vocab[word], reverse=True) - min_count = vocab[sorted_vocab[topk]] + 1 + min_count = heapq.nlargest(topk, itervalues(vocab))[-1] + 1 prune_vocab(vocab, min_count, trim_rule=trim_rule) From 74e7b022f0ae621916f92fabd8171e16cdb4c5aa Mon Sep 17 00:00:00 2001 From: persiyanov Date: Fri, 15 Jun 2018 13:15:03 +0300 Subject: [PATCH 30/41] fix --- gensim/test/test_utils.py | 2 +- gensim/utils.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/gensim/test/test_utils.py b/gensim/test/test_utils.py index 2b9686002b..5b265d0f77 100644 --- a/gensim/test/test_utils.py +++ b/gensim/test/test_utils.py @@ -128,7 +128,7 @@ def test_trim_vocab(self): self.assertEqual(d, expected_dict) d = {"word1": 5, "word2": 2, "word3": 2, "word4": 1} - expected_dict = {"word1": 5} + expected_dict = {"word1": 5, "word2": 2, "word3": 2} utils.trim_vocab_by_freq(d, topk=2) self.assertEqual(d, expected_dict) diff --git a/gensim/utils.py b/gensim/utils.py index b592225c40..68c2dbe40d 100644 --- a/gensim/utils.py +++ b/gensim/utils.py @@ -1712,7 +1712,7 @@ def prune_vocab(vocab, min_reduce, trim_rule=None): def trim_vocab_by_freq(vocab, topk, trim_rule=None): """Retain `topk` most frequent words in `vocab`. - If there are more words with the same frequency as `topk`-th one, they will be dropped. + If there are more words with the same frequency as `topk`-th one, they will be keeped. Modifies `vocab` in place, returns nothing. Parameters @@ -1728,7 +1728,7 @@ def trim_vocab_by_freq(vocab, topk, trim_rule=None): if topk >= len(vocab): return - min_count = heapq.nlargest(topk, itervalues(vocab))[-1] + 1 + min_count = heapq.nlargest(topk, itervalues(vocab))[-1] prune_vocab(vocab, min_count, trim_rule=trim_rule) From 0d12d8b03b9b33b0eb1d4b5375b81f70c68943d9 Mon Sep 17 00:00:00 2001 From: persiyanov Date: Tue, 19 Jun 2018 14:24:40 +0300 Subject: [PATCH 31/41] multistream flag to input_streams param --- gensim/models/base_any2vec.py | 60 ++++++++++++++++------------ gensim/models/doc2vec.py | 74 +++++++++++++++++------------------ gensim/models/fasttext.py | 49 ++++++++++++----------- gensim/models/word2vec.py | 43 ++++++++++---------- gensim/utils.py | 2 +- 5 files changed, 117 insertions(+), 111 deletions(-) diff --git a/gensim/models/base_any2vec.py b/gensim/models/base_any2vec.py index 4f440b02a7..6ab2d4bfbb 100644 --- a/gensim/models/base_any2vec.py +++ b/gensim/models/base_any2vec.py @@ -83,6 +83,11 @@ def _check_training_sanity(self, epochs=None, total_examples=None, total_words=N """Check that the training parameters provided make sense. e.g. raise error if `epochs` not provided.""" raise NotImplementedError() + def _check_input_data_sanity(self, data_iterable=None, data_iterables=None): + """Check that only one argument is not None.""" + if not ((data_iterable is not None) ^ (data_iterables is not None)): + raise ValueError("You can't provide both singlestream and multistream arguments.") + def _worker_loop(self, job_queue, progress_queue): """Train the model, lifting lists of data from the job_queue.""" thread_private_mem = self._get_thread_working_mem() @@ -201,9 +206,10 @@ def _log_epoch_progress(self, progress_queue, job_queue, cur_epoch=0, total_exam self.total_train_time += elapsed return trained_word_count, raw_word_count, job_tally - def _train_epoch(self, data_iterable, multistream=False, cur_epoch=0, total_examples=None, + def _train_epoch(self, data_iterable=None, data_iterables=None, cur_epoch=0, total_examples=None, total_words=None, queue_factor=2, report_delay=1.0): """Train one epoch.""" + self._check_input_data_sanity(data_iterable, data_iterables) job_queue = Queue(maxsize=queue_factor * self.workers) progress_queue = Queue(maxsize=(queue_factor + 1) * self.workers) @@ -215,7 +221,7 @@ def _train_epoch(self, data_iterable, multistream=False, cur_epoch=0, total_exam ] # Chain all input streams into one, because multistream training is not supported yet. - if multistream: + if data_iterables is not None: data_iterable = itertools.chain(*data_iterable) workers.append(threading.Thread( target=self._job_producer, @@ -232,7 +238,7 @@ def _train_epoch(self, data_iterable, multistream=False, cur_epoch=0, total_exam return trained_word_count, raw_word_count, job_tally - def train(self, data_iterable, multistream=False, epochs=None, total_examples=None, + def train(self, data_iterable=None, data_iterables=None, epochs=None, total_examples=None, total_words=None, queue_factor=2, report_delay=1.0, callbacks=(), **kwargs): """Handle multi-worker training.""" self._set_train_params(**kwargs) @@ -257,8 +263,9 @@ def train(self, data_iterable, multistream=False, epochs=None, total_examples=No callback.on_epoch_begin(self) trained_word_count_epoch, raw_word_count_epoch, job_tally_epoch = self._train_epoch( - data_iterable, multistream=multistream, cur_epoch=cur_epoch, total_examples=total_examples, - total_words=total_words, queue_factor=queue_factor, report_delay=report_delay) + data_iterable=data_iterable, data_iterables=data_iterables, cur_epoch=cur_epoch, + total_examples=total_examples, total_words=total_words, queue_factor=queue_factor, + report_delay=report_delay) trained_word_count += trained_word_count_epoch raw_word_count += raw_word_count_epoch job_tally += job_tally_epoch @@ -301,7 +308,7 @@ def _do_train_job(self, data_iterable, job_parameters, thread_private_mem): def _set_train_params(self, **kwargs): raise NotImplementedError() - def __init__(self, sentences=None, multistream=False, workers=3, vector_size=100, epochs=5, callbacks=(), + def __init__(self, sentences=None, input_streams=None, workers=3, vector_size=100, epochs=5, callbacks=(), batch_words=10000, trim_rule=None, sg=0, alpha=0.025, window=5, seed=1, hs=0, negative=5, cbow_mean=1, min_alpha=0.0001, compute_loss=False, fast_version=0, **kwargs): self.sg = int(sg) @@ -333,18 +340,19 @@ def __init__(self, sentences=None, multistream=False, workers=3, vector_size=100 self.neg_labels = zeros(self.negative + 1) self.neg_labels[0] = 1. - if sentences is not None: - if multistream: - if not isinstance(sentences, (tuple, list)): - raise TypeError("If multistream=True, you must pass tuple or list as the sentences argument.") - if any(isinstance(stream, GeneratorType) for stream in sentences): - raise TypeError("You can't pass a generator as input streams. Try an iterator.") - if not multistream and isinstance(sentences, GeneratorType): + if sentences is not None or input_streams is not None: + self._check_input_data_sanity(data_iterable=sentences, data_iterables=input_streams) + if input_streams is not None: + if not isinstance(input_streams, (tuple, list)): + raise TypeError("You must pass tuple or list as the input_streams argument.") + if any(isinstance(stream, GeneratorType) for stream in input_streams): + raise TypeError("You can't pass a generator as any of input streams. Try an iterator.") + elif isinstance(sentences, GeneratorType): raise TypeError("You can't pass a generator as the sentences argument. Try an iterator.") - self.build_vocab(sentences, multistream=multistream, trim_rule=trim_rule) + self.build_vocab(sentences=sentences, input_streams=input_streams, trim_rule=trim_rule) self.train( - sentences, total_examples=self.corpus_count, epochs=self.epochs, multistream=multistream, + sentences=sentences, input_streams=input_streams, total_examples=self.corpus_count, epochs=self.epochs, start_alpha=self.alpha, end_alpha=self.min_alpha, compute_loss=compute_loss) else: if trim_rule is not None: @@ -469,25 +477,24 @@ def __str__(self): self.__class__.__name__, len(self.wv.index2word), self.vector_size, self.alpha ) - def build_vocab(self, sentences, multistream=False, workers=None, update=False, progress_per=10000, + def build_vocab(self, sentences=None, input_streams=None, workers=None, update=False, progress_per=10000, keep_raw_vocab=False, trim_rule=None, **kwargs): """Build vocabulary from a sequence of sentences (can be a once-only generator stream). Each sentence is a iterable of iterables (can simply be a list of unicode strings too). Parameters ---------- - sentences : {iterable of iterables, list or tuple of iterable of iterables} + sentences : iterable of iterables The `sentences` iterable can be simply a list of lists of tokens, but for larger corpora, consider an iterable that streams the sentences directly from disk/network. See :class:`~gensim.models.word2vec.BrownCorpus`, :class:`~gensim.models.word2vec.Text8Corpus` or :class:`~gensim.models.word2vec.LineSentence` in :mod:`~gensim.models.word2vec` module for such examples. - If `multistream=True`, `sentences` must be a list or tuple of iterables described above. - multistream : bool - If True, use `sentences` as list of input streams and speed up vocab building by parallelization - with `min(len(sentences), self.workers)` processes. This option can lead up to 2.5x reduction - in vocabulary building time. + input_streams : list or tuple of iterable of iterables + The tuple or list of `sentences`-like arguments. Use it if you have multiple input streams. It is possible + to process streams in parallel, using `workers` parameter. workers : int - Used if `multistream=True`. Determines how many processes to use for vocab building. + Used if `input_streams` is passed. Determines how many processes to use for vocab building. + Actual number of workers is determined by `min(len(input_streams), workers)`. update : bool If true, the new words in `sentences` will be added to model's vocab. progress_per : int @@ -496,7 +503,8 @@ def build_vocab(self, sentences, multistream=False, workers=None, update=False, """ workers = workers or self.workers total_words, corpus_count = self.vocabulary.scan_vocab( - sentences, multistream=multistream, progress_per=progress_per, trim_rule=trim_rule, workers=workers) + sentences=sentences, input_streams=input_streams, progress_per=progress_per, trim_rule=trim_rule, + workers=workers) self.corpus_count = corpus_count report_values = self.vocabulary.prepare_vocab( self.hs, self.negative, self.wv, update=update, keep_raw_vocab=keep_raw_vocab, @@ -574,7 +582,7 @@ def estimate_memory(self, vocab_size=None, report=None): ) return report - def train(self, sentences, multistream=False, total_examples=None, total_words=None, + def train(self, sentences=None, input_streams=None, total_examples=None, total_words=None, epochs=None, start_alpha=None, end_alpha=None, word_count=0, queue_factor=2, report_delay=1.0, compute_loss=False, callbacks=()): @@ -583,7 +591,7 @@ def train(self, sentences, multistream=False, total_examples=None, total_words=N self.compute_loss = compute_loss self.running_training_loss = 0.0 return super(BaseWordEmbeddingsModel, self).train( - sentences, multistream=multistream, total_examples=total_examples, total_words=total_words, + sentences=sentences, input_streams=input_streams, total_examples=total_examples, total_words=total_words, epochs=epochs, start_alpha=start_alpha, end_alpha=end_alpha, word_count=word_count, queue_factor=queue_factor, report_delay=report_delay, compute_loss=compute_loss, callbacks=callbacks) diff --git a/gensim/models/doc2vec.py b/gensim/models/doc2vec.py index 3494f27473..58c2480452 100644 --- a/gensim/models/doc2vec.py +++ b/gensim/models/doc2vec.py @@ -276,8 +276,9 @@ def repeat(self, word_count): class Doc2Vec(BaseWordEmbeddingsModel): """Class for training, using and evaluating neural networks described in http://arxiv.org/pdf/1405.4053v2.pdf""" - def __init__(self, documents=None, dm_mean=None, dm=1, dbow_words=0, dm_concat=0, dm_tag_count=1, docvecs=None, - docvecs_mapfile=None, comment=None, trim_rule=None, callbacks=(), multistream=False, **kwargs): + def __init__(self, documents=None, input_streams=None, dm_mean=None, dm=1, dbow_words=0, dm_concat=0, + dm_tag_count=1, docvecs=None, docvecs_mapfile=None, comment=None, trim_rule=None, callbacks=(), + **kwargs): """Initialize the model from an iterable of `documents`. Each document is a TaggedDocument object that will be used for training. @@ -288,12 +289,12 @@ def __init__(self, documents=None, dm_mean=None, dm=1, dbow_words=0, dm_concat=0 consider an iterable that streams the documents directly from disk/network. If you don't supply `documents`, the model is left uninitialized -- use if you plan to initialize it in some other way. - If `multistream=True`, `documents` must be a list or tuple of iterables described above. - + input_streams : list or tuple of iterable of iterables + The tuple or list of `documents`-like arguments. Use it if you have multiple input streams. It is possible + to process streams in parallel, using `workers` parameter. dm : int {1,0} Defines the training algorithm. If `dm=1`, 'distributed memory' (PV-DM) is used. Otherwise, `distributed bag of words` (PV-DBOW) is employed. - size : int Dimensionality of the feature vectors. window : int @@ -352,8 +353,6 @@ def __init__(self, documents=None, dm_mean=None, dm=1, dbow_words=0, dm_concat=0 of the model. callbacks : :obj: `list` of :obj: `~gensim.models.callbacks.CallbackAny2Vec` List of callbacks that need to be executed/run at specific stages during training. - multistream : bool - If True, use `sentences` as list of input streams and speed up IO by parallelization. """ @@ -402,21 +401,23 @@ def __init__(self, documents=None, dm_mean=None, dm=1, dbow_words=0, dm_concat=0 self.docvecs = docvecs or Doc2VecKeyedVectors(self.vector_size, docvecs_mapfile) self.comment = comment - if documents is not None: - if multistream: - if not isinstance(documents, (tuple, list)): - raise TypeError("If multistream=True, you must pass tuple or list as the documents argument.") - if any(isinstance(stream, GeneratorType) for stream in documents): - raise TypeError("You can't pass a generators as input streams. Try an iterator.") - if any(isinstance(stream, TaggedLineDocument) for stream in documents): + if documents is not None or input_streams is not None: + self._check_input_data_sanity(data_iterable=documents, data_iterables=input_streams) + if input_streams is not None: + if not isinstance(input_streams, (tuple, list)): + raise TypeError("You must pass tuple or list as the input_streams argument.") + if any(isinstance(stream, GeneratorType) for stream in input_streams): + raise TypeError("You can't pass a generator as any of input streams. Try an iterator.") + if any(isinstance(stream, TaggedLineDocument) for stream in input_streams): warnings.warn("Using TaggedLineDocument in multistream mode can lead to incorrect results " "because of tags collision.") - if not multistream and isinstance(documents, GeneratorType): + elif isinstance(documents, GeneratorType): raise TypeError("You can't pass a generator as the documents argument. Try an iterator.") - self.build_vocab(documents, trim_rule=trim_rule, multistream=multistream, workers=self.workers) + self.build_vocab(documents=documents, input_streams=input_streams, + trim_rule=trim_rule, workers=self.workers) self.train( - documents, total_examples=self.corpus_count, epochs=self.epochs, - start_alpha=self.alpha, end_alpha=self.min_alpha, callbacks=callbacks, multistream=multistream) + documents=documents, input_streams=input_streams, total_examples=self.corpus_count, epochs=self.epochs, + start_alpha=self.alpha, end_alpha=self.min_alpha, callbacks=callbacks) @property def dm(self): @@ -475,9 +476,9 @@ def _do_train_job(self, job, alpha, inits): ) return tally, self._raw_word_count(job) - def train(self, documents, total_examples=None, total_words=None, + def train(self, documents=None, input_streams=None, total_examples=None, total_words=None, epochs=None, start_alpha=None, end_alpha=None, - word_count=0, queue_factor=2, report_delay=1.0, multistream=False, callbacks=()): + word_count=0, queue_factor=2, report_delay=1.0, callbacks=()): """Update the model's neural weights from a sequence of sentences (can be a once-only generator stream). The `documents` iterable can be simply a list of TaggedDocument elements. @@ -494,12 +495,14 @@ def train(self, documents, total_examples=None, total_words=None, Parameters ---------- - documents : {iterable of iterables, list or tuple of iterable of iterables} + documents : iterable of iterables The `documents` iterable can be simply a list of TaggedDocument elements, but for larger corpora, consider an iterable that streams the documents directly from disk/network. - If `multistream=True`, `documents` must be a list or tuple of iterables described above. See :class:`~gensim.models.doc2vec.TaggedBrownCorpus` or :class:`~gensim.models.doc2vec.TaggedLineDocument` in :mod:`~gensim.models.doc2vec` module for such examples. + input_streams : list or tuple of iterable of iterables + The tuple or list of `documents`-like arguments. Use it if you have multiple input streams. It is possible + to process streams in parallel, using `workers` parameter. total_examples : int Count of sentences. total_words : int @@ -519,11 +522,9 @@ def train(self, documents, total_examples=None, total_words=None, Seconds to wait before reporting progress. callbacks : :obj: `list` of :obj: `~gensim.models.callbacks.CallbackAny2Vec` List of callbacks that need to be executed/run at specific stages during training. - multistream : bool - If True, use `documents` as list of input streams and speed up IO by parallelization. """ super(Doc2Vec, self).train( - documents, multistream=multistream, total_examples=total_examples, total_words=total_words, + documents=documents, input_streams=input_streams, total_examples=total_examples, total_words=total_words, epochs=epochs, start_alpha=start_alpha, end_alpha=end_alpha, word_count=word_count, queue_factor=queue_factor, report_delay=report_delay, callbacks=callbacks) @@ -715,8 +716,8 @@ def estimate_memory(self, vocab_size=None, report=None): report['doctag_syn0'] = self.docvecs.count * self.vector_size * dtype(REAL).itemsize return super(Doc2Vec, self).estimate_memory(vocab_size, report=report) - def build_vocab(self, documents, update=False, progress_per=10000, keep_raw_vocab=False, - trim_rule=None, multistream=False, workers=None, **kwargs): + def build_vocab(self, documents=None, input_streams=None, update=False, progress_per=10000, keep_raw_vocab=False, + trim_rule=None, workers=None, **kwargs): """Build vocabulary from a sequence of sentences (can be a once-only generator stream). Each sentence is a iterable of iterables (can simply be a list of unicode strings too). @@ -725,8 +726,10 @@ def build_vocab(self, documents, update=False, progress_per=10000, keep_raw_voca documents : {iterable of iterables, list or tuple of iterable of iterables} The `documents` iterable can be simply a list of TaggedDocument elements, but for larger corpora, consider an iterable that streams the documents directly from disk/network. - See :class:`~gensim.models.doc2vec.TaggedBrownCorpus` or :class:`~gensim.models.doc2vec.TaggedLineDocument` - If `multistream=True`, `documents` must be a list or tuple of iterables described above. + See :class:`~gensim.models.doc2vec.TaggedBrownCorpus` or :class:`~gensim.models.doc2vec.TaggedLineDocument + input_streams : list or tuple of iterable of iterables + The tuple or list of `documents`-like arguments. Use it if you have multiple input streams. It is possible + to process streams in parallel, using `workers` parameter. progress_per : int Indicates how many words to process before showing/updating the progress. update : bool @@ -742,17 +745,14 @@ def build_vocab(self, documents, update=False, progress_per=10000, keep_raw_voca :attr:`gensim.utils.RULE_DISCARD`, :attr:`gensim.utils.RULE_KEEP` or :attr:`gensim.utils.RULE_DEFAULT`. Note: The rule, if given, is only used to prune vocabulary during build_vocab() and is not stored as part of the model. - multistream : bool - If True, use `documents` as list of input streams and speed up vocab building by parallelization - with `min(len(documents), self.workers)` processes. This option can lead up to 2.5x reduction - in vocabulary building time. workers : int - Used if `multistream=True`. Determines how many processes to use for vocab building. + Used if `input_streams` is passed. Determines how many processes to use for vocab building. + Actual number of workers is determined by `min(len(input_streams), workers)`. """ workers = workers or self.workers total_words, corpus_count = self.vocabulary.scan_vocab( - documents, self.docvecs, multistream=multistream, + documents=documents, input_streams=input_streams, docvecs=self.docvecs, progress_per=progress_per, trim_rule=trim_rule, workers=workers ) self.corpus_count = corpus_count @@ -954,9 +954,9 @@ def _scan_vocab_singlestream(self, documents, docvecs, progress_per, trim_rule): self.raw_vocab = vocab return total_words, corpus_count - def scan_vocab(self, documents, docvecs, multistream=False, progress_per=10000, workers=None, trim_rule=None): + def scan_vocab(self, documents=None, input_streams=None, docvecs=None, progress_per=10000, workers=None, trim_rule=None): logger.info("collecting all words and their counts") - if not multistream: + if input_streams is None: total_words, corpus_count = self._scan_vocab_singlestream(documents, docvecs, progress_per, trim_rule) else: total_words, corpus_count = self._scan_vocab_multistream(documents, docvecs, workers, trim_rule) diff --git a/gensim/models/fasttext.py b/gensim/models/fasttext.py index 1277a49fd2..a868971de8 100644 --- a/gensim/models/fasttext.py +++ b/gensim/models/fasttext.py @@ -160,10 +160,10 @@ class FastText(BaseWordEmbeddingsModel): fasttext implementation via :meth:`~gensim.models.fasttext.FastText.load_fasttext_format()`. """ - def __init__(self, sentences=None, sg=0, hs=0, size=100, alpha=0.025, window=5, min_count=5, + def __init__(self, sentences=None, input_streams=None, sg=0, hs=0, size=100, alpha=0.025, window=5, min_count=5, max_vocab_size=None, word_ngrams=1, sample=1e-3, seed=1, workers=3, min_alpha=0.0001, negative=5, cbow_mean=1, hashfxn=hash, iter=5, null_word=0, min_n=3, max_n=6, sorted_vocab=1, - bucket=2000000, trim_rule=None, batch_words=MAX_WORDS_IN_BATCH, callbacks=(), multistream=False): + bucket=2000000, trim_rule=None, batch_words=MAX_WORDS_IN_BATCH, callbacks=()): """Initialize the model from an iterable of `sentences`. Each sentence is a list of words (unicode strings) that will be used for training. @@ -174,9 +174,11 @@ def __init__(self, sentences=None, sg=0, hs=0, size=100, alpha=0.025, window=5, consider an iterable that streams the sentences directly from disk/network. See :class:`~gensim.models.word2vec.BrownCorpus`, :class:`~gensim.models.word2vec.Text8Corpus` or :class:`~gensim.models.word2vec.LineSentence` in :mod:`~gensim.models.word2vec` module for such examples. - If `multistream=True`, `sentences` must be a list or tuple of iterables described above. If you don't supply `sentences`, the model is left uninitialized -- use if you plan to initialize it in some other way. + input_streams : list or tuple of iterable of iterables + The tuple or list of `sentences`-like arguments. Use it if you have multiple input streams. It is possible + to process streams in parallel, using `workers` parameter. sg : int {1, 0} Defines the training algorithm. If 1, skip-gram is used, otherwise, CBOW is employed. size : int @@ -244,17 +246,15 @@ def __init__(self, sentences=None, sg=0, hs=0, size=100, alpha=0.025, window=5, memory usage of the model. This option specifies the number of buckets used by the model. callbacks : :obj: `list` of :obj: `~gensim.models.callbacks.CallbackAny2Vec` List of callbacks that need to be executed/run at specific stages during training. - multistream : bool - If True, use `sentences` as list of input streams and speed up IO by parallelization. Examples -------- Initialize and train a `FastText` model >>> from gensim.models import FastText - >>> sentences = [["cat", "say", "meow"], ["dog", "say", "woof"]] + >>> input_streams = [[["cat", "say", "meow"], ["dog", "say", "woof"]]] >>> - >>> model = FastText(sentences, min_count=1) + >>> model = FastText(input_streams=input_streams, min_count=1) >>> say_vector = model['say'] # get vector for word >>> of_vector = model['of'] # get vector for out-of-vocab word @@ -276,7 +276,7 @@ def __init__(self, sentences=None, sg=0, hs=0, size=100, alpha=0.025, window=5, self.wv.bucket = self.bucket super(FastText, self).__init__( - sentences=sentences, multistream=multistream, workers=workers, vector_size=size, epochs=iter, + sentences=sentences, input_streams=input_streams, workers=workers, vector_size=size, epochs=iter, callbacks=callbacks, batch_words=batch_words, trim_rule=trim_rule, sg=sg, alpha=alpha, window=window, seed=seed, hs=hs, negative=negative, cbow_mean=cbow_mean, min_alpha=min_alpha, fast_version=FAST_VERSION) @@ -330,19 +330,21 @@ def syn0_ngrams_lockf(self): def num_ngram_vectors(self): return self.wv.num_ngram_vectors - def build_vocab(self, sentences, update=False, progress_per=10000, keep_raw_vocab=False, trim_rule=None, - multistream=False, workers=None, **kwargs): + def build_vocab(self, sentences=None, input_streams=None, update=False, progress_per=10000, keep_raw_vocab=False, + trim_rule=None, workers=None, **kwargs): """Build vocabulary from a sequence of sentences (can be a once-only generator stream). Each sentence must be a list of unicode strings. Parameters ---------- - sentences : {iterable of iterables, list or tuple of iterable of iterables} + sentences : iterable of iterables The `sentences` iterable can be simply a list of lists of tokens, but for larger corpora, consider an iterable that streams the sentences directly from disk/network. See :class:`~gensim.models.word2vec.BrownCorpus`, :class:`~gensim.models.word2vec.Text8Corpus` or :class:`~gensim.models.word2vec.LineSentence` in :mod:`~gensim.models.word2vec` module for such examples. - If `multistream=True`, `sentences` must be a list or tuple of iterables described above. + input_streams : list or tuple of iterable of iterables + The tuple or list of `sentences`-like arguments. Use it if you have multiple input streams. It is possible + to process streams in parallel, using `workers` parameter. keep_raw_vocab : bool If not true, delete the raw vocabulary after the scaling is done and free up RAM. trim_rule : function @@ -357,12 +359,9 @@ def build_vocab(self, sentences, update=False, progress_per=10000, keep_raw_voca Indicates how many words to process before showing/updating the progress. update : bool If true, the new words in `sentences` will be added to model's vocab. - multistream : bool - If True, use `sentences` as list of input streams and speed up vocab building by parallelization - with `min(len(sentences), self.workers)` processes. This option can lead up to 2.5x reduction - in vocabulary building time. workers : int - Used if `multistream=True`. Determines how many processes to use for vocab building. + Used if `input_streams` is passed. Determines how many processes to use for vocab building. + Actual number of workers is determined by `min(len(input_streams), workers)`. Example ------- @@ -389,8 +388,8 @@ def build_vocab(self, sentences, update=False, progress_per=10000, keep_raw_voca self.trainables.old_hash2index_len = len(self.wv.hash2index) return super(FastText, self).build_vocab( - sentences, update=update, progress_per=progress_per, - keep_raw_vocab=keep_raw_vocab, trim_rule=trim_rule, multistream=multistream, workers=workers, **kwargs) + sentences=sentences, input_streams=input_streams, update=update, progress_per=progress_per, + keep_raw_vocab=keep_raw_vocab, trim_rule=trim_rule, workers=workers, **kwargs) def _set_train_params(self, **kwargs): pass @@ -468,9 +467,9 @@ def _do_train_job(self, sentences, alpha, inits): return tally, self._raw_word_count(sentences) - def train(self, sentences, total_examples=None, total_words=None, + def train(self, sentences=None, input_streams=None, total_examples=None, total_words=None, epochs=None, start_alpha=None, end_alpha=None, - word_count=0, queue_factor=2, report_delay=1.0, callbacks=(), multistream=False, **kwargs): + word_count=0, queue_factor=2, report_delay=1.0, callbacks=(), **kwargs): """Update the model's neural weights from a sequence of sentences (can be a once-only generator stream). For FastText, each sentence must be a list of unicode strings. @@ -490,9 +489,11 @@ def train(self, sentences, total_examples=None, total_words=None, sentences : {iterable of iterables, list or tuple of iterable of iterables} The `sentences` iterable can be simply a list of lists of tokens, but for larger corpora, consider an iterable that streams the sentences directly from disk/network. - If `multistream=True`, `sentences` must be a list or tuple of iterables described above. See :class:`~gensim.models.word2vec.BrownCorpus`, :class:`~gensim.models.word2vec.Text8Corpus` or :class:`~gensim.models.word2vec.LineSentence` in :mod:`~gensim.models.word2vec` module for such examples. + input_streams : list or tuple of iterable of iterables + The tuple or list of `sentences`-like arguments. Use it if you have multiple input streams. It is possible + to process streams in parallel, using `workers` parameter. total_examples : int Count of sentences. total_words : int @@ -512,8 +513,6 @@ def train(self, sentences, total_examples=None, total_words=None, Seconds to wait before reporting progress. callbacks : :obj: `list` of :obj: `~gensim.models.callbacks.CallbackAny2Vec` List of callbacks that need to be executed/run at specific stages during training. - multistream : bool - If True, use `sentences` as list of input streams and speed up IO by parallelization. Examples -------- @@ -527,7 +526,7 @@ def train(self, sentences, total_examples=None, total_words=None, """ super(FastText, self).train( - sentences, multistream=multistream, total_examples=total_examples, total_words=total_words, + sentences=sentences, input_streams=input_streams, total_examples=total_examples, total_words=total_words, epochs=epochs, start_alpha=start_alpha, end_alpha=end_alpha, word_count=word_count, queue_factor=queue_factor, report_delay=report_delay, callbacks=callbacks) self.trainables.get_vocab_word_vecs(self.wv) diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index 7e2c50423d..cc20240c38 100755 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -425,26 +425,27 @@ class Word2Vec(BaseWordEmbeddingsModel): """ - def __init__(self, sentences=None, size=100, alpha=0.025, window=5, min_count=5, + def __init__(self, sentences=None, input_streams=None, size=100, alpha=0.025, window=5, min_count=5, max_vocab_size=None, sample=1e-3, seed=1, workers=3, min_alpha=0.0001, sg=0, hs=0, negative=5, cbow_mean=1, hashfxn=hash, iter=5, null_word=0, trim_rule=None, sorted_vocab=1, batch_words=MAX_WORDS_IN_BATCH, compute_loss=False, callbacks=(), - max_final_vocab=None, multistream=False): + max_final_vocab=None): """ Initialize the model from an iterable of `sentences`. Each sentence is a list of words (unicode strings) that will be used for training. Parameters ---------- - sentences : {iterable of iterables, list or tuple of iterable of iterables} + sentences : iterable of iterables The `sentences` iterable can be simply a list of lists of tokens, but for larger corpora, consider an iterable that streams the sentences directly from disk/network. See :class:`~gensim.models.word2vec.BrownCorpus`, :class:`~gensim.models.word2vec.Text8Corpus` or :class:`~gensim.models.word2vec.LineSentence` in :mod:`~gensim.models.word2vec` module for such examples. - If `multistream=True`, `sentences` must be a list or tuple of iterables described above. If you don't supply `sentences`, the model is left uninitialized -- use if you plan to initialize it in some other way. - + input_streams : list or tuple of iterable of iterables + The tuple or list of `sentences`-like arguments. Use it if you have multiple input streams. It is possible + to process streams in parallel, using `workers` parameter. sg : int {1, 0} Defines the training algorithm. If 1, skip-gram is employed; otherwise, CBOW is used. size : int @@ -507,17 +508,15 @@ def __init__(self, sentences=None, size=100, alpha=0.025, window=5, min_count=5, If True, computes and stores loss value which can be retrieved using `model.get_latest_training_loss()`. callbacks : :obj: `list` of :obj: `~gensim.models.callbacks.CallbackAny2Vec` List of callbacks that need to be executed/run at specific stages during training. - multistream : bool - If True, use `sentences` as list of input streams and speed up IO by parallelization. Examples -------- Initialize and train a `Word2Vec` model >>> from gensim.models import Word2Vec - >>> sentences = [["cat", "say", "meow"], ["dog", "say", "woof"]] + >>> input_streams = [[["cat", "say", "meow"], ["dog", "say", "woof"]]] >>> - >>> model = Word2Vec(sentences, min_count=1) + >>> model = Word2Vec(input_streams=input_streams, min_count=1) >>> say_vector = model['say'] # get vector for word """ @@ -533,7 +532,7 @@ def __init__(self, sentences=None, size=100, alpha=0.025, window=5, min_count=5, self.trainables = Word2VecTrainables(seed=seed, vector_size=size, hashfxn=hashfxn) super(Word2Vec, self).__init__( - sentences=sentences, multistream=multistream, workers=workers, vector_size=size, epochs=iter, + sentences=sentences, input_streams=input_streams, workers=workers, vector_size=size, epochs=iter, callbacks=callbacks, batch_words=batch_words, trim_rule=trim_rule, sg=sg, alpha=alpha, window=window, seed=seed, hs=hs, negative=negative, cbow_mean=cbow_mean, min_alpha=min_alpha, compute_loss=compute_loss, fast_version=FAST_VERSION) @@ -560,9 +559,9 @@ def _set_train_params(self, **kwargs): self.compute_loss = kwargs['compute_loss'] self.running_training_loss = 0 - def train(self, sentences, total_examples=None, total_words=None, + def train(self, sentences=None, input_streams=None, total_examples=None, total_words=None, epochs=None, start_alpha=None, end_alpha=None, word_count=0, - queue_factor=2, report_delay=1.0, compute_loss=False, callbacks=(), multistream=False): + queue_factor=2, report_delay=1.0, compute_loss=False, callbacks=()): """Update the model's neural weights from a sequence of sentences (can be a once-only generator stream). For Word2Vec, each sentence must be a list of unicode strings. (Subclasses may accept other examples.) @@ -582,9 +581,11 @@ def train(self, sentences, total_examples=None, total_words=None, sentences : {iterable of iterables, list or tuple of iterable of iterables} The `sentences` iterable can be simply a list of lists of tokens, but for larger corpora, consider an iterable that streams the sentences directly from disk/network. - If `multistream=True`, `sentences` must be a list or tuple of iterables described above. See :class:`~gensim.models.word2vec.BrownCorpus`, :class:`~gensim.models.word2vec.Text8Corpus` or :class:`~gensim.models.word2vec.LineSentence` in :mod:`~gensim.models.word2vec` module for such examples. + input_streams : list or tuple of iterable of iterables + The tuple or list of `sentences`-like arguments. Use it if you have multiple input streams. It is possible + to process streams in parallel, using `workers` parameter. total_examples : int Count of sentences. total_words : int @@ -606,22 +607,20 @@ def train(self, sentences, total_examples=None, total_words=None, If True, computes and stores loss value which can be retrieved using `model.get_latest_training_loss()`. callbacks : :obj: `list` of :obj: `~gensim.models.callbacks.CallbackAny2Vec` List of callbacks that need to be executed/run at specific stages during training. - multistream : bool - If True, use `sentences` as list of input streams and speed up IO by parallelization. Examples -------- >>> from gensim.models import Word2Vec - >>> sentences = [["cat", "say", "meow"], ["dog", "say", "woof"]] + >>> input_streams = [[["cat", "say", "meow"], ["dog", "say", "woof"]]] >>> >>> model = Word2Vec(min_count=1) - >>> model.build_vocab(sentences) - >>> model.train(sentences, total_examples=model.corpus_count, epochs=model.iter) + >>> model.build_vocab(input_streams=input_streams) + >>> model.train(input_streams=input_streams, total_examples=model.corpus_count, epochs=model.iter) """ return super(Word2Vec, self).train( - sentences, multistream=multistream, total_examples=total_examples, total_words=total_words, + sentences=sentences, input_streams=input_streams, total_examples=total_examples, total_words=total_words, epochs=epochs, start_alpha=start_alpha, end_alpha=end_alpha, word_count=word_count, queue_factor=queue_factor, report_delay=report_delay, compute_loss=compute_loss, callbacks=callbacks) @@ -1265,12 +1264,12 @@ def _scan_vocab_multistream(self, input_streams, workers, trim_rule): utils.trim_vocab_by_freq(self.raw_vocab, self.max_vocab_size, trim_rule=trim_rule) return total_words, total_sentences - def scan_vocab(self, sentences, multistream=False, progress_per=10000, workers=None, trim_rule=None): + def scan_vocab(self, sentences=None, input_streams=None, progress_per=10000, workers=None, trim_rule=None): logger.info("collecting all words and their counts") - if not multistream: + if sentences is not None: total_words, corpus_count = self._scan_vocab_singlestream(sentences, progress_per, trim_rule) else: - total_words, corpus_count = self._scan_vocab_multistream(sentences, workers, trim_rule) + total_words, corpus_count = self._scan_vocab_multistream(input_streams, workers, trim_rule) logger.info( "collected %i word types from a corpus of %i raw words and %i sentences", diff --git a/gensim/utils.py b/gensim/utils.py index 68c2dbe40d..fea2a6731e 100644 --- a/gensim/utils.py +++ b/gensim/utils.py @@ -1712,7 +1712,7 @@ def prune_vocab(vocab, min_reduce, trim_rule=None): def trim_vocab_by_freq(vocab, topk, trim_rule=None): """Retain `topk` most frequent words in `vocab`. - If there are more words with the same frequency as `topk`-th one, they will be keeped. + If there are more words with the same frequency as `topk`-th one, they will be kept. Modifies `vocab` in place, returns nothing. Parameters From 25d00cd220c3c123b928fbc801a100b02226875e Mon Sep 17 00:00:00 2001 From: persiyanov Date: Tue, 19 Jun 2018 14:58:37 +0300 Subject: [PATCH 32/41] fix tests --- gensim/models/base_any2vec.py | 6 +++--- gensim/models/word2vec.py | 1 - gensim/test/test_doc2vec.py | 8 ++++---- gensim/test/test_fasttext.py | 8 ++++---- gensim/test/test_word2vec.py | 10 +++++----- 5 files changed, 16 insertions(+), 17 deletions(-) diff --git a/gensim/models/base_any2vec.py b/gensim/models/base_any2vec.py index 6ab2d4bfbb..066afa064d 100644 --- a/gensim/models/base_any2vec.py +++ b/gensim/models/base_any2vec.py @@ -86,7 +86,7 @@ def _check_training_sanity(self, epochs=None, total_examples=None, total_words=N def _check_input_data_sanity(self, data_iterable=None, data_iterables=None): """Check that only one argument is not None.""" if not ((data_iterable is not None) ^ (data_iterables is not None)): - raise ValueError("You can't provide both singlestream and multistream arguments.") + raise ValueError("You must provide only one of singlestream or multistream arguments.") def _worker_loop(self, job_queue, progress_queue): """Train the model, lifting lists of data from the job_queue.""" @@ -591,8 +591,8 @@ def train(self, sentences=None, input_streams=None, total_examples=None, total_w self.compute_loss = compute_loss self.running_training_loss = 0.0 return super(BaseWordEmbeddingsModel, self).train( - sentences=sentences, input_streams=input_streams, total_examples=total_examples, total_words=total_words, - epochs=epochs, start_alpha=start_alpha, end_alpha=end_alpha, word_count=word_count, + data_iterable=sentences, data_iterables=input_streams, total_examples=total_examples, + total_words=total_words, epochs=epochs, start_alpha=start_alpha, end_alpha=end_alpha, word_count=word_count, queue_factor=queue_factor, report_delay=report_delay, compute_loss=compute_loss, callbacks=callbacks) def _get_job_params(self, cur_epoch): diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index cc20240c38..70d1f02273 100755 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -618,7 +618,6 @@ def train(self, sentences=None, input_streams=None, total_examples=None, total_w >>> model.train(input_streams=input_streams, total_examples=model.corpus_count, epochs=model.iter) """ - return super(Word2Vec, self).train( sentences=sentences, input_streams=input_streams, total_examples=total_examples, total_words=total_words, epochs=epochs, start_alpha=start_alpha, end_alpha=end_alpha, word_count=word_count, diff --git a/gensim/test/test_doc2vec.py b/gensim/test/test_doc2vec.py index b38df81e28..c2aeb71a81 100644 --- a/gensim/test/test_doc2vec.py +++ b/gensim/test/test_doc2vec.py @@ -303,13 +303,13 @@ def test_multistream_training(self): input_streams = [list_corpus[:len(list_corpus) // 2], list_corpus[len(list_corpus) // 2:]] model = doc2vec.Doc2Vec(inpsize=100, min_count=2, iter=20, workers=1, seed=42) - model.build_vocab(input_streams, multistream=True, workers=1) + model.build_vocab(input_streams=input_streams, workers=1) self.assertEqual(model.docvecs.doctag_syn0.shape, (300, 100)) - model.train(input_streams, multistream=True, total_examples=model.corpus_count, epochs=model.iter) + model.train(input_streams=input_streams, total_examples=model.corpus_count, epochs=model.iter) self.model_sanity(model) # build vocab and train in one step; must be the same as above - model2 = doc2vec.Doc2Vec(input_streams, multistream=True, size=100, min_count=2, iter=20, workers=1, seed=42) + model2 = doc2vec.Doc2Vec(input_streams=input_streams, size=100, min_count=2, iter=20, workers=1, seed=42) # check resulted vectors; note that order of words may be different for word in model.wv.index2word: @@ -324,7 +324,7 @@ def test_multistream_build_vocab(self): # Multistream vocab model2 = doc2vec.Doc2Vec(min_count=0) input_streams = [list_corpus[:len(list_corpus) // 2], list_corpus[len(list_corpus) // 2:]] - model2.build_vocab(input_streams, multistream=True, workers=2) + model2.build_vocab(input_streams=input_streams, workers=2) multistream_vocab = model2.vocabulary.raw_vocab self.assertEqual(singlestream_vocab, multistream_vocab) diff --git a/gensim/test/test_fasttext.py b/gensim/test/test_fasttext.py index 288a003dee..545d75b1e9 100644 --- a/gensim/test/test_fasttext.py +++ b/gensim/test/test_fasttext.py @@ -84,10 +84,10 @@ def test_training(self): def test_multistream_training(self): input_streams = [sentences[:len(sentences) // 2], sentences[len(sentences) // 2:]] model = FT_gensim(size=5, min_count=1, hs=1, negative=0, seed=42, workers=1) - model.build_vocab(input_streams, multistream=True, workers=2) + model.build_vocab(input_streams=input_streams, workers=2) self.model_sanity(model) - model.train(input_streams, multistream=True, total_examples=model.corpus_count, epochs=model.iter) + model.train(input_streams=input_streams, total_examples=model.corpus_count, epochs=model.iter) sims = model.most_similar('graph', topn=10) self.assertEqual(model.wv.syn0.shape, (12, 5)) @@ -103,7 +103,7 @@ def test_multistream_training(self): self.assertEqual(sims, sims2) # build vocab and train in one step; must be the same as above - model2 = FT_gensim(input_streams, multistream=True, size=5, min_count=1, hs=1, negative=0, seed=42, workers=1) + model2 = FT_gensim(input_streams=input_streams, size=5, min_count=1, hs=1, negative=0, seed=42, workers=1) self.models_equal(model, model2) # verify oov-word vector retrieval @@ -122,7 +122,7 @@ def test_multistream_build_vocab(self): # Multistream vocab model2 = FT_gensim(size=5, min_count=1, hs=1, negative=0, seed=42) input_streams = [list_corpus[:len(list_corpus) // 2], list_corpus[len(list_corpus) // 2:]] - model2.build_vocab(input_streams, multistream=True, workers=2) + model2.build_vocab(input_streams=input_streams, workers=2) multistream_vocab = model2.vocabulary.raw_vocab self.assertEqual(singlestream_vocab, multistream_vocab) diff --git a/gensim/test/test_word2vec.py b/gensim/test/test_word2vec.py index aaf61aacde..c2ee97062d 100644 --- a/gensim/test/test_word2vec.py +++ b/gensim/test/test_word2vec.py @@ -175,7 +175,7 @@ def testMultiStreamBuildVocab(self): # Multistream vocab model = word2vec.Word2Vec(min_count=0) input_streams = [sentences[:len(sentences) // 2], sentences[len(sentences) // 2:]] - model.build_vocab(input_streams, multistream=True, workers=2) + model.build_vocab(input_streams=input_streams, workers=2) multistream_vocab = model.vocabulary.raw_vocab self.assertEqual(singlestream_vocab, multistream_vocab) @@ -499,12 +499,12 @@ def testMultistreamTraining(self): # build vocabulary, don't train yet input_streams = [sentences[:len(sentences) // 2], sentences[len(sentences) // 2:]] model = word2vec.Word2Vec(size=2, min_count=1, hs=1, negative=0, workers=1, seed=42) - model.build_vocab(input_streams, multistream=True) + model.build_vocab(input_streams=input_streams) self.assertTrue(model.wv.syn0.shape == (len(model.wv.vocab), 2)) self.assertTrue(model.syn1.shape == (len(model.wv.vocab), 2)) - model.train(input_streams, total_examples=model.corpus_count, epochs=model.iter, multistream=True) + model.train(input_streams=input_streams, total_examples=model.corpus_count, epochs=model.iter) sims = model.most_similar('graph', topn=10) # test querying for "most similar" by vector @@ -514,8 +514,8 @@ def testMultistreamTraining(self): self.assertEqual(sims, sims2) # build vocab and train in one step; must be the same as above - model2 = word2vec.Word2Vec(input_streams, size=2, min_count=1, hs=1, negative=0, - multistream=True, workers=1, seed=42) + model2 = word2vec.Word2Vec(input_streams=input_streams, size=2, min_count=1, hs=1, negative=0, + workers=1, seed=42) self.models_equal(model, model2) def testScoring(self): From 2281265d7a46b2615d70085c78096b4c3bcf89c4 Mon Sep 17 00:00:00 2001 From: persiyanov Date: Tue, 19 Jun 2018 15:03:38 +0300 Subject: [PATCH 33/41] fix flake 8 --- gensim/models/doc2vec.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/gensim/models/doc2vec.py b/gensim/models/doc2vec.py index 58c2480452..3c9b4ebfc8 100644 --- a/gensim/models/doc2vec.py +++ b/gensim/models/doc2vec.py @@ -954,7 +954,8 @@ def _scan_vocab_singlestream(self, documents, docvecs, progress_per, trim_rule): self.raw_vocab = vocab return total_words, corpus_count - def scan_vocab(self, documents=None, input_streams=None, docvecs=None, progress_per=10000, workers=None, trim_rule=None): + def scan_vocab(self, documents=None, input_streams=None, docvecs=None, progress_per=10000, workers=None, + trim_rule=None): logger.info("collecting all words and their counts") if input_streams is None: total_words, corpus_count = self._scan_vocab_singlestream(documents, docvecs, progress_per, trim_rule) From 543a9e01195175eb82822f0790b593eba1beb3e7 Mon Sep 17 00:00:00 2001 From: persiyanov Date: Tue, 19 Jun 2018 15:15:02 +0300 Subject: [PATCH 34/41] fix doc2vec docstrings --- gensim/models/doc2vec.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/gensim/models/doc2vec.py b/gensim/models/doc2vec.py index 3c9b4ebfc8..22822d3fcd 100644 --- a/gensim/models/doc2vec.py +++ b/gensim/models/doc2vec.py @@ -726,7 +726,8 @@ def build_vocab(self, documents=None, input_streams=None, update=False, progress documents : {iterable of iterables, list or tuple of iterable of iterables} The `documents` iterable can be simply a list of TaggedDocument elements, but for larger corpora, consider an iterable that streams the documents directly from disk/network. - See :class:`~gensim.models.doc2vec.TaggedBrownCorpus` or :class:`~gensim.models.doc2vec.TaggedLineDocument + See :class:`~gensim.models.doc2vec.TaggedBrownCorpus` or :class:`~gensim.models.doc2vec.TaggedLineDocument` + in :mod:`~gensim.models.doc2vec` module for such examples. input_streams : list or tuple of iterable of iterables The tuple or list of `documents`-like arguments. Use it if you have multiple input streams. It is possible to process streams in parallel, using `workers` parameter. From d520d68f497ae0f09afe439f8d860f0d52f185ae Mon Sep 17 00:00:00 2001 From: persiyanov Date: Tue, 19 Jun 2018 15:22:34 +0300 Subject: [PATCH 35/41] fix merging streams --- gensim/models/base_any2vec.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/models/base_any2vec.py b/gensim/models/base_any2vec.py index 066afa064d..897c92318d 100644 --- a/gensim/models/base_any2vec.py +++ b/gensim/models/base_any2vec.py @@ -222,7 +222,7 @@ def _train_epoch(self, data_iterable=None, data_iterables=None, cur_epoch=0, tot # Chain all input streams into one, because multistream training is not supported yet. if data_iterables is not None: - data_iterable = itertools.chain(*data_iterable) + data_iterable = itertools.chain(*data_iterables) workers.append(threading.Thread( target=self._job_producer, args=(data_iterable, job_queue), From d11a0b82e590f80a66bbf6cf621d2e7832c7466c Mon Sep 17 00:00:00 2001 From: persiyanov Date: Tue, 19 Jun 2018 15:28:29 +0300 Subject: [PATCH 36/41] fix doc2vec --- gensim/models/doc2vec.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gensim/models/doc2vec.py b/gensim/models/doc2vec.py index 22822d3fcd..c9c235f5c7 100644 --- a/gensim/models/doc2vec.py +++ b/gensim/models/doc2vec.py @@ -524,7 +524,7 @@ def train(self, documents=None, input_streams=None, total_examples=None, total_w List of callbacks that need to be executed/run at specific stages during training. """ super(Doc2Vec, self).train( - documents=documents, input_streams=input_streams, total_examples=total_examples, total_words=total_words, + sentences=documents, input_streams=input_streams, total_examples=total_examples, total_words=total_words, epochs=epochs, start_alpha=start_alpha, end_alpha=end_alpha, word_count=word_count, queue_factor=queue_factor, report_delay=report_delay, callbacks=callbacks) @@ -961,7 +961,7 @@ def scan_vocab(self, documents=None, input_streams=None, docvecs=None, progress_ if input_streams is None: total_words, corpus_count = self._scan_vocab_singlestream(documents, docvecs, progress_per, trim_rule) else: - total_words, corpus_count = self._scan_vocab_multistream(documents, docvecs, workers, trim_rule) + total_words, corpus_count = self._scan_vocab_multistream(input_streams, docvecs, workers, trim_rule) logger.info( "collected %i word types and %i unique tags from a corpus of %i examples and %i words", From ecd8f39e76c1964c08fbaca3bac4d1eff11416e2 Mon Sep 17 00:00:00 2001 From: persiyanov Date: Tue, 19 Jun 2018 18:54:37 +0300 Subject: [PATCH 37/41] max_vocab_size -> max_vocab_size / workers --- gensim/models/doc2vec.py | 7 ++++--- gensim/models/word2vec.py | 6 ++++-- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/gensim/models/doc2vec.py b/gensim/models/doc2vec.py index c9c235f5c7..3a385bcfa4 100644 --- a/gensim/models/doc2vec.py +++ b/gensim/models/doc2vec.py @@ -875,12 +875,13 @@ def _scan_vocab_multistream(self, input_streams, docvecs, workers, trim_rule): manager = multiprocessing.Manager() progress_queue = manager.Queue() - logger.info("Scanning vocab in %i processes.", min(workers, len(input_streams))) - pool = multiprocessing.Pool(processes=min(workers, len(input_streams))) + workers = min(workers, len(input_streams)) + logger.info("Scanning vocab in %i processes.", workers) + pool = multiprocessing.Pool(processes=workers) results = [ pool.apply_async(_scan_vocab_worker, - (stream, progress_queue, self.max_vocab_size, trim_rule) + (stream, progress_queue, self.max_vocab_size / workers, trim_rule) ) for stream in input_streams ] pool.close() diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index 70d1f02273..55e71b62a3 100755 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -1233,11 +1233,13 @@ def _scan_vocab_multistream(self, input_streams, workers, trim_rule): progress_queue = manager.Queue() logger.info("Scanning vocab in %i processes.", min(workers, len(input_streams))) - pool = multiprocessing.Pool(processes=min(workers, len(input_streams))) + + workers = min(workers, input_streams) + pool = multiprocessing.Pool(processes=workers) results = [ pool.apply_async(_scan_vocab_worker, - (stream, progress_queue, self.max_vocab_size, trim_rule) + (stream, progress_queue, self.max_vocab_size / workers, trim_rule) ) for stream in input_streams ] pool.close() From a96d5a44bfbaf8366b6e65b64d3dc3776a3356f9 Mon Sep 17 00:00:00 2001 From: persiyanov Date: Tue, 19 Jun 2018 18:58:05 +0300 Subject: [PATCH 38/41] fixed --- gensim/models/doc2vec.py | 3 ++- gensim/models/word2vec.py | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/gensim/models/doc2vec.py b/gensim/models/doc2vec.py index 3a385bcfa4..f669d78839 100644 --- a/gensim/models/doc2vec.py +++ b/gensim/models/doc2vec.py @@ -879,9 +879,10 @@ def _scan_vocab_multistream(self, input_streams, docvecs, workers, trim_rule): logger.info("Scanning vocab in %i processes.", workers) pool = multiprocessing.Pool(processes=workers) + worker_max_vocab_size = self.max_vocab_size / workers if self.max_vocab_size else None results = [ pool.apply_async(_scan_vocab_worker, - (stream, progress_queue, self.max_vocab_size / workers, trim_rule) + (stream, progress_queue, worker_max_vocab_size, trim_rule) ) for stream in input_streams ] pool.close() diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index 55e71b62a3..e9609f6c6e 100755 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -1237,9 +1237,10 @@ def _scan_vocab_multistream(self, input_streams, workers, trim_rule): workers = min(workers, input_streams) pool = multiprocessing.Pool(processes=workers) + worker_max_vocab_size = self.max_vocab_size / workers if self.max_vocab_size else None results = [ pool.apply_async(_scan_vocab_worker, - (stream, progress_queue, self.max_vocab_size / workers, trim_rule) + (stream, progress_queue, worker_max_vocab_size, trim_rule) ) for stream in input_streams ] pool.close() From 0a327b0cb1eb01b23cf0db17ebfdc6caa2da2119 Mon Sep 17 00:00:00 2001 From: persiyanov Date: Tue, 19 Jun 2018 19:02:23 +0300 Subject: [PATCH 39/41] / -> // (py3 division) --- gensim/models/doc2vec.py | 2 +- gensim/models/word2vec.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/gensim/models/doc2vec.py b/gensim/models/doc2vec.py index f669d78839..05276b1e2a 100644 --- a/gensim/models/doc2vec.py +++ b/gensim/models/doc2vec.py @@ -879,7 +879,7 @@ def _scan_vocab_multistream(self, input_streams, docvecs, workers, trim_rule): logger.info("Scanning vocab in %i processes.", workers) pool = multiprocessing.Pool(processes=workers) - worker_max_vocab_size = self.max_vocab_size / workers if self.max_vocab_size else None + worker_max_vocab_size = self.max_vocab_size // workers if self.max_vocab_size else None results = [ pool.apply_async(_scan_vocab_worker, (stream, progress_queue, worker_max_vocab_size, trim_rule) diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index e9609f6c6e..529e6b7f36 100755 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -1237,7 +1237,7 @@ def _scan_vocab_multistream(self, input_streams, workers, trim_rule): workers = min(workers, input_streams) pool = multiprocessing.Pool(processes=workers) - worker_max_vocab_size = self.max_vocab_size / workers if self.max_vocab_size else None + worker_max_vocab_size = self.max_vocab_size // workers if self.max_vocab_size else None results = [ pool.apply_async(_scan_vocab_worker, (stream, progress_queue, worker_max_vocab_size, trim_rule) From 62873fb713f1bf444770980e91da8f5a012be699 Mon Sep 17 00:00:00 2001 From: persiyanov Date: Wed, 20 Jun 2018 00:56:55 +0300 Subject: [PATCH 40/41] fix --- gensim/models/word2vec.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index 529e6b7f36..26429269d6 100755 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -1234,7 +1234,7 @@ def _scan_vocab_multistream(self, input_streams, workers, trim_rule): logger.info("Scanning vocab in %i processes.", min(workers, len(input_streams))) - workers = min(workers, input_streams) + workers = min(workers, len(input_streams)) pool = multiprocessing.Pool(processes=workers) worker_max_vocab_size = self.max_vocab_size // workers if self.max_vocab_size else None From c67f96477aa1da40ed0265e290c36e9609f016df Mon Sep 17 00:00:00 2001 From: persiyanov Date: Wed, 20 Jun 2018 14:36:12 +0300 Subject: [PATCH 41/41] fix docstring --- gensim/models/doc2vec.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/gensim/models/doc2vec.py b/gensim/models/doc2vec.py index fb1ed4a406..5245b8d084 100644 --- a/gensim/models/doc2vec.py +++ b/gensim/models/doc2vec.py @@ -1028,6 +1028,7 @@ def build_vocab(self, documents=None, input_streams=None, update=False, progress :attr:`gensim.utils.RULE_DISCARD`, :attr:`gensim.utils.RULE_KEEP` or :attr:`gensim.utils.RULE_DEFAULT`. The rule, if given, is only used to prune vocabulary during current method call and is not stored as part of the model. + The input parameters are of the following types: * `word` (str) - the word we are examining * `count` (int) - the word's frequency count in the corpus @@ -1036,6 +1037,7 @@ def build_vocab(self, documents=None, input_streams=None, update=False, progress workers : int Used if `input_streams` is passed. Determines how many processes to use for vocab building. Actual number of workers is determined by `min(len(input_streams), workers)`. + **kwargs Additional key word arguments passed to the internal vocabulary construction.