Skip to content

Commit

Permalink
clean up pure-python code paths in word2vec
Browse files Browse the repository at this point in the history
  • Loading branch information
piskvorky committed Nov 21, 2015
1 parent a70c0c1 commit da0b269
Showing 1 changed file with 63 additions and 69 deletions.
132 changes: 63 additions & 69 deletions gensim/models/word2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,55 +105,59 @@
FAST_VERSION = -1
MAX_WORDS_IN_BATCH = 10000

def train_sentence_sg(model, sentence, alpha, work=None):
def train_batch_sg(model, sentences, alpha, work=None):
"""
Update skip-gram model by training on a single sentence.
Update skip-gram model by training on a sequence of sentences.
The sentence is a list of string tokens, which are looked up in the model's
Each sentence is a list of string tokens, which are looked up in the model's
vocab dictionary. Called internally from `Word2Vec.train()`.
This is the non-optimized, Python version. If you have cython installed, gensim
will use the optimized version from word2vec_inner instead.
"""
word_vocabs = [model.vocab[w] for w in sentence if w in model.vocab and
model.vocab[w].sample_int > model.random.rand() * 2**32]
for pos, word in enumerate(word_vocabs):
reduced_window = model.random.randint(model.window) # `b` in the original word2vec code

# now go over all words from the (reduced) window, predicting each one in turn
start = max(0, pos - model.window + reduced_window)
for pos2, word2 in enumerate(word_vocabs[start:(pos + model.window + 1 - reduced_window)], start):
# don't train on the `word` itself
if pos2 != pos:
train_sg_pair(model, model.index2word[word.index], word2.index, alpha)

return len(word_vocabs)
result = 0
for sentence in sentences:
word_vocabs = [model.vocab[w] for w in sentence if w in model.vocab and
model.vocab[w].sample_int > model.random.rand() * 2**32]
for pos, word in enumerate(word_vocabs):
reduced_window = model.random.randint(model.window) # `b` in the original word2vec code

# now go over all words from the (reduced) window, predicting each one in turn
start = max(0, pos - model.window + reduced_window)
for pos2, word2 in enumerate(word_vocabs[start:(pos + model.window + 1 - reduced_window)], start):
# don't train on the `word` itself
if pos2 != pos:
train_sg_pair(model, model.index2word[word.index], word2.index, alpha)
result += len(word_vocabs)
return result

def train_sentence_cbow(model, sentence, alpha, work=None, neu1=None):
def train_batch_cbow(model, sentences, alpha, work=None, neu1=None):
"""
Update CBOW model by training on a single sentence.
Update CBOW model by training on a sequence of sentences.
The sentence is a list of string tokens, which are looked up in the model's
Each sentence is a list of string tokens, which are looked up in the model's
vocab dictionary. Called internally from `Word2Vec.train()`.
This is the non-optimized, Python version. If you have cython installed, gensim
will use the optimized version from word2vec_inner instead.
"""
word_vocabs = [model.vocab[w] for w in sentence if w in model.vocab and
model.vocab[w].sample_int > model.random.rand() * 2**32]
for pos, word in enumerate(word_vocabs):
reduced_window = model.random.randint(model.window) # `b` in the original word2vec code
start = max(0, pos - model.window + reduced_window)
window_pos = enumerate(word_vocabs[start:(pos + model.window + 1 - reduced_window)], start)
word2_indices = [word2.index for pos2, word2 in window_pos if (word2 is not None and pos2 != pos)]
l1 = np_sum(model.syn0[word2_indices], axis=0) # 1 x vector_size
if word2_indices and model.cbow_mean:
l1 /= len(word2_indices)
train_cbow_pair(model, word, word2_indices, l1, alpha)

return len(word_vocabs)
result = 0
for sentence in sentences:
word_vocabs = [model.vocab[w] for w in sentence if w in model.vocab and
model.vocab[w].sample_int > model.random.rand() * 2**32]
for pos, word in enumerate(word_vocabs):
reduced_window = model.random.randint(model.window) # `b` in the original word2vec code
start = max(0, pos - model.window + reduced_window)
window_pos = enumerate(word_vocabs[start:(pos + model.window + 1 - reduced_window)], start)
word2_indices = [word2.index for pos2, word2 in window_pos if (word2 is not None and pos2 != pos)]
l1 = np_sum(model.syn0[word2_indices], axis=0) # 1 x vector_size
if word2_indices and model.cbow_mean:
l1 /= len(word2_indices)
train_cbow_pair(model, word, word2_indices, l1, alpha)
result += len(word_vocabs)
return result

def score_sentence_sg(model, sentence, work=None):
"""
Expand All @@ -178,9 +182,9 @@ def score_sentence_sg(model, sentence, work=None):

# now go over all words from the window, predicting each one in turn
start = max(0, pos - model.window)
for pos2, word2 in enumerate(sentence[start:(pos + model.window + 1)], start):
for pos2, word2 in enumerate(word_vocabs[start : pos + model.window + 1], start):

This comment has been minimized.

Copy link
@piskvorky

piskvorky Nov 21, 2015

Author Owner

@mataddy the pure-Python code paths in scoring were failing with exceptions. This was my fix -- please review (also a few code simplifications below).

# don't train on OOV words and on the `word` itself
if word2 and not (pos2 == pos):
if word2 is not None and pos2 != pos:
log_prob_sentence += score_sg_pair(model, word, word2)

return log_prob_sentence
Expand All @@ -206,7 +210,7 @@ def score_sentence_cbow(model, sentence, alpha, work=None, neu1=None):
continue # OOV word in the input sentence => skip

start = max(0, pos - model.window)
window_pos = enumerate(sentence[start:(pos + model.window + 1)], start)
window_pos = enumerate(word_vocabs[start:(pos + model.window + 1)], start)
word2_indices = [word2.index for pos2, word2 in window_pos if (word2 is not None and pos2 != pos)]
l1 = np_sum(model.syn0[word2_indices], axis=0) # 1 x layer1_size
if word2_indices and model.cbow_mean:
Expand Down Expand Up @@ -850,13 +854,6 @@ def job_producer():
self.clear_sims()
return trained_word_count

def _score_job_words(self, sentence, inits):
work, neu1 = inits
if self.sg:
return score_sentence_sg(self, sentence, work)
else:
return score_sentence_cbow(self, sentence, work, neu1)

# basics copied from the train() function
def score(self, sentences, total_sentences=int(1e6), chunksize=100, queue_factor=2, report_delay=1):
"""
Expand Down Expand Up @@ -889,31 +886,25 @@ def score(self, sentences, total_sentences=int(1e6), chunksize=100, queue_factor
if not self.hs:
raise RuntimeError("we have only implemented score for hs")

def worker_init():
work = zeros(1, dtype=REAL) # for sg hs, we actually only need one memory loc (running sum)
neu1 = matutils.zeros_aligned(self.layer1_size, dtype=REAL)
return (work, neu1)

def worker_one_job(job, inits):
if job is None: # signal to finish
return False
ns = 0
for (id, sentence) in job:
if id < total_sentences:
sentence_scores[id] = self._score_job_words(sentence, inits)
ns += 1
else:
break
progress_queue.put(ns) # report progress
return True

def worker_loop():
"""Train the model, lifting lists of sentences from the jobs queue."""
init = worker_init()
work = zeros(1, dtype=REAL) # for sg hs, we actually only need one memory loc (running sum)
neu1 = matutils.zeros_aligned(self.layer1_size, dtype=REAL)
while True:
job = job_queue.get()
if not worker_one_job(job, init):
if job is None: # signal to finish
break
ns = 0
for sentence_id, sentence in job:
if sentence_id >= total_sentences:
break
if self.sg:
score = score_sentence_sg(self, sentence, work)
else:
score = score_sentence_cbow(self, sentence, work, neu1)
sentence_scores[sentence_id] = score
ns += 1
progress_queue.put(ns) # report progress

start, next_report = default_timer(), 1.0
# buffer ahead only a limited number of jobs.. this is the reason we can't simply use ThreadPool :(
Expand All @@ -936,8 +927,10 @@ def worker_loop():
while True:
try:
job_no, items = next(jobs_source)
if (job_no-1)*chunksize > total_sentences:
logger.warning("terminating after %i sentences (set higher total_sentences if you want more)."%total_sentences)
if (job_no - 1) * chunksize > total_sentences:
logger.warning(
"terminating after %i sentences (set higher total_sentences if you want more).",
total_sentences)
job_no -= 1
raise StopIteration()
logger.debug("putting job #%i in the queue", job_no)
Expand All @@ -950,15 +943,15 @@ def worker_loop():
job_queue.put(None) # give the workers heads up that they can finish -- no more work!
push_done = True
try:
while done_jobs < (job_no+1) or not push_done:
while done_jobs < (job_no + 1) or not push_done:
ns = progress_queue.get(push_done) # only block after all jobs pushed
sentence_count += ns
done_jobs += 1
elapsed = default_timer() - start
if elapsed >= next_report:
logger.info(
"PROGRESS: at %.2f%% sentences, %.0f sentences/s",
100.0 * sentence_count, sentence_count / elapsed)
"PROGRESS: at %.2f%% sentences, %.0f sentences/s",
100.0 * sentence_count, sentence_count / elapsed)
next_report = elapsed + report_delay # don't flood log, wait report_delay seconds
else:
# loop ended by job count; really done
Expand All @@ -968,8 +961,9 @@ def worker_loop():

elapsed = default_timer() - start
self.clear_sims()
logger.info("scoring %i sentences took %.1fs, %.0f sentences/s"
% (sentence_count, elapsed, sentence_count / elapsed if elapsed else 0.0))
logger.info(
"scoring %i sentences took %.1fs, %.0f sentences/s",
sentence_count, elapsed, sentence_count / elapsed if elapsed else 0.0)
return sentence_scores[:sentence_count]

def clear_sims(self):
Expand Down

1 comment on commit da0b269

@mataddy
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks @piskvorky this is much cleaner. I like what you're doing in this branch. I ran my notebook also and the results are unchanged, so we're good to go.

Please sign in to comment.