Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added phrase support #135

Closed
wants to merge 2 commits into from
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
175 changes: 138 additions & 37 deletions gensim/models/word2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,6 @@
The training algorithm was originally ported from the C package https://code.google.com/p/word2vec/
and extended with additional functionality.

**Install Cython with `pip install cython` before to use optimized word2vec training** (70x speedup [2]_).

Initialize a model with e.g.::

>>> model = Word2Vec(sentences, size=100, window=5, min_count=5, workers=4)
Expand Down Expand Up @@ -44,7 +42,7 @@
and so on.

.. [1] Tomas Mikolov, Kai Chen, Greg Corrado, and Jeffrey Dean. Efficient Estimation of Word Representations in Vector Space. In Proceedings of Workshop at ICLR, 2013.
.. [2] Optimizing word2vec in gensim, http://radimrehurek.com/2013/09/word2vec-in-python-part-two-optimizing/

"""

import logging
Expand All @@ -60,8 +58,8 @@
from numpy import zeros_like, empty, exp, dot, outer, random, dtype, get_include,\
float32 as REAL, uint32, seterr, array, uint8, vstack, argsort, fromstring

logger = logging.getLogger("gensim.models.word2vec")

logger = logging.getLogger("gensim.models.word2vec")

from gensim import utils, matutils # utility fnc for pickling, common scipy operations etc

Expand Down Expand Up @@ -131,7 +129,7 @@ class Word2Vec(utils.SaveLoad):
compatible with the original word2vec implementation via `save_word2vec_format()` and `load_word2vec_format()`.

"""
def __init__(self, sentences=None, size=100, alpha=0.025, window=5, min_count=5, seed=1, workers=1, min_alpha=0.0001):
def __init__(self, sentences=None, size=100, alpha=0.025, window=5, min_count=5, seed=1, workers=1, min_alpha=0.0001, thresholds= [100], phrase_pass=0):
"""
Initialize the model from an iterable of `sentences`. Each sentence is a
list of words (utf8 strings) that will be used for training.
Expand All @@ -146,6 +144,8 @@ def __init__(self, sentences=None, size=100, alpha=0.025, window=5, min_count=5,
`seed` = for the random number generator.
`min_count` = ignore all words with total frequency lower than this.
`workers` = use this many worker threads to train the model (=faster training with multicore machines)
`phrase_pass` is number of times sentences are evaluated for phrase ( will result in higher order ngrams ).
`threshold` array of thresholds for forming the phrases at each pass. ( higher means less phrases ).

"""
self.vocab = {} # mapping from a word (string) to a Vocab object
Expand All @@ -157,11 +157,86 @@ def __init__(self, sentences=None, size=100, alpha=0.025, window=5, min_count=5,
self.min_count = min_count
self.workers = workers
self.min_alpha = min_alpha
self.phrase_pass = int(phrase_pass)
self.thresholds = thresholds

if sentences is not None:
if phrase_pass > 0 :
sentences = self.build_phrases( sentences)
self.build_vocab(sentences)
self.train(sentences)


def build_phrases( self, sentences ):
"""
Generate phrases for given sentences.

"""
logger.info("building phrases. will run %i passes" % (self.phrase_pass) )

for p in range(self.phrase_pass):

# build vocab
logger.info("Start building phrases using sentences. Pass %i" % p)
self.build_phrase_vocab( sentences )

## lets train for phrases
new_sentences = self.find_bigrams(sentences, float(self.thresholds[p]) )
sentences = new_sentences;
return sentences


def find_bigrams( self, sentences, threshold ) :
"""
Find bigrams using words that appear together frequently together and infrequently
in other contexts. Frequent bigrams are calculated based on unigram and bigram counts.

"""
logger.info("Selecting phrases based on threshold %i." % threshold )
new_sentences = []
total_bigrams = 0
for sentence_no, sentence in enumerate( sentences ):

last_word, bigram_word = None, None
last_word_count, score, bigram_count = 0,0,0
new_sentence = []
for word in sentence:
out_of_vocab = False
if word in self.vocab:
word_count = self.vocab[word].count
else:
out_of_vocab = True
if last_word is None:
out_of_vocab = True
else :
bigram_word = last_word + '_' + word
if bigram_word in self.vocab:
bigram_count = self.vocab[bigram_word].count
else:
out_of_vocab = True
if ( last_word_count < self.min_count or word_count < self.min_count ):
out_of_vocab = True
if ( out_of_vocab ) :
score = 0
else :
score = ( bigram_count - self.min_count ) / ( float(last_word_count) * (float(word_count) / self.train_words))

if ( score > threshold ) :
## remove last word from the sentence and add the new bigram word
new_sentence.pop()
new_sentence.append( bigram_word )
total_bigrams +=1
word_count = 0
else:
new_sentence.append( word )

last_word = word
last_word_count = word_count
new_sentences.append( new_sentence )
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is not going to work -- you are creating the entire corpus as a list in memory! that wouldn't scale very well :)

the bigram sentences must be coming out as a stream (one sentence at a time).

logger.info("collected %i bigrams for %i sentences" % ( total_bigrams, (sentence_no+1) ))
return new_sentences


def create_binary_tree(self):
"""
Create a binary Huffman tree using stored vocabulary word counts. Frequent words
Expand Down Expand Up @@ -194,6 +269,51 @@ def create_binary_tree(self):

logger.info("built huffman tree with maximum node depth %i" % max_depth)


def build_phrase_vocab(self, sentences ):
"""
Build vocabulary from a sequence of sentences. Add all possible bi-grams to the dictionay.
Each sentence must be a list of utf8 strings.

"""
logger.info("collecting all words and their counts")
sentence_no, vocab = -1, {}
total_words = lambda: sum(v.count for v in vocab.itervalues())
train_words,bigram_total = 0, 0
for sentence_no, sentence in enumerate(sentences):
if sentence_no % 10000 == 0:
logger.info("PROGRESS: at sentence #%i, processed %i words and %i word types" %
(sentence_no, total_words(), len(vocab)))
last_word = None
for word in sentence:
train_words += 1
if word in vocab:
vocab[word].count += 1
else:
vocab[word] = Vocab(count=1)
## start of the sentence
if last_word is None:
last_word = word
continue
else:
bigram_word = last_word+'_'+word
last_word = word
if bigram_word in vocab:
vocab[bigram_word].count +=1
else:
vocab[bigram_word] = Vocab(count=1)
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this won't scale -- there are too many word combinations, cannot store them all in memory.

the original C code solves this by pruning infrequent entries from memory from time to time (a hack).

bigram_total +=1
logger.info("collected %i bigrams from a corpus of %i words and %i sentences" %
(bigram_total, total_words(), sentence_no+1 ))
logger.info("collected %i word types from a corpus of %i words and %i sentences" %
(len(vocab), total_words(), sentence_no + 1))
self.train_words = train_words
# assign a unique index to each word
self.create_unique_index( vocab )
logger.info("total %i word types after removing those with count<%s" % (len(self.vocab), self.min_count))



def build_vocab(self, sentences):
"""
Build vocabulary from a sequence of sentences (can be a once-only generator stream).
Expand All @@ -216,6 +336,14 @@ def build_vocab(self, sentences):
(len(vocab), total_words(), sentence_no + 1))

# assign a unique index to each word
self.create_unique_index( vocab )
# add info about each word's Huffman encoding
self.create_binary_tree()
self.reset_weights()



def create_unique_index( self, vocab ) :
self.vocab, self.index2word = {}, []
for word, v in vocab.iteritems():
if v.count >= self.min_count:
Expand All @@ -224,20 +352,13 @@ def build_vocab(self, sentences):
self.vocab[word] = v
logger.info("total %i word types after removing those with count<%s" % (len(self.vocab), self.min_count))

# add info about each word's Huffman encoding
self.create_binary_tree()
self.reset_weights()


def train(self, sentences, total_words=None, word_count=0, chunksize=100):
"""
Update the model's neural weights from a sequence of sentences (can be a once-only generator stream).
Each sentence must be a list of utf8 strings.

"""
if FAST_VERSION < 0:
import warnings
warnings.warn("Cython compilation failed, training will be slow. Do you have Cython installed? `pip install cython`")
logger.info("training model with %i workers on %i vocabulary and %i features" % (self.workers, len(self.vocab), self.layer1_size))

if not self.vocab:
Expand Down Expand Up @@ -588,33 +709,13 @@ def __iter__(self):


class LineSentence(object):
def __init__(self, source):
"""Simple format: one sentence = one line; words already preprocessed and separated by whitespace.

source can be either a string or a file object

Thus, one can use this for just plain files:

sentences = LineSentence('myfile.txt')

Or for compressed files:

sentences = LineSentence(bz2.BZ2File('compressed_text.bz2'))
"""
self.source = source
def __init__(self, fname):
"""Simple format: one sentence = one line; words already preprocessed and separated by whitespace."""
self.fname = fname

def __iter__(self):
"""Iterate through the lines in the source."""
try:
# Assume it is a file-like object and try treating it as such
# Things that don't have seek will trigger an exception
self.source.seek(0)
for line in self.source:
yield line.split()
except AttributeError:
# If it didn't work like a file, use it as a string filename
for line in open(self.source):
yield line.split()
for line in open(self.fname):
yield line.split()



Expand Down