Skip to content

Commit

Permalink
Merge pull request #1 from piskvorky/cythonize_setup
Browse files Browse the repository at this point in the history
various fixes
  • Loading branch information
Björn Esser committed Apr 22, 2014
2 parents 1f2c1a2 + 909012f commit a9f24b5
Show file tree
Hide file tree
Showing 3 changed files with 71 additions and 69 deletions.
4 changes: 2 additions & 2 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ before_install:
- sudo apt-get update -qq
- sudo apt-get install -qq libatlas-dev liblapack-dev gfortran python-numpy python-scipy
install:
- pip install numpy
- pip install scipy
- pip install --quiet numpy
- pip install --quiet scipy
- python setup.py install
script: python setup.py test
123 changes: 62 additions & 61 deletions gensim/models/word2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,86 +84,87 @@
try:
# try to compile and use the faster cython version
import pyximport
pyximport.install(setup_args={"include_dirs": [os.path.dirname(__file__), get_include()]})
models_dir = os.path.dirname(__file__) or os.getcwd()
pyximport.install(setup_args={"include_dirs": [models_dir, get_include()]})
from word2vec_inner import train_sentence_sg, train_sentence_cbow, FAST_VERSION
except:
# failed... fall back to plain numpy (20-80x slower training than the above)
FAST_VERSION = -1

def train_sentence_sg(model, sentence, alpha, work=None):
"""
Update skip-gram hierarchical softmax model by training on a single sentence.
def train_sentence_sg(model, sentence, alpha, work=None):
"""
Update skip-gram hierarchical softmax model by training on a single sentence.
The sentence is a list of Vocab objects (or None, where the corresponding
word is not in the vocabulary. Called internally from `Word2Vec.train()`.
The sentence is a list of Vocab objects (or None, where the corresponding
word is not in the vocabulary. Called internally from `Word2Vec.train()`.
"""
for pos, word in enumerate(sentence):
if word is None:
continue # OOV word in the input sentence => skip
reduced_window = random.randint(model.window) # `b` in the original word2vec code

# now go over all words from the (reduced) window, predicting each one in turn
start = max(0, pos - model.window + reduced_window)
for pos2, word2 in enumerate(sentence[start : pos + model.window + 1 - reduced_window], start):
if pos2 == pos or word2 is None:
# don't train on OOV words and on the `word` itself
continue
"""
for pos, word in enumerate(sentence):
if word is None:
continue # OOV word in the input sentence => skip
reduced_window = random.randint(model.window) # `b` in the original word2vec code

l1 = model.syn0[word2.index]
# work on the entire tree at once, to push as much work into numpy's C routines as possible (performance)
l2a = model.syn1[word.point] # 2d matrix, codelen x layer1_size
fa = 1.0 / (1.0 + exp(-dot(l1, l2a.T))) # propagate hidden -> output
ga = (1 - word.code - fa) * alpha # vector of error gradients multiplied by the learning rate
model.syn1[word.point] += outer(ga, l1) # learn hidden -> output
# now go over all words from the (reduced) window, predicting each one in turn
start = max(0, pos - model.window + reduced_window)
for pos2, word2 in enumerate(sentence[start : pos + model.window + 1 - reduced_window], start):
if pos2 == pos or word2 is None:
# don't train on OOV words and on the `word` itself
continue

# TODO add negative sampling?
l1 = model.syn0[word2.index]
# work on the entire tree at once, to push as much work into numpy's C routines as possible (performance)
l2a = model.syn1[word.point] # 2d matrix, codelen x layer1_size
fa = 1.0 / (1.0 + exp(-dot(l1, l2a.T))) # propagate hidden -> output
ga = (1 - word.code - fa) * alpha # vector of error gradients multiplied by the learning rate
model.syn1[word.point] += outer(ga, l1) # learn hidden -> output

l1 += dot(ga, l2a) # learn input -> hidden
# TODO add negative sampling?

return len([word for word in sentence if word is not None])
l1 += dot(ga, l2a) # learn input -> hidden

return len([word for word in sentence if word is not None])

def train_sentence_cbow(model, sentence, alpha, work=None, neu1=None):
"""
Update CBOW hierarchical softmax model by training on a single sentence.

The sentence is a list of Vocab objects (or None, where the corresponding
word is not in the vocabulary. Called internally from `Word2Vec.train()`.
def train_sentence_cbow(model, sentence, alpha, work=None, neu1=None):
"""
Update CBOW hierarchical softmax model by training on a single sentence.
"""
The sentence is a list of Vocab objects (or None, where the corresponding
word is not in the vocabulary. Called internally from `Word2Vec.train()`.
for pos, word in enumerate(sentence):
if word is None:
continue # OOV word in the input sentence => skip
reduced_window = random.randint(model.window) # `b` in the original word2vec code

# Combine all context words into an appropriate input
start = max(0, pos - model.window + reduced_window)
l1 = matutils.zeros_aligned((model.layer1_size), dtype=REAL)
count = 0
for pos2, word2 in enumerate(sentence[start : pos + model.window + 1 - reduced_window], start):
if pos2 == pos or word2 is None:
pass
else:
count += 1
l1 += model.syn0[word2.index]
"""

if count > 0:
l1 = l1 / count
for pos, word in enumerate(sentence):
if word is None:
continue # OOV word in the input sentence => skip
reduced_window = random.randint(model.window) # `b` in the original word2vec code

l2a = model.syn1[word.point] # 2d matrix, codelen x layer1_size
fa = 1.0 / (1.0 + exp(-dot(l1, l2a.T))) # propagate hidden -> output
ga = (1 - word.code - fa) * alpha # vector of error gradients multiplied by the learning rate
model.syn1[word.point] += outer(ga, l1) # learn hidden -> output
# Combine all context words into an appropriate input
start = max(0, pos - model.window + reduced_window)
l1 = matutils.zeros_aligned((model.layer1_size), dtype=REAL)
count = 0
for pos2, word2 in enumerate(sentence[start : pos + model.window + 1 - reduced_window], start):
if pos2 == pos or word2 is None:
pass
else:
count += 1
l1 += model.syn0[word2.index]

for pos2, word2 in enumerate(sentence[start : pos + model.window + 1 - reduced_window], start):
if pos2 == pos or word2 is None:
pass
else:
model.syn0[word2.index] += dot(ga, l2a)
if count > 0:
l1 = l1 / count

l2a = model.syn1[word.point] # 2d matrix, codelen x layer1_size
fa = 1.0 / (1.0 + exp(-dot(l1, l2a.T))) # propagate hidden -> output
ga = (1 - word.code - fa) * alpha # vector of error gradients multiplied by the learning rate
model.syn1[word.point] += outer(ga, l1) # learn hidden -> output

for pos2, word2 in enumerate(sentence[start : pos + model.window + 1 - reduced_window], start):
if pos2 == pos or word2 is None:
pass
else:
model.syn0[word2.index] += dot(ga, l2a)

return len([word for word in sentence if word is not None])
return len([word for word in sentence if word is not None])


class Vocab(object):
Expand Down Expand Up @@ -713,7 +714,7 @@ def __iter__(self):
yield sentence
break
last_token = text.rfind(' ') # the last token may have been split in two... keep it for the next iteration
words, rest = (utils.to_unicode(text[:last_token].split()), text[last_token:].strip()) if last_token >= 0 else ([], text)
words, rest = (utils.to_unicode(text[:last_token]).split(), text[last_token:].strip()) if last_token >= 0 else ([], text)
sentence.extend(words)
while len(sentence) >= max_sentence_length:
yield sentence[:max_sentence_length]
Expand Down
13 changes: 7 additions & 6 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,11 +110,12 @@ def read(fname):
try:
from Cython.Distutils import build_ext
import numpy
models_dir = os.path.join(os.path.dirname(__file__), 'gensim', 'models')

ext_modules = [
Extension('gensim_addons.models.word2vec_inner',
['gensim_addons/models/word2vec_inner.pyx'],
include_dirs = [numpy.get_include()])
include_dirs = [models_dir, numpy.get_include()])
]

native_ext = True
Expand All @@ -126,7 +127,7 @@ def read(fname):
Please install Cython (http://cython.org/), if you
want to use the highly optimized version of word2vec.
Usually you can install it using:
Usually you can install it (optional) using:
pip install -U cython
Expand All @@ -136,11 +137,11 @@ def read(fname):
or
the package-management of your distribution
the package-management of your distribution.
If you install Cython after installing gensim, the
optimized version of word2vec is automatically
generated on the first call of the function.
If you install Cython *after* installing gensim, the
optimized version of word2vec will still be automatically
generated, on the first use of word2vec.
=========================================================
''')
Expand Down

0 comments on commit a9f24b5

Please sign in to comment.