In [1]:
from gensim.models import Word2Vec
import numpy as np
from numba.decorators import jit, autojit
import timeit
from nltk.corpus import brown
import cython
from re import sub
import psutil

np.random.seed(42)

REAL = np.float32

In [2]:
%load_ext cython

In [3]:
def normalize_text(sentences):
    for i, s in enumerate(sentences):
        sentences[i] = [sub("[^a-zA-Z]", "", w.lower()) for w in s]

In [4]:
sentences = [s for s in brown.sents()]
normalize_text(sentences)

In [5]:
model = Word2Vec(size=50, iter=10, workers=psutil.cpu_count(), sg=1, window=5, negative=5, min_count=1)
model.build_vocab(sentences)
model.train(sentences, epochs=model.epochs, total_examples=model.corpus_count)

(7963167, 11611920)

In [9]:
def compute_sif_weights(wv, sif_param=1e-3):
        """Pre-Compute the SIF weighting for the summation function"""
        if sif_param > 0:
            corpus_size = 0
            wv.sif = np.zeros(shape=len(wv.vocab), dtype=REAL)

            for k in wv.vocab:
                corpus_size += wv.vocab[k].count

            for idx, k in enumerate(wv.index2word):
                pw = wv.vocab[k].count / corpus_size
                wv.sif[idx] = sif_param / (sif_param+pw)
        else:
            wv.sif = np.ones(shape=len(wv.vocab), dtype=REAL)

In [10]:
compute_sif_weights(model.wv)

In [38]:
def sentence_summary(sentences, model):
    vlookup = model.wv
    np_sum = np.sum
    
    v = []
    for s in sentences:
        v.append(np_sum(vlookup[s], axis=0))
    return np.vstack(v)

def sentence_summary_0(sentences, model):
    vlookup = model.wv.vocab
    vectors = model.wv.vectors
    np_sum = np.sum
    
    v = []
    for s in sentences:
        idx = [vlookup[w].index for w in s]
        
        v.append(np_sum(vectors[idx], axis=0))
    return np.vstack(v)

In [13]:
%%timeit -n 1000
sentence_summary(sentences[:200], model)

14.9 ms ± 736 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [14]:
%%timeit -n 1000
sentence_summary_0(sentences[:200], model)

3.31 ms ± 42.2 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [15]:
sentences_idx = [np.asarray([model.wv.vocab[w].index for w in s], dtype=np.intc) for s in sentences]

In [16]:
def sentence_summary_1(sentences, model):
    vectors = model.wv.vectors
    np_sum = np.sum
    
    v = []
    for s in sentences:
        v.append(np_sum(vectors[s], axis=0))
    return np.vstack(v)

def sentence_summary_2(sentences, model):
    vectors = model.wv.vectors
    np_sum = np.sum
    
    v = np.vstack([np_sum(vectors[s], axis=0) for s in sentences])
    return v

In [17]:
%%timeit -n 1000
sentence_summary_1(sentences_idx[:200], model)

1.92 ms ± 44.7 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [18]:
%%timeit -n 1000
sentence_summary_2(sentences_idx[:200], model)

2.13 ms ± 69 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [19]:
%%cython
import numpy as np
cimport numpy as np
import cython

def sentence_summary_3(sentences, model):
    cdef int size = model.vector_size
    cdef float[:,:] vectors = model.wv.vectors

    cdef int sentence_index, word_index, d
    np_sum = np.sum
    
    output = np.zeros((len(sentences), size), dtype=np.float32)   
    cdef float[:,:] sv = output
    
    for sentence_index, sentence in enumerate(sentences):
        for word_index in sentence:
            for d in range(size):
                sv[sentence_index, d] += vectors[word_index, d]
    
    return output

@cython.boundscheck(False)
@cython.wraparound(False)
def sentence_summary_4(sentences, model):
    cdef int size = model.vector_size
    cdef float[:,:] vectors = model.wv.vectors

    cdef int sentence_index, word_index, d
    np_sum = np.sum
    
    output = np.zeros((len(sentences), size), dtype=np.float32)   
    cdef float[:,:] sv = output
    
    for sentence_index, sentence in enumerate(sentences):
        for word_index in sentence:
            for d in range(size):
                sv[sentence_index, d] += vectors[word_index, d]
    
    return output

def sentence_summary_5(sentences, model):
    cdef int size = model.vector_size
    cdef float[:,:] vectors = model.wv.vectors

    np_sum = np.sum
    np_asarray = np.asarray
    
    output = np.zeros((len(sentences), size), dtype=np.float32)   
    cdef float[:,:] sv = output
    
    cdef int[:] sentence_view
    cdef int sentence_len
    
    
    for i in xrange(len(sentences)):
        sentence_view = sentences[i]
        sentence_len = len(sentences[i])
        sentence_summary_5_cloop(size, sentence_view, sentence_len, i, vectors, sv)
        
    return output

cdef void sentence_summary_5_cloop(int size, int[:] sentence_view, int sentence_len, int sentence_idx, float[:,:] vectors, float[:,:] summary_vectors) nogil:
    cdef int i,d, word_index
    
    for i in xrange(sentence_len):
        word_index = sentence_view[i]
        
        for d in xrange(size):
            summary_vectors[sentence_idx, d] += vectors[word_index, d]
            
            
@cython.boundscheck(False)
@cython.wraparound(False)
def sentence_summary_6(sentences, model):
    cdef int size = model.vector_size
    cdef float[:,:] vectors = model.wv.vectors

    np_sum = np.sum
    np_asarray = np.asarray
    
    output = np.zeros((len(sentences), size), dtype=np.float32)   
    cdef float[:,:] sv = output
    
    cdef int[:] sentence_view
    cdef int sentence_len
    
    
    for i in xrange(len(sentences)):
        sentence_view = sentences[i]
        sentence_len = len(sentences[i])
        sentence_summary_6_cloop(size, sentence_view, sentence_len, i, vectors, sv)
        
    return output

@cython.boundscheck(False)
@cython.wraparound(False)
cdef void sentence_summary_6_cloop(int size, int[:] sentence_view, int sentence_len, int sentence_idx, float[:,:] vectors, float[:,:] summary_vectors) nogil:
    cdef int i,d, word_index
    
    for i in xrange(sentence_len):
        word_index = sentence_view[i]
        
        for d in xrange(size):
            summary_vectors[sentence_idx, d] += vectors[word_index, d]

In [20]:
%%timeit -n 1000
sentence_summary_3(sentences_idx[:200], model)

621 µs ± 5.78 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [21]:
%%timeit -n 1000
sentence_summary_4(sentences_idx[:200], model)

534 µs ± 10.3 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [22]:
%%timeit -n 1000
sentence_summary_5(sentences_idx[:200], model)

366 µs ± 5.58 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [23]:
%%timeit -n 1000
sentence_summary_6(sentences_idx[:200], model)

292 µs ± 15.4 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [24]:
%%cython

cdef extern from "/Users/oliverborchers/Desktop/GSDEV/gensim-develop/gensim/models/voidptr.h":
    void* PyCObject_AsVoidPtr(object obj)

import numpy as np
cimport numpy as np
import cython

import scipy.linalg.blas as fblas

ctypedef np.float32_t REAL_t
ctypedef np.int32_t INT_t

ctypedef void (*saxpy_ptr) (const int *N, const float *alpha, const float *X, const int *incX, float *Y, const int *incY) nogil
cdef saxpy_ptr saxpy=<saxpy_ptr>PyCObject_AsVoidPtr(fblas.saxpy._cpointer)

cdef REAL_t ONEF = <REAL_t>1.0
cdef int ONE = 1

@cython.boundscheck(False)
@cython.wraparound(False)
def sentence_summary_7(sentences, model):
    cdef int size = model.vector_size
    cdef REAL_t *vectors = <REAL_t *>(np.PyArray_DATA(model.wv.vectors))
    
    output = np.zeros((len(sentences), size), dtype=np.float32)   
    cdef REAL_t *sv = <REAL_t *>(np.PyArray_DATA(output))
    
    cdef INT_t *sentence_view
    
    for i in xrange(len(sentences)):
        sentence_view = <INT_t *>(np.PyArray_DATA(sentences[i]))
        sentence_len = len(sentences[i])
        
        sentence_summary_7_cloop(size, sentence_view, sentence_len, i, vectors, sv)
        
    return output

@cython.boundscheck(False)
@cython.wraparound(False)
cdef void sentence_summary_7_cloop(const int size, const INT_t *sentence_view, const int sentence_len, 
                                   const int sentence_idx, const REAL_t *vectors, REAL_t *summary_vectors) nogil:
    
    cdef int i,d, word_index
    
    for i in xrange(sentence_len):
        word_index = sentence_view[i]
        
        saxpy(&size, &ONEF, &vectors[word_index * size], &ONE, &summary_vectors[sentence_idx * size], &ONE)

In [25]:
np.allclose(sentence_summary_7(sentences_idx[:200], model), sentence_summary_6(sentences_idx[:200], model))

True

In [26]:
%%timeit -n 1000
sentence_summary_7(sentences_idx[:200], model)

116 µs ± 4.84 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
