In [1]:
%load_ext cython
from gensim.models import Word2Vec
import numpy as np
import timeit
from nltk.corpus import brown
from re import sub
import psutil

import os

np.random.seed(42)

# Define data types for later use in cython
REAL = np.float32 
INT = np.intc

# Simple in-place normalization
def normalize_text(sentences):
    for i, s in enumerate(sentences):
        sentences[i] = [sub("[^a-zA-Z]", "", w.lower()) for w in s]

sentences = [s for s in brown.sents()]
normalize_text(sentences)

# Train standard word2vec model on preprocessed brown corpus
model = Word2Vec(size=50, iter=10, workers=psutil.cpu_count(), sg=1, window=5, negative=5, min_count=5)
model.build_vocab(sentences)
model.train(sentences, epochs=model.epochs, total_examples=model.corpus_count)

/Users/oliverborchers/Library/Mobile Documents/com~apple~CloudDocs/Diss/Medium/Medium_Repo/Sentence Summaries Fast Please


(7377494, 11611920)

In [2]:
def sif_embeddings(sentences, model, alpha=1e-3):
    """Compute the SIF embeddings for a list of sentences

    Parameters
    ----------
    sentences : list
        The sentences to compute the embeddings for
    model : `~gensim.models.base_any2vec.BaseAny2VecModel`
        A gensim model that contains the word vectors and the vocabulary
    alpha : float, optional
        Parameter which is used to weigh each individual word based on its probability p(w).

    Returns
    -------
    numpy.ndarray 
        SIF sentence embedding matrix of dim len(sentences) * dimension
    """
    
    vlookup = model.wv.vocab  # Gives us access to word index and count
    vectors = model.wv        # Gives us access to word vectors
    size = model.vector_size  # Embedding size
    
    Z = 0
    for k in vlookup:
        Z += vlookup[k].count # Compute the normalization constant Z
    
    output = []
    
    # Iterate all sentences
    for s in sentences:
        count = 0
        v = np.zeros(size, dtype=REAL) # Summary vector
        # Iterare all words
        for w in s:
            # A word must be present in the vocabulary
            if w in vlookup:
                for i in range(size):
                    v[i] += ( alpha / (alpha + (vlookup[w].count / Z))) * vectors[w][i]
                count += 1 
                
        if count > 0:
            for i in range(size):
                v[i] *= 1/count
        output.append(v)
    return np.vstack(output).astype(REAL)

In [3]:
%%timeit -n 10
sif_embeddings(sentences[:200], model)

1.69 s ± 83.8 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [4]:
def sif_embeddings_1(sentences, model, alpha=1e-3):
    vlookup = model.wv.vocab
    vectors = model.wv
    size = model.vector_size
    
    Z = 0
    for k in vlookup:
        Z += vlookup[k].count
    
    output = []
    for s in sentences:
        count = 0
        v = np.zeros(size, dtype=REAL)
        for w in s:
            if w in vlookup:
                v += ( alpha / (alpha + (vlookup[w].count / Z))) * vectors[w]
                count += 1
        if count > 0:
            v *= 1/count
        output.append(v)
    return np.vstack(output).astype(REAL)

In [5]:
np.allclose(sif_embeddings(sentences[:200], model), sif_embeddings_1(sentences[:200], model), atol=1e-6)

True

In [6]:
%%timeit -n 100
sif_embeddings_1(sentences[:200], model)

27.5 ms ± 2.28 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [7]:
def compute_sif_weights(wv, alpha=1e-3):
        """Precompute the SIF weights

        Parameters
        ----------
        wv : `~gensim.models.keyedvectors.BaseKeyedVectors`
            A gensim keyedvectors child that contains the word vectors and the vocabulary
        alpha : float
            Parameter which is used to weigh each individual word based on its probability p(w).

        """
        if alpha > 0:
            corpus_size = 0
            wv.sif = np.zeros(shape=len(wv.vocab), dtype=REAL) 
            # Make sure to set the dtypes right for Cython

            for k in wv.index2word:
                corpus_size += wv.vocab[k].count

            for idx, k in enumerate(wv.index2word):
                pw = wv.vocab[k].count / corpus_size
                wv.sif[idx] = alpha / (alpha+pw)
        else:
            wv.sif = np.ones(shape=len(wv.vocab), dtype=REAL)

compute_sif_weights(model.wv)

In [8]:
def sif_embeddings_2(sentences, model, alpha=1e-3):
    vlookup = model.wv.vocab
    vectors = model.wv
    size = model.vector_size
    
    output = []
    for s in sentences:
        count = 0
        v = np.zeros(size, dtype=REAL)
        for w in s:
            if w in vlookup:
                v +=  vectors.sif[vlookup[w].index]*vectors[w]
                count += 1
        if count > 0:
            v *= 1/count
        output.append(v)
    return np.vstack(output).astype(REAL)

In [9]:
np.allclose(sif_embeddings(sentences[:200], model), sif_embeddings_2(sentences[:200], model), atol=1e-6)

True

In [10]:
%%timeit -n 100
sif_embeddings_2(sentences[:200], model)

25.2 ms ± 680 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [11]:
def sif_embeddings_3(sentences, model, alpha=1e-3):
    vlookup = model.wv.vocab
    vectors = model.wv
    
    output = []
    for s in sentences:
        
        idx = [vlookup[w].index for w in s if w in vlookup]
        
        v = np.sum(vectors.vectors[idx] * vectors.sif[idx][:, None], axis=0)
        if len(idx) > 0:
            v *= 1/len(idx)
        output.append(v)
    return np.vstack(output).astype(REAL)

In [12]:
np.allclose(sif_embeddings(sentences[:200], model), sif_embeddings_3(sentences[:200], model), atol=1e-6)

True

In [13]:
%%timeit -n 1000
sif_embeddings_3(sentences[:200], model)

6.67 ms ± 799 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [14]:
model.wv.sif_vectors = model.wv.vectors * model.wv.sif[:, None]

def sif_embeddings_4(sentences, model):
    vlookup = model.wv.vocab
    vectors = model.wv.sif_vectors
    
    output = []
    for s in sentences:
        idx = [vlookup[w].index for w in s if w in vlookup]
        v = np.sum(vectors[idx], axis=0)
        if len(idx) > 0:
            v *= 1/len(idx)
        output.append(v)
    return np.vstack(output).astype(REAL)

In [15]:
np.allclose(sif_embeddings(sentences[:200], model), sif_embeddings_4(sentences[:200], model), atol=1e-6)

True

In [16]:
%%timeit -n 1000
sif_embeddings_4(sentences[:200], model)

4.72 ms ± 241 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [17]:
def sif_embeddings_5(sentences, model):
    vlookup = model.wv.vocab
    vectors = model.wv.sif_vectors
    
    output = np.zeros(shape=(len(sentences), model.vector_size))
    
    for i in range(len(sentences)):
        idx = [vlookup[w].index for w in sentences[i] if w in vlookup]
        output[i] = np.sum(vectors[idx], axis=0) * ( (1/len(idx)) if len(idx)>0 else 1)
    return output.astype(REAL)

In [18]:
np.allclose(sif_embeddings(sentences[:200], model), sif_embeddings_5(sentences[:200], model), atol=1e-6)

True

In [19]:
%%timeit -n 1000
sif_embeddings_5(sentences[:200], model)

4.02 ms ± 185 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


# Convert sentence to list of numbers

In [1]:
sentences_idx = [np.asarray([model.wv.vocab[w].index for w in s if w in model.wv.vocab], dtype=np.intc) for s in sentences]

NameError: name 'sentences' is not defined

In [21]:
def sif_embeddings_6(sentences, model):
    vectors = model.wv.sif_vectors
    output = np.zeros(shape=(len(sentences), model.vector_size), dtype=REAL)
    
    for i,s in enumerate(sentences):
        output[i] = np.sum(vectors[s], axis=0) * ( (1/len(s)) if len(s)>0 else 1)
    return output.astype(REAL)

In [22]:
np.allclose(sif_embeddings(sentences[:200], model), sif_embeddings_6(sentences_idx[:200], model), atol=1e-6)

True

In [23]:
%%timeit -n 1000
sif_embeddings_6(sentences_idx[:200], model)

2.21 ms ± 11.4 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [24]:
%%cython
import numpy as np
cimport numpy as np
import cython

def sif_embeddings_7(sentences, model):
    cdef int size = model.vector_size
    cdef float[:,:] vectors = model.wv.sif_vectors

    cdef int sentence_index, word_index, d, count = 0
    cdef float inv = 1.
    np_sum = np.sum
    
    output = np.zeros((len(sentences), size), dtype=np.float32)   
    cdef float[:,:] sv = output
    
    for sentence_index, sentence in enumerate(sentences):
        if len(sentence) > 0:
            count = 0
            for word_index in sentence:
                count += 1
                for d in range(size):
                    sv[sentence_index, d] += vectors[word_index, d]

            inv = (1./ <float>count)
            for d in range(size):
                sv[sentence_index, d] *= inv
    return output

In [25]:
np.allclose(sif_embeddings(sentences[:200], model), sif_embeddings_7(sentences_idx[:200], model), atol=1e-6)

True

In [26]:
%%timeit -n 1000
sif_embeddings_7(sentences_idx[:200], model)

594 µs ± 3.29 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [27]:
%%cython
import numpy as np
cimport numpy as np
import cython

@cython.boundscheck(False)
@cython.wraparound(False)
def sif_embeddings_8(sentences, model):
    cdef int size = model.vector_size
    cdef float[:,:] vectors = model.wv.sif_vectors

    cdef int sentence_index, word_index, d, count = 0
    cdef float inv = 1.
    np_sum = np.sum
    
    output = np.zeros((len(sentences), size), dtype=np.float32)   
    cdef float[:,:] sv = output
    
    for sentence_index, sentence in enumerate(sentences):
        if len(sentence) > 0:
            count = 0
            for word_index in sentence:
                count += 1
                for d in range(size):
                    sv[sentence_index, d] += vectors[word_index, d]

            inv = (1./ <float>count)
            for d in range(size):
                sv[sentence_index, d] *= inv
    return output

In [28]:
np.allclose(sif_embeddings(sentences[:200], model), sif_embeddings_8(sentences_idx[:200], model), atol=1e-6)

True

In [29]:
%%timeit -n 1000
sif_embeddings_8(sentences_idx[:200], model)

526 µs ± 10.9 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [30]:
%%cython
import numpy as np
cimport numpy as np
import cython

@cython.boundscheck(False)
@cython.wraparound(False)
def sif_embeddings_9(sentences, model):
    cdef int size = model.vector_size
    cdef float[:,:] vectors = model.wv.sif_vectors

    np_sum = np.sum
    np_asarray = np.asarray
    
    output = np.zeros((len(sentences), size), dtype=np.float32)   
    cdef float[:,:] sv = output
    
    cdef int[:] sentence_view
    cdef int sentence_len
    
    
    for i in xrange(len(sentences)):
        if len(sentences[i]) > 0:
            sentence_view = sentences[i]
            sentence_len = len(sentences[i])
            sif_embeddings_9_cloop(size, sentence_view, sentence_len, i, vectors, sv)
        
    return output

@cython.boundscheck(False)
@cython.wraparound(False)
cdef void sif_embeddings_9_cloop(int size, int[:] sentence_view, int sentence_len, int sentence_idx, float[:,:] vectors, float[:,:] summary_vectors) nogil:
    cdef int i,d, word_index, count = 0
    cdef float inv = 1.
    
    for i in xrange(sentence_len):
        word_index = sentence_view[i]
        count += 1
        for d in xrange(size):
            summary_vectors[sentence_idx, d] += vectors[word_index, d]
    
    inv = (1./ <float>count)
    for d in xrange(size):
        summary_vectors[sentence_idx, d] *= inv

In [31]:
np.allclose(sif_embeddings(sentences[:200], model), sif_embeddings_9(sentences_idx[:200], model), atol=1e-6)

True

In [32]:
%%timeit -n 1000
sif_embeddings_9(sentences_idx[:200], model)

303 µs ± 11.2 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [9]:
%%cython

cdef extern from "voidptr.h":
    void* PyCObject_AsVoidPtr(object obj)

import numpy as np
cimport numpy as np
import cython

import scipy.linalg.blas as fblas

ctypedef np.float32_t REAL_t
ctypedef np.int32_t INT_t

ctypedef void (*saxpy_ptr) (const int *N, const float *alpha, const float *X, const int *incX, float *Y, const int *incY) nogil
cdef saxpy_ptr saxpy=<saxpy_ptr>PyCObject_AsVoidPtr(fblas.saxpy._cpointer)

ctypedef void (*sscal_ptr) (const int *N, const float *alpha, const float *X, const int *incX) nogil
cdef sscal_ptr sscal=<sscal_ptr>PyCObject_AsVoidPtr(fblas.sscal._cpointer)

cdef REAL_t ONEF = <REAL_t>1.0
cdef int ONE = 1

@cython.boundscheck(False)
@cython.wraparound(False)
def sif_embeddings_10(sentences, model):
    cdef int size = model.vector_size
    cdef REAL_t *vectors = <REAL_t *>(np.PyArray_DATA(model.wv.sif_vectors))
    
    output = np.zeros((len(sentences), size), dtype=np.float32)   
    cdef REAL_t *sv = <REAL_t *>(np.PyArray_DATA(output))
    
    cdef INT_t *sentence_view
    
    for i in xrange(len(sentences)):
        if len(sentences[i]):
            sentence_view = <INT_t *>(np.PyArray_DATA(sentences[i]))
            sentence_len = len(sentences[i])
            sif_embeddings_10_cloop(size, sentence_view, sentence_len, i, vectors, sv)
    return output

@cython.boundscheck(False)
@cython.wraparound(False)
cdef void sif_embeddings_10_cloop(const int size, const INT_t *sentence_view, const int sentence_len, 
                                   const int sentence_idx, const REAL_t *vectors, REAL_t *summary_vectors) nogil:
    
    cdef int i,d, word_index
    cdef REAL_t inv = ONEF, count = <REAL_t> 0.
    
    for i in xrange(sentence_len):
        count += ONEF
        word_index = sentence_view[i]
        saxpy(&size, &ONEF, &vectors[word_index * size], &ONE, &summary_vectors[sentence_idx * size], &ONE)
        
    inv = ONEF / count
    sscal(&size, &inv, &summary_vectors[sentence_idx * size], &ONE)

CompileError: command 'gcc' failed with exit status 1

In [34]:
np.allclose(sif_embeddings(sentences[:200], model), sif_embeddings_10(sentences_idx[:200], model), atol=1e-6)

True

In [35]:
%%timeit -n 1000
sif_embeddings_10(sentences_idx[:200], model)

124 µs ± 8.44 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [38]:
%%time
sif_embeddings(sentences, model)

CPU times: user 6min 15s, sys: 925 ms, total: 6min 16s
Wall time: 6min 17s


array([[ 0.13711028, -0.3364488 ,  0.21267296, ...,  0.0463994 ,
        -0.13668068, -0.02847308],
       [ 0.07414544, -0.297035  ,  0.09073199, ..., -0.0087327 ,
        -0.083715  , -0.05233829],
       [ 0.13163339, -0.33895665,  0.10850863, ..., -0.00495466,
        -0.1570635 , -0.03875722],
       ...,
       [ 0.01167195, -0.29158774,  0.09907509, ..., -0.00638369,
         0.0765533 , -0.06349101],
       [-0.05498754, -0.14868212,  0.17586865, ..., -0.07050015,
         0.02253566, -0.03000972],
       [-0.03447687, -0.19199331,  0.15758486, ..., -0.02365185,
         0.00581935,  0.05083567]], dtype=float32)

In [39]:
%%time
sif_embeddings_10(sentences_idx, model)

CPU times: user 238 ms, sys: 7.87 ms, total: 246 ms
Wall time: 61.9 ms


array([[ 0.13711031, -0.3364488 ,  0.21267295, ...,  0.0463994 ,
        -0.13668068, -0.02847308],
       [ 0.07414544, -0.29703495,  0.09073199, ..., -0.0087327 ,
        -0.08371499, -0.05233828],
       [ 0.13163339, -0.33895662,  0.10850862, ..., -0.00495466,
        -0.1570635 , -0.03875722],
       ...,
       [ 0.01167195, -0.29158774,  0.09907508, ..., -0.00638369,
         0.07655329, -0.063491  ],
       [-0.05498753, -0.14868212,  0.17586863, ..., -0.07050015,
         0.02253566, -0.03000972],
       [-0.03447687, -0.19199331,  0.15758485, ..., -0.02365185,
         0.00581935,  0.05083568]], dtype=float32)