# Speed comparision

In [1]:
from fse.models import Average
from fse.models.average import train_average_np
from fse.models.average_inner import train_average_cy

from fse import IndexedList

import numpy as np

import gensim.downloader as api
data = api.load("quora-duplicate-questions")

sentences = []
for d in data:
    sentences.append(d["question1"])
s = IndexedList(sentences[:500])
print(len(s))

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


500


In [2]:
from gensim.models.keyedvectors import Word2VecKeyedVectors, FastTextKeyedVectors

w2v = Word2VecKeyedVectors.load("/Volumes/Ext_HDD/Models/Static/google_news.model", mmap="r")
ft = FastTextKeyedVectors.load("/Volumes/Ext_HDD/Models/Static/ft_crawl_300d_2m.model", mmap="r")

# Test W2V Model

To test if the fast version is available, you need to import the variable FAST_VERSION from fse.models.average. 
1 : The cython version is available
-1 : The cython version is not available.

If the cython compiliation fails, you will be notified.

In [3]:
from fse.models.average import FAST_VERSION
FAST_VERSION

1

In [4]:
%%timeit
w2v_avg = Average(w2v)

2.53 ms ± 24.9 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [5]:
%%timeit
w2v_avg = Average(w2v, lang_freq="en")

2.17 s ± 8.02 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


The slowest part during the init is the induction of frequencies for words, as some pre-trained embeddings do not come with frequencies for words. This is only necessary for the SIF and uSIF Model, not for the Average model.

In [6]:
w2v_avg = Average(w2v)
statistics = w2v_avg.scan_sentences(s)
w2v_avg.prep.prepare_vectors(sv=w2v_avg.sv, total_sentences=statistics["max_index"], update=False)
memory = w2v_avg._get_thread_working_mem()

In [7]:
%%timeit
train_average_np(model=w2v_avg, indexed_sentences=s, target=w2v_avg.sv.vectors, memory=memory)

20.9 ms ± 960 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [8]:
%%timeit
train_average_cy(model=w2v_avg, indexed_sentences=s, target=w2v_avg.sv.vectors, memory=memory)

2.4 ms ± 52.9 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


For 500 sentences, the Cython version is about 7.15x faster than the numpy version when using a Word2Vec type model.

In [9]:
out_w2v_np = np.zeros_like(w2v_avg.sv.vectors)
out_w2v_cy = np.zeros_like(w2v_avg.sv.vectors)
train_average_np(model=w2v_avg, indexed_sentences=s, target=out_w2v_np, memory=w2v_avg._get_thread_working_mem())
train_average_cy(model=w2v_avg, indexed_sentences=s, target=out_w2v_cy, memory=w2v_avg._get_thread_working_mem())

np.allclose(out_w2v_np, out_w2v_cy)

True

# Test FastTextModel

In [10]:
ft_avg = Average(ft)
statistics = ft_avg.scan_sentences(s)
ft_avg.prep.prepare_vectors(sv=ft_avg.sv, total_sentences=statistics["max_index"], update=False)
memory = ft_avg._get_thread_working_mem()

In [11]:
%%timeit
train_average_np(model=ft_avg, indexed_sentences=s, target=ft_avg.sv.vectors, memory=memory)

70.6 ms ± 1.39 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [12]:
%%timeit
train_average_cy(model=ft_avg, indexed_sentences=s, target=ft_avg.sv.vectors, memory=memory)

7.2 ms ± 52.3 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


With a FastText type model, the cython routine is about 10 times faster.

In [13]:
out_ft_np = np.zeros_like(ft_avg.sv.vectors)
out_ft_cy = np.zeros_like(ft_avg.sv.vectors)
train_average_np(model=ft_avg, indexed_sentences=s, target=out_ft_np, memory=ft_avg._get_thread_working_mem())
train_average_cy(model=ft_avg, indexed_sentences=s, target=out_ft_cy, memory=ft_avg._get_thread_working_mem())

np.allclose(out_ft_np, out_ft_cy)

True