Skip to content

Commit

Permalink
Fix FastText RAM usage in tests (+ fixes for wheel building) (#2791)
Browse files Browse the repository at this point in the history
* pin `bucket` parameter (to avoid RAM issues on CI system) + get rid win32 skip

* fix flake8

* partially fix doc building

* better workaround for docs build

* fix sphinx-gallery

* avoid test error

* get back loading of old model (because large buckets)

* Update setup.py

Co-Authored-By: Radim Řehůřek <me@radimrehurek.com>

* Update gensim/test/test_fasttext.py

Co-Authored-By: Radim Řehůřek <me@radimrehurek.com>

* define missing buckets & fix formatting

Co-authored-by: Ivan Menshikh <imenshikh@embedika.ru>
Co-authored-by: Radim Řehůřek <me@radimrehurek.com>
  • Loading branch information
3 people committed Apr 13, 2020
1 parent ec222e8 commit a2ec4c3
Show file tree
Hide file tree
Showing 5 changed files with 49 additions and 45 deletions.
74 changes: 39 additions & 35 deletions gensim/test/test_fasttext.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,12 @@
logger = logging.getLogger(__name__)

IS_WIN32 = (os.name == "nt") and (struct.calcsize('P') * 8 == 32)

MAX_WORDVEC_COMPONENT_DIFFERENCE = 1.0e-10

# Limit the size of FastText ngram buckets, for RAM reasons.
# See https://github.com/RaRe-Technologies/gensim/issues/2790
BUCKET = 5000

FT_HOME = os.environ.get("FT_HOME")
FT_CMD = os.path.join(FT_HOME, "fasttext") if FT_HOME else None

Expand Down Expand Up @@ -67,7 +70,7 @@ def setUp(self):
self.test_new_model_file = datapath('lee_fasttext_new.bin')

def test_training(self):
model = FT_gensim(size=10, min_count=1, hs=1, negative=0, seed=42, workers=1)
model = FT_gensim(size=10, min_count=1, hs=1, negative=0, seed=42, workers=1, bucket=BUCKET)
model.build_vocab(sentences)
self.model_sanity(model)

Expand All @@ -87,7 +90,7 @@ def test_training(self):
self.assertEqual(sims, sims2)

# build vocab and train in one step; must be the same as above
model2 = FT_gensim(sentences, size=10, min_count=1, hs=1, negative=0, seed=42, workers=1)
model2 = FT_gensim(sentences, size=10, min_count=1, hs=1, negative=0, seed=42, workers=1, bucket=BUCKET)
self.models_equal(model, model2)

# verify oov-word vector retrieval
Expand All @@ -99,7 +102,7 @@ def test_training(self):

def testFastTextTrainParameters(self):

model = FT_gensim(size=10, min_count=1, hs=1, negative=0, seed=42, workers=1)
model = FT_gensim(size=10, min_count=1, hs=1, negative=0, seed=42, workers=1, bucket=BUCKET)
model.build_vocab(sentences=sentences)

self.assertRaises(TypeError, model.train, corpus_file=11111)
Expand All @@ -112,7 +115,7 @@ def test_training_fromfile(self):
with temporary_file(get_tmpfile('gensim_fasttext.tst')) as corpus_file:
utils.save_as_line_sentence(sentences, corpus_file)

model = FT_gensim(size=10, min_count=1, hs=1, negative=0, seed=42, workers=1)
model = FT_gensim(size=10, min_count=1, hs=1, negative=0, seed=42, workers=1, bucket=BUCKET)
model.build_vocab(corpus_file=corpus_file)
self.model_sanity(model)

Expand Down Expand Up @@ -151,10 +154,9 @@ def models_equal(self, model, model2):
most_common_word = max(model.wv.vocab.items(), key=lambda item: item[1].count)[0]
self.assertTrue(np.allclose(model.wv[most_common_word], model2.wv[most_common_word]))

@unittest.skipIf(IS_WIN32, "avoid memory error with Appveyor x32")
def test_persistence(self):
tmpf = get_tmpfile('gensim_fasttext.tst')
model = FT_gensim(sentences, min_count=1)
model = FT_gensim(sentences, min_count=1, bucket=BUCKET)
model.save(tmpf)
self.models_equal(model, FT_gensim.load(tmpf))
# test persistence of the KeyedVectors of a model
Expand All @@ -169,7 +171,7 @@ def test_persistence_fromfile(self):
utils.save_as_line_sentence(sentences, corpus_file)

tmpf = get_tmpfile('gensim_fasttext.tst')
model = FT_gensim(corpus_file=corpus_file, min_count=1)
model = FT_gensim(corpus_file=corpus_file, min_count=1, bucket=BUCKET)
model.save(tmpf)
self.models_equal(model, FT_gensim.load(tmpf))
# test persistence of the KeyedVectors of a model
Expand All @@ -179,10 +181,9 @@ def test_persistence_fromfile(self):
self.assertTrue(np.allclose(wv.vectors_ngrams, loaded_wv.vectors_ngrams))
self.assertEqual(len(wv.vocab), len(loaded_wv.vocab))

@unittest.skipIf(IS_WIN32, "avoid memory error with Appveyor x32")
def test_norm_vectors_not_saved(self):
tmpf = get_tmpfile('gensim_fasttext.tst')
model = FT_gensim(sentences, min_count=1)
model = FT_gensim(sentences, min_count=1, bucket=BUCKET)
model.init_sims()
model.save(tmpf)
loaded_model = FT_gensim.load(tmpf)
Expand Down Expand Up @@ -406,7 +407,7 @@ def test_cbow_hs_training(self):
model_gensim = FT_gensim(
size=50, sg=0, cbow_mean=1, alpha=0.05, window=5, hs=1, negative=0,
min_count=5, iter=5, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6,
sorted_vocab=1, workers=1, min_alpha=0.0)
sorted_vocab=1, workers=1, min_alpha=0.0, bucket=BUCKET)

lee_data = LineSentence(datapath('lee_background.cor'))
model_gensim.build_vocab(lee_data)
Expand Down Expand Up @@ -435,7 +436,7 @@ def test_cbow_hs_training_fromfile(self):
model_gensim = FT_gensim(
size=50, sg=0, cbow_mean=1, alpha=0.05, window=5, hs=1, negative=0,
min_count=5, iter=5, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6,
sorted_vocab=1, workers=1, min_alpha=0.0)
sorted_vocab=1, workers=1, min_alpha=0.0, bucket=BUCKET * 4)

lee_data = LineSentence(datapath('lee_background.cor'))
utils.save_as_line_sentence(lee_data, corpus_file)
Expand Down Expand Up @@ -468,7 +469,7 @@ def test_sg_hs_training(self):
model_gensim = FT_gensim(
size=50, sg=1, cbow_mean=1, alpha=0.025, window=5, hs=1, negative=0,
min_count=5, iter=5, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6,
sorted_vocab=1, workers=1, min_alpha=0.0)
sorted_vocab=1, workers=1, min_alpha=0.0, bucket=BUCKET)

lee_data = LineSentence(datapath('lee_background.cor'))
model_gensim.build_vocab(lee_data)
Expand Down Expand Up @@ -497,7 +498,7 @@ def test_sg_hs_training_fromfile(self):
model_gensim = FT_gensim(
size=50, sg=1, cbow_mean=1, alpha=0.025, window=5, hs=1, negative=0,
min_count=5, iter=5, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6,
sorted_vocab=1, workers=1, min_alpha=0.0)
sorted_vocab=1, workers=1, min_alpha=0.0, bucket=BUCKET)

lee_data = LineSentence(datapath('lee_background.cor'))
utils.save_as_line_sentence(lee_data, corpus_file)
Expand Down Expand Up @@ -530,7 +531,7 @@ def test_cbow_neg_training(self):
model_gensim = FT_gensim(
size=50, sg=0, cbow_mean=1, alpha=0.05, window=5, hs=0, negative=5,
min_count=5, iter=5, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6,
sorted_vocab=1, workers=1, min_alpha=0.0)
sorted_vocab=1, workers=1, min_alpha=0.0, bucket=BUCKET)

lee_data = LineSentence(datapath('lee_background.cor'))
model_gensim.build_vocab(lee_data)
Expand Down Expand Up @@ -559,7 +560,7 @@ def test_cbow_neg_training_fromfile(self):
model_gensim = FT_gensim(
size=50, sg=0, cbow_mean=1, alpha=0.05, window=5, hs=0, negative=5,
min_count=5, iter=5, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6,
sorted_vocab=1, workers=1, min_alpha=0.0)
sorted_vocab=1, workers=1, min_alpha=0.0, bucket=BUCKET)

lee_data = LineSentence(datapath('lee_background.cor'))
utils.save_as_line_sentence(lee_data, corpus_file)
Expand Down Expand Up @@ -592,7 +593,7 @@ def test_sg_neg_training(self):
model_gensim = FT_gensim(
size=50, sg=1, cbow_mean=1, alpha=0.025, window=5, hs=0, negative=5,
min_count=5, iter=5, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6,
sorted_vocab=1, workers=1, min_alpha=0.0)
sorted_vocab=1, workers=1, min_alpha=0.0, bucket=BUCKET * 4)

lee_data = LineSentence(datapath('lee_background.cor'))
model_gensim.build_vocab(lee_data)
Expand Down Expand Up @@ -621,7 +622,7 @@ def test_sg_neg_training_fromfile(self):
model_gensim = FT_gensim(
size=50, sg=1, cbow_mean=1, alpha=0.025, window=5, hs=0, negative=5,
min_count=5, iter=5, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6,
sorted_vocab=1, workers=1, min_alpha=0.0)
sorted_vocab=1, workers=1, min_alpha=0.0, bucket=BUCKET * 4)

lee_data = LineSentence(datapath('lee_background.cor'))
utils.save_as_line_sentence(lee_data, corpus_file)
Expand Down Expand Up @@ -650,7 +651,7 @@ def test_sg_neg_training_fromfile(self):
self.assertGreaterEqual(overlap_count, 2)

def test_online_learning(self):
model_hs = FT_gensim(sentences, size=10, min_count=1, seed=42, hs=1, negative=0)
model_hs = FT_gensim(sentences, size=10, min_count=1, seed=42, hs=1, negative=0, bucket=BUCKET)
self.assertTrue(len(model_hs.wv.vocab), 12)
self.assertTrue(model_hs.wv.vocab['graph'].count, 3)
model_hs.build_vocab(new_sentences, update=True) # update vocab
Expand All @@ -664,7 +665,8 @@ def test_online_learning_fromfile(self):
utils.save_as_line_sentence(sentences, corpus_file)
utils.save_as_line_sentence(new_sentences, new_corpus_file)

model_hs = FT_gensim(corpus_file=corpus_file, size=10, min_count=1, seed=42, hs=1, negative=0)
model_hs = FT_gensim(
corpus_file=corpus_file, size=10, min_count=1, seed=42, hs=1, negative=0, bucket=BUCKET)
self.assertTrue(len(model_hs.wv.vocab), 12)
self.assertTrue(model_hs.wv.vocab['graph'].count, 3)
model_hs.build_vocab(corpus_file=new_corpus_file, update=True) # update vocab
Expand All @@ -674,7 +676,7 @@ def test_online_learning_fromfile(self):

def test_online_learning_after_save(self):
tmpf = get_tmpfile('gensim_fasttext.tst')
model_neg = FT_gensim(sentences, size=10, min_count=0, seed=42, hs=0, negative=5)
model_neg = FT_gensim(sentences, size=10, min_count=0, seed=42, hs=0, negative=5, bucket=BUCKET)
model_neg.save(tmpf)
model_neg = FT_gensim.load(tmpf)
self.assertTrue(len(model_neg.wv.vocab), 12)
Expand All @@ -689,7 +691,8 @@ def test_online_learning_after_save_fromfile(self):
utils.save_as_line_sentence(new_sentences, new_corpus_file)

tmpf = get_tmpfile('gensim_fasttext.tst')
model_neg = FT_gensim(corpus_file=corpus_file, size=10, min_count=0, seed=42, hs=0, negative=5)
model_neg = FT_gensim(
corpus_file=corpus_file, size=10, min_count=0, seed=42, hs=0, negative=5, bucket=BUCKET)
model_neg.save(tmpf)
model_neg = FT_gensim.load(tmpf)
self.assertTrue(len(model_neg.wv.vocab), 12)
Expand Down Expand Up @@ -720,33 +723,30 @@ def online_sanity(self, model):
sim = model.wv.n_similarity(['war'], ['terrorism'])
self.assertLess(0., sim)

@unittest.skipIf(IS_WIN32, "avoid memory error with Appveyor x32")
def test_sg_hs_online(self):
model = FT_gensim(sg=1, window=2, hs=1, negative=0, min_count=3, iter=1, seed=42, workers=1)
model = FT_gensim(sg=1, window=2, hs=1, negative=0, min_count=3, iter=1, seed=42, workers=1, bucket=BUCKET)
self.online_sanity(model)

@unittest.skipIf(IS_WIN32, "avoid memory error with Appveyor x32")
def test_sg_neg_online(self):
model = FT_gensim(sg=1, window=2, hs=0, negative=5, min_count=3, iter=1, seed=42, workers=1)
model = FT_gensim(sg=1, window=2, hs=0, negative=5, min_count=3, iter=1, seed=42, workers=1, bucket=BUCKET)
self.online_sanity(model)

@unittest.skipIf(IS_WIN32, "avoid memory error with Appveyor x32")
def test_cbow_hs_online(self):
model = FT_gensim(
sg=0, cbow_mean=1, alpha=0.05, window=2, hs=1, negative=0, min_count=3, iter=1, seed=42, workers=1
sg=0, cbow_mean=1, alpha=0.05, window=2, hs=1, negative=0, min_count=3, iter=1, seed=42, workers=1,
bucket=BUCKET,
)
self.online_sanity(model)

@unittest.skipIf(IS_WIN32, "avoid memory error with Appveyor x32")
def test_cbow_neg_online(self):
model = FT_gensim(
sg=0, cbow_mean=1, alpha=0.05, window=2, hs=0, negative=5,
min_count=5, iter=1, seed=42, workers=1, sample=0
min_count=5, iter=1, seed=42, workers=1, sample=0, bucket=BUCKET
)
self.online_sanity(model)

def test_get_vocab_word_vecs(self):
model = FT_gensim(size=10, min_count=1, seed=42)
model = FT_gensim(size=10, min_count=1, seed=42, bucket=BUCKET)
model.build_vocab(sentences)
original_syn0_vocab = np.copy(model.wv.vectors_vocab)
model.wv.adjust_vectors()
Expand All @@ -755,7 +755,7 @@ def test_get_vocab_word_vecs(self):
def test_persistence_word2vec_format(self):
"""Test storing/loading the model in word2vec format."""
tmpf = get_tmpfile('gensim_fasttext_w2v_format.tst')
model = FT_gensim(sentences, min_count=1, size=10)
model = FT_gensim(sentences, min_count=1, size=10, bucket=BUCKET)
model.wv.save_word2vec_format(tmpf, binary=True)
loaded_model_kv = Word2VecKeyedVectors.load_word2vec_format(tmpf, binary=True)
self.assertEqual(len(model.wv.vocab), len(loaded_model_kv.vocab))
Expand All @@ -769,7 +769,7 @@ def test_bucket_ngrams(self):
self.assertEqual(model.wv.vectors_ngrams.shape, (20, 10))

def test_estimate_memory(self):
model = FT_gensim(sg=1, hs=1, size=10, negative=5, min_count=3)
model = FT_gensim(sg=1, hs=1, size=10, negative=5, min_count=3, bucket=BUCKET)
model.build_vocab(sentences)
report = model.estimate_memory()
self.assertEqual(report['vocab'], 2800)
Expand All @@ -780,6 +780,7 @@ def test_estimate_memory(self):
self.assertEqual(report['buckets_word'], 640)
self.assertEqual(report['total'], 6160)

@unittest.skipIf(IS_WIN32, "avoid memory error with Appveyor x32")
def testLoadOldModel(self):
"""Test loading fasttext models from previous version"""

Expand Down Expand Up @@ -835,7 +836,7 @@ def test_cbow_hs_against_wrapper(self):

model_gensim = FT_gensim(size=50, sg=0, cbow_mean=1, alpha=0.05, window=5, hs=1, negative=0,
min_count=5, iter=5, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6,
sorted_vocab=1, workers=1, min_alpha=0.0)
sorted_vocab=1, workers=1, min_alpha=0.0, bucket=BUCKET)

lee_data = LineSentence(datapath('lee_background.cor'))
model_gensim.build_vocab(lee_data)
Expand All @@ -856,7 +857,7 @@ def test_sg_hs_against_wrapper(self):

model_gensim = FT_gensim(size=50, sg=1, cbow_mean=1, alpha=0.025, window=5, hs=1, negative=0,
min_count=5, iter=5, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6,
sorted_vocab=1, workers=1, min_alpha=0.0)
sorted_vocab=1, workers=1, min_alpha=0.0, bucket=BUCKET)

lee_data = LineSentence(datapath('lee_background.cor'))
model_gensim.build_vocab(lee_data)
Expand Down Expand Up @@ -1334,6 +1335,7 @@ def _check_roundtrip(self, sg):
"hs": 1,
"negative": 5,
"seed": 42,
"bucket": BUCKET,
"workers": 1}

with temporary_file("roundtrip_model_to_model.bin") as fpath:
Expand Down Expand Up @@ -1387,6 +1389,7 @@ def _check_roundtrip_file_file(self, sg):
"min_count": 1,
"hs": 1,
"negative": 0,
"bucket": BUCKET,
"seed": 42,
"workers": 1}

Expand Down Expand Up @@ -1486,6 +1489,7 @@ def _check_load_fasttext_format(self, sg):
"min_count": 1,
"hs": 1,
"negative": 5,
"bucket": BUCKET,
"seed": 42,
"workers": 1}

Expand Down
2 changes: 1 addition & 1 deletion gensim/test/test_nmf.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ def testTransform(self):
vec = matutils.sparse2full(transformed, 2)
expected = [0.35023746, 0.64976251]
# must contain the same values, up to re-ordering
self.assertTrue(np.allclose(sorted(vec), sorted(expected), rtol=1e-4))
self.assertTrue(np.allclose(sorted(vec), sorted(expected), rtol=1e-3))

def testTopTopics(self):
top_topics = self.model.top_topics(common_corpus)
Expand Down
4 changes: 2 additions & 2 deletions gensim/test/test_similarities.py
Original file line number Diff line number Diff line change
Expand Up @@ -570,7 +570,7 @@ def __iter__(self):
for line in infile:
yield line.lower().strip().split()

model = FastText(LeeReader(datapath('lee.cor')))
model = FastText(LeeReader(datapath('lee.cor')), bucket=5000)
model.init_sims()
index = self.indexer(model, 10)

Expand Down Expand Up @@ -733,7 +733,7 @@ def __iter__(self):
for line in infile:
yield line.lower().strip().split()

model = FastText(LeeReader(datapath('lee.cor')))
model = FastText(LeeReader(datapath('lee.cor')), bucket=5000)
model.init_sims()
index = self.indexer(model)

Expand Down
11 changes: 5 additions & 6 deletions gensim/test/test_sklearn_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -1299,7 +1299,7 @@ def testModelNotFitted(self):

class TestFastTextWrapper(unittest.TestCase):
def setUp(self):
self.model = FTTransformer(size=10, min_count=0, seed=42)
self.model = FTTransformer(size=10, min_count=0, seed=42, bucket=5000)
self.model.fit(texts)

def testTransform(self):
Expand Down Expand Up @@ -1327,12 +1327,11 @@ def testTransform(self):

def testConsistencyWithGensimModel(self):
# training a FTTransformer
self.model = FTTransformer(size=10, min_count=0, seed=42, workers=1)
self.model = FTTransformer(size=10, min_count=0, seed=42, workers=1, bucket=5000)
self.model.fit(texts)

# training a Gensim FastText model with the same params
gensim_ftmodel = models.FastText(texts, size=10, min_count=0, seed=42,
workers=1)
gensim_ftmodel = models.FastText(texts, size=10, min_count=0, seed=42, workers=1, bucket=5000)

# vectors returned by FTTransformer
vecs_transformer_api = self.model.transform(
Expand All @@ -1350,7 +1349,7 @@ def testConsistencyWithGensimModel(self):
self.assertTrue(passed)

def testPipeline(self):
model = FTTransformer(size=10, min_count=1)
model = FTTransformer(size=10, min_count=1, bucket=5000)
model.fit(w2v_texts)

class_dict = {'mathematics': 1, 'physics': 0}
Expand Down Expand Up @@ -1396,7 +1395,7 @@ def testPersistence(self):
self.assertTrue(passed)

def testModelNotFitted(self):
ftmodel_wrapper = FTTransformer(size=10, min_count=0, seed=42)
ftmodel_wrapper = FTTransformer(size=10, min_count=0, seed=42, bucket=5000)
word = texts[0][0]
self.assertRaises(NotFittedError, ftmodel_wrapper.transform, word)

Expand Down
3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -284,7 +284,7 @@ def run(self):
# https://packaging.python.org/discussions/install-requires-vs-requirements/
#
docs_testenv = linux_testenv + distributed_env + [
'sphinx',
'sphinx <= 2.4.4', # avoid `sphinx >= 3.0` that breaks build
'sphinxcontrib-napoleon',
'plotly',
#
Expand All @@ -304,6 +304,7 @@ def run(self):
'statsmodels',
'pyemd',
'pandas',
'matplotlib', # sphinx-gallery expects this dep
]

if sys.version_info < (3, 7):
Expand Down

0 comments on commit a2ec4c3

Please sign in to comment.