Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix FastText RAM usage in tests (+ fixes for wheel building) #2791

Merged
merged 10 commits into from Apr 13, 2020
74 changes: 39 additions & 35 deletions gensim/test/test_fasttext.py
Expand Up @@ -33,9 +33,12 @@
logger = logging.getLogger(__name__)

IS_WIN32 = (os.name == "nt") and (struct.calcsize('P') * 8 == 32)

MAX_WORDVEC_COMPONENT_DIFFERENCE = 1.0e-10

# Limit the size of FastText ngram buckets, for RAM reasons.
# See https://github.com/RaRe-Technologies/gensim/issues/2790
BUCKET = 5000
menshikh-iv marked this conversation as resolved.
Show resolved Hide resolved

FT_HOME = os.environ.get("FT_HOME")
FT_CMD = os.path.join(FT_HOME, "fasttext") if FT_HOME else None

Expand Down Expand Up @@ -67,7 +70,7 @@ def setUp(self):
self.test_new_model_file = datapath('lee_fasttext_new.bin')

def test_training(self):
model = FT_gensim(size=10, min_count=1, hs=1, negative=0, seed=42, workers=1)
model = FT_gensim(size=10, min_count=1, hs=1, negative=0, seed=42, workers=1, bucket=BUCKET)
mpenkov marked this conversation as resolved.
Show resolved Hide resolved
model.build_vocab(sentences)
self.model_sanity(model)

Expand All @@ -87,7 +90,7 @@ def test_training(self):
self.assertEqual(sims, sims2)

# build vocab and train in one step; must be the same as above
model2 = FT_gensim(sentences, size=10, min_count=1, hs=1, negative=0, seed=42, workers=1)
model2 = FT_gensim(sentences, size=10, min_count=1, hs=1, negative=0, seed=42, workers=1, bucket=BUCKET)
self.models_equal(model, model2)

# verify oov-word vector retrieval
Expand All @@ -99,7 +102,7 @@ def test_training(self):

def testFastTextTrainParameters(self):

model = FT_gensim(size=10, min_count=1, hs=1, negative=0, seed=42, workers=1)
model = FT_gensim(size=10, min_count=1, hs=1, negative=0, seed=42, workers=1, bucket=BUCKET)
model.build_vocab(sentences=sentences)

self.assertRaises(TypeError, model.train, corpus_file=11111)
Expand All @@ -112,7 +115,7 @@ def test_training_fromfile(self):
with temporary_file(get_tmpfile('gensim_fasttext.tst')) as corpus_file:
utils.save_as_line_sentence(sentences, corpus_file)

model = FT_gensim(size=10, min_count=1, hs=1, negative=0, seed=42, workers=1)
model = FT_gensim(size=10, min_count=1, hs=1, negative=0, seed=42, workers=1, bucket=BUCKET)
model.build_vocab(corpus_file=corpus_file)
self.model_sanity(model)

Expand Down Expand Up @@ -151,10 +154,9 @@ def models_equal(self, model, model2):
most_common_word = max(model.wv.vocab.items(), key=lambda item: item[1].count)[0]
self.assertTrue(np.allclose(model.wv[most_common_word], model2.wv[most_common_word]))

@unittest.skipIf(IS_WIN32, "avoid memory error with Appveyor x32")
def test_persistence(self):
tmpf = get_tmpfile('gensim_fasttext.tst')
model = FT_gensim(sentences, min_count=1)
model = FT_gensim(sentences, min_count=1, bucket=BUCKET)
model.save(tmpf)
self.models_equal(model, FT_gensim.load(tmpf))
# test persistence of the KeyedVectors of a model
Expand All @@ -169,7 +171,7 @@ def test_persistence_fromfile(self):
utils.save_as_line_sentence(sentences, corpus_file)

tmpf = get_tmpfile('gensim_fasttext.tst')
model = FT_gensim(corpus_file=corpus_file, min_count=1)
model = FT_gensim(corpus_file=corpus_file, min_count=1, bucket=BUCKET)
model.save(tmpf)
self.models_equal(model, FT_gensim.load(tmpf))
# test persistence of the KeyedVectors of a model
Expand All @@ -179,10 +181,9 @@ def test_persistence_fromfile(self):
self.assertTrue(np.allclose(wv.vectors_ngrams, loaded_wv.vectors_ngrams))
self.assertEqual(len(wv.vocab), len(loaded_wv.vocab))

@unittest.skipIf(IS_WIN32, "avoid memory error with Appveyor x32")
def test_norm_vectors_not_saved(self):
tmpf = get_tmpfile('gensim_fasttext.tst')
model = FT_gensim(sentences, min_count=1)
model = FT_gensim(sentences, min_count=1, bucket=BUCKET)
model.init_sims()
model.save(tmpf)
loaded_model = FT_gensim.load(tmpf)
Expand Down Expand Up @@ -406,7 +407,7 @@ def test_cbow_hs_training(self):
model_gensim = FT_gensim(
size=50, sg=0, cbow_mean=1, alpha=0.05, window=5, hs=1, negative=0,
min_count=5, iter=5, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6,
sorted_vocab=1, workers=1, min_alpha=0.0)
sorted_vocab=1, workers=1, min_alpha=0.0, bucket=BUCKET)

lee_data = LineSentence(datapath('lee_background.cor'))
model_gensim.build_vocab(lee_data)
Expand Down Expand Up @@ -435,7 +436,7 @@ def test_cbow_hs_training_fromfile(self):
model_gensim = FT_gensim(
size=50, sg=0, cbow_mean=1, alpha=0.05, window=5, hs=1, negative=0,
min_count=5, iter=5, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6,
sorted_vocab=1, workers=1, min_alpha=0.0)
sorted_vocab=1, workers=1, min_alpha=0.0, bucket=BUCKET * 4)

lee_data = LineSentence(datapath('lee_background.cor'))
utils.save_as_line_sentence(lee_data, corpus_file)
Expand Down Expand Up @@ -468,7 +469,7 @@ def test_sg_hs_training(self):
model_gensim = FT_gensim(
size=50, sg=1, cbow_mean=1, alpha=0.025, window=5, hs=1, negative=0,
min_count=5, iter=5, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6,
sorted_vocab=1, workers=1, min_alpha=0.0)
sorted_vocab=1, workers=1, min_alpha=0.0, bucket=BUCKET)

lee_data = LineSentence(datapath('lee_background.cor'))
model_gensim.build_vocab(lee_data)
Expand Down Expand Up @@ -497,7 +498,7 @@ def test_sg_hs_training_fromfile(self):
model_gensim = FT_gensim(
size=50, sg=1, cbow_mean=1, alpha=0.025, window=5, hs=1, negative=0,
min_count=5, iter=5, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6,
sorted_vocab=1, workers=1, min_alpha=0.0)
sorted_vocab=1, workers=1, min_alpha=0.0, bucket=BUCKET)

lee_data = LineSentence(datapath('lee_background.cor'))
utils.save_as_line_sentence(lee_data, corpus_file)
Expand Down Expand Up @@ -530,7 +531,7 @@ def test_cbow_neg_training(self):
model_gensim = FT_gensim(
size=50, sg=0, cbow_mean=1, alpha=0.05, window=5, hs=0, negative=5,
min_count=5, iter=5, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6,
sorted_vocab=1, workers=1, min_alpha=0.0)
sorted_vocab=1, workers=1, min_alpha=0.0, bucket=BUCKET)

lee_data = LineSentence(datapath('lee_background.cor'))
model_gensim.build_vocab(lee_data)
Expand Down Expand Up @@ -559,7 +560,7 @@ def test_cbow_neg_training_fromfile(self):
model_gensim = FT_gensim(
size=50, sg=0, cbow_mean=1, alpha=0.05, window=5, hs=0, negative=5,
min_count=5, iter=5, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6,
sorted_vocab=1, workers=1, min_alpha=0.0)
sorted_vocab=1, workers=1, min_alpha=0.0, bucket=BUCKET)

lee_data = LineSentence(datapath('lee_background.cor'))
utils.save_as_line_sentence(lee_data, corpus_file)
Expand Down Expand Up @@ -592,7 +593,7 @@ def test_sg_neg_training(self):
model_gensim = FT_gensim(
size=50, sg=1, cbow_mean=1, alpha=0.025, window=5, hs=0, negative=5,
min_count=5, iter=5, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6,
sorted_vocab=1, workers=1, min_alpha=0.0)
sorted_vocab=1, workers=1, min_alpha=0.0, bucket=BUCKET * 4)

lee_data = LineSentence(datapath('lee_background.cor'))
model_gensim.build_vocab(lee_data)
Expand Down Expand Up @@ -621,7 +622,7 @@ def test_sg_neg_training_fromfile(self):
model_gensim = FT_gensim(
size=50, sg=1, cbow_mean=1, alpha=0.025, window=5, hs=0, negative=5,
min_count=5, iter=5, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6,
sorted_vocab=1, workers=1, min_alpha=0.0)
sorted_vocab=1, workers=1, min_alpha=0.0, bucket=BUCKET * 4)

lee_data = LineSentence(datapath('lee_background.cor'))
utils.save_as_line_sentence(lee_data, corpus_file)
Expand Down Expand Up @@ -650,7 +651,7 @@ def test_sg_neg_training_fromfile(self):
self.assertGreaterEqual(overlap_count, 2)

def test_online_learning(self):
model_hs = FT_gensim(sentences, size=10, min_count=1, seed=42, hs=1, negative=0)
model_hs = FT_gensim(sentences, size=10, min_count=1, seed=42, hs=1, negative=0, bucket=BUCKET)
self.assertTrue(len(model_hs.wv.vocab), 12)
self.assertTrue(model_hs.wv.vocab['graph'].count, 3)
model_hs.build_vocab(new_sentences, update=True) # update vocab
Expand All @@ -664,7 +665,8 @@ def test_online_learning_fromfile(self):
utils.save_as_line_sentence(sentences, corpus_file)
utils.save_as_line_sentence(new_sentences, new_corpus_file)

model_hs = FT_gensim(corpus_file=corpus_file, size=10, min_count=1, seed=42, hs=1, negative=0)
model_hs = FT_gensim(
corpus_file=corpus_file, size=10, min_count=1, seed=42, hs=1, negative=0, bucket=BUCKET)
self.assertTrue(len(model_hs.wv.vocab), 12)
self.assertTrue(model_hs.wv.vocab['graph'].count, 3)
model_hs.build_vocab(corpus_file=new_corpus_file, update=True) # update vocab
Expand All @@ -674,7 +676,7 @@ def test_online_learning_fromfile(self):

def test_online_learning_after_save(self):
tmpf = get_tmpfile('gensim_fasttext.tst')
model_neg = FT_gensim(sentences, size=10, min_count=0, seed=42, hs=0, negative=5)
model_neg = FT_gensim(sentences, size=10, min_count=0, seed=42, hs=0, negative=5, bucket=BUCKET)
model_neg.save(tmpf)
model_neg = FT_gensim.load(tmpf)
self.assertTrue(len(model_neg.wv.vocab), 12)
Expand All @@ -689,7 +691,8 @@ def test_online_learning_after_save_fromfile(self):
utils.save_as_line_sentence(new_sentences, new_corpus_file)

tmpf = get_tmpfile('gensim_fasttext.tst')
model_neg = FT_gensim(corpus_file=corpus_file, size=10, min_count=0, seed=42, hs=0, negative=5)
model_neg = FT_gensim(
corpus_file=corpus_file, size=10, min_count=0, seed=42, hs=0, negative=5, bucket=BUCKET)
model_neg.save(tmpf)
model_neg = FT_gensim.load(tmpf)
self.assertTrue(len(model_neg.wv.vocab), 12)
Expand Down Expand Up @@ -720,33 +723,30 @@ def online_sanity(self, model):
sim = model.wv.n_similarity(['war'], ['terrorism'])
self.assertLess(0., sim)

@unittest.skipIf(IS_WIN32, "avoid memory error with Appveyor x32")
def test_sg_hs_online(self):
model = FT_gensim(sg=1, window=2, hs=1, negative=0, min_count=3, iter=1, seed=42, workers=1)
model = FT_gensim(sg=1, window=2, hs=1, negative=0, min_count=3, iter=1, seed=42, workers=1, bucket=BUCKET)
self.online_sanity(model)

@unittest.skipIf(IS_WIN32, "avoid memory error with Appveyor x32")
def test_sg_neg_online(self):
model = FT_gensim(sg=1, window=2, hs=0, negative=5, min_count=3, iter=1, seed=42, workers=1)
model = FT_gensim(sg=1, window=2, hs=0, negative=5, min_count=3, iter=1, seed=42, workers=1, bucket=BUCKET)
self.online_sanity(model)

@unittest.skipIf(IS_WIN32, "avoid memory error with Appveyor x32")
def test_cbow_hs_online(self):
model = FT_gensim(
sg=0, cbow_mean=1, alpha=0.05, window=2, hs=1, negative=0, min_count=3, iter=1, seed=42, workers=1
sg=0, cbow_mean=1, alpha=0.05, window=2, hs=1, negative=0, min_count=3, iter=1, seed=42, workers=1,
bucket=BUCKET,
)
self.online_sanity(model)

@unittest.skipIf(IS_WIN32, "avoid memory error with Appveyor x32")
def test_cbow_neg_online(self):
model = FT_gensim(
sg=0, cbow_mean=1, alpha=0.05, window=2, hs=0, negative=5,
min_count=5, iter=1, seed=42, workers=1, sample=0
min_count=5, iter=1, seed=42, workers=1, sample=0, bucket=BUCKET
)
self.online_sanity(model)

def test_get_vocab_word_vecs(self):
model = FT_gensim(size=10, min_count=1, seed=42)
model = FT_gensim(size=10, min_count=1, seed=42, bucket=BUCKET)
model.build_vocab(sentences)
original_syn0_vocab = np.copy(model.wv.vectors_vocab)
model.wv.adjust_vectors()
Expand All @@ -755,7 +755,7 @@ def test_get_vocab_word_vecs(self):
def test_persistence_word2vec_format(self):
"""Test storing/loading the model in word2vec format."""
tmpf = get_tmpfile('gensim_fasttext_w2v_format.tst')
model = FT_gensim(sentences, min_count=1, size=10)
model = FT_gensim(sentences, min_count=1, size=10, bucket=BUCKET)
model.wv.save_word2vec_format(tmpf, binary=True)
loaded_model_kv = Word2VecKeyedVectors.load_word2vec_format(tmpf, binary=True)
self.assertEqual(len(model.wv.vocab), len(loaded_model_kv.vocab))
Expand All @@ -769,7 +769,7 @@ def test_bucket_ngrams(self):
self.assertEqual(model.wv.vectors_ngrams.shape, (20, 10))

def test_estimate_memory(self):
model = FT_gensim(sg=1, hs=1, size=10, negative=5, min_count=3)
model = FT_gensim(sg=1, hs=1, size=10, negative=5, min_count=3, bucket=BUCKET)
model.build_vocab(sentences)
report = model.estimate_memory()
self.assertEqual(report['vocab'], 2800)
Expand All @@ -780,6 +780,7 @@ def test_estimate_memory(self):
self.assertEqual(report['buckets_word'], 640)
self.assertEqual(report['total'], 6160)

@unittest.skipIf(IS_WIN32, "avoid memory error with Appveyor x32")
def testLoadOldModel(self):
"""Test loading fasttext models from previous version"""

Expand Down Expand Up @@ -835,7 +836,7 @@ def test_cbow_hs_against_wrapper(self):

model_gensim = FT_gensim(size=50, sg=0, cbow_mean=1, alpha=0.05, window=5, hs=1, negative=0,
min_count=5, iter=5, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6,
sorted_vocab=1, workers=1, min_alpha=0.0)
sorted_vocab=1, workers=1, min_alpha=0.0, bucket=BUCKET)

lee_data = LineSentence(datapath('lee_background.cor'))
model_gensim.build_vocab(lee_data)
Expand All @@ -856,7 +857,7 @@ def test_sg_hs_against_wrapper(self):

model_gensim = FT_gensim(size=50, sg=1, cbow_mean=1, alpha=0.025, window=5, hs=1, negative=0,
min_count=5, iter=5, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6,
sorted_vocab=1, workers=1, min_alpha=0.0)
sorted_vocab=1, workers=1, min_alpha=0.0, bucket=BUCKET)

lee_data = LineSentence(datapath('lee_background.cor'))
model_gensim.build_vocab(lee_data)
Expand Down Expand Up @@ -1334,6 +1335,7 @@ def _check_roundtrip(self, sg):
"hs": 1,
"negative": 5,
"seed": 42,
"bucket": BUCKET,
"workers": 1}

with temporary_file("roundtrip_model_to_model.bin") as fpath:
Expand Down Expand Up @@ -1387,6 +1389,7 @@ def _check_roundtrip_file_file(self, sg):
"min_count": 1,
"hs": 1,
"negative": 0,
"bucket": BUCKET,
"seed": 42,
"workers": 1}

Expand Down Expand Up @@ -1486,6 +1489,7 @@ def _check_load_fasttext_format(self, sg):
"min_count": 1,
"hs": 1,
"negative": 5,
"bucket": BUCKET,
"seed": 42,
"workers": 1}

Expand Down
2 changes: 1 addition & 1 deletion gensim/test/test_nmf.py
Expand Up @@ -98,7 +98,7 @@ def testTransform(self):
vec = matutils.sparse2full(transformed, 2)
expected = [0.35023746, 0.64976251]
# must contain the same values, up to re-ordering
self.assertTrue(np.allclose(sorted(vec), sorted(expected), rtol=1e-4))
self.assertTrue(np.allclose(sorted(vec), sorted(expected), rtol=1e-3))

def testTopTopics(self):
top_topics = self.model.top_topics(common_corpus)
Expand Down
4 changes: 2 additions & 2 deletions gensim/test/test_similarities.py
Expand Up @@ -570,7 +570,7 @@ def __iter__(self):
for line in infile:
yield line.lower().strip().split()

model = FastText(LeeReader(datapath('lee.cor')))
model = FastText(LeeReader(datapath('lee.cor')), bucket=5000)
model.init_sims()
index = self.indexer(model, 10)

Expand Down Expand Up @@ -733,7 +733,7 @@ def __iter__(self):
for line in infile:
yield line.lower().strip().split()

model = FastText(LeeReader(datapath('lee.cor')))
model = FastText(LeeReader(datapath('lee.cor')), bucket=5000)
model.init_sims()
index = self.indexer(model)

Expand Down
11 changes: 5 additions & 6 deletions gensim/test/test_sklearn_api.py
Expand Up @@ -1299,7 +1299,7 @@ def testModelNotFitted(self):

class TestFastTextWrapper(unittest.TestCase):
def setUp(self):
self.model = FTTransformer(size=10, min_count=0, seed=42)
self.model = FTTransformer(size=10, min_count=0, seed=42, bucket=5000)
self.model.fit(texts)

def testTransform(self):
Expand Down Expand Up @@ -1327,12 +1327,11 @@ def testTransform(self):

def testConsistencyWithGensimModel(self):
# training a FTTransformer
self.model = FTTransformer(size=10, min_count=0, seed=42, workers=1)
self.model = FTTransformer(size=10, min_count=0, seed=42, workers=1, bucket=5000)
self.model.fit(texts)

# training a Gensim FastText model with the same params
gensim_ftmodel = models.FastText(texts, size=10, min_count=0, seed=42,
workers=1)
gensim_ftmodel = models.FastText(texts, size=10, min_count=0, seed=42, workers=1, bucket=5000)

# vectors returned by FTTransformer
vecs_transformer_api = self.model.transform(
Expand All @@ -1350,7 +1349,7 @@ def testConsistencyWithGensimModel(self):
self.assertTrue(passed)

def testPipeline(self):
model = FTTransformer(size=10, min_count=1)
model = FTTransformer(size=10, min_count=1, bucket=5000)
model.fit(w2v_texts)

class_dict = {'mathematics': 1, 'physics': 0}
Expand Down Expand Up @@ -1396,7 +1395,7 @@ def testPersistence(self):
self.assertTrue(passed)

def testModelNotFitted(self):
ftmodel_wrapper = FTTransformer(size=10, min_count=0, seed=42)
ftmodel_wrapper = FTTransformer(size=10, min_count=0, seed=42, bucket=5000)
word = texts[0][0]
self.assertRaises(NotFittedError, ftmodel_wrapper.transform, word)

Expand Down
3 changes: 2 additions & 1 deletion setup.py
Expand Up @@ -284,7 +284,7 @@ def run(self):
# https://packaging.python.org/discussions/install-requires-vs-requirements/
#
docs_testenv = linux_testenv + distributed_env + [
'sphinx',
'sphinx <= 2.4.4', # avoid `sphinx >= 3.0` that breaks build
menshikh-iv marked this conversation as resolved.
Show resolved Hide resolved
'sphinxcontrib-napoleon',
'plotly',
#
Expand All @@ -304,6 +304,7 @@ def run(self):
'statsmodels',
'pyemd',
'pandas',
'matplotlib', # sphinx-gallery expects this dep
]

if sys.version_info < (3, 7):
Expand Down