diff --git a/gensim/corpora/wikicorpus.py b/gensim/corpora/wikicorpus.py index 7214d6b2b0..7bc52bcb75 100644 --- a/gensim/corpora/wikicorpus.py +++ b/gensim/corpora/wikicorpus.py @@ -639,6 +639,10 @@ def __init__(self, fname, processes=None, lemmatize=utils.has_pattern(), diction else: self.dictionary = dictionary + @property + def input(self): + return self.fname + def get_texts(self): """Iterate over the dump, yielding a list of tokens for each article that passed the length and namespace filtering. diff --git a/gensim/test/test_corpora.py b/gensim/test/test_corpora.py index 6660542b48..e13e06ca36 100644 --- a/gensim/test/test_corpora.py +++ b/gensim/test/test_corpora.py @@ -769,6 +769,11 @@ def test_removed_table_markup(self): for word in table_markup: self.assertTrue(word not in text) + def test_get_stream(self): + wiki = self.corpus_class(self.enwiki) + sample_text_wiki = next(wiki.getstream()).decode()[1:14] + self.assertEqual(sample_text_wiki, "mediawiki xml") + # #TODO: sporadic failure to be investigated # def test_get_texts_returns_generator_of_lists(self): # corpus = self.corpus_class(self.enwiki)