Skip to content

Commit

Permalink
svmlightcorpus.py: Add sequence serialization of corpus
Browse files Browse the repository at this point in the history
Current version of serialization support only lists, but this
adds support for any type of sequence.

Closes: #2113
  • Loading branch information
aquatiko committed Oct 27, 2018
1 parent 7e4965e commit 6eafa23
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 1 deletion.
3 changes: 3 additions & 0 deletions gensim/corpora/svmlightcorpus.py
Expand Up @@ -111,6 +111,9 @@ def save_corpus(fname, corpus, id2word=None, labels=False, metadata=False):
"""
logger.info("converting corpus to SVMlight format: %s", fname)

if labels is not False:
# Cast any sequence (incl. a numpy array) to a list, to simplify the processing below.
labels = list(labels)
offsets = []
with utils.smart_open(fname, 'wb') as fout:
for docno, doc in enumerate(corpus):
Expand Down
13 changes: 12 additions & 1 deletion gensim/test/test_corpora.py
Expand Up @@ -23,7 +23,7 @@
ucicorpus, malletcorpus, textcorpus, indexedcorpus, wikicorpus)
from gensim.interfaces import TransformedCorpus
from gensim.utils import to_unicode
from gensim.test.utils import datapath, get_tmpfile
from gensim.test.utils import datapath, get_tmpfile, common_corpus


class DummyTransformer(object):
Expand Down Expand Up @@ -382,6 +382,17 @@ def setUp(self):
self.corpus_class = svmlightcorpus.SvmLightCorpus
self.file_extension = '.svmlight'

def test_serialization(self):
path = get_tmpfile("svml.corpus")
labels = [1] * len(common_corpus)
second_corpus = [(0, 1.0), (3, 1.0), (4, 1.0), (5, 1.0), (6, 1.0), (7, 1.0)]
self.corpus_class.serialize(path, common_corpus, labels=labels)
serialized_corpus = self.corpus_class(path)
self.assertEqual(serialized_corpus[1], second_corpus)
self.corpus_class.serialize(path, common_corpus, labels=np.array(labels))
serialized_corpus = self.corpus_class(path)
self.assertEqual(serialized_corpus[1], second_corpus)


class TestBleiCorpus(CorpusTestCase):
def setUp(self):
Expand Down

0 comments on commit 6eafa23

Please sign in to comment.