Permalink
Cannot retrieve contributors at this time
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
# | |
# Copyright (C) 2011 Radim Rehurek <radimrehurek@seznam.cz> | |
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html | |
""" | |
This module contains math helper functions. | |
""" | |
from __future__ import with_statement | |
import logging | |
import math | |
import utils | |
import numpy | |
import scipy.sparse | |
import scipy.linalg | |
from scipy.linalg.lapack import get_lapack_funcs | |
from six import iteritems, itervalues, string_types | |
from six.moves import xrange, zip as izip | |
# scipy is not a stable package yet, locations change, so try to work | |
# around differences (currently only concerns location of 'triu' in scipy 0.7 vs. 0.8) | |
try: | |
from scipy.linalg.basic import triu | |
except ImportError: | |
from scipy.linalg.special_matrices import triu | |
try: | |
from numpy import triu_indices | |
except ImportError: | |
# numpy < 1.4 | |
def triu_indices(n, k=0): | |
m = numpy.ones((n, n), int) | |
a = triu(m, k) | |
return numpy.where(a != 0) | |
blas = lambda name, ndarray: scipy.linalg.get_blas_funcs((name,), (ndarray,))[0] | |
try: | |
# with bottleneck installed, we can use faster partial sorting | |
import bottleneck | |
def argsort(x, topn=None): | |
"""Return indices of the `topn` greatest elements in numpy array `x`, in order.""" | |
if topn is None: | |
topn = x.size | |
if topn <= 0: | |
return [] | |
if topn >= x.size: | |
return numpy.argsort(x)[::-1] | |
biggest = bottleneck.argpartsort(x, x.size - topn)[-topn:] | |
# the indices in `biggest` are not sorted by magnitude => sort & return | |
return biggest.take(numpy.argsort(x.take(biggest))[::-1]) | |
except ImportError: | |
# no bottleneck => fall back to numpy | |
def argsort(x, topn=None): | |
if topn is None: | |
topn = x.size | |
return numpy.argsort(x)[::-1][:topn] | |
logger = logging.getLogger("gensim.matutils") | |
def corpus2csc(corpus, num_terms=None, dtype=numpy.float64, num_docs=None, num_nnz=None, printprogress=0): | |
""" | |
Convert a streamed corpus into a sparse matrix, in scipy.sparse.csc_matrix format, | |
with documents as columns. | |
If the number of terms, documents and non-zero elements is known, you can pass | |
them here as parameters and a more memory efficient code path will be taken. | |
The input corpus may be a non-repeatable stream (generator). | |
This is the mirror function to `Sparse2Corpus`. | |
""" | |
try: | |
# if the input corpus has the `num_nnz`, `num_docs` and `num_terms` attributes | |
# (as is the case with MmCorpus for example), we can use a more efficient code path | |
if num_terms is None: | |
num_terms = corpus.num_terms | |
if num_docs is None: | |
num_docs = corpus.num_docs | |
if num_nnz is None: | |
num_nnz = corpus.num_nnz | |
except AttributeError: | |
pass # not a MmCorpus... | |
if printprogress: | |
logger.info("creating sparse matrix from corpus") | |
if num_terms is not None and num_docs is not None and num_nnz is not None: | |
# faster and much more memory-friendly version of creating the sparse csc | |
posnow, indptr = 0, [0] | |
indices = numpy.empty((num_nnz,), dtype=numpy.int32) # HACK assume feature ids fit in 32bit integer | |
data = numpy.empty((num_nnz,), dtype=dtype) | |
for docno, doc in enumerate(corpus): | |
if printprogress and docno % printprogress == 0: | |
logger.info("PROGRESS: at document #%i/%i" % (docno, num_docs)) | |
posnext = posnow + len(doc) | |
indices[posnow: posnext] = [feature_id for feature_id, _ in doc] | |
data[posnow: posnext] = [feature_weight for _, feature_weight in doc] | |
indptr.append(posnext) | |
posnow = posnext | |
assert posnow == num_nnz, "mismatch between supplied and computed number of non-zeros" | |
result = scipy.sparse.csc_matrix((data, indices, indptr), shape=(num_terms, num_docs), dtype=dtype) | |
else: | |
# slower version; determine the sparse matrix parameters during iteration | |
num_nnz, data, indices, indptr = 0, [], [], [0] | |
for docno, doc in enumerate(corpus): | |
if printprogress and docno % printprogress == 0: | |
logger.info("PROGRESS: at document #%i" % (docno)) | |
indices.extend([feature_id for feature_id, _ in doc]) | |
data.extend([feature_weight for _, feature_weight in doc]) | |
num_nnz += len(doc) | |
indptr.append(num_nnz) | |
if num_terms is None: | |
num_terms = max(indices) + 1 if indices else 0 | |
num_docs = len(indptr) - 1 | |
# now num_docs, num_terms and num_nnz contain the correct values | |
data = numpy.asarray(data, dtype=dtype) | |
indices = numpy.asarray(indices) | |
result = scipy.sparse.csc_matrix((data, indices, indptr), shape=(num_terms, num_docs), dtype=dtype) | |
return result | |
def pad(mat, padrow, padcol): | |
""" | |
Add additional rows/columns to a numpy.matrix `mat`. The new rows/columns | |
will be initialized with zeros. | |
""" | |
if padrow < 0: | |
padrow = 0 | |
if padcol < 0: | |
padcol = 0 | |
rows, cols = mat.shape | |
return numpy.bmat([[mat, numpy.matrix(numpy.zeros((rows, padcol)))], | |
[numpy.matrix(numpy.zeros((padrow, cols + padcol)))]]) | |
def zeros_aligned(shape, dtype, order='C', align=128): | |
"""Like `numpy.zeros()`, but the array will be aligned at `align` byte boundary.""" | |
nbytes = numpy.prod(shape, dtype=numpy.int64) * numpy.dtype(dtype).itemsize | |
buffer = numpy.zeros(nbytes + align, | |
dtype=numpy.uint8) # problematic on win64 ("maximum allowed dimension exceeded") | |
start_index = -buffer.ctypes.data % align | |
return buffer[start_index: start_index + nbytes].view(dtype).reshape(shape, order=order) | |
def ismatrix(m): | |
return isinstance(m, numpy.ndarray) and m.ndim == 2 or scipy.sparse.issparse(m) | |
def any2sparse(vec, eps=1e-9): | |
"""Convert a numpy/scipy vector into gensim document format (=list of 2-tuples).""" | |
if isinstance(vec, numpy.ndarray): | |
return dense2vec(vec, eps) | |
if scipy.sparse.issparse(vec): | |
return scipy2sparse(vec, eps) | |
return [(int(fid), float(fw)) for fid, fw in vec if numpy.abs(fw) > eps] | |
def scipy2sparse(vec, eps=1e-9): | |
"""Convert a scipy.sparse vector into gensim document format (=list of 2-tuples).""" | |
vec = vec.tocsr() | |
assert vec.shape[0] == 1 | |
return [(int(pos), float(val)) for pos, val in zip(vec.indices, vec.data) if numpy.abs(val) > eps] | |
class Scipy2Corpus(object): | |
""" | |
Convert a sequence of dense/sparse vectors into a streamed gensim corpus object. | |
This is the mirror function to `corpus2csc`. | |
""" | |
def __init__(self, vecs): | |
""" | |
`vecs` is a sequence of dense and/or sparse vectors, such as a 2d numpy array, | |
or a scipy.sparse.csc_matrix, or any sequence containing a mix of 1d numpy/scipy vectors. | |
""" | |
self.vecs = vecs | |
def __iter__(self): | |
for vec in self.vecs: | |
if isinstance(vec, numpy.ndarray): | |
yield full2sparse(vec) | |
else: | |
yield scipy2sparse(vec) | |
def __len__(self): | |
return len(self.vecs) | |
def sparse2full(doc, length): | |
""" | |
Convert a document in sparse document format (=sequence of 2-tuples) into a dense | |
numpy array (of size `length`). | |
This is the mirror function to `full2sparse`. | |
""" | |
result = numpy.zeros(length, dtype=numpy.float32) # fill with zeroes (default value) | |
doc = dict(doc) | |
# overwrite some of the zeroes with explicit values | |
result[list(doc)] = list(itervalues(doc)) | |
return result | |
def full2sparse(vec, eps=1e-9): | |
""" | |
Convert a dense numpy array into the sparse document format (sequence of 2-tuples). | |
Values of magnitude < `eps` are treated as zero (ignored). | |
This is the mirror function to `sparse2full`. | |
""" | |
vec = numpy.asarray(vec, dtype=float) | |
nnz = numpy.nonzero(abs(vec) > eps)[0] | |
return list(zip(nnz, vec.take(nnz))) | |
dense2vec = full2sparse | |
def full2sparse_clipped(vec, topn, eps=1e-9): | |
""" | |
Like `full2sparse`, but only return the `topn` elements of the greatest magnitude (abs). | |
""" | |
# use numpy.argsort and only form tuples that are actually returned. | |
# this is about 40x faster than explicitly forming all 2-tuples to run sort() or heapq.nlargest() on. | |
if topn <= 0: | |
return [] | |
vec = numpy.asarray(vec, dtype=float) | |
nnz = numpy.nonzero(abs(vec) > eps)[0] | |
biggest = nnz.take(argsort(vec.take(nnz), topn)) | |
return list(zip(biggest, vec.take(biggest))) | |
def corpus2dense(corpus, num_terms, num_docs=None, dtype=numpy.float32): | |
""" | |
Convert corpus into a dense numpy array (documents will be columns). You | |
must supply the number of features `num_terms`, because dimensionality | |
cannot be deduced from the sparse vectors alone. | |
You can optionally supply `num_docs` (=the corpus length) as well, so that | |
a more memory-efficient code path is taken. | |
This is the mirror function to `Dense2Corpus`. | |
""" | |
if num_docs is not None: | |
# we know the number of documents => don't bother column_stacking | |
docno, result = -1, numpy.empty((num_terms, num_docs), dtype=dtype) | |
for docno, doc in enumerate(corpus): | |
result[:, docno] = sparse2full(doc, num_terms) | |
assert docno + 1 == num_docs | |
else: | |
result = numpy.column_stack(sparse2full(doc, num_terms) for doc in corpus) | |
return result.astype(dtype) | |
class Dense2Corpus(object): | |
""" | |
Treat dense numpy array as a sparse, streamed gensim corpus. | |
No data copy is made (changes to the underlying matrix imply changes in the | |
corpus). | |
This is the mirror function to `corpus2dense`. | |
""" | |
def __init__(self, dense, documents_columns=True): | |
if documents_columns: | |
self.dense = dense.T | |
else: | |
self.dense = dense | |
def __iter__(self): | |
for doc in self.dense: | |
yield full2sparse(doc.flat) | |
def __len__(self): | |
return len(self.dense) | |
# endclass DenseCorpus | |
class Sparse2Corpus(object): | |
""" | |
Convert a matrix in scipy.sparse format into a streaming gensim corpus. | |
This is the mirror function to `corpus2csc`. | |
""" | |
def __init__(self, sparse, documents_columns=True): | |
if documents_columns: | |
self.sparse = sparse.tocsc() | |
else: | |
self.sparse = sparse.tocsr().T # make sure shape[1]=number of docs (needed in len()) | |
def __iter__(self): | |
for indprev, indnow in izip(self.sparse.indptr, self.sparse.indptr[1:]): | |
yield list(zip(self.sparse.indices[indprev:indnow], self.sparse.data[indprev:indnow])) | |
def __len__(self): | |
return self.sparse.shape[1] | |
# endclass Sparse2Corpus | |
def veclen(vec): | |
if len(vec) == 0: | |
return 0.0 | |
length = 1.0 * math.sqrt(sum(val ** 2 for _, val in vec)) | |
assert length > 0.0, "sparse documents must not contain any explicit zero entries" | |
return length | |
blas_nrm2 = blas('nrm2', numpy.array([], dtype=float)) | |
blas_scal = blas('scal', numpy.array([], dtype=float)) | |
def unitvec(vec): | |
""" | |
Scale a vector to unit length. The only exception is the zero vector, which | |
is returned back unchanged. | |
Output will be in the same format as input (i.e., gensim vector=>gensim vector, | |
or numpy array=>numpy array, scipy.sparse=>scipy.sparse). | |
""" | |
if scipy.sparse.issparse(vec): # convert scipy.sparse to standard numpy array | |
vec = vec.tocsr() | |
veclen = numpy.sqrt(numpy.sum(vec.data ** 2)) | |
if veclen > 0.0: | |
return vec / veclen | |
else: | |
return vec | |
if isinstance(vec, numpy.ndarray): | |
vec = numpy.asarray(vec, dtype=float) | |
veclen = blas_nrm2(vec) | |
if veclen > 0.0: | |
return blas_scal(1.0 / veclen, vec) | |
else: | |
return vec | |
try: | |
first = next(iter(vec)) # is there at least one element? | |
except: | |
return vec | |
if isinstance(first, (tuple, list)) and len(first) == 2: # gensim sparse format? | |
length = 1.0 * math.sqrt(sum(val ** 2 for _, val in vec)) | |
assert length > 0.0, "sparse documents must not contain any explicit zero entries" | |
if length != 1.0: | |
return [(termid, val / length) for termid, val in vec] | |
else: | |
return list(vec) | |
else: | |
raise ValueError("unknown input type") | |
def cossim(vec1, vec2): | |
""" | |
Return cosine similarity between two sparse vectors. | |
The similarity is a number between <-1.0, 1.0>, higher is more similar. | |
""" | |
vec1, vec2 = dict(vec1), dict(vec2) | |
if not vec1 or not vec2: | |
return 0.0 | |
vec1len = 1.0 * math.sqrt(sum(val * val for val in itervalues(vec1))) | |
vec2len = 1.0 * math.sqrt(sum(val * val for val in itervalues(vec2))) | |
assert vec1len > 0.0 and vec2len > 0.0, "sparse documents must not contain any explicit zero entries" | |
if len(vec2) < len(vec1): | |
vec1, vec2 = vec2, vec1 # swap references so that we iterate over the shorter vector | |
result = sum(value * vec2.get(index, 0.0) for index, value in iteritems(vec1)) | |
result /= vec1len * vec2len # rescale by vector lengths | |
return result | |
def qr_destroy(la): | |
""" | |
Return QR decomposition of `la[0]`. Content of `la` gets destroyed in the process. | |
Using this function should be less memory intense than calling `scipy.linalg.qr(la[0])`, | |
because the memory used in `la[0]` is reclaimed earlier. | |
""" | |
a = numpy.asfortranarray(la[0]) | |
del la[0], la # now `a` is the only reference to the input matrix | |
m, n = a.shape | |
# perform q, r = QR(a); code hacked out of scipy.linalg.qr | |
logger.debug("computing QR of %s dense matrix" % str(a.shape)) | |
geqrf, = get_lapack_funcs(('geqrf',), (a,)) | |
qr, tau, work, info = geqrf(a, lwork=-1, overwrite_a=True) | |
qr, tau, work, info = geqrf(a, lwork=work[0], overwrite_a=True) | |
del a # free up mem | |
assert info >= 0 | |
r = triu(qr[:n, :n]) | |
if m < n: # rare case, #features < #topics | |
qr = qr[:, :m] # retains fortran order | |
gorgqr, = get_lapack_funcs(('orgqr',), (qr,)) | |
q, work, info = gorgqr(qr, tau, lwork=-1, overwrite_a=True) | |
q, work, info = gorgqr(qr, tau, lwork=work[0], overwrite_a=True) | |
assert info >= 0, "qr failed" | |
assert q.flags.f_contiguous | |
return q, r | |
class MmWriter(object): | |
""" | |
Store a corpus in Matrix Market format. | |
Note that the output is written one document at a time, not the whole | |
matrix at once (unlike scipy.io.mmread). This allows us to process corpora | |
which are larger than the available RAM. | |
NOTE: the output file is created in a single pass through the input corpus, so | |
that the input can be a once-only stream (iterator). | |
To achieve this, a fake MM header is written first, statistics are collected | |
during the pass (shape of the matrix, number of non-zeroes), followed by a seek | |
back to the beginning of the file, rewriting the fake header with proper values. | |
""" | |
HEADER_LINE = b'%%MatrixMarket matrix coordinate real general\n' # the only supported MM format | |
def __init__(self, fname): | |
self.fname = fname | |
if fname.endswith(".gz") or fname.endswith('.bz2'): | |
raise NotImplementedError("compressed output not supported with MmWriter") | |
self.fout = open(self.fname, 'wb+') # open for both reading and writing | |
self.headers_written = False | |
def write_headers(self, num_docs, num_terms, num_nnz): | |
self.fout.write(MmWriter.HEADER_LINE) | |
if num_nnz < 0: | |
# we don't know the matrix shape/density yet, so only log a general line | |
logger.info("saving sparse matrix to %s" % self.fname) | |
self.fout.write(utils.to_utf8(' ' * 50 + '\n')) # 48 digits must be enough for everybody | |
else: | |
logger.info("saving sparse %sx%s matrix with %i non-zero entries to %s" % | |
(num_docs, num_terms, num_nnz, self.fname)) | |
self.fout.write(utils.to_utf8('%s %s %s\n' % (num_docs, num_terms, num_nnz))) | |
self.last_docno = -1 | |
self.headers_written = True | |
def fake_headers(self, num_docs, num_terms, num_nnz): | |
stats = '%i %i %i' % (num_docs, num_terms, num_nnz) | |
if len(stats) > 50: | |
raise ValueError('Invalid stats: matrix too large!') | |
self.fout.seek(len(MmWriter.HEADER_LINE)) | |
self.fout.write(utils.to_utf8(stats)) | |
def write_vector(self, docno, vector): | |
""" | |
Write a single sparse vector to the file. | |
Sparse vector is any iterable yielding (field id, field value) pairs. | |
""" | |
assert self.headers_written, "must write Matrix Market file headers before writing data!" | |
assert self.last_docno < docno, "documents %i and %i not in sequential order!" % (self.last_docno, docno) | |
vector = sorted((i, w) for i, w in vector if abs(w) > 1e-12) # ignore near-zero entries | |
for termid, weight in vector: # write term ids in sorted order | |
self.fout.write(utils.to_utf8( | |
"%i %i %s\n" % (docno + 1, termid + 1, weight))) # +1 because MM format starts counting from 1 | |
self.last_docno = docno | |
return (vector[-1][0], len(vector)) if vector else (-1, 0) | |
@staticmethod | |
def write_corpus(fname, corpus, progress_cnt=1000, index=False, num_terms=None, metadata=False): | |
""" | |
Save the vector space representation of an entire corpus to disk. | |
Note that the documents are processed one at a time, so the whole corpus | |
is allowed to be larger than the available RAM. | |
""" | |
mw = MmWriter(fname) | |
# write empty headers to the file (with enough space to be overwritten later) | |
mw.write_headers(-1, -1, -1) # will print 50 spaces followed by newline on the stats line | |
# calculate necessary header info (nnz elements, num terms, num docs) while writing out vectors | |
_num_terms, num_nnz = 0, 0 | |
docno, poslast = -1, -1 | |
offsets = [] | |
if hasattr(corpus, 'metadata'): | |
orig_metadata = corpus.metadata | |
corpus.metadata = metadata | |
if metadata: | |
docno2metadata = {} | |
else: | |
metadata = False | |
for docno, doc in enumerate(corpus): | |
if metadata: | |
bow, data = doc | |
docno2metadata[docno] = data | |
else: | |
bow = doc | |
if docno % progress_cnt == 0: | |
logger.info("PROGRESS: saving document #%i" % docno) | |
if index: | |
posnow = mw.fout.tell() | |
if posnow == poslast: | |
offsets[-1] = -1 | |
offsets.append(posnow) | |
poslast = posnow | |
max_id, veclen = mw.write_vector(docno, bow) | |
_num_terms = max(_num_terms, 1 + max_id) | |
num_nnz += veclen | |
if metadata: | |
utils.pickle(docno2metadata, fname + '.metadata.cpickle') | |
corpus.metadata = orig_metadata | |
num_docs = docno + 1 | |
num_terms = num_terms or _num_terms | |
if num_docs * num_terms != 0: | |
logger.info("saved %ix%i matrix, density=%.3f%% (%i/%i)" % ( | |
num_docs, num_terms, | |
100.0 * num_nnz / (num_docs * num_terms), | |
num_nnz, | |
num_docs * num_terms)) | |
# now write proper headers, by seeking and overwriting the spaces written earlier | |
mw.fake_headers(num_docs, num_terms, num_nnz) | |
mw.close() | |
if index: | |
return offsets | |
def __del__(self): | |
""" | |
Automatic destructor which closes the underlying file. | |
There must be no circular references contained in the object for __del__ | |
to work! Closing the file explicitly via the close() method is preferred | |
and safer. | |
""" | |
self.close() # does nothing if called twice (on an already closed file), so no worries | |
def close(self): | |
logger.debug("closing %s" % self.fname) | |
if hasattr(self, 'fout'): | |
self.fout.close() | |
#endclass MmWriter | |
class MmReader(object): | |
""" | |
Wrap a term-document matrix on disk (in matrix-market format), and present it | |
as an object which supports iteration over the rows (~documents). | |
Note that the file is read into memory one document at a time, not the whole | |
matrix at once (unlike scipy.io.mmread). This allows us to process corpora | |
which are larger than the available RAM. | |
""" | |
def __init__(self, input, transposed=True): | |
""" | |
Initialize the matrix reader. | |
The `input` refers to a file on local filesystem, which is expected to | |
be in the sparse (coordinate) Matrix Market format. Documents are assumed | |
to be rows of the matrix (and document features are columns). | |
`input` is either a string (file path) or a file-like object that supports | |
`seek()` (e.g. gzip.GzipFile, bz2.BZ2File). | |
""" | |
logger.info("initializing corpus reader from %s" % input) | |
self.input, self.transposed = input, transposed | |
with utils.file_or_filename(self.input) as lines: | |
try: | |
header = utils.to_unicode(next(lines)).strip() | |
if not header.lower().startswith('%%matrixmarket matrix coordinate real general'): | |
raise ValueError( | |
"File %s not in Matrix Market format with coordinate real general; instead found: \n%s" % | |
(self.input, header)) | |
except StopIteration: | |
pass | |
self.num_docs = self.num_terms = self.num_nnz = 0 | |
for lineno, line in enumerate(lines): | |
line = utils.to_unicode(line) | |
if not line.startswith('%'): | |
self.num_docs, self.num_terms, self.num_nnz = map(int, line.split()) | |
if not self.transposed: | |
self.num_docs, self.num_terms = self.num_terms, self.num_docs | |
break | |
logger.info("accepted corpus with %i documents, %i features, %i non-zero entries" % | |
(self.num_docs, self.num_terms, self.num_nnz)) | |
def __len__(self): | |
return self.num_docs | |
def __str__(self): | |
return ("MmCorpus(%i documents, %i features, %i non-zero entries)" % | |
(self.num_docs, self.num_terms, self.num_nnz)) | |
def skip_headers(self, input_file): | |
""" | |
Skip file headers that appear before the first document. | |
""" | |
for line in input_file: | |
if line.startswith(b'%'): | |
continue | |
break | |
def __iter__(self): | |
""" | |
Iteratively yield vectors from the underlying file, in the format (row_no, vector), | |
where vector is a list of (col_no, value) 2-tuples. | |
Note that the total number of vectors returned is always equal to the | |
number of rows specified in the header; empty documents are inserted and | |
yielded where appropriate, even if they are not explicitly stored in the | |
Matrix Market file. | |
""" | |
with utils.file_or_filename(self.input) as lines: | |
self.skip_headers(lines) | |
previd = -1 | |
for line in lines: | |
docid, termid, val = utils.to_unicode(line).split() # needed for python3 | |
if not self.transposed: | |
termid, docid = docid, termid | |
docid, termid, val = int(docid) - 1, int(termid) - 1, float( | |
val) # -1 because matrix market indexes are 1-based => convert to 0-based | |
assert previd <= docid, "matrix columns must come in ascending order" | |
if docid != previd: | |
# change of document: return the document read so far (its id is prevId) | |
if previd >= 0: | |
yield previd, document | |
# return implicit (empty) documents between previous id and new id | |
# too, to keep consistent document numbering and corpus length | |
for previd in xrange(previd + 1, docid): | |
yield previd, [] | |
# from now on start adding fields to a new document, with a new id | |
previd = docid | |
document = [] | |
document.append((termid, val,)) # add another field to the current document | |
# handle the last document, as a special case | |
if previd >= 0: | |
yield previd, document | |
# return empty documents between the last explicit document and the number | |
# of documents as specified in the header | |
for previd in xrange(previd + 1, self.num_docs): | |
yield previd, [] | |
def docbyoffset(self, offset): | |
"""Return document at file offset `offset` (in bytes)""" | |
# empty documents are not stored explicitly in MM format, so the index marks | |
# them with a special offset, -1. | |
if offset == -1: | |
return [] | |
if isinstance(self.input, string_types): | |
fin = open(self.input) | |
else: | |
fin = self.input | |
fin.seek(offset) # works for gzip/bz2 input, too | |
previd, document = -1, [] | |
for line in fin: | |
docid, termid, val = line.split() | |
if not self.transposed: | |
termid, docid = docid, termid | |
docid, termid, val = int(docid) - 1, int(termid) - 1, float( | |
val) # -1 because matrix market indexes are 1-based => convert to 0-based | |
assert previd <= docid, "matrix columns must come in ascending order" | |
if docid != previd: | |
if previd >= 0: | |
return document | |
previd = docid | |
document.append((termid, val,)) # add another field to the current document | |
return document | |
#endclass MmReader |