-
-
Notifications
You must be signed in to change notification settings - Fork 4.4k
/
indexedcorpus.py
188 lines (148 loc) · 6.28 KB
/
indexedcorpus.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright (C) 2010 Radim Rehurek <radimrehurek@seznam.cz>
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
"""Base Indexed Corpus class."""
import logging
import numpy
from gensim import interfaces, utils
logger = logging.getLogger(__name__)
class IndexedCorpus(interfaces.CorpusABC):
"""Indexed corpus is a mechanism for random-accessing corpora.
While the standard corpus interface in gensim allows iterating over corpus,
we'll show it with :class:`~gensim.corpora.mmcorpus.MmCorpus`.
.. sourcecode:: pycon
>>> from gensim.corpora import MmCorpus
>>> from gensim.test.utils import datapath
>>>
>>> corpus = MmCorpus(datapath('testcorpus.mm'))
>>> for doc in corpus:
... pass
:class:`~gensim.corpora.indexedcorpus.IndexedCorpus` allows accessing the documents with index
in :math:`{O}(1)` look-up time.
.. sourcecode:: pycon
>>> document_index = 3
>>> doc = corpus[document_index]
Notes
-----
This functionality is achieved by storing an extra file (by default named the same as the `fname.index`)
that stores the byte offset of the beginning of each document.
"""
def __init__(self, fname, index_fname=None):
"""
Parameters
----------
fname : str
Path to corpus.
index_fname : str, optional
Path to index, if not provided - used `fname.index`.
"""
try:
if index_fname is None:
index_fname = utils.smart_extension(fname, '.index')
self.index = utils.unpickle(index_fname)
# change self.index into a numpy.ndarray to support fancy indexing
self.index = numpy.asarray(self.index)
logger.info("loaded corpus index from %s", index_fname)
except Exception:
self.index = None
self.length = None
@classmethod
def serialize(serializer, fname, corpus, id2word=None, index_fname=None,
progress_cnt=None, labels=None, metadata=False):
"""Serialize corpus with offset metadata, allows to use direct indexes after loading.
Parameters
----------
fname : str
Path to output file.
corpus : iterable of iterable of (int, float)
Corpus in BoW format.
id2word : dict of (str, str), optional
Mapping id -> word.
index_fname : str, optional
Where to save resulting index, if None - store index to `fname`.index.
progress_cnt : int, optional
Number of documents after which progress info is printed.
labels : bool, optional
If True - ignore first column (class labels).
metadata : bool, optional
If True - ensure that serialize will write out article titles to a pickle file.
Examples
--------
.. sourcecode:: pycon
>>> from gensim.corpora import MmCorpus
>>> from gensim.test.utils import get_tmpfile
>>>
>>> corpus = [[(1, 0.3), (2, 0.1)], [(1, 0.1)], [(2, 0.3)]]
>>> output_fname = get_tmpfile("test.mm")
>>>
>>> MmCorpus.serialize(output_fname, corpus)
>>> mm = MmCorpus(output_fname) # `mm` document stream now has random access
>>> print(mm[1]) # retrieve document no. 42, etc.
[(1, 0.1)]
"""
if getattr(corpus, 'fname', None) == fname:
raise ValueError("identical input vs. output corpus filename, refusing to serialize: %s" % fname)
if index_fname is None:
index_fname = utils.smart_extension(fname, '.index')
kwargs = {'metadata': metadata}
if progress_cnt is not None:
kwargs['progress_cnt'] = progress_cnt
if labels is not None:
kwargs['labels'] = labels
offsets = serializer.save_corpus(fname, corpus, id2word, **kwargs)
if offsets is None:
raise NotImplementedError(
"Called serialize on class %s which doesn't support indexing!" % serializer.__name__
)
# store offsets persistently, using pickle
# we shouldn't have to worry about self.index being a numpy.ndarray as the serializer will return
# the offsets that are actually stored on disk - we're not storing self.index in any case, the
# load just needs to turn whatever is loaded from disk back into a ndarray - this should also ensure
# backwards compatibility
logger.info("saving %s index to %s", serializer.__name__, index_fname)
utils.pickle(offsets, index_fname)
def __len__(self):
"""Get the index length.
Notes
-----
If the corpus is not indexed, also count corpus length and cache this value.
Returns
-------
int
Length of index.
"""
if self.index is not None:
return len(self.index)
if self.length is None:
logger.info("caching corpus length")
self.length = sum(1 for _ in self)
return self.length
def __getitem__(self, docno):
"""Get document by `docno` index.
Parameters
----------
docno : {int, iterable of int}
Document number or iterable of numbers (like a list of str).
Returns
-------
list of (int, float)
If `docno` is int - return document in BoW format.
:class:`~gensim.utils.SlicedCorpus`
If `docno` is iterable of int - return several documents in BoW format
wrapped to :class:`~gensim.utils.SlicedCorpus`.
Raises
------
RuntimeError
If index isn't exist.
"""
if self.index is None:
raise RuntimeError("Cannot call corpus[docid] without an index")
if isinstance(docno, (slice, list, numpy.ndarray)):
return utils.SlicedCorpus(self, docno)
elif isinstance(docno, (int, numpy.integer,)):
return self.docbyoffset(self.index[docno])
# TODO: no `docbyoffset` method, should be defined in this class
else:
raise ValueError('Unrecognised value for docno, use either a single integer, a slice or a numpy.ndarray')