/
interfaces.py
70 lines (52 loc) · 2.31 KB
/
interfaces.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
#!/usr/bin/env python2.5
# -*- coding: utf-8 -*-
#
# author Radim Rehurek, radimrehurek@seznam.cz
import utils
class CorpusABC(utils.SaveLoad):
"""
Interface for corpora. A 'corpus' is simply an iterable, where each
iteration step yields one document. A document is a list of (fieldId, fieldValue)
2-tuples.
See the corpora module for some example corpus implementations.
Note that although a default len() method is provided, it is very inefficient
(performs a linear scan through the corpus to determine its length). Wherever
the corpus size is known in advance (or at least doesn't change so that it can
be cached), the len() method should be overridden.
"""
def __iter__(self):
raise NotImplementedError('cannot instantiate abstract base class')
def __len__(self):
"""
Return the number of documents in the corpus.
This method is just the least common denominator and should really be
overridden when possible.
"""
logging.warning("performing full corpus scan to determine its length; was this intended?")
return sum(1 for doc in self) # sum(empty generator) == 0, so this works even for an empty corpus
#endclass CorpusABC
class TransformationABC(utils.SaveLoad):
"""
Interface for transformations. A 'transformation' is any object which accepts
a sparse document via the dictionary notation [] and returns another sparse
document in its stead.
See the tfidfmodel module for an example of a transformation.
"""
class TransformedCorpus(CorpusABC):
def __init__(self, fnc, corpus):
self.fnc, self.corpus = fnc, corpus
def __len__(self):
return len(self.corpus)
def __iter__(self):
for doc in self.corpus:
yield self.fnc(doc)
#endclass TransformedCorpus
def __getitem__(self):
raise NotImplementedError('cannot instantiate abstract base class')
def apply(self, corpus):
"""
Helper function used in derived classes. Applies the transformation to
a whole corpus (as opposed to a single document) and returns another corpus.
"""
return TransformationABC.TransformedCorpus(self.__getitem__, corpus)
#endclass TransformationABC