/
tokenfilter.py
208 lines (158 loc) · 6.94 KB
/
tokenfilter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
# Copyright 2015 moco_beta
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from abc import ABC, abstractmethod
from collections import defaultdict
from typing import Iterator, List, Dict, Tuple, Any
from .tokenizer import Token
class TokenFilter(ABC):
"""
Base TokenFilter class.
A TokenFilter modifies or transforms the input token sequence according to the rule described in apply() method.
Subclasses must implement apply() method.
Added in *version 0.3.4*
"""
@abstractmethod
def apply(self, tokens: Iterator[Token]) -> Iterator[Any]:
pass
def __call__(self, tokens: Iterator[Token]) -> Iterator[Any]:
return self.apply(tokens)
class LowerCaseFilter(TokenFilter):
"""
A LowerCaseFilter converts the surface and base_form of tokens to lowercase.
Added in *version 0.3.4*
"""
def apply(self, tokens: Iterator[Token]) -> Iterator[Token]:
for token in tokens:
token.surface = token.surface.lower()
token.base_form = token.base_form.lower()
yield token
class UpperCaseFilter(TokenFilter):
"""
An UpperCaseFilter converts the surface and base_form of tokens to uppercase.
Added in *version 0.3.4*
"""
def apply(self, tokens: Iterator[Token]) -> Iterator[Token]:
for token in tokens:
token.surface = token.surface.upper()
token.base_form = token.base_form.upper()
yield token
class POSStopFilter(TokenFilter):
u"""
A POSStopFilter removes tokens associated with part-of-speech tags
listed in the stop tags list and keeps other tokens.
Tag matching rule is prefix-matching. e.g., if '動詞' is given as a stop tag,
'動詞,自立,*,*' and '動詞,非自立,*,*' (or so) are removed.
Added in *version 0.3.4*
"""
def __init__(self, pos_list: List[str]):
"""
Initialize POSStopFilter object.
:param pos_list: stop part-of-speech tags list.
"""
self.pos_list = pos_list
def apply(self, tokens: Iterator[Token]) -> Iterator[Token]:
for token in tokens:
if any(token.part_of_speech.startswith(pos) for pos in self.pos_list):
continue
yield token
class POSKeepFilter(TokenFilter):
"""
A POSKeepFilter keeps tokens associated with part-of-speech tags
listed in the keep tags list and removes other tokens.
Tag matching rule is prefix-matching. e.g., if '動詞' is given as a keep tag,
'動詞,自立,*,*' and '動詞,非自立,*,*' (or so) are kept.
Added in *version 0.3.4*
"""
def __init__(self, pos_list: List[str]):
"""
Initialize POSKeepFilter object.
:param pos_list: keep part-of-speech tags list.
"""
self.pos_list = pos_list
def apply(self, tokens: Iterator[Token]) -> Iterator[Token]:
for token in tokens:
if any(token.part_of_speech.startswith(pos) for pos in self.pos_list):
yield token
class CompoundNounFilter(TokenFilter):
"""
A CompoundNounFilter generates compound nouns.
This Filter joins contiguous nouns.
For example, '形態素解析器' is splitted three noun tokens '形態素/解析/器' by Tokenizer and then re-joined by this filter.
Generated tokens are associated with the special part-of-speech tag '名詞,複合,*,*'
Added in *version 0.3.4*
"""
def apply(self, tokens: Iterator[Token]) -> Iterator[Token]:
_ret = None
for token in tokens:
if _ret:
if token.part_of_speech.startswith('名詞') and _ret.part_of_speech.startswith('名詞'):
_ret.surface += token.surface
_ret.part_of_speech = '名詞,複合,*,*'
_ret.base_form += token.base_form
_ret.reading += token.reading
_ret.phonetic += token.phonetic
else:
ret = _ret
_ret = token
yield ret
else:
_ret = token
if _ret:
yield _ret
class ExtractAttributeFilter(TokenFilter):
"""
An ExtractAttributeFilter extracts a specified attribute of Token.
**NOTES** This filter must placed the last of token filter chain because return values are not tokens but strings.
Added in *version 0.3.4*
"""
def __init__(self, att: str):
"""
Initialize ExtractAttributeFilter object.
:param att: attribute name should be extraced from a token. valid values for *att* are 'surface',
'part_of_speech', 'infl_type', 'infl_form', 'base_form', 'reading' and 'phonetic'.
"""
if att not in ['surface', 'part_of_speech', 'infl_type', 'infl_form', 'base_form', 'reading', 'phonetic']:
raise Exception(f'Unknown attribute name: {att}')
self.att = att
def apply(self, tokens: Iterator[Token]) -> Iterator[str]:
for token in tokens:
yield getattr(token, self.att)
class TokenCountFilter(TokenFilter):
"""
An TokenCountFilter counts word frequencies in the input text. Here, 'word' means an attribute of Token.
This filter generates word-frequency pairs.
When `sorted` option is set to True, pairs are sorted in descending order of frequency.
**NOTES** This filter must placed the last of token filter chain because return values are not tokens
but string-integer tuples.
Added in *version 0.3.5*
"""
def __init__(self, att: str = 'surface', sorted: bool = False):
"""
Initialize TokenCountFilter object.
:param att: attribute name should be extraced from a token. valid values for *att* are 'surface',
'part_of_speech', 'infl_type', 'infl_form', 'base_form', 'reading' and 'phonetic'.
:param sorted: sort items by term frequency
"""
if att not in ['surface', 'part_of_speech', 'infl_type', 'infl_form', 'base_form', 'reading', 'phonetic']:
raise Exception(f'Unknown attribute name: {att}')
self.att = att
self.sorted = sorted
def apply(self, tokens: Iterator[Token]) -> Iterator[Tuple[str, int]]:
token_counts: Dict[str, int] = defaultdict(int)
for token in tokens:
token_counts[getattr(token, self.att)] += 1
if self.sorted:
return ((k, v) for k, v in sorted(token_counts.items(), key=lambda x: x[1], reverse=True))
else:
return ((k, v) for k, v in token_counts.items())