Skip to content

Commit

Permalink
Add word filters (#116)
Browse files Browse the repository at this point in the history
Added WordStopFilter and WordKeepFilter.
  • Loading branch information
mocobeta committed Jul 1, 2023
1 parent 88f1d66 commit 8f92b61
Show file tree
Hide file tree
Showing 2 changed files with 58 additions and 3 deletions.
49 changes: 46 additions & 3 deletions janome/tokenfilter.py
Expand Up @@ -14,9 +14,9 @@

from abc import ABC, abstractmethod
from collections import defaultdict
from typing import Iterator, List, Dict, Tuple, Any
from typing import Iterator, Tuple, Any, List, Dict

from .tokenizer import Token
from janome.tokenizer import Token


class TokenFilter(ABC):
Expand Down Expand Up @@ -66,7 +66,7 @@ def apply(self, tokens: Iterator[Token]) -> Iterator[Token]:


class POSStopFilter(TokenFilter):
u"""
"""
A POSStopFilter removes tokens associated with part-of-speech tags
listed in the stop tags list and keeps other tokens.
Expand Down Expand Up @@ -116,6 +116,49 @@ def apply(self, tokens: Iterator[Token]) -> Iterator[Token]:
yield token


class WordStopFilter(TokenFilter):
"""
A WordStopFilter removes tokens whose surface form is listed in the stop words list.
Added in *version 0.5.0*
"""

def __init__(self, stop_words: List[str]):
"""
Initialize WordStopFilter object.
:param stop_words: stop words list.
"""
self.stop_words = stop_words

def apply(self, tokens: Iterator[Token]) -> Iterator[Token]:
for token in tokens:
if token.surface in self.stop_words:
continue
yield token


class WordKeepFilter(TokenFilter):
"""
A WordKeepFilter keeps tokens whose surface form is listed in the keep words list.
Added in *version 0.5.0*
"""

def __init__(self, keep_words: List[str]) -> None:
"""
Initialize WordKeepFilter object.
:param keep_words: keep words list.
"""
self.keep_words = keep_words

def apply(self, tokens: Iterator[Token]) -> Iterator[Token]:
for token in tokens:
if token.surface in self.keep_words:
yield token


class CompoundNounFilter(TokenFilter):
"""
A CompoundNounFilter generates compound nouns.
Expand Down
12 changes: 12 additions & 0 deletions tests/test_tokenfilter.py
Expand Up @@ -20,6 +20,8 @@
UpperCaseFilter,
POSStopFilter,
POSKeepFilter,
WordStopFilter,
WordKeepFilter,
CompoundNounFilter,
ExtractAttributeFilter,
TokenCountFilter
Expand Down Expand Up @@ -60,6 +62,16 @@ def test_pos_keep_filter(self):
tokens = tf.apply(self.t.tokenize('東京駅で降りる'))
self.assertEqual(['名詞,固有名詞,地域,一般', '動詞,自立,*,*'], list(map(lambda token: token.part_of_speech, tokens)))

def test_word_stop_filter(self):
tf = WordStopFilter(['東京', '駅'])
tokens = tf.apply(self.t.tokenize('東京駅で降りる'))
self.assertEqual(['で', '降りる'], list(map(lambda token: token.surface, tokens)))

def test_word_keep_filter(self):
tf = WordKeepFilter(['東京', '駅'])
tokens = tf.apply(self.t.tokenize('東京駅で降りる'))
self.assertEqual(['東京', '駅'], list(map(lambda token: token.surface, tokens)))

def test_compound_noun_filter(self):
tf = CompoundNounFilter()
tokens = tf.apply(self.t.tokenize('浜松町駅から東京モノレールで羽田空港ターミナルへ向かう'))
Expand Down

0 comments on commit 8f92b61

Please sign in to comment.