Add word filters (#116)

Added WordStopFilter and WordKeepFilter.
mocobeta · Jul 1, 2023 · 8f92b61 · 8f92b61
1 parent 88f1d66
commit 8f92b61
Show file tree

Hide file tree

Showing 2 changed files with 58 additions and 3 deletions.
diff --git a/janome/tokenfilter.py b/janome/tokenfilter.py
@@ -14,9 +14,9 @@
 
 from abc import ABC, abstractmethod
 from collections import defaultdict
-from typing import Iterator, List, Dict, Tuple, Any
+from typing import Iterator, Tuple, Any, List, Dict
 
-from .tokenizer import Token
+from janome.tokenizer import Token
 
 
 class TokenFilter(ABC):
@@ -66,7 +66,7 @@ def apply(self, tokens: Iterator[Token]) -> Iterator[Token]:
 
 
 class POSStopFilter(TokenFilter):
-    u"""
+    """
     A POSStopFilter removes tokens associated with part-of-speech tags
     listed in the stop tags list and keeps other tokens.
 
@@ -116,6 +116,49 @@ def apply(self, tokens: Iterator[Token]) -> Iterator[Token]:
                 yield token
 
 
+class WordStopFilter(TokenFilter):
+    """
+    A WordStopFilter removes tokens whose surface form is listed in the stop words list.
+
+    Added in *version 0.5.0*
+    """
+
+    def __init__(self, stop_words: List[str]):
+        """
+        Initialize WordStopFilter object.
+
+        :param stop_words: stop words list.
+        """
+        self.stop_words = stop_words
+
+    def apply(self, tokens: Iterator[Token]) -> Iterator[Token]:
+        for token in tokens:
+            if token.surface in self.stop_words:
+                continue
+            yield token
+
+
+class WordKeepFilter(TokenFilter):
+    """
+    A WordKeepFilter keeps tokens whose surface form is listed in the keep words list.
+
+    Added in *version 0.5.0*
+    """
+
+    def __init__(self, keep_words: List[str]) -> None:
+        """
+        Initialize WordKeepFilter object.
+
+        :param keep_words: keep words list.
+        """
+        self.keep_words = keep_words
+
+    def apply(self, tokens: Iterator[Token]) -> Iterator[Token]:
+        for token in tokens:
+            if token.surface in self.keep_words:
+                yield token
+
+
 class CompoundNounFilter(TokenFilter):
     """
     A CompoundNounFilter generates compound nouns.

diff --git a/tests/test_tokenfilter.py b/tests/test_tokenfilter.py
@@ -20,6 +20,8 @@
     UpperCaseFilter,
     POSStopFilter,
     POSKeepFilter,
+    WordStopFilter,
+    WordKeepFilter,
     CompoundNounFilter,
     ExtractAttributeFilter,
     TokenCountFilter
@@ -60,6 +62,16 @@ def test_pos_keep_filter(self):
         tokens = tf.apply(self.t.tokenize('東京駅で降りる'))
         self.assertEqual(['名詞,固有名詞,地域,一般', '動詞,自立,*,*'], list(map(lambda token: token.part_of_speech, tokens)))
 
+    def test_word_stop_filter(self):
+        tf = WordStopFilter(['東京', '駅'])
+        tokens = tf.apply(self.t.tokenize('東京駅で降りる'))
+        self.assertEqual(['で', '降りる'], list(map(lambda token: token.surface, tokens)))
+
+    def test_word_keep_filter(self):
+        tf = WordKeepFilter(['東京', '駅'])
+        tokens = tf.apply(self.t.tokenize('東京駅で降りる'))
+        self.assertEqual(['東京', '駅'], list(map(lambda token: token.surface, tokens)))
+
     def test_compound_noun_filter(self):
         tf = CompoundNounFilter()
         tokens = tf.apply(self.t.tokenize('浜松町駅から東京モノレールで羽田空港ターミナルへ向かう'))