Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add word filters #116

Merged
merged 3 commits into from
Jul 1, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 46 additions & 3 deletions janome/tokenfilter.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,9 @@

from abc import ABC, abstractmethod
from collections import defaultdict
from typing import Iterator, List, Dict, Tuple, Any
from typing import Iterator, Tuple, Any, List, Dict

from .tokenizer import Token
from janome.tokenizer import Token


class TokenFilter(ABC):
Expand Down Expand Up @@ -66,7 +66,7 @@ def apply(self, tokens: Iterator[Token]) -> Iterator[Token]:


class POSStopFilter(TokenFilter):
u"""
"""
A POSStopFilter removes tokens associated with part-of-speech tags
listed in the stop tags list and keeps other tokens.

Expand Down Expand Up @@ -116,6 +116,49 @@ def apply(self, tokens: Iterator[Token]) -> Iterator[Token]:
yield token


class WordStopFilter(TokenFilter):
"""
A WordStopFilter removes tokens whose surface form is listed in the stop words list.

Added in *version 0.5.0*
"""

def __init__(self, stop_words: List[str]):
"""
Initialize WordStopFilter object.

:param stop_words: stop words list.
"""
self.stop_words = stop_words

def apply(self, tokens: Iterator[Token]) -> Iterator[Token]:
for token in tokens:
if token.surface in self.stop_words:
continue
yield token


class WordKeepFilter(TokenFilter):
"""
A WordKeepFilter keeps tokens whose surface form is listed in the keep words list.

Added in *version 0.5.0*
"""

def __init__(self, keep_words: List[str]) -> None:
"""
Initialize WordKeepFilter object.

:param keep_words: keep words list.
"""
self.keep_words = keep_words

def apply(self, tokens: Iterator[Token]) -> Iterator[Token]:
for token in tokens:
if token.surface in self.keep_words:
yield token


class CompoundNounFilter(TokenFilter):
"""
A CompoundNounFilter generates compound nouns.
Expand Down
12 changes: 12 additions & 0 deletions tests/test_tokenfilter.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@
UpperCaseFilter,
POSStopFilter,
POSKeepFilter,
WordStopFilter,
WordKeepFilter,
CompoundNounFilter,
ExtractAttributeFilter,
TokenCountFilter
Expand Down Expand Up @@ -60,6 +62,16 @@ def test_pos_keep_filter(self):
tokens = tf.apply(self.t.tokenize('東京駅で降りる'))
self.assertEqual(['名詞,固有名詞,地域,一般', '動詞,自立,*,*'], list(map(lambda token: token.part_of_speech, tokens)))

def test_word_stop_filter(self):
tf = WordStopFilter(['東京', '駅'])
tokens = tf.apply(self.t.tokenize('東京駅で降りる'))
self.assertEqual(['で', '降りる'], list(map(lambda token: token.surface, tokens)))

def test_word_keep_filter(self):
tf = WordKeepFilter(['東京', '駅'])
tokens = tf.apply(self.t.tokenize('東京駅で降りる'))
self.assertEqual(['東京', '駅'], list(map(lambda token: token.surface, tokens)))

def test_compound_noun_filter(self):
tf = CompoundNounFilter()
tokens = tf.apply(self.t.tokenize('浜松町駅から東京モノレールで羽田空港ターミナルへ向かう'))
Expand Down