Skip to content

Commit

Permalink
Add 'baseform_unk' option for backward compatibility.
Browse files Browse the repository at this point in the history
  • Loading branch information
mocobeta committed Aug 6, 2017
1 parent 4add22d commit 54d407f
Show file tree
Hide file tree
Showing 2 changed files with 35 additions and 7 deletions.
16 changes: 9 additions & 7 deletions janome/tokenizer.py
Expand Up @@ -175,35 +175,36 @@ def __init__(self, udic='', udic_enc='utf8', udic_type='ipadic', max_unknown_len
self.user_dic = None
self.max_unknown_length = max_unknown_length

def tokenize(self, text, stream = False, wakati = False):
def tokenize(self, text, stream=False, wakati=False, baseform_unk=True):
u"""
Tokenize the input text.
:param text: unicode string to be tokenized
:param stream: (Optional) if given True use stream mode. default is False.
:param wakati: (Optinal) if given True returns surface forms only. default is False.
:param baseform_unk: (Optional) if given True sets base_form attribute for unknown tokens. default is True.
:return: list of tokens (stream=False, wakati=False) or token generator (stream=True, wakati=False) or list of string (stream=False, wakati=True) or string generator (stream=True, wakati=True)
"""
if self.wakati:
wakati = True
if stream:
return self.__tokenize_stream(text, wakati)
return self.__tokenize_stream(text, wakati, baseform_unk)
else:
return list(self.__tokenize_stream(text, wakati))
return list(self.__tokenize_stream(text, wakati, baseform_unk))

def __tokenize_stream(self, text, wakati = False):
def __tokenize_stream(self, text, wakati, baseform_unk):
text = text.strip()
text_length = len(text)
processed = 0
while processed < text_length:
tokens, pos = self.__tokenize_partial(text[processed:], wakati)
tokens, pos = self.__tokenize_partial(text[processed:], wakati, baseform_unk)
for token in tokens:
yield token
processed += pos


def __tokenize_partial(self, text, wakati = False):
def __tokenize_partial(self, text, wakati, baseform_unk):
if self.wakati and not wakati:
raise WakatiModeOnlyException

Expand Down Expand Up @@ -247,7 +248,8 @@ def __tokenize_partial(self, text, wakati = False):
assert unknown_entries
for entry in unknown_entries:
left_id, right_id, cost, part_of_speech = entry
dummy_dict_entry = (buf, left_id, right_id, cost, part_of_speech, '*', '*', buf, '*', '*')
base_form = buf if baseform_unk else '*'
dummy_dict_entry = (buf, left_id, right_id, cost, part_of_speech, '*', '*', base_form, '*', '*')
lattice.add(Node(dummy_dict_entry, NodeType.UNKNOWN))

pos += lattice.forward()
Expand Down
26 changes: 26 additions & 0 deletions tests/test_tokenizer.py
Expand Up @@ -93,6 +93,32 @@ def test_tokenize_unknown(self):
self._check_token(tokens[9], u'モバキャス', u'名詞,固有名詞,一般,*,*,*,モバキャス,*,*', NodeType.UNKNOWN)
self._check_token(tokens[10], u'」', u'記号,括弧閉,*,*,*,*,」,」,」', NodeType.SYS_DICT)

def test_tokenize_unknown_no_baseform(self):
text = u'2009年10月16日'
tokens = Tokenizer().tokenize(text, baseform_unk=False)
self.assertEqual(6, len(tokens))
self._check_token(tokens[0], u'2009', u'名詞,数,*,*,*,*,*,*,*', NodeType.UNKNOWN)
self._check_token(tokens[1], u'年', u'名詞,接尾,助数詞,*,*,*,年,ネン,ネン', NodeType.SYS_DICT)
self._check_token(tokens[2], u'10', u'名詞,数,*,*,*,*,*,*,*', NodeType.UNKNOWN)
self._check_token(tokens[3], u'月', u'名詞,一般,*,*,*,*,月,ツキ,ツキ', NodeType.SYS_DICT)
self._check_token(tokens[4], u'16', u'名詞,数,*,*,*,*,*,*,*', NodeType.UNKNOWN)
self._check_token(tokens[5], u'日', u'名詞,接尾,助数詞,*,*,*,日,ニチ,ニチ', NodeType.SYS_DICT)

text = u'マルチメディア放送(VHF-HIGH帯)「モバキャス」'
tokens = Tokenizer().tokenize(text, baseform_unk=False)
self.assertEqual(11, len(tokens))
self._check_token(tokens[0], u'マルチメディア', u'名詞,一般,*,*,*,*,マルチメディア,マルチメディア,マルチメディア', NodeType.SYS_DICT)
self._check_token(tokens[1], u'放送', u'名詞,サ変接続,*,*,*,*,放送,ホウソウ,ホーソー', NodeType.SYS_DICT)
self._check_token(tokens[2], u'(', u'記号,括弧開,*,*,*,*,(,(,(', NodeType.SYS_DICT)
self._check_token(tokens[3], u'VHF', u'名詞,固有名詞,組織,*,*,*,*,*,*', NodeType.UNKNOWN)
self._check_token(tokens[4], u'-', u'名詞,サ変接続,*,*,*,*,*,*,*', NodeType.UNKNOWN)
self._check_token(tokens[5], u'HIGH', u'名詞,一般,*,*,*,*,*,*,*', NodeType.UNKNOWN)
self._check_token(tokens[6], u'帯', u'名詞,接尾,一般,*,*,*,帯,タイ,タイ', NodeType.SYS_DICT)
self._check_token(tokens[7], u')', u'記号,括弧閉,*,*,*,*,),),)', NodeType.SYS_DICT)
self._check_token(tokens[8], u'「', u'記号,括弧開,*,*,*,*,「,「,「', NodeType.SYS_DICT)
self._check_token(tokens[9], u'モバキャス', u'名詞,固有名詞,一般,*,*,*,*,*,*', NodeType.UNKNOWN)
self._check_token(tokens[10], u'」', u'記号,括弧閉,*,*,*,*,」,」,」', NodeType.SYS_DICT)

def test_tokenize_with_userdic(self):
text = u'東京スカイツリーへのお越しは、東武スカイツリーライン「とうきょうスカイツリー駅」が便利です。'
udic_file = os.path.join(parent_dir, 'tests/user_ipadic.csv')
Expand Down

0 comments on commit 54d407f

Please sign in to comment.