Skip to content

Commit

Permalink
Merge pull request #41 from mocobeta/unknown-baseform
Browse files Browse the repository at this point in the history
Set  base_form attribute for unknown tokens
  • Loading branch information
mocobeta committed Aug 6, 2017
2 parents b342d6d + 54d407f commit a4472eb
Show file tree
Hide file tree
Showing 2 changed files with 38 additions and 10 deletions.
16 changes: 9 additions & 7 deletions janome/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,35 +175,36 @@ def __init__(self, udic='', udic_enc='utf8', udic_type='ipadic', max_unknown_len
self.user_dic = None
self.max_unknown_length = max_unknown_length

def tokenize(self, text, stream = False, wakati = False):
def tokenize(self, text, stream=False, wakati=False, baseform_unk=True):
u"""
Tokenize the input text.
:param text: unicode string to be tokenized
:param stream: (Optional) if given True use stream mode. default is False.
:param wakati: (Optinal) if given True returns surface forms only. default is False.
:param baseform_unk: (Optional) if given True sets base_form attribute for unknown tokens. default is True.
:return: list of tokens (stream=False, wakati=False) or token generator (stream=True, wakati=False) or list of string (stream=False, wakati=True) or string generator (stream=True, wakati=True)
"""
if self.wakati:
wakati = True
if stream:
return self.__tokenize_stream(text, wakati)
return self.__tokenize_stream(text, wakati, baseform_unk)
else:
return list(self.__tokenize_stream(text, wakati))
return list(self.__tokenize_stream(text, wakati, baseform_unk))

def __tokenize_stream(self, text, wakati = False):
def __tokenize_stream(self, text, wakati, baseform_unk):
text = text.strip()
text_length = len(text)
processed = 0
while processed < text_length:
tokens, pos = self.__tokenize_partial(text[processed:], wakati)
tokens, pos = self.__tokenize_partial(text[processed:], wakati, baseform_unk)
for token in tokens:
yield token
processed += pos


def __tokenize_partial(self, text, wakati = False):
def __tokenize_partial(self, text, wakati, baseform_unk):
if self.wakati and not wakati:
raise WakatiModeOnlyException

Expand Down Expand Up @@ -247,7 +248,8 @@ def __tokenize_partial(self, text, wakati = False):
assert unknown_entries
for entry in unknown_entries:
left_id, right_id, cost, part_of_speech = entry
dummy_dict_entry = (buf, left_id, right_id, cost, part_of_speech, '*', '*', '*', '*', '*')
base_form = buf if baseform_unk else '*'
dummy_dict_entry = (buf, left_id, right_id, cost, part_of_speech, '*', '*', base_form, '*', '*')
lattice.add(Node(dummy_dict_entry, NodeType.UNKNOWN))

pos += lattice.forward()
Expand Down
32 changes: 29 additions & 3 deletions tests/test_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,19 +58,45 @@ def test_tokenize2(self):
text = u'𠮷野屋'
tokens = Tokenizer().tokenize(text)
self.assertEqual(3, len(tokens))
self._check_token(tokens[0], u'𠮷', u'記号,一般,*,*,*,*,*,*,*', NodeType.UNKNOWN)
self._check_token(tokens[0], u'𠮷', u'記号,一般,*,*,*,*,𠮷,*,*', NodeType.UNKNOWN)
self._check_token(tokens[1], u'野', u'名詞,一般,*,*,*,*,野,ノ,ノ', NodeType.SYS_DICT)
self._check_token(tokens[2], u'屋', u'名詞,接尾,一般,*,*,*,屋,ヤ,ヤ', NodeType.SYS_DICT)

text = u'한국어'
tokens = Tokenizer().tokenize(text)
self.assertEqual(1, len(tokens))
self._check_token(tokens[0], u'한국어', u'記号,一般,*,*,*,*,*,*,*', NodeType.UNKNOWN)
self._check_token(tokens[0], u'한국어', u'記号,一般,*,*,*,*,한국어,*,*', NodeType.UNKNOWN)

def test_tokenize_unknown(self):
text = u'2009年10月16日'
tokens = Tokenizer().tokenize(text)
self.assertEqual(6, len(tokens))
self._check_token(tokens[0], u'2009', u'名詞,数,*,*,*,*,2009,*,*', NodeType.UNKNOWN)
self._check_token(tokens[1], u'年', u'名詞,接尾,助数詞,*,*,*,年,ネン,ネン', NodeType.SYS_DICT)
self._check_token(tokens[2], u'10', u'名詞,数,*,*,*,*,10,*,*', NodeType.UNKNOWN)
self._check_token(tokens[3], u'月', u'名詞,一般,*,*,*,*,月,ツキ,ツキ', NodeType.SYS_DICT)
self._check_token(tokens[4], u'16', u'名詞,数,*,*,*,*,16,*,*', NodeType.UNKNOWN)
self._check_token(tokens[5], u'日', u'名詞,接尾,助数詞,*,*,*,日,ニチ,ニチ', NodeType.SYS_DICT)

text = u'マルチメディア放送(VHF-HIGH帯)「モバキャス」'
tokens = Tokenizer().tokenize(text)
self.assertEqual(11, len(tokens))
self._check_token(tokens[0], u'マルチメディア', u'名詞,一般,*,*,*,*,マルチメディア,マルチメディア,マルチメディア', NodeType.SYS_DICT)
self._check_token(tokens[1], u'放送', u'名詞,サ変接続,*,*,*,*,放送,ホウソウ,ホーソー', NodeType.SYS_DICT)
self._check_token(tokens[2], u'(', u'記号,括弧開,*,*,*,*,(,(,(', NodeType.SYS_DICT)
self._check_token(tokens[3], u'VHF', u'名詞,固有名詞,組織,*,*,*,VHF,*,*', NodeType.UNKNOWN)
self._check_token(tokens[4], u'-', u'名詞,サ変接続,*,*,*,*,-,*,*', NodeType.UNKNOWN)
self._check_token(tokens[5], u'HIGH', u'名詞,一般,*,*,*,*,HIGH,*,*', NodeType.UNKNOWN)
self._check_token(tokens[6], u'帯', u'名詞,接尾,一般,*,*,*,帯,タイ,タイ', NodeType.SYS_DICT)
self._check_token(tokens[7], u')', u'記号,括弧閉,*,*,*,*,),),)', NodeType.SYS_DICT)
self._check_token(tokens[8], u'「', u'記号,括弧開,*,*,*,*,「,「,「', NodeType.SYS_DICT)
self._check_token(tokens[9], u'モバキャス', u'名詞,固有名詞,一般,*,*,*,モバキャス,*,*', NodeType.UNKNOWN)
self._check_token(tokens[10], u'」', u'記号,括弧閉,*,*,*,*,」,」,」', NodeType.SYS_DICT)

def test_tokenize_unknown_no_baseform(self):
text = u'2009年10月16日'
tokens = Tokenizer().tokenize(text, baseform_unk=False)
self.assertEqual(6, len(tokens))
self._check_token(tokens[0], u'2009', u'名詞,数,*,*,*,*,*,*,*', NodeType.UNKNOWN)
self._check_token(tokens[1], u'年', u'名詞,接尾,助数詞,*,*,*,年,ネン,ネン', NodeType.SYS_DICT)
self._check_token(tokens[2], u'10', u'名詞,数,*,*,*,*,*,*,*', NodeType.UNKNOWN)
Expand All @@ -79,7 +105,7 @@ def test_tokenize_unknown(self):
self._check_token(tokens[5], u'日', u'名詞,接尾,助数詞,*,*,*,日,ニチ,ニチ', NodeType.SYS_DICT)

text = u'マルチメディア放送(VHF-HIGH帯)「モバキャス」'
tokens = Tokenizer().tokenize(text)
tokens = Tokenizer().tokenize(text, baseform_unk=False)
self.assertEqual(11, len(tokens))
self._check_token(tokens[0], u'マルチメディア', u'名詞,一般,*,*,*,*,マルチメディア,マルチメディア,マルチメディア', NodeType.SYS_DICT)
self._check_token(tokens[1], u'放送', u'名詞,サ変接続,*,*,*,*,放送,ホウソウ,ホーソー', NodeType.SYS_DICT)
Expand Down

0 comments on commit a4472eb

Please sign in to comment.