-
Notifications
You must be signed in to change notification settings - Fork 12
/
tokenizer.py
152 lines (130 loc) · 6.8 KB
/
tokenizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
from typing import List
class NLTKMosesTokenizer:
"""Create the Moses Tokenizer implemented by in NLTK.
From:
https://www.nltk.org/_modules/nltk/tokenize/moses.html
Examples:
>>> tokenizer = prenlp.tokenizer.NLTKMosesTokenizer()
>>> tokenizer('PreNLP package provides a variety of text preprocessing tools.')
['PreNLP', 'package', 'provides', 'a', 'variety', 'of', 'text', 'preprocessing', 'tools', '.']
>>> tokenizer.tokenize('PreNLP package provides a variety of text preprocessing tools.')
['PreNLP', 'package', 'provides', 'a', 'variety', 'of', 'text', 'preprocessing', 'tools', '.']
"""
def __init__(self):
try:
from nltk.tokenize.moses import MosesTokenizer
except Exception as ex:
import nltk
nltk.download('perluniprops')
nltk.download('nonbreaking_prefixes')
self.tokenizer = MosesTokenizer()
def __call__(self, text: str) -> List[str]:
return self.tokenize(text)
def tokenize(self, text: str) -> List[str]:
return self.tokenizer.tokenize(text, escape=False)
class Mecab:
"""Create the Mecab morphological analyzer.
From:
https://bitbucket.org/eunjeon/mecab-ko-dic/src
Examples:
>>> tokenizer = prenlp.tokenizer.Mecab()
>>> tokenizer('모든 이야기에는 끝이 있지만, 인생에서의 모든 끝은 새로운 시작을 의미한다.')
['모든', '이야기', '에', '는', '끝', '이', '있', '지만', ',', '인생', '에서', '의', '모든', '끝', '은', '새로운', '시작', '을', '의미', '한다', '.']
>>> tokenizer.tokenize('모든 이야기에는 끝이 있지만, 인생에서의 모든 끝은 새로운 시작을 의미한다.')
['모든', '이야기', '에', '는', '끝', '이', '있', '지만', ',', '인생', '에서', '의', '모든', '끝', '은', '새로운', '시작', '을', '의미', '한다', '.']
"""
def __init__(self):
try:
from konlpy.tag import Mecab
except ImportError:
raise ImportError(
'Mecab is not installed. '
'You can install Mecab with "sh scripts/install_mecab.sh" '
'You can refer to the installation guide in https://github.com/lyeoni/prenlp/blob/master/scripts/install_mecab.sh or https://bitbucket.org/eunjeon/mecab-ko-dic/src')
self.tokenizer = Mecab()
def __call__(self, text: str) -> List[str]:
return self.tokenize(text)
def tokenize(self, text: str) -> List[str]:
return self.tokenizer.morphs(text)
class SentencePiece:
"""Create the SentencePiece subword tokenizer.
From:
https://github.com/google/sentencepiece
Examples:
>>> prenlp.tokenizer.SentencePiece.train(input='corpus.txt', model_prefix='sentencepiece', vocab_size=10000)
>>> tokenizer = prenlp.tokenizer.SentencePiece.load('sentencepiece.model')
>>> tokenizer('Time is the most valuable thing a man can spend.')
['▁Time', '▁is', '▁the', '▁most', '▁valuable', '▁thing', '▁a', '▁man', '▁can', '▁spend', '.']
>>> tokenizer.tokenize('Time is the most valuable thing a man can spend.')
['▁Time', '▁is', '▁the', '▁most', '▁valuable', '▁thing', '▁a', '▁man', '▁can', '▁spend', '.']
>>> tokenizer.detokenize(['▁Time', '▁is', '▁the', '▁most', '▁valuable', '▁thing', '▁a', '▁man', '▁can', '▁spend', '.'])
Time is the most valuable thing a man can spend.
"""
def __init__(self):
try:
import sentencepiece
except ImportError:
raise ImportError(
'SentencePiece is not installed. '
'You can install Python binary package of SentencePiece with "pip install sentencepiece" '
'You can refer to the official installation guide in https://github.com/google/sentencepiece')
self.sentencepiece = sentencepiece
self.processor = sentencepiece.SentencePieceProcessor()
def __call__(self, text: str) -> List[str]:
return self.tokenize(text)
def tokenize(self, text: str) -> List[str]:
return self.processor.EncodeAsPieces(text)
def detokenize(self, tokens: List[str]) -> str:
return self.processor.DecodePieces(tokens)
@classmethod
def train(cls, input: str, model_prefix: str, vocab_size: int,
character_coverage: float = 1.0,
model_type: str = 'bpe',
max_sentence_length :int = 100000,
pad_id: int = 0,
unk_id: int = 1,
bos_id: int = 2,
eos_id: int = 3,
pad_token: str = '[PAD]',
unk_token: str = '[UNK]',
bos_token: str = '[BOS]',
eos_token: str = '[EOS]',
user_defined_symbols: str = '[SEP],[CLS],[MASK]') -> None:
"""Train SentencePiece model.
Args:
input (str): one-sentence-per-line raw corpus file
model_prefix (str): output model name prefix. <model_prefix>.model and <model_prefix>.vocab are generated
vocab_size (int): vocabulary size
model_type (str): model type. Choose from bpe, unigram, char, or word.
The input sentence must be pretokenized when using word type
character_coverage (float): amount of characters covered by the model,
good defaults are: 0.9995 for languages with rich character set like Japanse or Chinese and 1.0 for other languages with small character set
"""
template = '--input={} --model_prefix={} --vocab_size={} \
--character_coverage={} \
--model_type={} \
--max_sentence_length={} \
--pad_id={} --pad_piece={} \
--unk_id={} --unk_piece={} \
--bos_id={} --bos_piece={} \
--eos_id={} --eos_piece={} \
--user_defined_symbols={}'
cmd = template.format(input, model_prefix, vocab_size,
character_coverage,
model_type,
max_sentence_length,
pad_id, pad_token,
unk_id, unk_token,
bos_id, bos_token,
eos_id, eos_token,
user_defined_symbols)
cls().sentencepiece.SentencePieceTrainer.train(cmd)
@classmethod
def load(cls, model: str) -> 'SentencePiece':
"""Load the pre-trained SentencePiece model.
Args:
model (str): pre-trained sentencepiece model
"""
sentencepiece = cls()
sentencepiece.processor.load(model)
return sentencepiece