## KoNLPy as pre-tokenizer

In [1]:
from huggingface_konlpy.tokenizers_konlpy import KoNLPyPreTokenizer
from konlpy.tag import Komoran

sent_ko = '신종 코로나바이러스 감염증(코로나19) 사태가 심각합니다'
komoran_pretok = KoNLPyPreTokenizer(Komoran())
print(komoran_pretok(sent_ko))

신종 코로나바이러스 감염증 ( 코로나 19 ) 사태 가 심각 하 ㅂ니다


In [2]:
from huggingface_konlpy.tokenizers_konlpy import KoNLPyPretokBertWordPieceTokenizer
from huggingface_konlpy.transformers_konlpy import KoNLPyPretokBertTokenizer


komoran_bertwordpiece_tokenizer = KoNLPyPretokBertWordPieceTokenizer(
    konlpy_pretok = komoran_pretok)

komoran_bertwordpiece_tokenizer.train(
    files = ['../data/2020-07-29_covid_news_sents.txt'],
    vocab_size = 3000)
komoran_bertwordpiece_tokenizer.save_model(
    directory='./tokenizers/KomoranBertWordPieceTokenizer/',
    name='covid')

komoran_pretok_berttokenizer = KoNLPyPretokBertTokenizer(
    konlpy_pretok = komoran_pretok,
    vocab_file = './tokenizers/KomoranBertWordPieceTokenizer/covid-vocab.txt')

In [3]:
from huggingface_konlpy import compose

indices = komoran_pretok_berttokenizer.encode(sent_ko)
tokens = [komoran_pretok_berttokenizer.ids_to_tokens[ids] for ids in indices]
print(' '.join(compose(tokens)))

[CLS] 신종 코로나바이러스 감염증 ( 코로나 19 ) 사태 가 심 ##각 하 [UNK] [SEP]


## KoNLPy WordPiece Tokenizer

### with tag

In [4]:
from huggingface_konlpy.tokenizers_konlpy import KoNLPyWordPieceTokenizer
from konlpy.tag import Mecab

mecab_wordpiece_notag = KoNLPyWordPieceTokenizer(Mecab(), use_tag=False)
print(' '.join(mecab_wordpiece_notag.tokenize(sent_ko)))

신종 코로나 ##바이러스 감염증 ##( ##코로나 ##19 ##) 사태 ##가 심각 ##합니다


In [5]:
mecab_wordpiece_usetag = KoNLPyWordPieceTokenizer(Mecab(), use_tag=True)
print(' '.join(mecab_wordpiece_usetag.tokenize(sent_ko)))

신종/NNG 코로나/NNP ##바이러스/NNG 감염증/NNG ##(/SSO ##코로나/NNP ##19/SN ##)/SSC 사태/NNG ##가/JKS 심각/XR ##합니다/XSA+EC


In [6]:
from huggingface_konlpy.tokenizers_konlpy import KoNLPyBertWordPieceTrainer

mecab_wordpiece_notag_trainer = KoNLPyBertWordPieceTrainer(
    Mecab(), use_tag=False)
mecab_wordpiece_notag_trainer.train(
    files = ['../data/2020-07-29_covid_news_sents.txt'])
mecab_wordpiece_notag_trainer.save_model('./tokenizers/BertStyleMecab/', 'notag')

Initialize alphabet 1/1: 100%|██████████| 70964/70964 [00:00<00:00, 80584.70it/s]
Train vocab 1/1: 100%|██████████| 70964/70964 [00:14<00:00, 4790.16it/s]


[/mnt/lovit/git/transformers_konlpy_trainer/tutorials/tokenizers/BertStyleMecab/notag-vocab.txt]


In [7]:
from huggingface_konlpy.transformers_konlpy import KoNLPyBertTokenizer

konlpy_bert_notag = KoNLPyBertTokenizer(
    konlpy_wordpiece = KoNLPyWordPieceTokenizer(Mecab(), use_tag=False),
    vocab_file = './tokenizers/BertStyleMecab/notag-vocab.txt'
)
print(' '.join(konlpy_bert_notag.tokenize(sent_ko)))

신종 코로나 ##바이러스 감염증 ##( ##코로나 ##19 ##) 사태 ##가 심각 ##합니다


In [8]:
mecab_wordpiece_usetag_trainer = KoNLPyBertWordPieceTrainer(Mecab(), use_tag=True)
mecab_wordpiece_usetag_trainer.train(
    files = ['../data/2020-07-29_covid_news_sents.txt'])
mecab_wordpiece_usetag_trainer.save_model('./tokenizers/BertStyleMecab/', 'usetag')

konlpy_bert_usetag = KoNLPyBertTokenizer(
    konlpy_wordpiece = KoNLPyWordPieceTokenizer(Mecab(), use_tag=True),
    vocab_file = './tokenizers/BertStyleMecab/usetag-vocab.txt')

indices = konlpy_bert_usetag.encode(sent_ko)
tokens = [konlpy_bert_usetag.ids_to_tokens[ids] for ids in indices]
print(' '.join(compose(tokens)))

Initialize alphabet 1/1: 100%|██████████| 70964/70964 [00:00<00:00, 81053.35it/s]
Train vocab 1/1: 100%|██████████| 70964/70964 [00:14<00:00, 4826.94it/s]


[/mnt/lovit/git/transformers_konlpy_trainer/tutorials/tokenizers/BertStyleMecab/usetag-vocab.txt]
[CLS] 신종/NNG 코로나/NNP ##바이러스/NNG 감염증/NNG ##(/SSO ##코로나/NNP ##19/SN ##)/SSC 사태/NNG ##가/JKS 심각/XR 합 니 다 [SEP]
