# import

In [1]:
import json
import os
import re
from pprint import pprint

from soynlp.normalizer import repeat_normalize

In [2]:
def dump_jsonl(data, output_path, append=False):
    mode = "a+" if append else "w"
    with open(output_path, mode, encoding="utf-8") as f:
        for line in data:
            json_record = json.dumps(line, ensure_ascii=False)
            f.write(json_record + "\n")


def load_jsonl(input_path) -> list:
    data = []
    with open(input_path, "r", encoding="utf-8") as f:
        for line in f:
            data.append(json.loads(line.rstrip("\n|\r")))
    return data

In [3]:
os.chdir("/home/nsw0311/nas_storage")

vocab_size = 24000

# load data

In [4]:
def processing(text):
    #     text = re.sub(r' +', r' ', text.strip())
    #     text = re.sub(r'(.{8,}?)\1+', r'\1', text)
    #     text = re.sub(r'[^ ㄱ-ㅎㅏ-ㅣ가-힣A-Za-z0-9~!@#$%\^\&*\(\)_\+\-=\[\]{},\./<>\?]', r'', text)
    #     text = re.sub(r'http.+', r'<url>', text)

    return text.strip()

In [5]:
data = load_jsonl("datasets/RG/dialog_data.json")

In [6]:
data[0]["content"][8:]

''

In [7]:
def gen():
    for row in data:
        yield row["content"][8:]

In [8]:
user_defined_symbols = [
    "<pad>",
    "<unk>",
    "<cls>",
    "<sep>",
    "<mask>",
    "<bos>",
    "<eos>",
    "<tsep>",
    "<name>",
    "<url>",
    "<file>",
    "<image>",
    "<video>",
    "<location>",
]
user_defined_symbols += [f"<user{i}>" for i in range(10)]
user_defined_symbols += [f"<unk{i}>" for i in range(10)]
unused_token_num = 100
unused_list = [f"<unused{i}>" for i in range(unused_token_num)]
whole_user_defined_symbols = user_defined_symbols + unused_list

pprint(user_defined_symbols)

['<pad>',
 '<unk>',
 '<cls>',
 '<sep>',
 '<mask>',
 '<bos>',
 '<eos>',
 '<tsep>',
 '<name>',
 '<url>',
 '<file>',
 '<image>',
 '<video>',
 '<location>',
 '<user0>',
 '<user1>',
 '<user2>',
 '<user3>',
 '<user4>',
 '<user5>',
 '<user6>',
 '<user7>',
 '<user8>',
 '<user9>',
 '<unk0>',
 '<unk1>',
 '<unk2>',
 '<unk3>',
 '<unk4>',
 '<unk5>',
 '<unk6>',
 '<unk7>',
 '<unk8>',
 '<unk9>']


# train

In [9]:
from tokenizers import Tokenizer
from tokenizers.models import BPE

gpt_tokenizer = Tokenizer(BPE(unk_token="<unk>"))

In [10]:
from tokenizers import normalizers
from tokenizers.normalizers import NFKC, BertNormalizer


n1 = NFKC()
n2 = BertNormalizer(
    clean_text=False,
    handle_chinese_chars=False,
    strip_accents=False,
    lowercase=False,
)

gpt_tokenizer.normalizer = normalizers.Sequence([n1, n2])

In [11]:
from tokenizers import pre_tokenizers
from tokenizers.pre_tokenizers import Metaspace

gpt_tokenizer.pre_tokenizer = pre_tokenizers.Sequence(
    [Metaspace(replacement="_", add_prefix_space=True)]
)

In [12]:
# post_processing pass

In [13]:
from tokenizers.trainers import BpeTrainer

trainer = BpeTrainer(
    vocab_size=vocab_size,
    special_tokens=whole_user_defined_symbols,
)
gpt_tokenizer.train_from_iterator(gen(), trainer)






In [14]:
output = gpt_tokenizer.encode("테스트용인데 잘 되는거같아?")
print(output.ids)

gpt_tokenizer.decode(output.ids)

[13461, 4173, 6645, 6516, 8324, 7211, 164]


'_테스트 용 인데 _잘 _되는거 같아 ?'

In [15]:
from tokenizers import decoders

gpt_tokenizer.decoder = decoders.BPEDecoder(suffix="_")
gpt_tokenizer.decode(output.ids)

'테스트용인데 잘 되는거같아?'

# convert transformers tokenizer and save

In [16]:
from transformers import GPT2TokenizerFast


fast_tokenizer = GPT2TokenizerFast(tokenizer_object=gpt_tokenizer)

In [17]:
fast_tokenizer.pad_token = "<pad>"
fast_tokenizer.unk_token = "<unk>"
fast_tokenizer.cls_token = "<cls>"
fast_tokenizer.sep_token = "<sep>"
fast_tokenizer.mask_token = "<mask>"
fast_tokenizer.bos_token = "<bos>"
fast_tokenizer.eos_token = "<eos>"

special_tokens_dict = {"additional_special_tokens": user_defined_symbols}
fast_tokenizer.add_special_tokens(special_tokens_dict)

0

In [18]:
e = fast_tokenizer.encode("<user1>'테스트용'으로 \"잘\" 되는지 보고이따<sep>")
print(e)
print(fast_tokenizer.decode(e))

[15, 10080, 22599, 4173, 140, 6534, 10861, 4369, 135, 14674, 6846, 10759, 3]
<user1> '테스트용'으로 "잘" 되는지 보고이따<sep>


In [19]:
fast_tokenizer.save_pretrained("etc/DialogBPE")

('etc/DialogBPE/tokenizer_config.json',
 'etc/DialogBPE/special_tokens_map.json',
 'etc/DialogBPE/vocab.json',
 'etc/DialogBPE/merges.txt',
 'etc/DialogBPE/added_tokens.json',
 'etc/DialogBPE/tokenizer.json')