In [1]:
from tokenizers import Tokenizer
from datasets import load_dataset
from tokenizers.normalizers import Lowercase
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.models import BPE, WordPiece, Unigram
from tokenizers.trainers import BpeTrainer, WordPieceTrainer, UnigramTrainer

In [2]:
dataset = load_dataset('wikitext', 'wikitext-2-raw-v1', split='train')

Downloading readme:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/733k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/6.36M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/657k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/4358 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/36718 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3760 [00:00<?, ? examples/s]

In [4]:
print(dataset[:10])

{'text': ['', ' = Valkyria Chronicles III = \n', '', ' Senjō no Valkyria 3 : Unrecorded Chronicles ( Japanese : 戦場のヴァルキュリア3 , lit . Valkyria of the Battlefield 3 ) , commonly referred to as Valkyria Chronicles III outside Japan , is a tactical role @-@ playing video game developed by Sega and Media.Vision for the PlayStation Portable . Released in January 2011 in Japan , it is the third game in the Valkyria series . Employing the same fusion of tactical and real @-@ time gameplay as its predecessors , the story runs parallel to the first game and follows the " Nameless " , a penal military unit serving the nation of Gallia during the Second Europan War who perform secret black operations and are pitted against the Imperial unit " Calamaty Raven " . \n', " The game began development in 2010 , carrying over a large portion of the work done on Valkyria Chronicles II . While it retained the standard features of the series , it also underwent multiple adjustments , such as making the game m

In [5]:
texts = [t for t in dataset['text'] if t.strip()] ## 빈 문자열 제거.

In [8]:
print(texts[:10])

[' = Valkyria Chronicles III = \n', ' Senjō no Valkyria 3 : Unrecorded Chronicles ( Japanese : 戦場のヴァルキュリア3 , lit . Valkyria of the Battlefield 3 ) , commonly referred to as Valkyria Chronicles III outside Japan , is a tactical role @-@ playing video game developed by Sega and Media.Vision for the PlayStation Portable . Released in January 2011 in Japan , it is the third game in the Valkyria series . Employing the same fusion of tactical and real @-@ time gameplay as its predecessors , the story runs parallel to the first game and follows the " Nameless " , a penal military unit serving the nation of Gallia during the Second Europan War who perform secret black operations and are pitted against the Imperial unit " Calamaty Raven " . \n', " The game began development in 2010 , carrying over a large portion of the work done on Valkyria Chronicles II . While it retained the standard features of the series , it also underwent multiple adjustments , such as making the game more forgiving for

In [9]:
# BPE 토크나이저 초기화
bpe_tokenizer = Tokenizer(BPE())

# Normalizer 및 Pre-tokenizer 설정
bpe_tokenizer.normalizer = Lowercase()
bpe_tokenizer.pre_tokenizer = Whitespace()

# 트레이너 설정
bpe_trainer = BpeTrainer(
    vocab_size=30000,
    special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"]
)

# 토크나이저 학습
bpe_tokenizer.train_from_iterator(texts, trainer=bpe_trainer)

# 샘플 데이터 토큰화
bpe_output = bpe_tokenizer.encode("Hello, how are you?")
print("BPE 토크나이저 토큰:", bpe_output.tokens)




BPE 토크나이저 토큰: ['hel', 'lo', ',', 'how', 'are', 'you', '?']


In [10]:
# WordPiece 토크나이저 초기화
wp_tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))

# Normalizer 및 Pre-tokenizer 설정
wp_tokenizer.normalizer = Lowercase()
wp_tokenizer.pre_tokenizer = Whitespace()

# 트레이너 설정
wp_trainer = WordPieceTrainer(
    vocab_size=30000,
    special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"]
)

# 토크나이저 학습
wp_tokenizer.train_from_iterator(texts, trainer=wp_trainer)

# 샘플 데이터 토큰화
wp_output = wp_tokenizer.encode("Hello, how are you?")
print("WordPiece 토크나이저 토큰:", wp_output.tokens)




WordPiece 토크나이저 토큰: ['hell', '##o', ',', 'how', 'are', 'you', '?']
