In [1]:
import os
import numpy as np
import pandas as pd

from datasets import Dataset
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import ByteLevel
from transformers import PreTrainedTokenizerFast

In [2]:
os.makedirs("./tokenizer/bpe", exist_ok=True)

In [3]:
train_df = pd.read_csv("../dataset/cleaned_train.csv")
valid_df = pd.read_csv("../dataset/cleaned_dev.csv")
df = pd.concat([train_df, valid_df], ignore_index=True)

In [4]:
def train_tokenizer(df, vocab_size=30000, min_frequency=2):
    all_text = df['dialogue'].tolist() + df['summary'].tolist()
    
    tokenizer = Tokenizer(BPE(unk_token="<unk>"))
    tokenizer.pre_tokenizer = ByteLevel()
    
    trainer = BpeTrainer(
        vocab_size=vocab_size,
        min_frequency=min_frequency,
        special_tokens=["<s>", "<pad>", "</s>", "<unk>", "<mask>",
                        "#Person1#", "#Person2#", "#Person3#", "#Person4#", "#Person5#", 
                        "#Person6#", "#Person7#", "#PhoneNumber#", "#Address#", "#PassportNumber#", 
                        "#CardNumber#", "#Email#", "#DateOfBirth#"]
    )
    
    tokenizer.train_from_iterator(all_text, trainer=trainer)
    
    return tokenizer

In [5]:
def convert_to_pretrained_tokenizer(tokenizer):
    return PreTrainedTokenizerFast(
        tokenizer_object=tokenizer,
        bos_token="<s>",
        eos_token="</s>",
        unk_token="<unk>",
        pad_token="<pad>",
        mask_token="<mask>"
    )

In [6]:
def get_max_length(df, tokenizer, percentile=95):
    dialogue_lengths = [len(tokenizer.encode(str(text))) for text in df['dialogue']]
    summary_lengths = [len(tokenizer.encode(str(text))) for text in df['summary']]
    all_lengths = dialogue_lengths + summary_lengths
    return int(np.percentile(all_lengths, percentile))

In [7]:
def tokenize_function(examples, tokenizer, max_length):
    return tokenizer(
        examples["dialogue"],
        examples["summary"],
        padding="max_length",
        truncation=True,
        max_length=max_length
    )

In [8]:
base_tokenizer = train_tokenizer(df)
pretrained_tokenizer = convert_to_pretrained_tokenizer(base_tokenizer)
max_length = get_max_length(df, pretrained_tokenizer, percentile=95)
print(max_length)

dataset = Dataset.from_pandas(df)
tokenized_datasets = dataset.map(
    lambda examples: tokenize_function(examples, pretrained_tokenizer, max_length),
    batched=True
)

pretrained_tokenizer.save_pretrained("./tokenizer/bpe/")

print(f"토크나이저 어휘 크기: {len(pretrained_tokenizer)}")
print(f"처리된 데이터셋 크기: {len(tokenized_datasets)}")




245


Map:   0%|          | 0/12956 [00:00<?, ? examples/s]

토크나이저 어휘 크기: 30000
처리된 데이터셋 크기: 12956


In [9]:
def sample_and_tokenize(df, tokenizer, n_samples=5):
    # 랜덤 샘플링
    sampled_df = df.sample(n=n_samples, random_state=42)
    
    for _, row in sampled_df.iterrows():
        dialogue = row['dialogue']
        summary = row['summary']
        
        # Dialogue 토큰화
        dialogue_tokens = tokenizer.encode(dialogue)
        
        # Summary 토큰화
        summary_tokens = tokenizer.encode(summary)
        
        print(f"Original Dialogue: {dialogue}")
        print(f"Tokenized Dialogue: {dialogue_tokens}")
        print(f"Decoded Dialogue: {tokenizer.decode(dialogue_tokens)}")
        print("\n")
        print(f"Original Summary: {summary}")
        print(f"Tokenized Summary: {summary_tokens}")
        print(f"Decoded Summary: {tokenizer.decode(summary_tokens)}")
        print("\n" + "="*50 + "\n")

In [10]:
sample_and_tokenize(df, pretrained_tokenizer, n_samples=3)

Original Dialogue: #Person1#: 아이고, 지난 3개월 동안 10파운드나 쪘어, 옷들이 하나도 안 맞아.
#Person2#: 나라면 불평하지 않을 거야, 너는 훨씬 더 잘 생겨 보여. 사실, 너는 또 5파운드를 더 찌워도 여전히 잘 보일 거야.
Tokenized Dialogue: [5, 139, 42, 6703, 28, 1307, 670, 2564, 719, 1045, 28638, 13218, 10116, 28, 13000, 8907, 388, 1676, 30, 138, 6, 139, 42, 19053, 20515, 1160, 572, 28, 974, 1609, 421, 541, 6752, 937, 30, 887, 28, 974, 924, 844, 7929, 421, 6360, 16902, 2425, 541, 3932, 572, 30]
Decoded Dialogue: #Person1# Ġ : ĠìķĦìĿ´ê³ł, Ġì§ĢëĤľ Ġ3 ê°ľìĽĶ ĠëıĻìķĪ Ġ10 íĮĮìļ´ëĵľëĤĺ Ġìª ĺìĸ´, Ġìĺ·ëĵ¤ìĿ´ ĠíķĺëĤĺëıĦ ĠìķĪ Ġë§ŀìķĦ. Ċ #Person2# Ġ : ĠëĤĺëĿ¼ë©´ Ġë¶Īíıīíķĺì§Ģ ĠìķĬìĿĦ Ġê±°ìķ¼, ĠëĦĪëĬĶ ĠíĽ¨ìĶ¬ ĠëįĶ Ġìŀĺ ĠìĥĿê²¨ Ġë³´ìĹ¬. ĠìĤ¬ìĭ¤, ĠëĦĪëĬĶ ĠëĺĲ Ġ5 íĮĮìļ´ëĵľë¥¼ ĠëįĶ Ġì°Į ìĽĮëıĦ ĠìĹ¬ìłĦíŀĪ Ġìŀĺ Ġë³´ìĿ¼ Ġê±°ìķ¼.


Original Summary: #Person1#은 체중이 증가했지만 #Person2#는 #Person1#이 잘 보인다고 생각한다.
Tokenized Summary: [5, 1345, 11986, 6728, 2282, 139, 6, 679, 160, 139, 5, 249, 541, 8381, 1433, 30]
Decoded Summary: #Person1# ĠìĿĢ Ġì²´ì¤ĳìĿ´ Ġì¦Ŀê°Ģ íĸĪì§Ģë§Į Ġ #Person2# Ġ