In [1]:
import os
import numpy as np
import pandas as pd

from typing import List
from datasets import Dataset
from transformers import T5TokenizerFast
from transformers import AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
os.makedirs("./tokenizer/sentencepiece", exist_ok=True)

max_len = 1024
model_id = "psyche/KoT5-summarization"

train_df = pd.read_csv("../dataset/cleaned_train.csv")
valid_df = pd.read_csv("../dataset/cleaned_dev.csv")

df = pd.concat([train_df, valid_df], ignore_index=True)

In [3]:
def train_tokenizer(df: pd.DataFrame, vocab_size: int = 32000, model_prefix: str = "t5_tokenizer") -> T5TokenizerFast:
    all_text = df['dialogue'].tolist() + df['summary'].tolist()
    
    tokenizer = T5TokenizerFast.from_pretrained(model_id, model_max_length=max_len)
    tokenizer = tokenizer.train_new_from_iterator(all_text, vocab_size=vocab_size)
    
    special_tokens = ["<s>", "</s>", "<pad>", "<unk>", "<mask>", "<sep>",
                      "#Person1#", "#Person2#", "#Person3#", "#Person4#", "#Person5#", 
                      "#Person6#", "#Person7#", "#PhoneNumber#", "#Address#", "#PassportNumber#", 
                      "#CardNumber#", "#Email#", "#DateOfBirth#"]
    tokenizer.add_special_tokens({"additional_special_tokens": special_tokens})
    
    return tokenizer

def get_max_length(df: pd.DataFrame, tokenizer: T5TokenizerFast, percentile: int = 95) -> int:
    dialogue_lengths = [len(tokenizer.encode(str(text))) for text in df['dialogue']]
    summary_lengths = [len(tokenizer.encode(str(text))) for text in df['summary']]
    all_lengths = dialogue_lengths + summary_lengths
    return int(np.percentile(all_lengths, percentile))

def tokenize_function(examples: dict, tokenizer: T5TokenizerFast, max_length: int) -> dict:
    return tokenizer(
        examples["dialogue"],
        examples["summary"],
        padding="max_length",
        truncation=True,
        max_length=max_length
    )

In [4]:
tokenizer = train_tokenizer(df)
max_length = get_max_length(df, tokenizer, percentile=95)
print(f"Max length: {max_length}")

dataset = Dataset.from_pandas(df)
tokenized_datasets = dataset.map(
    lambda examples: tokenize_function(examples, tokenizer, max_length),
    batched=True
)

tokenizer.save_pretrained("./tokenizer/sentencepiece")






Max length: 188


Map: 100%|██████████| 12956/12956 [00:00<00:00, 16374.21 examples/s]


('./tokenizer/sentencepiece/tokenizer_config.json',
 './tokenizer/sentencepiece/special_tokens_map.json',
 './tokenizer/sentencepiece/tokenizer.json')