In [None]:
import os
import numpy as np
import pandas as pd

from typing import List
from datasets import Dataset
from transformers import AutoTokenizer, PreTrainedTokenizerFast, T5TokenizerFast

In [None]:
os.makedirs("./tokenizer/sentencepiece", exist_ok=True)

max_len = 512
# model_id = "psyche/KoT5-summarization"
model_id = "philschmid/bart-large-cnn-samsum"

train_df = pd.read_csv("../dataset/en_train.csv")
valid_df = pd.read_csv("../dataset/en_dev.csv")

df = pd.concat([train_df, valid_df], ignore_index=True)

In [None]:
special_tokens = [
    '#Person1#',
    '#Person2#',
    '#Person3#',
    '#Person4#',
    '#Person5#',
    '#Person6#',
    '#Person7#',
    '#SSN#',
    '#Email#',
    '#Address#',
    '#Reaction#',
    '#CarNumber#',
    '#Movietitle#',
    '#DateOfBirth#',
    '#CardNumber#',
    '#PhoneNumber#',
    '#PassportNumber#',
    '<sep>'
]

extra_tokens = [f"<extra_id_{i}>" for i in range(500)]

In [None]:
def train_tokenizer(df: pd.DataFrame, vocab_size: int = 32000, model_prefix: str = "t5_tokenizer") -> T5TokenizerFast:
    all_text = df['dialogue'].tolist() + df['summary'].tolist()
    
    tokenizer = T5TokenizerFast.from_pretrained(model_id, model_max_length=max_len)
    tokenizer = tokenizer.train_new_from_iterator(all_text, vocab_size=vocab_size)
    # tokenizer.add_special_tokens({"additional_special_tokens": special_tokens})
    tokenizer.add_special_tokens({"additional_special_tokens": special_tokens + extra_tokens})
    
    return tokenizer

def get_max_length(df: pd.DataFrame, tokenizer: T5TokenizerFast, percentile: int = 95) -> int:
    dialogue_lengths = [len(tokenizer.encode(str(text))) for text in df['dialogue']]
    summary_lengths = [len(tokenizer.encode(str(text))) for text in df['summary']]
    all_lengths = dialogue_lengths + summary_lengths
    return int(np.percentile(all_lengths, percentile))

def tokenize_function(examples: dict, tokenizer: T5TokenizerFast, max_length: int) -> dict:
    return tokenizer(
        examples["dialogue"],
        examples["summary"],
        padding="max_length",
        truncation=True,
        max_length=max_length
    )

In [None]:
tokenizer = train_tokenizer(df)
max_length = get_max_length(df, tokenizer, percentile=95)
print(f"Max length: {max_length}")

dataset = Dataset.from_pandas(df)
tokenized_datasets = dataset.map(
    lambda examples: tokenize_function(examples, tokenizer, max_length),
    batched=True
)

tokenizer.save_pretrained("./tokenizer/sentencepiece")