In [1]:
from transformers import AutoTokenizer
from datasets import load_dataset

In [5]:
DATA_ROOT_DIR = "/mnt/ml-data/data-synthesis/finetuning/"
MAX_LENGTH = 256

model_name_or_path = "/mnt/ml-data/huggingface/flan-ul2"
train_data = "gsm8k_train.jsonl"
test_data = "gsm8k_test.jsonl"

In [None]:
dataset = load_dataset(
    DATA_ROOT_DIR,
    data_files={
        "train": train_data,
        "test": test_data,
    },
    cache_dir="./cache",
)

In [6]:
dataset_train_val_split = dataset["train"].train_test_split(test_size=0.2)
dataset["validation"] = dataset_train_val_split["test"]
dataset["train"] = dataset_train_val_split["train"]

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
tokenizer.pad_token = tokenizer.eos_token

def preprocess_function(examples, padding="max_length"):
    model_inputs = tokenizer(
        examples["question"],
        max_length=MAX_LENGTH,
        padding=padding,
        truncation=True,
    )

    labels = tokenizer(
        examples["answer"],
        max_length=MAX_LENGTH,
        padding=padding,
        truncation=True,
        return_tensors="pt",
    )
    labels = labels["input_ids"]
    labels[labels == tokenizer.pad_token_id] = -100
    model_inputs["labels"] = labels
    return model_inputs

In [None]:
tokenized_datasets = dataset.map(
    preprocess_function,
    batched=True,
    num_proc=1,
    remove_columns=dataset["train"].column_names,
    load_from_cache_file=False,
    desc="Tokenizing dataset...",
)

In [None]:
dataset["train"]["question"][0]

In [None]:
tokenized_datasets["train"]["input_ids"][0]