In [13]:
from datasets import load_dataset, DatasetDict, Value
from transformers import DataCollatorForLanguageModeling
from transformers import AutoTokenizer, GPT2LMHeadModel, AutoConfig 
from transformers import GPT2Config
from transformers import Trainer, TrainingArguments
from torch.cuda import empty_cache

raw_datasets = load_dataset("tiny_shakespeare")

Found cached dataset tiny_shakespeare (/home/nguyenan1/.cache/huggingface/datasets/tiny_shakespeare/default/1.0.0/b5b13969f09fe8707337f6cb296314fbe06960bd9a868dca39e713e163d27b5e)
100%|██████████| 3/3 [00:00<00:00, 1676.83it/s]


In [14]:
def get_training_corpus():
    return (
        raw_datasets["train"][i : i + 1000]["text"]
        for i in range(0, len(raw_datasets["train"]), 1000)
    )
training_corpus = get_training_corpus()

In [21]:
from transformers import AutoTokenizer
old_tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer = old_tokenizer.train_new_from_iterator(training_corpus, 52000)







In [22]:
tokenizer = AutoTokenizer.from_pretrained("gpt2")
context_length = 128

def tokenize(element):

    outputs = tokenizer(
        element["text"],
        truncation=True,
        max_length=context_length,
        return_overflowing_tokens=True,
        return_length=True,
    )
    input_batch = []
    for length, input_ids in zip(outputs["length"], outputs["input_ids"]):
        if length == context_length:
            input_batch.append(input_ids)
    return {"input_ids": input_batch}


tokenized_datasets = raw_datasets.map(
    tokenize, batched=True, remove_columns=raw_datasets["train"].column_names
)

Loading cached processed dataset at /home/nguyenan1/.cache/huggingface/datasets/tiny_shakespeare/default/1.0.0/b5b13969f09fe8707337f6cb296314fbe06960bd9a868dca39e713e163d27b5e/cache-99e16b84bb2ae807.arrow
Loading cached processed dataset at /home/nguyenan1/.cache/huggingface/datasets/tiny_shakespeare/default/1.0.0/b5b13969f09fe8707337f6cb296314fbe06960bd9a868dca39e713e163d27b5e/cache-6156f2fbbe37109b.arrow


In [23]:
config = GPT2Config(vocab_size=len(tokenizer),
    n_ctx=context_length,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id)
model = GPT2LMHeadModel(config)

tokenizer.pad_token = tokenizer.eos_token


In [28]:
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

args = TrainingArguments(
    output_dir="codeparrot-ds",
    per_device_train_batch_size=12,
    per_device_eval_batch_size=12,
    evaluation_strategy="steps",
    eval_steps=1_000,
    logging_steps=1_000,
    gradient_accumulation_steps=8,
    num_train_epochs=3,
    weight_decay=0.1,
    warmup_steps=1_000,
    lr_scheduler_type="cosine",
    learning_rate=5e-4,
    #save_steps=500,
    save_strategy="steps",
    fp16=True
)

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
)
empty_cache()
trainer.train()

100%|██████████| 72/72 [01:22<00:00,  1.15s/it]

{'train_runtime': 82.4748, 'train_samples_per_second': 85.808, 'train_steps_per_second': 0.873, 'train_loss': 5.947112189398871, 'epoch': 2.92}





TrainOutput(global_step=72, training_loss=5.947112189398871, metrics={'train_runtime': 82.4748, 'train_samples_per_second': 85.808, 'train_steps_per_second': 0.873, 'train_loss': 5.947112189398871, 'epoch': 2.92})