In [1]:
import torch

In [2]:
checkpoints = "openai-community/gpt2"

In [3]:
from datasets import load_dataset

jokes = load_dataset("Maximofn/short-jokes-dataset")
jokes

DatasetDict({
    train: Dataset({
        features: ['ID', 'Joke'],
        num_rows: 231657
    })
})

In [4]:
percent_of_train_dataset = 0.01
subset_dataset = jokes["train"].select(range(int(len(jokes["train"]) * percent_of_train_dataset)))
subset_dataset

Dataset({
    features: ['ID', 'Joke'],
    num_rows: 2316
})

In [5]:
percent_of_train_dataset = 0.95
split_dataset = subset_dataset.train_test_split(train_size=int(subset_dataset.num_rows * percent_of_train_dataset), seed=19, shuffle=False)
train_dataset = split_dataset["train"]
eval_dataset = split_dataset["test"]
print(f"Size of the train set: {len(train_dataset)}. Size of the validation set: {len(eval_dataset)}")

Size of the train set: 2200. Size of the validation set: 116


In [6]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(checkpoints, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"  # Fix weird overflow issue with fp16 training

In [7]:
from transformers import AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained(checkpoints)
model.config.pad_token_id = model.config.eos_token_id

In [8]:
vocab_size = model.config.vocab_size
vocab_size

50257

In [9]:
new_tokens = ['<SJ>', '<EJ>']   # Start and end of joke tokens

num_added_tokens = tokenizer.add_tokens(new_tokens)
print(f"Added {num_added_tokens} tokens")

Added 2 tokens


In [10]:
model.resize_token_embeddings(len(tokenizer))

new_vocab_size = model.config.vocab_size
print(f"Old vocab size: {vocab_size}. New vocab size: {new_vocab_size}. Added {new_vocab_size - vocab_size} tokens")

Old vocab size: 50257. New vocab size: 50259. Added 2 tokens


In [11]:
joke_column = "Joke"

def format_joke(example):
    example[joke_column] = '<SJ> ' + example['Joke'] + ' <EJ>'
    return example

In [12]:
remove_columns = [column for column in train_dataset.column_names if column != joke_column]
remove_columns

['ID']

In [13]:
subset_dataset = subset_dataset.map(format_joke, remove_columns=remove_columns)
subset_dataset

Dataset({
    features: ['Joke'],
    num_rows: 2316
})

In [14]:
subset_dataset[0]

{'Joke': '<SJ> [me narrating a documentary about narrators] "I can\'t hear what they\'re saying cuz I\'m talking" <EJ>'}

In [15]:
def tokenize_function(examples):
    return tokenizer(examples[joke_column])

In [16]:
subset_dataset = subset_dataset.map(tokenize_function, batched=True, remove_columns=[joke_column])
subset_dataset

Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 2316
})

In [17]:
subset_dataset_train = subset_dataset.train_test_split(train_size=int(subset_dataset.num_rows * 0.8), seed=19, shuffle=False)["train"]
subset_dataset_val = subset_dataset.train_test_split(train_size=int(subset_dataset.num_rows * 0.8), seed=19, shuffle=False)["test"]

In [18]:
from transformers import TrainingArguments

metric_name = "accuracy"
model_name = "GPT2-small-finetuned-Maximofn-short-jokes-dataset-casualLM"
output_dir = f"./training_results"
LR = 2e-5
BS_TRAIN = 8
BS_EVAL = 8
EPOCHS = 3
WEIGHT_DECAY = 0.01
WARMUP_STEPS = 100
MAX_STEPS = 1000

training_args = TrainingArguments(
    model_name,
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=LR,
    per_device_train_batch_size=BS_TRAIN,
    per_device_eval_batch_size=BS_EVAL,
    warmup_steps=WARMUP_STEPS,
    # max_steps=MAX_STEPS,
    num_train_epochs=EPOCHS,
    weight_decay=WEIGHT_DECAY,
    lr_scheduler_type="cosine",
    warmup_ratio = 0.1,
    fp16=True,
    load_best_model_at_end=True,
    # metric_for_best_model=metric_name,
    push_to_hub=True,
    report_to="tensorboard",
)

In [19]:
from transformers import Trainer, DataCollatorForLanguageModeling

trainer = Trainer(
    model,
    training_args,
    train_dataset=subset_dataset_train,
    eval_dataset=subset_dataset_val,
    # eval_dataset=dataset['validation'],
    # tokenizer=tokenizer,
    # compute_metrics=compute_metrics,
    data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False),
)

In [20]:
# %load_ext tensorboard
# %tensorboard --logdir ./GPT2-small-finetuned-Maximofn-short-jokes-dataset-casualLM

In [21]:
trainer.train()

  0%|          | 0/696 [00:00<?, ?it/s]

  0%|          | 0/58 [00:00<?, ?it/s]

{'eval_loss': 3.793056011199951, 'eval_runtime': 19.4156, 'eval_samples_per_second': 23.898, 'eval_steps_per_second': 2.987, 'epoch': 1.0}


  0%|          | 0/58 [00:00<?, ?it/s]

{'eval_loss': 3.6969993114471436, 'eval_runtime': 19.4097, 'eval_samples_per_second': 23.906, 'eval_steps_per_second': 2.988, 'epoch': 2.0}
{'loss': 11.7253, 'grad_norm': 8.681195259094238, 'learning_rate': 5.245406611757882e-06, 'epoch': 2.16}


  0%|          | 0/58 [00:00<?, ?it/s]

{'eval_loss': 3.7049918174743652, 'eval_runtime': 19.4128, 'eval_samples_per_second': 23.902, 'eval_steps_per_second': 2.988, 'epoch': 3.0}


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


{'train_runtime': 809.279, 'train_samples_per_second': 6.865, 'train_steps_per_second': 0.86, 'train_loss': 9.440610162143049, 'epoch': 3.0}


TrainOutput(global_step=696, training_loss=9.440610162143049, metrics={'train_runtime': 809.279, 'train_samples_per_second': 6.865, 'train_steps_per_second': 0.86, 'total_flos': 112578080256000.0, 'train_loss': 9.440610162143049, 'epoch': 3.0})

In [22]:
import torch
import gc


def clear_hardwares():
    torch.clear_autocast_cache()
    torch.cuda.ipc_collect()
    torch.cuda.empty_cache()
    gc.collect()


clear_hardwares()
clear_hardwares()

In [23]:
def generate_joke(prompt_text):
    text = f"<SJ> {prompt_text}"
    tokens = tokenizer(text, return_tensors="pt").to(model.device)
    with torch.no_grad():
        output = model.generate(**tokens, max_new_tokens=256, eos_token_id=tokenizer.encode("<EJ>")[-1])
    return tokenizer.decode(output[0], skip_special_tokens=False)        

In [24]:
generate_joke("Why didn't the frog cross the road?")

Setting `pad_token_id` to `eos_token_id`:50258 for open-end generation.


"<SJ> Why didn't the frog cross the road? Because he was too big for it. <EJ>"