In [1]:
import math
import torch
import transformers
from datasets import load_dataset, concatenate_datasets

In [2]:
# https://huggingface.co/distilbert/distilgpt2
# https://huggingface.co/openai-community/gpt2/tree/main
model_id = "distilbert/distilgpt2"
tokenizer_id = model_id

# https://huggingface.co/datasets/salgara/Grimes_tales
dataset_id = "salgara/Grimes_tales"

In [3]:
dataset = load_dataset(dataset_id)["train"]
dataset

Dataset({
    features: ['Title', 'Story', 'Rating', 'Voters'],
    num_rows: 216
})

[HuggingFace Course: Tokenizers](https://huggingface.co/learn/nlp-course/en/chapter2/4)

In [4]:
tokenizer = transformers.AutoTokenizer.from_pretrained(tokenizer_id)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

tokenizer.special_tokens_map

{'bos_token': '<|endoftext|>',
 'eos_token': '<|endoftext|>',
 'unk_token': '<|endoftext|>',
 'pad_token': '<|endoftext|>'}

In [5]:
# A whitespace-before-the-word and a whitespace symbol
tokenizer.vocab["Ġ"]

220

In [6]:
tokenizer(" ")

{'input_ids': [220], 'attention_mask': [1]}

In [7]:
# A newline symbol
tokenizer.vocab["Ċ"]

198

In [8]:
tokenizer("\n")

{'input_ids': [198], 'attention_mask': [1]}

In [9]:
tokenizer.vocab

{'giving': 13992,
 'apore': 11656,
 'Ġ});': 14980,
 'Ġevaluating': 22232,
 'rition': 10168,
 'Ġpigs': 22333,
 'ĠPas': 17454,
 'Ġplanning': 5410,
 'ĠâĤ¬': 10432,
 'æĪ': 22755,
 'utions': 3508,
 'Ġobserve': 12414,
 'Ġuncover': 23658,
 'Besides': 23937,
 'Ġkidnapping': 25201,
 'ithe': 31470,
 '456': 29228,
 'Ġprogressives': 31778,
 'araoh': 33766,
 'Ġcoordinated': 22080,
 'Ġclarified': 28464,
 'Ġjung': 34799,
 'Ġhostilities': 39082,
 'ĠLondon': 3576,
 'android': 19411,
 'ĠGENERAL': 41877,
 'ĠSuperintendent': 34058,
 'ijn': 48848,
 'ils': 4487,
 '([': 26933,
 'grey': 49502,
 'Ġuncomfortable': 12916,
 'walking': 44065,
 'Ġshield': 7614,
 'ller': 6051,
 'ĠAin': 31899,
 'ĠAs': 1081,
 'ĠCelt': 16333,
 'Ġgeop': 30324,
 'oring': 3255,
 'ARK': 14175,
 'Ġpursue': 10660,
 'Ġâĺħ': 23883,
 'Ġaddition': 3090,
 'Ġnoun': 23227,
 'HE': 13909,
 'ĠSharon': 26918,
 'menu': 26272,
 'Ġbrushed': 31782,
 'Ġshowdown': 33338,
 'ruction': 2762,
 'ĠBret': 39420,
 'Ġspinach': 39129,
 'ĠHazel': 42805,
 'Ġjewelry': 22

In [10]:
# How many different tokens will the word "study" and derived get?
nonsense = "Study or not to study your studies? Studying is light, but not studying is darkness."
tokens = tokenizer.tokenize(nonsense)
token_ids = tokenizer.convert_tokens_to_ids(tokens)
print("Token, Token ID")
for token, token_id in zip(tokens, token_ids):
    print(f"{token:<10} {token_id}")

# Read some more: https://discuss.huggingface.co/t/bpe-tokenizers-and-spaces-before-words/475/2

Token, Token ID
Study      39841
Ġor        393
Ġnot       407
Ġto        284
Ġstudy     2050
Ġyour      534
Ġstudies   3640
?          30
ĠStud      3604
ying       1112
Ġis        318
Ġlight     1657
,          11
Ġbut       475
Ġnot       407
Ġstudying  11065
Ġis        318
Ġdarkness  11854
.          13


In [None]:
token_ids = tokenizer(nonsense)["input_ids"]
tokenizer.decode(token_ids)

In [None]:
dataset["Story"][:3]

[Training a causal language model from scratch](https://huggingface.co/learn/nlp-course/en/chapter7/6)

In [None]:
tokenized1 = tokenizer(dataset["Story"][:3])
print(tokenized1.keys())
print("Length:", [len(x) for x in tokenized1["input_ids"]])
print("Total tokens:", sum(len(x) for x in tokenized1["input_ids"]))

[tokenizer() parameters](https://huggingface.co/docs/transformers/en/internal/tokenization_utils#transformers.PreTrainedTokenizerBase.__call__)

In [None]:
tokenized2 = tokenizer(dataset["Story"][:3], truncation=True, max_length=500, return_overflowing_tokens=True)
print(tokenized2.keys())
print("Length:", [len(x) for x in tokenized2["input_ids"]])
print("Total tokens:", sum(len(x) for x in tokenized2["input_ids"]))

In [None]:
tokenized3 = tokenizer(dataset["Story"][:3], truncation=True, max_length=500, return_overflowing_tokens=True, padding="max_length")
print(tokenized3.keys())
print("Length:", [len(x) for x in tokenized3["input_ids"]])
print("Total tokens:", sum(len(x) for x in tokenized3["input_ids"]))

In [None]:
[y for x in tokenized1["input_ids"] for y in x] == [y for x in tokenized2["input_ids"] for y in x]

In [None]:
[y for x in tokenized1["input_ids"] for y in x] == [y for x in tokenized3["input_ids"] for y in x if y != tokenizer.pad_token_id]

In [None]:
res = tokenizer(
    dataset["Story"][:10],
    truncation=True,
    return_overflowing_tokens=True,
    padding="max_length",
    return_tensors="pt"
)
res

In [None]:
res["input_ids"][1]

In [None]:
res["attention_mask"][1]

In [None]:
# return_tensors="pt" doesn't work with .map()!
# https://discuss.huggingface.co/t/dataset-map-return-only-list-instead-torch-tensors/15767
# Use ds.set_format("pt", columns=["input_ids"], output_all_columns=True) after .map()

def tokenize_batch(examples):
    print("Number of examples:", len(examples["Story"]))

    res = tokenizer(
        examples["Story"],
        truncation=True,
        return_overflowing_tokens=True,
        padding="max_length",  # Defaults to the max length of the model
        return_tensors="pt"
    )
    print("Result shape:", res["input_ids"].shape)
    return res


In [None]:
dataset

In [None]:
# SPOILER ALERT! DO NOT SCROLL FURTHER DOWN! UNCOMMENT THE FOLLOWING LINE AND FIX THE ERROR!
# dataset.map(tokenize_batch, batched=True)

In [None]:
dataset["Story"][:3]

In [None]:
ds = dataset.train_test_split(test_size=0.2, seed=42)

# Use remove_columns to drop the columns that don't have the same number of rows as the tokenized columns
# https://discuss.huggingface.co/t/how-to-use-map-or-similar-when-one-row-is-mapped-to-multiple-rows/8374

train_dataset = ds["train"].map(tokenize_batch, remove_columns=dataset.column_names, batched=True)
train_dataset.set_format("pt", columns=["input_ids"], output_all_columns=True)
print(train_dataset)
test_dataset = ds["test"].map(tokenize_batch, remove_columns=dataset.column_names, batched=True)
test_dataset.set_format("pt", columns=["input_ids"], output_all_columns=True)
print(test_dataset)

In [None]:
train_dataset["input_ids"][:10]

In [None]:
train_dataset["input_ids"].shape

[HuggingFace: Causal language modeling](https://huggingface.co/docs/transformers/en/tasks/language_modeling)

[HuggingFace course: Fine-tune a pretrained model](https://huggingface.co/docs/transformers/en/training)

In [None]:
device = torch.device(
    "cuda" if torch.cuda.is_available()
    else "mps" if torch.backends.mps.is_available()
    else "cpu"
)
device

In [None]:
model = transformers.AutoModelForCausalLM.from_pretrained(model_id).to(device)

In [None]:
train_dataset[:2]["input_ids"].shape

In [None]:
# We can send inputs shorter than the model's context lengths (1024)
res = model(train_dataset["input_ids"][0][:100].to(device))
tokenizer.decode(res.logits.argmax(dim=-1))

In [None]:
coll = data_collator = transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False)
out = coll([train_dataset[:2]])

for key in out:
    print(f"{key} shape: {out[key].shape}")

assert torch.allclose(out["input_ids"], train_dataset[:2]["input_ids"])

# Note that labels are the same as input_ids
assert torch.allclose(out["input_ids"], out["labels"])

In [None]:
# https://huggingface.co/datasets/karpathy/tiny_shakespeare
shakespeare_dataset_id = "karpathy/tiny_shakespeare"

shakespeare = load_dataset(shakespeare_dataset_id)

def tokenize_batch_shakespeare(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        return_overflowing_tokens=True,
        padding="max_length",  # Defaults to the max length of the model
        return_tensors="pt"
    )

shakespeare_train = shakespeare["train"].map(tokenize_batch_shakespeare, remove_columns=["text"], batched=True)
shakespeare_train.set_format("pt", columns=["input_ids"], output_all_columns=True)
shakespeare_val = shakespeare["validation"].map(tokenize_batch_shakespeare, remove_columns=["text"], batched=True)
shakespeare_val.set_format("pt", columns=["input_ids"], output_all_columns=True)
shakespeare_test = shakespeare["test"].map(tokenize_batch_shakespeare, remove_columns=["text"], batched=True)
shakespeare_test.set_format("pt", columns=["input_ids"], output_all_columns=True)


In [None]:
all_train_datasets = concatenate_datasets([train_dataset, shakespeare_train])
all_test_datasets = concatenate_datasets([test_dataset, shakespeare_val, shakespeare_test])

all_train_datasets["input_ids"].shape

In [None]:
model

In [None]:
training_args = transformers.TrainingArguments(
    output_dir="shakespeare_grim_gpt2",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    seed=42,
    save_total_limit=3,
    load_best_model_at_end=True,
    num_train_epochs=5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=5e-5,
    weight_decay=0.001,
    push_to_hub=False,
)

trainer = transformers.Trainer(
    model=model,
    args=training_args,
    train_dataset=all_train_datasets,
    eval_dataset=all_test_datasets,
    data_collator=data_collator,
)

steps_per_epoch = math.ceil(len(all_train_datasets) / training_args.per_device_train_batch_size)
print("Total number of training examples:", len(all_train_datasets))
print("Number of steps per epoch:", steps_per_epoch)
print("Total number of steps:", steps_per_epoch * training_args.num_train_epochs)

trainer.train()

In [None]:
# import math

# eval_results = trainer.evaluate()
# print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

In [None]:
generator = transformers.pipeline("text-generation", model=model, tokenizer=tokenizer, device=device)

In [None]:
name = "Arthur"

In [None]:
def gen(prompt):
    print(generator(prompt, max_new_tokens=200)[0]["generated_text"])

In [None]:
gen(f"A long time ago there lived a king named {name} who was known for his love of outrageous wigs")

In [None]:
gen(f"There was once a peasant named {name} who owned a cat and was afraid of the big black wolf")