In [1]:
import math
import torch
import transformers
from datasets import load_dataset, concatenate_datasets

In [2]:
# https://huggingface.co/distilbert/distilgpt2
# https://huggingface.co/openai-community/gpt2/tree/main
model_id = "distilbert/distilgpt2"
tokenizer_id = model_id

# https://huggingface.co/datasets/salgara/Grimes_tales
dataset_id = "salgara/Grimes_tales"

In [3]:
dataset = load_dataset(dataset_id)["train"]
dataset

Dataset({
    features: ['Title', 'Story', 'Rating', 'Voters'],
    num_rows: 216
})

[HuggingFace Course: Tokenizers](https://huggingface.co/learn/nlp-course/en/chapter2/4)

In [4]:
tokenizer = transformers.AutoTokenizer.from_pretrained(tokenizer_id)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

tokenizer.special_tokens_map

{'bos_token': '<|endoftext|>',
 'eos_token': '<|endoftext|>',
 'unk_token': '<|endoftext|>',
 'pad_token': '<|endoftext|>'}

In [5]:
# A whitespace-before-the-word and a whitespace symbol
tokenizer.vocab["Ġ"]

220

In [6]:
tokenizer(" ")

{'input_ids': [220], 'attention_mask': [1]}

In [7]:
# A newline symbol
tokenizer.vocab["Ċ"]

198

In [8]:
tokenizer("\n")

{'input_ids': [198], 'attention_mask': [1]}

In [9]:
tokenizer.vocab

{'ĠSunny': 32241,
 'Ġregarded': 11987,
 'Ġundermined': 31256,
 'ĠDAV': 42274,
 'Vi': 38432,
 'Ġchamp': 24092,
 'SM': 12310,
 'hops': 21936,
 'ĠVermont': 16033,
 'ĠBuy': 11763,
 'ctl': 34168,
 'ĠBoh': 44366,
 'Effective': 44831,
 'Ġcut': 2005,
 'dom': 3438,
 'Ġgoat': 26917,
 'Ġovers': 10753,
 'ĠOl': 6544,
 'oping': 15816,
 '302': 22709,
 'ĠP': 350,
 'rans': 26084,
 'Ġcongr': 19030,
 'Ġreinstated': 40685,
 'Medical': 37158,
 'Ġreversal': 27138,
 'Ġspecifying': 31577,
 'Ġlitres': 49622,
 'ĠSimple': 17427,
 'Ġstud': 941,
 'Ġmodules': 13103,
 '0000000000000000': 25645,
 'inated': 3898,
 'Ġsynd': 11150,
 'Ġ116': 18693,
 'liness': 26061,
 'pour': 48681,
 'Ġnausea': 32122,
 'Wow': 22017,
 'ayette': 27067,
 'laughter': 27815,
 'ĠTend': 48664,
 'Ġclub': 3430,
 'Ġframework': 9355,
 'ĠYork': 1971,
 'ĠGor': 19097,
 'Ġpressured': 32098,
 'Ġpick': 2298,
 'Ġoffer': 2897,
 'APP': 24805,
 'SAY': 27358,
 'icker': 15799,
 'UTERS': 14974,
 'Ġ122': 19409,
 'Ġcreatine': 49346,
 'ĠVariant': 38215,
 'isal': 28

In [10]:
# How many different tokens will the word "study" and derived get?
nonsense = "Study or not to study your studies? Studying is light, but not studying is darkness."
tokens = tokenizer.tokenize(nonsense)
token_ids = tokenizer.convert_tokens_to_ids(tokens)
print("Token, Token ID")
for token, token_id in zip(tokens, token_ids):
    print(f"{token:<10} {token_id}")

# Read some more: https://discuss.huggingface.co/t/bpe-tokenizers-and-spaces-before-words/475/2

Token, Token ID
Study      39841
Ġor        393
Ġnot       407
Ġto        284
Ġstudy     2050
Ġyour      534
Ġstudies   3640
?          30
ĠStud      3604
ying       1112
Ġis        318
Ġlight     1657
,          11
Ġbut       475
Ġnot       407
Ġstudying  11065
Ġis        318
Ġdarkness  11854
.          13


In [11]:
token_ids = tokenizer(nonsense)["input_ids"]
tokenizer.decode(token_ids)

'Study or not to study your studies? Studying is light, but not studying is darkness.'

In [12]:
dataset["Story"][:3]

['In the old times, when it was still of some use to wish for the thing one wanted, there lived a King whose daughters were all handsome, but the youngest was so beautiful that the sun himself, who has seen so much, wondered each time he shone over her because of her beauty. Near the royal castle there was a great dark wood, and in the wood under an old linden-tree was a well; and when the day was hot, the King\'s daughter used to go forth into the wood and sit by the brink of the cool well, and if the time seemed long, she would take out a golden ball, and throw it up and catch it again, and this was her favourite pastime.\n\nNow it happened one day that the golden ball, instead of falling back into the maiden\'s little hand which had sent it aloft, dropped to the ground near the edge of the well and rolled in. The king\'s daughter followed it with her eyes as it sank, but the well was deep, so deep that the bottom could not be seen. Then she began to weep, and she wept and wept as if

[Training a causal language model from scratch](https://huggingface.co/learn/nlp-course/en/chapter7/6)

In [13]:
tokenized1 = tokenizer(dataset["Story"][:3])
print(tokenized1.keys())
print("Length:", [len(x) for x in tokenized1["input_ids"]])
print("Total tokens:", sum(len(x) for x in tokenized1["input_ids"]))

Token indices sequence length is longer than the specified maximum sequence length for this model (1743 > 1024). Running this sequence through the model will result in indexing errors


dict_keys(['input_ids', 'attention_mask'])
Length: [1743, 1264, 2298]
Total tokens: 5305


[tokenizer() parameters](https://huggingface.co/docs/transformers/en/internal/tokenization_utils#transformers.PreTrainedTokenizerBase.__call__)

`return_overflowing_tokens=True` will split the long sequences into multiple shorter sequences.

In [14]:
tokenized2 = tokenizer(dataset["Story"][:3], truncation=True, max_length=500, return_overflowing_tokens=True)
print(tokenized2.keys())
print("Length:", [len(x) for x in tokenized2["input_ids"]])
print("Total tokens:", sum(len(x) for x in tokenized2["input_ids"]))
print(tokenized2["overflow_to_sample_mapping"])

dict_keys(['input_ids', 'attention_mask', 'overflow_to_sample_mapping'])
Length: [500, 500, 500, 243, 500, 500, 264, 500, 500, 500, 500, 298]
Total tokens: 5305
[0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 2, 2]


In [15]:
tokenized3 = tokenizer(
    dataset["Story"][:3],
    truncation=True,
    max_length=500,
    return_overflowing_tokens=True,
    padding="max_length",
)
print(tokenized3.keys())
print("Length:", [len(x) for x in tokenized3["input_ids"]])
print("Total tokens:", sum(len(x) for x in tokenized3["input_ids"]))

dict_keys(['input_ids', 'attention_mask', 'overflow_to_sample_mapping'])
Length: [500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500]
Total tokens: 6000


In [16]:
[y for x in tokenized1["input_ids"] for y in x] == [y for x in tokenized2["input_ids"] for y in x]

True

In [17]:
[y for x in tokenized1["input_ids"] for y in x] == [y for x in tokenized3["input_ids"] for y in x if y != tokenizer.pad_token_id]

True

In [18]:
res = tokenizer(
    dataset["Story"][:10],
    truncation=True,
    return_overflowing_tokens=True,
    padding="max_length",
    return_tensors="pt"
)
res

{'input_ids': tensor([[  818,   262,  1468,  ...,  2061,  8072,   198],
        [ 5832,   502,    30,  ..., 50256, 50256, 50256],
        [   32,  3797,   550,  ...,   530,  9392,   262],
        ...,
        [  401,   395, 14210,  ...,  3244,   262, 10614],
        [  373, 26322,  1143,  ..., 50256, 50256, 50256],
        [  464,  7540,  1752,  ..., 50256, 50256, 50256]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'overflow_to_sample_mapping': tensor([0, 0, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3, 4, 4, 5, 5, 5, 5, 6, 6, 6, 7, 7, 8,
        8, 8, 9])}

In [19]:
res["input_ids"][1]

tensor([ 5832,   502,    30,  ..., 50256, 50256, 50256])

In [20]:
res["attention_mask"][1]

tensor([1, 1, 1,  ..., 0, 0, 0])

In [21]:
# return_tensors="pt" doesn't work with .map()!
# https://discuss.huggingface.co/t/dataset-map-return-only-list-instead-torch-tensors/15767
# Use ds.set_format("pt", columns=["input_ids"], output_all_columns=True) after .map()

def tokenize_batch(examples):
    print("Number of examples:", len(examples["Story"]))

    res = tokenizer(
        examples["Story"],
        truncation=True,
        return_overflowing_tokens=True,
        padding="max_length",  # Defaults to the max length of the model
        return_tensors="pt"
    )
    print("Result shape:", res["input_ids"].shape)
    return res


In [22]:
dataset

Dataset({
    features: ['Title', 'Story', 'Rating', 'Voters'],
    num_rows: 216
})

In [23]:
# SPOILER ALERT! DO NOT SCROLL FURTHER DOWN! UNCOMMENT THE FOLLOWING LINE AND FIX THE ERROR!
# dataset.map(tokenize_batch, batched=True)  # Old columns have fewer rows than the new ones, use `remove_columns=dataset.column_names`

In [24]:
dataset["Story"][:3]

['In the old times, when it was still of some use to wish for the thing one wanted, there lived a King whose daughters were all handsome, but the youngest was so beautiful that the sun himself, who has seen so much, wondered each time he shone over her because of her beauty. Near the royal castle there was a great dark wood, and in the wood under an old linden-tree was a well; and when the day was hot, the King\'s daughter used to go forth into the wood and sit by the brink of the cool well, and if the time seemed long, she would take out a golden ball, and throw it up and catch it again, and this was her favourite pastime.\n\nNow it happened one day that the golden ball, instead of falling back into the maiden\'s little hand which had sent it aloft, dropped to the ground near the edge of the well and rolled in. The king\'s daughter followed it with her eyes as it sank, but the well was deep, so deep that the bottom could not be seen. Then she began to weep, and she wept and wept as if

In [25]:
ds = dataset.train_test_split(test_size=0.2, seed=42)

# Use remove_columns to drop the columns that don't have the same number of rows as the tokenized columns
# https://discuss.huggingface.co/t/how-to-use-map-or-similar-when-one-row-is-mapped-to-multiple-rows/8374

train_dataset = ds["train"].map(tokenize_batch, remove_columns=dataset.column_names, batched=True)
train_dataset.set_format("pt", columns=["input_ids"], output_all_columns=True)
print(train_dataset)
test_dataset = ds["test"].map(tokenize_batch, remove_columns=dataset.column_names, batched=True)
test_dataset.set_format("pt", columns=["input_ids"], output_all_columns=True)
print(test_dataset)

Dataset({
    features: ['input_ids', 'attention_mask', 'overflow_to_sample_mapping'],
    num_rows: 369
})
Dataset({
    features: ['input_ids', 'attention_mask', 'overflow_to_sample_mapping'],
    num_rows: 90
})


In [26]:
train_dataset["input_ids"][:10]

tensor([[   32,  3595,  4898,  ...,   290,   750,   407],
        [ 2328,  5223,   546,  ...,   257,  4320,    11],
        [  475,  1115,  5527,  ..., 50256, 50256, 50256],
        ...,
        [   11,   290,   788,  ..., 50256, 50256, 50256],
        [   32,  1499,   805,  ..., 50256, 50256, 50256],
        [ 1858,   373,  1752,  ..., 50256, 50256, 50256]])

In [27]:
train_dataset["input_ids"].shape

torch.Size([369, 1024])

[HuggingFace: Causal language modeling](https://huggingface.co/docs/transformers/en/tasks/language_modeling)

[HuggingFace course: Fine-tune a pretrained model](https://huggingface.co/docs/transformers/en/training)

In [28]:
device = torch.device(
    "cuda" if torch.cuda.is_available()
    else "mps" if torch.backends.mps.is_available()
    else "cpu"
)
device

device(type='cuda')

In [29]:
model = transformers.AutoModelForCausalLM.from_pretrained(model_id).to(device)

In [30]:
train_dataset[:2]["input_ids"].shape

torch.Size([2, 1024])

In [31]:
tokenizer.decode(train_dataset["input_ids"][0][:100])

'A poor wood-cutter lived with his wife and three daughters in a little hut on the edge of a lonely forest. One morning as he was about to go to his work, he said to his wife, "Let my dinner be brought into the forest to me by my eldest daughter, or I shall never get my work done, and in order that she may not miss her way," he added, "I will take a bag of millet with me and strew the seeds on the'

In [32]:
# We can send inputs shorter than the model's context lengths (1024)
res = model(train_dataset["input_ids"][0][:100].to(device))
tokenizer.decode(res.logits.argmax(dim=-1))

' The manworkingburningter, in his wife and two children in the small house in the outskirts of the forest hill.\n day, he was walking to leave to work house, he saw, himself wife, "I me wife be dinner to the house." be." the wife son." who to will be be my dinner done." and I the to I will be be the work of and said. "I will not my few of woodlet and me and putow it milk of the ground'

In [33]:
data_collator = transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False)
out = data_collator([train_dataset[:2]])

for key in out:
    print(f"{key} shape: {out[key].shape}")

assert torch.allclose(out["input_ids"], train_dataset[:2]["input_ids"])

# Note that labels are the same as input_ids
assert torch.allclose(out["input_ids"], out["labels"])

input_ids shape: torch.Size([1, 2, 1024])
attention_mask shape: torch.Size([1, 2, 1024])
overflow_to_sample_mapping shape: torch.Size([1, 2])
labels shape: torch.Size([1, 2, 1024])


In [34]:
# https://huggingface.co/datasets/karpathy/tiny_shakespeare
shakespeare_dataset_id = "karpathy/tiny_shakespeare"

shakespeare = load_dataset(shakespeare_dataset_id)

def tokenize_batch_shakespeare(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        return_overflowing_tokens=True,
        padding="max_length",  # Defaults to the max length of the model
        return_tensors="pt"
    )

shakespeare_train = shakespeare["train"].map(tokenize_batch_shakespeare, remove_columns=["text"], batched=True)
shakespeare_train.set_format("pt", columns=["input_ids"], output_all_columns=True)
shakespeare_val = shakespeare["validation"].map(tokenize_batch_shakespeare, remove_columns=["text"], batched=True)
shakespeare_val.set_format("pt", columns=["input_ids"], output_all_columns=True)
shakespeare_test = shakespeare["test"].map(tokenize_batch_shakespeare, remove_columns=["text"], batched=True)
shakespeare_test.set_format("pt", columns=["input_ids"], output_all_columns=True)


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


In [35]:
all_train_datasets = concatenate_datasets([train_dataset, shakespeare_train])
all_test_datasets = concatenate_datasets([test_dataset, shakespeare_val, shakespeare_test])

all_train_datasets["input_ids"].shape

torch.Size([664, 1024])

In [36]:
model = model.to("cpu")
model

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-5): 6 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [37]:
training_args = transformers.TrainingArguments(
    num_train_epochs=5,
    output_dir="out/shakespeare_grim_gpt2",
    save_strategy="epoch",
    eval_strategy="epoch",
    logging_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    bf16=True,
    seed=13,
    save_total_limit=3,
    load_best_model_at_end=True,
    
    learning_rate=5e-5,
    weight_decay=0.001,
)

trainer = transformers.Trainer(
    model=model,
    args=training_args,
    train_dataset=all_train_datasets,
    eval_dataset=all_test_datasets,
    data_collator=data_collator,
)

steps_per_epoch = math.ceil(len(all_train_datasets) / training_args.per_device_train_batch_size)
print("Total number of training examples:", len(all_train_datasets))
print("Number of steps per epoch:", steps_per_epoch)
print("Total number of steps:", steps_per_epoch * training_args.num_train_epochs)

trainer.train()

Total number of training examples: 664
Number of steps per epoch: 42
Total number of steps: 210




Epoch,Training Loss,Validation Loss
1,3.7477,3.361568
2,3.6035,3.314357
3,3.5406,3.290583
4,3.5028,3.283373
5,3.4865,3.277186


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


TrainOutput(global_step=105, training_loss=3.576228550502232, metrics={'train_runtime': 46.6013, 'train_samples_per_second': 71.243, 'train_steps_per_second': 2.253, 'total_flos': 867505211965440.0, 'train_loss': 3.576228550502232, 'epoch': 5.0})

In [38]:
import math

eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")



Perplexity: 26.50


In [39]:
generator = transformers.pipeline("text-generation", model=model.to("cuda"), tokenizer=tokenizer, device=device)

In [40]:
name = "Arthur"

In [41]:
def gen(prompt):
    print(generator(prompt, max_new_tokens=200)[0]["generated_text"])

In [42]:
gen(f"A long time ago there lived a king named {name} who was known for his love of outrageous wigs")

A long time ago there lived a king named Arthur who was known for his love of outrageous wigs and gold, and had married a princess named Elizabeth who had been married long ago, who was known for his passion for animal breeding, and was known for his passion for animal breeding."
He was then known to be so generous and compassionate that he was so much willing to fight his wickedness, even when the princess went on to conquer the world at night."
"For three years," said the king, "we had a good time together."
"And when she went on to conquer her kingdom by the sword of the god," said the queen; "it was the most beautiful time it's been here."
The King's wife went on to the seaside in the hopes of rescuing the princess, but in the end the king was rescued and the king was given a reward to fight for the king."
After three days at sea, and when Princess Elizabeth found out, she ordered that Edward should not marry her, she became terrified of that and was ready for battle.



In [43]:
gen(f"There was once a peasant named {name} who owned a cat and was afraid of the big black wolf")

There was once a peasant named Arthur who owned a cat and was afraid of the big black wolf. He had been a hunter and hunted people by the wolf. But the wolf did not come to the nearest home and said that he saw a man with great wealth that was looking at it. When he saw the man, he went into an underground cellar which was a black hole. When he saw the man there stood a man and said "Go into, you must be as handsome as I am". At that time the man had become very jealous with the black hole. When he was inside, he wanted to get away and let it go. When he was out of town, he said, "Where am I? I haven't gone outside for a while." The man gave the man a book which was written over a wood and cut into a piece in pieces. Then he gave the man a book which was called A Thousand Years of Men. Then he took the book and put it through his fingers. Then the man thought about it again and said, "And with the knowledge of this
