# Credits

Original Notebook [here](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling.ipynb) 

# Import dependencies

In [46]:
import math
import transformers
from datasets import load_dataset, Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, pipeline, set_seed

print(transformers.__version__)

4.36.1


# Load dataset

In [23]:
# Use  Wikitext 2 dataset

full_datasets = load_dataset('wikitext', 'wikitext-2-raw-v1')

In [24]:
full_datasets

DatasetDict({
    test: Dataset({
        features: ['text'],
        num_rows: 4358
    })
    train: Dataset({
        features: ['text'],
        num_rows: 36718
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 3760
    })
})

In [25]:
# Downsize dataset because I am on a CPU!
num_train = 400
num_test = 100
num_validation = 100

datasets = DatasetDict({
    "train": Dataset.from_dict({
            "text": full_datasets["train"]["text"][0:num_train]
        }),
    "test": Dataset.from_dict({
            "text": full_datasets["test"]["text"][0:num_test]
        }),
    "validation": Dataset.from_dict({
            "text": full_datasets["validation"]["text"][0:num_validation]
        })
})

In [26]:
datasets

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 400
    })
    test: Dataset({
        features: ['text'],
        num_rows: 100
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 100
    })
})

# Causal language Model

In [27]:
model_checkpoint = "distilgpt2"

## Tokenizer

In [28]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

In [29]:
def tokenize_function(examples):
    return tokenizer(examples["text"])

In [30]:
tokenized_datasets = datasets.map(tokenize_function, batched=True, num_proc=4, remove_columns=["text"])

Map (num_proc=4):   0%|          | 0/400 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/100 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/100 [00:00<?, ? examples/s]

In [31]:
tokenized_datasets["train"][1]

{'input_ids': [796, 569, 18354, 7496, 17740, 6711, 796, 220, 198],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [32]:
# This may be a little too much for the GPU memory so we take a reasonable block size
# block_size = tokenizer.model_max_length
block_size = 128

In [33]:
def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
        # customize this part to your needs.
    total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

In [34]:
lm_datasets = tokenized_datasets.map(
    group_texts,
    batched=True,
    batch_size=1000,
    num_proc=4,
)

Map (num_proc=4):   0%|          | 0/400 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/100 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/100 [00:00<?, ? examples/s]

In [35]:
lm_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 147
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 40
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 47
    })
})

In [36]:
tokenizer.decode(lm_datasets["train"][1]["input_ids"])

' game and follows the " Nameless ", a penal military unit serving the nation of Gallia during the Second Europan War who perform secret black operations and are pitted against the Imperial unit " Calamaty Raven ". \n The game began development in 2010, carrying over a large portion of the work done on Valkyria Chronicles II. While it retained the standard features of the series, it also underwent multiple adjustments, such as making the game more forgiving for series newcomers. Character designer Raita Honjou and composer Hitoshi Sakimoto both returned from previous entries, along with Valkyria Chronicles II director Takeshi Oz'

## Explanation of group text function

```Python
examples = {
    "input_ids": [
        [101, 2054, 2003, 1037, 2158, 1012, 15, 102],
        [101, 1045, 2064, 2022, 1037, 2158, 1012, 102],
        # ... other input_ids lists
    ],
    "attention_mask": [
        [1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1],
        # ... other attention_mask lists
    ],
    # ... other keys in examples
}

block_size = 4

concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
total_length = len(concatenated_examples[list(examples.keys())[0]])

# total_length = 16

result = {
    k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
    for k, t in concatenated_examples.items()
}

# result = {'input_ids': [[101, 2054, 2003, 1037],
#  [2158, 1012, 15, 102],
#  [101, 1045, 2064, 2022],
#  [1037, 2158, 1012, 102]],
# 'attention_mask': [[1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1]]}
```

## Model 

In [37]:
model = AutoModelForCausalLM.from_pretrained(model_checkpoint)

In [38]:
model_name = model_checkpoint.split("/")[-1]
training_args = TrainingArguments(
    f"{model_name}-finetuned-wikitext2",
    evaluation_strategy = "epoch",
    num_train_epochs=10,
    learning_rate=2e-5,
    weight_decay=0.01
)

In [39]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_datasets["train"],
    eval_dataset=lm_datasets["validation"]
)

In [40]:
train_output = trainer.train()

Epoch,Training Loss,Validation Loss
1,No log,4.165716
2,No log,4.092966
3,No log,4.051461
4,No log,4.026987
5,No log,4.013767
6,No log,4.009618
7,No log,4.005503
8,No log,4.003449
9,No log,4.003416
10,No log,4.004277


In [41]:
train_output

TrainOutput(global_step=190, training_loss=3.708026765522204, metrics={'train_runtime': 1533.8826, 'train_samples_per_second': 0.958, 'train_steps_per_second': 0.124, 'total_flos': 48013277921280.0, 'train_loss': 3.708026765522204, 'epoch': 10.0})

In [42]:
eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

Perplexity: 54.83


# Save model

In [43]:
custom_model_name = "./distilgpt2-tutorial-model"

In [44]:
tokenizer.save_pretrained(custom_model_name)

('./distilgpt2-tutorial-model/tokenizer_config.json',
 './distilgpt2-tutorial-model/special_tokens_map.json',
 './distilgpt2-tutorial-model/vocab.json',
 './distilgpt2-tutorial-model/merges.txt',
 './distilgpt2-tutorial-model/added_tokens.json',
 './distilgpt2-tutorial-model/tokenizer.json')

In [45]:
model.save_pretrained(custom_model_name)

In [47]:
generator = pipeline("text-generation", model="./distilgpt2-test-model")

In [52]:
set_seed(10)
generator("Character designer Raita Honjou and composer Hitoshi Sakimoto,", max_length=200, num_return_sequences=7)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'Character designer Raita Honjou and composer Hitoshi Sakimoto, in the design for the video game, published the video game adaptation of Resident Evil Revelations in September 2014.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n'},
 {'generated_text': 'Character designer Raita Honjou and composer Hitoshi Sakimoto, who contributed the sound design, announced the final designs for the film on Friday (Jan. 18). "We had only very limited budget and could only confirm that, due to difficulties in planning, production direction, and the budget on the film we had not achieved it," said Honjou, the studio president and director of the film production team. "We had been doing all this for a while wh