In [1]:
import os
from dotenv import load_dotenv

# Load variables from .env file
load_dotenv()

# Access environment variables
os.environ["HF_TOKEN"]=os.getenv("HF_TOKEN")

## Directly using pretrained model using pipeline

In [2]:
from transformers import pipeline

In [3]:
pipe = pipeline("text-generation", model="distilbert/distilgpt2")

Device set to use mps:0


In [4]:
print(pipe("Today I will teach you"))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'Today I will teach you a few basic principles for getting rid of your phone for a few days.\n\n\n1. Phone the most important thing you need to do\n2. Do everything you need to\n3. Have the right to do'}]


There are two types of language modeling, causal and masked. GPT-2 is an example of a causal language model. \
Causal language modeling predicts the next token in a sequence of tokens, and the model can only attend to tokens on the left.

Will finetine distilbert/distilgpt2

Will be using rexarski/eli5_category dataset for finetuning

In [5]:
from datasets import load_dataset

In [25]:
eli5 = load_dataset("rexarski/eli5_category", split="train[:20]")

No config specified, defaulting to: eli5_category/default
Reusing dataset eli5_category (/Users/lucky/.cache/huggingface/datasets/rexarski___eli5_category/default/1.0.0/80106cc49322f1f5075e1387be4a5b74b95e0f56c40ff142b8999d0606aa1908)


In [26]:
eli5 = eli5.train_test_split(test_size=0.2)
eli5

DatasetDict({
    train: Dataset({
        features: ['q_id', 'title', 'selftext', 'category', 'subreddit', 'answers', 'title_urls', 'selftext_urls'],
        num_rows: 16
    })
    test: Dataset({
        features: ['q_id', 'title', 'selftext', 'category', 'subreddit', 'answers', 'title_urls', 'selftext_urls'],
        num_rows: 4
    })
})

In [27]:
eli5['train']

Dataset({
    features: ['q_id', 'title', 'selftext', 'category', 'subreddit', 'answers', 'title_urls', 'selftext_urls'],
    num_rows: 16
})

Let's now load a distilbert/distilgpt2 tokenizer

In [28]:
from transformers import AutoTokenizer

In [29]:
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilgpt2")

In [30]:
tokenizer

GPT2TokenizerFast(name_or_path='distilbert/distilgpt2', vocab_size=50257, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	50256: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
}
)

In [31]:
eli5 = eli5.flatten() # remove nestedness

In [32]:
eli5['train'][0]

{'q_id': '5ld14u',
 'title': 'Why are internet speeds in America so slow',
 'selftext': '',
 'category': 'Technology',
 'subreddit': 'explainlikeimfive',
 'title_urls': ['url'],
 'selftext_urls': ['url'],
 'answers.a_id': ['dburxe8',
  'dbursr8',
  'dbuuxnd',
  'dbuutes',
  'dbuy0vg',
  'dbuxrff',
  'dbuyci8',
  'dbuy3lo',
  'dbuxujv',
  'dbuxyrh',
  'dbuyev3',
  'dbuy1ya',
  'dbux4mj'],
 'answers.text': ["It mainly has to do with the monopoly that ISP's hold over different areas. Most people have one choice for their internet provider. It's not like you can switch to a different company if you aren't happy with their shit service. Another thing is that the capacity for fast internet exists it's just that they have no reason to give it to you. My city recently became a prospective city for google fiber and TWC immediately upgraded our 20mb/s connection (maximum available) to 100mb/s for free and also started offering 200mb/s and 300mb/s for not much extra. They're also doing everything

In [33]:
len(eli5['train'][0]['answers.text'])

13

let's join all list of strings into single string \
Joining a List of Strings: \
example:
```python
a = ['Hello', 'world', 'from', 'Python']
res = '***'.join(a)
res
```
result: 
```bash
'Hello***world***from***Python'
```

In [34]:
txt = " ".join(eli5['train'][9]['answers.text'])

In [35]:
tokenizer(txt)

Token indices sequence length is longer than the specified maximum sequence length for this model (1591 > 1024). Running this sequence through the model will result in indexing errors


{'input_ids': [2215, 345, 1011, 1165, 1263, 257, 26633, 11, 340, 23687, 534, 1658, 2522, 31111, 11, 6666, 340, 284, 599, 8597, 13, 770, 20406, 13, 383, 13349, 318, 3375, 546, 734, 1180, 1243, 13, 554, 262, 3670, 11, 356, 389, 1085, 284, 1975, 326, 262, 13349, 318, 3375, 546, 366, 301, 1186, 10813, 534, 1658, 2522, 31111, 11, 6666, 340, 284, 599, 8597, 1, 2102, 11, 287, 262, 2912, 6903, 11, 13349, 318, 3375, 546, 366, 1858, 389, 734, 1688, 366, 83, 29080, 1, 326, 534, 5422, 5983, 656, 13, 1881, 318, 262, 1658, 2522, 31111, 11, 981, 262, 584, 318, 262, 491, 4891, 64, 526, 8975, 340, 20406, 772, 618, 262, 4144, 1816, 866, 262, 366, 3506, 1, 12403, 13, 770, 4325, 749, 1690, 351, 6588, 515, 11758, 13, 383, 2793, 3833, 2354, 286, 262, 460, 14, 10985, 293, 290, 262, 4894, 286, 534, 1767, 1838, 6588, 17556, 284, 1282, 503, 286, 262, 20584, 14, 12924, 14, 1073, 365, 355, 345, 4144, 340, 13, 770, 318, 644, 5640, 262, 277, 6457, 1359, 1245, 13, 632, 338, 3608, 290, 25103, 355, 890, 355, 340, 4325

In [36]:
def preprocess_function(examples):
    return tokenizer([" ".join(x) for x in examples["answers.text"]])

In [37]:
tokenized_eli5 = eli5.map(
    preprocess_function,
    batched=True,
    num_proc=4,
    remove_columns=eli5["train"].column_names,
)

        

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

    

  table = cls._concat_blocks(blocks, axis=0)


    

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

  table = cls._concat_blocks(blocks, axis=0)


In [38]:
tokenized_eli5['train']

Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 16
})

This dataset contains the token sequences, but some of these are longer than the maximum input length for the model.

now use a second preprocessing function to

- concatenate all the sequences 
- split the concatenated sequences into shorter chunks defined by block_size, which should be both shorter than the maximum input length and short enough for your GPU RAM.

concatenate
```python
a = [[12,], [13, 15], [12]]
[ele for lst in a for ele in lst]
```
```bash
[12, 13, 15, 12]

```

In [87]:
block_size = 128
def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {}
    for k in examples.keys():
        concatenated_examples[k] = [ele for lst in examples[k] for ele in lst]
    total_length = len(concatenated_examples[list(concatenated_examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    if total_length >= block_size:
        total_length = (total_length // block_size) * block_size
    # Split by chunks of block_size.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

In [89]:
lm_dataset = tokenized_eli5.map(group_texts, batched=True)

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [92]:
print(len(lm_dataset['train'][0]['input_ids']))

128


Now create a batch of examples using DataCollatorForLanguageModeling. It’s more efficient to dynamically pad the sentences to the longest length in a batch during collation, instead of padding the whole dataset to the maximum length.

In the 🤗 Transformers library, a data collator is a helper that prepares a batch of data just before feeding it into the model during training or evaluation.

⸻

🧠 Why it’s needed

Models expect input tensors of the same length, but natural language sequences are different lengths.

So we need to:
- Pad shorter sequences
- Possibly create labels
- Stack inputs into tensors

Instead of doing this manually every time, we use a data collator.

In [95]:
from transformers import DataCollatorForLanguageModeling

In [96]:
tokenizer

GPT2TokenizerFast(name_or_path='distilbert/distilgpt2', vocab_size=50257, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	50256: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
}
)

In [97]:
tokenizer.pad_token = tokenizer.eos_token

In [98]:
tokenizer

GPT2TokenizerFast(name_or_path='distilbert/distilgpt2', vocab_size=50257, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '<|endoftext|>'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	50256: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
}
)

In [99]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

Load DistilGPT2 with AutoModelForCausalLM 

At this point, only three steps remain:

- Define your training hyperparameters in TrainingArguments. The only required parameter is output_dir which specifies where to save your model. You’ll push this model to the Hub by setting push_to_hub=True (you need to be signed in to Hugging Face to upload your model).
- Pass the training arguments to Trainer along with the model, datasets, and data collator.
- Call train() to finetune your model.

In [101]:
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer

In [109]:
model = AutoModelForCausalLM.from_pretrained("distilbert/distilgpt2")

In [110]:
model

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-5): 6 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [116]:
training_args = TrainingArguments(
    output_dir="my_awesome_eli5_clm-model",
    eval_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    push_to_hub=False,
    num_train_epochs=1,
)

In [117]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_dataset["train"],
    eval_dataset=lm_dataset["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,

)

  trainer = Trainer(


In [118]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,No log,3.977015


TrainOutput(global_step=8, training_loss=3.9569272994995117, metrics={'train_runtime': 43.0267, 'train_samples_per_second': 1.441, 'train_steps_per_second': 0.186, 'total_flos': 2025049817088.0, 'train_loss': 3.9569272994995117, 'epoch': 1.0})

In [119]:
import math

eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

Perplexity: 53.36


In [122]:
from transformers import pipeline

prompt = "Somatic hypermutation allows the immune system to"
generator = pipeline("text-generation", model=model, tokenizer=tokenizer)
generator(prompt)

Device set to use mps:0


[{'generated_text': "Somatic hypermutation allows the immune system to generate many different types of neurons. For example, it's necessary for a certain type of neuron to synthesize two different types of cells, the type they use, and the type, the type the"}]