In [5]:
from datasets import load_dataset

eli5 = load_dataset("eli5", split="train_asks[:5000]")
eli5 = eli5.train_test_split(test_size=0.2)
eli5 = eli5.flatten()

Found cached dataset eli5 (/Users/mg/.cache/huggingface/datasets/eli5/LFQA_reddit/1.0.0/17574e5502a10f41bbd17beba83e22475b499fa62caa1384a3d093fc856fe6fa)


In [6]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilgpt2")

In [7]:
def preprocess_function(examples):
    return tokenizer([" ".join(x) for x in examples["answers.text"]])

In [8]:
tokenized_eli5 = eli5.map(
    preprocess_function,
    batched=True,
    num_proc=4,
    remove_columns=eli5["train"].column_names,
)

Map (num_proc=4):   0%|          | 0/4000 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1338 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1468 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (2060 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1060 > 1024). Running this sequence through the model will result in indexing errors


Map (num_proc=4):   0%|          | 0/1000 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1245 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1672 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (3471 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (2277 > 1024). Running this sequence through the model will result in indexing errors


In [9]:
block_size = 128

def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    if total_length >= block_size:
        total_length = (total_length // block_size) * block_size
    # Split by chunks of block_size.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

In [10]:
lm_dataset = tokenized_eli5.map(group_texts, batched=True, num_proc=4)

Map (num_proc=4):   0%|          | 0/4000 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/1000 [00:00<?, ? examples/s]

In [11]:
from transformers import DataCollatorForLanguageModeling

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [12]:
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer

model = AutoModelForCausalLM.from_pretrained("distilgpt2")

Downloading pytorch_model.bin:   0%|          | 0.00/353M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [14]:
training_args = TrainingArguments(
    output_dir="my_awesome_eli5_clm-model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    # push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_dataset["train"],
    eval_dataset=lm_dataset["test"],
    data_collator=data_collator,
)

trainer.train()



  0%|          | 0/3345 [00:00<?, ?it/s]

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'loss': 3.9392, 'learning_rate': 1.7010463378176384e-05, 'epoch': 0.45}
{'loss': 3.8725, 'learning_rate': 1.4020926756352765e-05, 'epoch': 0.9}


  0%|          | 0/296 [00:00<?, ?it/s]

{'eval_loss': 3.739006996154785, 'eval_runtime': 488.0337, 'eval_samples_per_second': 4.842, 'eval_steps_per_second': 0.607, 'epoch': 1.0}
{'loss': 3.8149, 'learning_rate': 1.1031390134529149e-05, 'epoch': 1.35}
{'loss': 3.7736, 'learning_rate': 8.041853512705531e-06, 'epoch': 1.79}


  0%|          | 0/296 [00:00<?, ?it/s]

{'eval_loss': 3.722926616668701, 'eval_runtime': 500.2298, 'eval_samples_per_second': 4.724, 'eval_steps_per_second': 0.592, 'epoch': 2.0}
{'loss': 3.7565, 'learning_rate': 5.0523168908819146e-06, 'epoch': 2.24}
{'loss': 3.7355, 'learning_rate': 2.062780269058296e-06, 'epoch': 2.69}


  0%|          | 0/296 [00:00<?, ?it/s]

{'eval_loss': 3.719756841659546, 'eval_runtime': 500.6927, 'eval_samples_per_second': 4.719, 'eval_steps_per_second': 0.591, 'epoch': 3.0}
{'train_runtime': 21150.1117, 'train_samples_per_second': 1.264, 'train_steps_per_second': 0.158, 'train_loss': 3.806183363111921, 'epoch': 3.0}


TrainOutput(global_step=3345, training_loss=3.806183363111921, metrics={'train_runtime': 21150.1117, 'train_samples_per_second': 1.264, 'train_steps_per_second': 0.158, 'train_loss': 3.806183363111921, 'epoch': 3.0})

In [15]:
import math

eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

  0%|          | 0/296 [00:00<?, ?it/s]

Perplexity: 41.25


In [16]:
prompt = "Somatic hypermutation allows the immune system to"

In [21]:
from transformers import AutoTokenizer

inputs = tokenizer(prompt, return_tensors="pt").input_ids
outputs = model.generate(inputs, max_new_tokens=100, do_sample=True, top_k=50, top_p=0.95)
tokenizer.batch_decode(outputs, skip_special_tokens=True)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


["Somatic hypermutation allows the immune system to respond rapidly without compromising immune function.\n\nBut I think there are other ways of preventing you from developing hypermutation which is the case with the case of TK-1 and the case of K-2. These are known as 'tok-knock'. This is called the'sorecase' effect. The TK-1 phenomenon is a natural phenomenon called'mismatch'. In this case you need a lot more to get started. It's the most common case"]