In [1]:
# !pip install transformers[torch]
# !pip install transformers datasets evaluate

# Train with Trainer

## DataLoad

In [1]:
from datasets import load_dataset

eli5 = load_dataset("eli5", split="train_asks[:5000]")
eli5 = eli5.train_test_split(test_size=0.2)
eli5 = eli5.flatten()
eli5["train"][0]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


{'q_id': 'tfakz',
 'title': 'What would happen if a wire were attached at the North Pole and connected it at the South Pole?',
 'selftext': 'Would it produce a current?  Would it do nothing at all?',
 'document': '',
 'subreddit': 'askscience',
 'answers.a_id': ['c4m4wue'],
 'answers.text': ["If we assume the wire is somehow fixed, such that the earth rotates through it, then yes it would produce current as it cuts the Earth's magnetic field (albeit not in the most efficient orientation for doing so).\n\nIf we assume it rotates with the earth (ie, it is in the same location at all times), then the simple answer is that it would not produce any current as the flux through its area never changes. However I'm not sure if the earth's field is stationary with respect to its axis of rotation, so there could still be some differential motion between the field and wire, resulting in current."],
 'answers.score': [5],
 'title_urls.url': [],
 'selftext_urls.url': [],
 'answers_urls.url': []}

## Tokenize

In [2]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilgpt2")

In [3]:
def preprocess_function(examples):
    return tokenizer([" ".join(x) for x in examples["answers.text"]])

In [4]:
tokenized_eli5 = eli5.map(
    preprocess_function,
    batched=True,
    num_proc=4,
    remove_columns=eli5["train"].column_names,
)

Map (num_proc=4):   0%|          | 0/4000 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (2193 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (2277 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1076 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1160 > 1024). Running this sequence through the model will result in indexing errors


Map (num_proc=4):   0%|          | 0/1000 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1165 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1114 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1128 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1723 > 1024). Running this sequence through the model will result in indexing errors


## chunk to 128

In [5]:
block_size = 128


def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    if total_length >= block_size:
        total_length = (total_length // block_size) * block_size
    # Split by chunks of block_size.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

In [6]:
length = len(tokenized_eli5["train"])
print(f"total size:{length}")
for i, d in enumerate(tokenized_eli5["train"]):
    if i == 5:
        break
    print(len(d["input_ids"]))

total size:4000
127
159
233
423
269


In [7]:
lm_dataset = tokenized_eli5.map(group_texts, batched=True, num_proc=4)

Map (num_proc=4):   0%|          | 0/4000 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/1000 [00:00<?, ? examples/s]

In [8]:
length = len(lm_dataset["train"])
print(f"total size:{length}")
for i, d in enumerate(lm_dataset["train"]):
    if i == 5:
        break
    print(len(d["input_ids"]))

total size:9064
128
128
128
128
128


## Collator

In [9]:
from transformers import DataCollatorForLanguageModeling

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [10]:
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer

model = AutoModelForCausalLM.from_pretrained("distilgpt2")

In [11]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## Train

In [24]:
training_args = TrainingArguments(
    output_dir="custom_eli5_clm-model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_dataset["train"],
    eval_dataset=lm_dataset["test"],
    data_collator=data_collator,
)

trainer.train()

Epoch,Training Loss,Validation Loss
1,No log,3.744712
2,3.759400,3.740784
3,3.759400,3.738842


TrainOutput(global_step=852, training_loss=3.7451888250073355, metrics={'train_runtime': 716.4535, 'train_samples_per_second': 37.954, 'train_steps_per_second': 1.189, 'total_flos': 888147655262208.0, 'train_loss': 3.7451888250073355, 'epoch': 3.0})

In [14]:
out = data_collator([lm_dataset["train"][i] for i in range(5)])
for key in out:
    print(f"{key} shape: {out[key].shape}")

input_ids shape: torch.Size([5, 128])
attention_mask shape: torch.Size([5, 128])
labels shape: torch.Size([5, 128])


In [15]:
import math
eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

Perplexity: 42.61


In [26]:
tokenizer.save_pretrained("custom_eli5_clm-model")

('custom_eli5_clm-model/tokenizer_config.json',
 'custom_eli5_clm-model/special_tokens_map.json',
 'custom_eli5_clm-model/vocab.json',
 'custom_eli5_clm-model/merges.txt',
 'custom_eli5_clm-model/added_tokens.json',
 'custom_eli5_clm-model/tokenizer.json')

## Test prompt

In [16]:
prompt = "Somatic hypermutation allows the immune system to"

### 1

In [27]:
from transformers import pipeline

generator = pipeline("text-generation", model="custom_eli5_clm-model")
generator(prompt)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'Somatic hypermutation allows the immune system to attack the brain and use the technique to treat some serious diseases like cancer. \n\nThe other issue is that we see the same pathological pathways affecting the brain and the brain as well; the ones'}]

### 2

In [28]:
from transformers import AutoTokenizer
from transformers import AutoModelForCausalLM


tokenizer = AutoTokenizer.from_pretrained("custom_eli5_clm-model")
inputs = tokenizer(prompt, return_tensors="pt").input_ids
model = AutoModelForCausalLM.from_pretrained("custom_eli5_clm-model")
outputs = model.generate(inputs, max_new_tokens=100, do_sample=True, top_k=50, top_p=0.95)
tokenizer.batch_decode(outputs, skip_special_tokens=True)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


["Somatic hypermutation allows the immune system to make its own decisions. The immune system might have to adapt faster and further to overcome the stress of our environment, or so it would say. It's also possible that the environment is already in a phase, and not as yet. \n\n_URL_0_\n\n > What happens if the virus infects a human being?\n\nFirst off, since the organism is not physically stable it will likely undergo a reaction that doesn't change the environment that it is currently in.\n\n"]

# Without trainer

In [29]:
from torch.utils.data.dataloader import DataLoader

lm_dataset.set_format("torch")
train_dataloader = DataLoader(lm_dataset["train"], batch_size=32, shuffle=True)
eval_dataloader = DataLoader(lm_dataset["test"], batch_size=32)


In [30]:
weight_decay = 0.1

def get_grouped_params(model, no_decay=["bias", "LayerNorm.weight"]):
    params_with_wd, params_without_wd = [], []
    for n, p in model.named_parameters():
        if any(nd in n for nd in no_decay):
            params_without_wd.append(p)
        else:
            params_with_wd.append(p)
    return [
        {"params": params_with_wd, "weight_decay": weight_decay},
        {"params": params_without_wd, "weight_decay": 0.0},
    ]


In [42]:
def evaluate():
    model.eval()
    losses = []
    for step, batch in enumerate(eval_dataloader):
        with torch.no_grad():
            outputs = model(batch["input_ids"], labels=batch["input_ids"])

        losses.append(accelerator.gather(outputs.loss))
    loss = torch.mean(torch.stack(losses))
    try:
        perplexity = torch.exp(loss)
    except OverflowError:
        perplexity = float("inf")
    return loss.item(), perplexity.item()


In [32]:
model = AutoModelForCausalLM.from_pretrained("distilgpt2")
tokenizer = AutoTokenizer.from_pretrained("distilgpt2")

In [33]:
from torch.optim import AdamW

optimizer = AdamW(get_grouped_params(model), lr=5e-4)

In [34]:
from accelerate import Accelerator

accelerator = Accelerator("fp16")

model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, eval_dataloader
)


In [35]:
from transformers import get_scheduler

num_train_epochs = 1
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
    name="linear",
    optimizer=optimizer,
    num_warmup_steps=1000,
    num_training_steps=num_training_steps,
)


In [36]:
from torch.nn import CrossEntropyLoss
import torch
loss_fct = CrossEntropyLoss(reduce=False)
def criterion(inputs, logits):
    shift_labels = inputs[..., 1:].contiguous()
    shift_logits = logits[..., :-1, :].contiguous()
    loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
    loss_per_sample = loss.view(shift_logits.size(0), shift_logits.size(1)).mean(axis=1)
    return loss_per_sample.sum()



In [43]:
from tqdm.notebook import tqdm

gradient_accumulation_steps = 8
eval_steps = 10

model.train()
completed_steps = 0
torch.cuda.empty_cache()
for epoch in range(num_train_epochs):
    torch.cuda.empty_cache()
    for step, batch in tqdm(enumerate(train_dataloader, start=1), total=num_training_steps):
        logits = model(batch["input_ids"]).logits

        loss = criterion(batch["input_ids"], logits)
        if step % 10 == 0:
            accelerator.print(
                {
                    # "lr": get_lr(),
                    # "samples": step * samples_per_step,
                    "steps": completed_steps,
                    "loss/train": loss * gradient_accumulation_steps,
                }
            )
        loss = loss / gradient_accumulation_steps
        accelerator.backward(loss)
        if step % gradient_accumulation_steps == 0:
            accelerator.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            completed_steps += 1
        if (step % (eval_steps * gradient_accumulation_steps)) == 0:
            eval_loss, perplexity = evaluate()
            accelerator.print({"loss/eval": eval_loss, "perplexity": perplexity})
            model.train()
            accelerator.wait_for_everyone()
            unwrapped_model = accelerator.unwrap_model(model)
            unwrapped_model.save_pretrained("custom2_eli5_clm-model", save_function=accelerator.save)
            # if accelerator.is_main_process:
            #     tokenizer.save_pretrained(output_dir)
            #     repo.push_to_hub(
            #         commit_message=f"Training in progress step {step}", blocking=False
            #     )


  0%|          | 0/284 [00:00<?, ?it/s]

{'steps': 1, 'loss/train': tensor(1063.5063, device='cuda:0', grad_fn=<MulBackward0>)}
{'steps': 2, 'loss/train': tensor(1084.1747, device='cuda:0', grad_fn=<MulBackward0>)}
{'steps': 3, 'loss/train': tensor(1065.4995, device='cuda:0', grad_fn=<MulBackward0>)}
{'steps': 4, 'loss/train': tensor(1053.1116, device='cuda:0', grad_fn=<MulBackward0>)}
{'steps': 6, 'loss/train': tensor(1078.5829, device='cuda:0', grad_fn=<MulBackward0>)}
{'steps': 7, 'loss/train': tensor(1069.3918, device='cuda:0', grad_fn=<MulBackward0>)}
{'steps': 8, 'loss/train': tensor(1052.0006, device='cuda:0', grad_fn=<MulBackward0>)}
{'steps': 9, 'loss/train': tensor(1025.6552, device='cuda:0', grad_fn=<MulBackward0>)}
{'loss/eval': 3.992100238800049, 'perplexity': 54.16853713989258}
{'steps': 11, 'loss/train': tensor(1067.7668, device='cuda:0', grad_fn=<MulBackward0>)}
{'steps': 12, 'loss/train': tensor(1036.7092, device='cuda:0', grad_fn=<MulBackward0>)}
{'steps': 13, 'loss/train': tensor(1035.2695, device='cuda:0',

In [45]:
tokenizer = AutoTokenizer.from_pretrained("custom_eli5_clm-model")
prompt = "Somatic hypermutation allows the immune system to"
inputs = tokenizer(prompt, return_tensors="pt").input_ids.to("cuda")

In [46]:
model.eval()
outputs = model.generate(inputs, max_new_tokens=100, do_sample=True, top_k=50, top_p=0.95)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [47]:
tokenizer.batch_decode(outputs, skip_special_tokens=True)

['Somatic hypermutation allows the immune system to detect the specific gene for the disease so that it can be detected. So here\'s what happens: When a person can detect the gene of the target and then see if it\'s mutated (e.g. the immune system is the person with the infection). This is called "intentionally defective." If you\'re being asked a question of a specific type of gene, an idiot can answer a simple question. This could be about the specific type of gene in a human\'s mind, the type']