source: https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/summarization.ipynb#scrollTo=IreSlFmlIrIm

In [1]:
import os

import torch
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    Seq2SeqTrainingArguments,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainer,
)
from datasets import load_dataset
import evaluate
import nltk
import numpy as np
import wandb

nltk.download("punkt", quiet=True)

2023-02-09 10:49:55.615603: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-02-09 10:49:56.131713: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/lib/python3.8/dist-packages/torch/lib:/usr/local/lib/python3.8/dist-packages/torch_tensorrt/lib:/usr/local/cuda/compat/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
2023-02-09 10:49:56.131755: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object 

True

In [2]:
ft_output_dir = os.getenv("HF_FINETUNE_OUTPUT_DIR")
checkpoint = "t5-base"
model_name = checkpoint.split("/")[-1]
dataset_name = "xsum"
hub_model_id = f"{model_name}-{dataset_name}"
model_output_dir = os.path.join(ft_output_dir, hub_model_id)

os.environ["WANDB_PROJECT"] = hub_model_id

In [3]:
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
model.parallelize()

tokenizer = AutoTokenizer.from_pretrained(checkpoint)

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [4]:
model.num_parameters() / 1e6  # param in millions

222.903552

In [5]:
model.get_memory_footprint() / 1e9  # GB

0.891614208

In [6]:
ds = load_dataset(dataset_name)
ds

  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 204045
    })
    validation: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 11332
    })
    test: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 11334
    })
})

In [7]:
tcut = 1000
vcut = 100
ds["train"] = ds["train"].select(range(tcut))
ds["validation"] = ds["validation"].select(range(vcut))

ds["test"] = ds["test"].select(range(vcut))

In [8]:
example = ds["train"][0]
print(example["document"], "\n")
print(example["summary"])

The full cost of damage in Newton Stewart, one of the areas worst affected, is still being assessed.
Repair work is ongoing in Hawick and many roads in Peeblesshire remain badly affected by standing water.
Trains on the west coast mainline face disruption due to damage at the Lamington Viaduct.
Many businesses and householders were affected by flooding in Newton Stewart after the River Cree overflowed into the town.
First Minister Nicola Sturgeon visited the area to inspect the damage.
The waters breached a retaining wall, flooding many commercial properties on Victoria Street - the main shopping thoroughfare.
Jeanette Tate, who owns the Cinnamon Cafe which was badly affected, said she could not fault the multi-agency response once the flood hit.
However, she said more preventative work could have been carried out to ensure the retaining wall did not fail.
"It is difficult but I do think there is so much publicity for Dumfries and the Nith - and I totally appreciate that - but it is al

In [9]:
def preprocess(examples):
    inputs = ["summarize: " + doc for doc in examples["document"]]
    model_input = tokenizer(
        inputs, max_length=4096, pad_to_multiple_of=8, truncation=True
    )
    model_input["labels"] = tokenizer(examples["summary"])["input_ids"]
    return model_input

In [10]:
tk_ds = ds.map(preprocess, batched=True).remove_columns(ds["train"].column_names)
tk_ds


  0%|          | 0/1 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 100
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 100
    })
})

In [11]:
rouge = evaluate.load("rouge")

In [12]:
# create baseline with lead_3 summary
lead_3_summary = [
    "\n".join(nltk.sent_tokenize(doc)[:3]) for doc in ds["validation"]["document"]
]
baseline_rouge = rouge.compute(
    predictions=lead_3_summary, references=ds["validation"]["summary"]
)
baseline_rouge

{'rouge1': 0.18405139538681847,
 'rouge2': 0.031020388473231353,
 'rougeL': 0.11822066552852066,
 'rougeLsum': 0.14333582368209413}

In [13]:
args = Seq2SeqTrainingArguments(
    output_dir=model_output_dir,
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=2,
    bf16=True,  # TODO: bf16 vs fp16?
    gradient_accumulation_steps=1,
    predict_with_generate=True,
    hub_model_id=hub_model_id,
    report_to="wandb",
)


In [14]:
def compute_metrics(eval_preds):
    preds, labels = eval_preds

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds = [
        "\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds
    ]
    decoded_labels = [
        "\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels
    ]

    result = rouge.compute(
        predictions=decoded_preds, references=decoded_labels, use_stemmer=True
    )
    return result

In [15]:
collator = DataCollatorForSeq2Seq(tokenizer)


In [16]:
trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    train_dataset=tk_ds["train"],
    eval_dataset=tk_ds["validation"],
    tokenizer=tokenizer,
    data_collator=collator,
    compute_metrics=compute_metrics,
)

Using cuda_amp half precision backend


In [17]:
trainer.train()

***** Running training *****
  Num examples = 1000
  Num Epochs = 2
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 1
  Gradient Accumulation steps = 1
  Total optimization steps = 2000
  Number of trainable parameters = 222903552
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
ERROR:wandb.jupyter:Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[34m[1mwandb[0m: Currently logged in as: [33mlukaemon[0m. Use [1m`wandb login --relogin`[0m to force relogin


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
Saving model checkpoint to /workspaces/seed/cache/hf_finetune/t5-base-xsum/checkpoint-500
Configuration saved in /workspaces/seed/cache/hf_finetune/t5-base-xsum/checkpoint-500/config.json
Configuration saved in /workspaces/seed/cache/hf_finetune/t5-base-xsum/checkpoint-500/generation_config.json


{'loss': 2.376, 'learning_rate': 3.7500000000000003e-05, 'epoch': 0.5}


Model weights saved in /workspaces/seed/cache/hf_finetune/t5-base-xsum/checkpoint-500/pytorch_model.bin
tokenizer config file saved in /workspaces/seed/cache/hf_finetune/t5-base-xsum/checkpoint-500/tokenizer_config.json
Special tokens file saved in /workspaces/seed/cache/hf_finetune/t5-base-xsum/checkpoint-500/special_tokens_map.json
Copy vocab file to /workspaces/seed/cache/hf_finetune/t5-base-xsum/checkpoint-500/spiece.model
Saving model checkpoint to /workspaces/seed/cache/hf_finetune/t5-base-xsum/checkpoint-1000
Configuration saved in /workspaces/seed/cache/hf_finetune/t5-base-xsum/checkpoint-1000/config.json
Configuration saved in /workspaces/seed/cache/hf_finetune/t5-base-xsum/checkpoint-1000/generation_config.json


{'loss': 2.2919, 'learning_rate': 2.5e-05, 'epoch': 1.0}


Model weights saved in /workspaces/seed/cache/hf_finetune/t5-base-xsum/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in /workspaces/seed/cache/hf_finetune/t5-base-xsum/checkpoint-1000/tokenizer_config.json
Special tokens file saved in /workspaces/seed/cache/hf_finetune/t5-base-xsum/checkpoint-1000/special_tokens_map.json
Copy vocab file to /workspaces/seed/cache/hf_finetune/t5-base-xsum/checkpoint-1000/spiece.model
***** Running Evaluation *****
  Num examples = 100
  Batch size = 1
Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.0"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.0"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.0"
}

Generate config GenerationConfig {
  "decoder_start_token_

{'eval_loss': 1.9816639423370361, 'eval_rouge1': 0.33324764435370924, 'eval_rouge2': 0.12019595829311905, 'eval_rougeL': 0.2765138269059063, 'eval_rougeLsum': 0.276605534219733, 'eval_runtime': 18.2555, 'eval_samples_per_second': 5.478, 'eval_steps_per_second': 5.478, 'epoch': 1.0}


Saving model checkpoint to /workspaces/seed/cache/hf_finetune/t5-base-xsum/checkpoint-1500
Configuration saved in /workspaces/seed/cache/hf_finetune/t5-base-xsum/checkpoint-1500/config.json
Configuration saved in /workspaces/seed/cache/hf_finetune/t5-base-xsum/checkpoint-1500/generation_config.json


{'loss': 1.9396, 'learning_rate': 1.25e-05, 'epoch': 1.5}


Model weights saved in /workspaces/seed/cache/hf_finetune/t5-base-xsum/checkpoint-1500/pytorch_model.bin
tokenizer config file saved in /workspaces/seed/cache/hf_finetune/t5-base-xsum/checkpoint-1500/tokenizer_config.json
Special tokens file saved in /workspaces/seed/cache/hf_finetune/t5-base-xsum/checkpoint-1500/special_tokens_map.json
Copy vocab file to /workspaces/seed/cache/hf_finetune/t5-base-xsum/checkpoint-1500/spiece.model
Deleting older checkpoint [/workspaces/seed/cache/hf_finetune/t5-base-xsum/checkpoint-500] due to args.save_total_limit
Saving model checkpoint to /workspaces/seed/cache/hf_finetune/t5-base-xsum/checkpoint-2000
Configuration saved in /workspaces/seed/cache/hf_finetune/t5-base-xsum/checkpoint-2000/config.json
Configuration saved in /workspaces/seed/cache/hf_finetune/t5-base-xsum/checkpoint-2000/generation_config.json


{'loss': 1.906, 'learning_rate': 0.0, 'epoch': 2.0}


Model weights saved in /workspaces/seed/cache/hf_finetune/t5-base-xsum/checkpoint-2000/pytorch_model.bin
tokenizer config file saved in /workspaces/seed/cache/hf_finetune/t5-base-xsum/checkpoint-2000/tokenizer_config.json
Special tokens file saved in /workspaces/seed/cache/hf_finetune/t5-base-xsum/checkpoint-2000/special_tokens_map.json
Copy vocab file to /workspaces/seed/cache/hf_finetune/t5-base-xsum/checkpoint-2000/spiece.model
Deleting older checkpoint [/workspaces/seed/cache/hf_finetune/t5-base-xsum/checkpoint-1000] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 100
  Batch size = 1
Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.0"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.0"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id":

{'eval_loss': 1.9849032163619995, 'eval_rouge1': 0.32457836952118846, 'eval_rouge2': 0.11521211821495334, 'eval_rougeL': 0.2675680741452409, 'eval_rougeLsum': 0.26696278971877474, 'eval_runtime': 18.5042, 'eval_samples_per_second': 5.404, 'eval_steps_per_second': 5.404, 'epoch': 2.0}
{'train_runtime': 239.7017, 'train_samples_per_second': 8.344, 'train_steps_per_second': 8.344, 'train_loss': 2.1283717956542967, 'epoch': 2.0}


TrainOutput(global_step=2000, training_loss=2.1283717956542967, metrics={'train_runtime': 239.7017, 'train_samples_per_second': 8.344, 'train_steps_per_second': 8.344, 'train_loss': 2.1283717956542967, 'epoch': 2.0})

In [18]:
wandb.finish()

VBox(children=(Label(value='0.004 MB of 0.004 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
eval/loss,▁█
eval/rouge1,█▁
eval/rouge2,█▁
eval/rougeL,█▁
eval/rougeLsum,█▁
eval/runtime,▁█
eval/samples_per_second,█▁
eval/steps_per_second,█▁
train/epoch,▁▃▃▆███
train/global_step,▁▃▃▆███

0,1
eval/loss,1.9849
eval/rouge1,0.32458
eval/rouge2,0.11521
eval/rougeL,0.26757
eval/rougeLsum,0.26696
eval/runtime,18.5042
eval/samples_per_second,5.404
eval/steps_per_second,5.404
train/epoch,2.0
train/global_step,2000.0
