source: https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/summarization.ipynb#scrollTo=IreSlFmlIrIm

In [1]:
import os

from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    Seq2SeqTrainingArguments,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainer,
)
from datasets import load_dataset
import evaluate
import nltk
import numpy as np
import wandb

nltk.download("punkt", quiet=True)

2023-02-10 00:02:04.508811: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-02-10 00:02:05.036762: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/lib/python3.8/dist-packages/torch/lib:/usr/local/lib/python3.8/dist-packages/torch_tensorrt/lib:/usr/local/cuda/compat/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
2023-02-10 00:02:05.036804: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object 

True

In [2]:
ft_output_dir = os.getenv("HF_FINETUNE_OUTPUT_DIR")
checkpoint = "google/flan-t5-base"
model_name = checkpoint.split("/")[-1]
dataset_name = "xsum"
hub_model_id = f"{model_name}-{dataset_name}"
model_output_dir = os.path.join(ft_output_dir, hub_model_id)

os.environ["WANDB_PROJECT"] = hub_model_id

In [3]:
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
model.parallelize()

tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [4]:
model.num_parameters() / 1e6  # param in millions

247.577856

In [5]:
model.get_memory_footprint() / 1e9  # GB

0.990311424

In [6]:
ds = load_dataset(dataset_name)
ds

  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 204045
    })
    validation: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 11332
    })
    test: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 11334
    })
})

In [7]:
# tcut = 100
# vcut = 20
# ds["train"] = ds["train"].select(range(tcut))
# ds["validation"] = ds["validation"].select(range(vcut))

# ds["test"] = ds["test"].select(range(vcut))

In [8]:
example = ds["train"][0]
print(example["document"], "\n")
print(example["summary"])

The full cost of damage in Newton Stewart, one of the areas worst affected, is still being assessed.
Repair work is ongoing in Hawick and many roads in Peeblesshire remain badly affected by standing water.
Trains on the west coast mainline face disruption due to damage at the Lamington Viaduct.
Many businesses and householders were affected by flooding in Newton Stewart after the River Cree overflowed into the town.
First Minister Nicola Sturgeon visited the area to inspect the damage.
The waters breached a retaining wall, flooding many commercial properties on Victoria Street - the main shopping thoroughfare.
Jeanette Tate, who owns the Cinnamon Cafe which was badly affected, said she could not fault the multi-agency response once the flood hit.
However, she said more preventative work could have been carried out to ensure the retaining wall did not fail.
"It is difficult but I do think there is so much publicity for Dumfries and the Nith - and I totally appreciate that - but it is al

In [9]:
def preprocess(examples):
    model_input = tokenizer(
        examples["document"],
        padding=True,
        pad_to_multiple_of=8,
        truncation=True,
        max_length=1024,
    )
    model_input["labels"] = tokenizer(examples["summary"])["input_ids"]
    return model_input

In [10]:
tk_ds = ds.map(preprocess, batched=True).remove_columns(ds["train"].column_names)
tk_ds


  0%|          | 0/205 [00:00<?, ?ba/s]

  0%|          | 0/12 [00:00<?, ?ba/s]

  0%|          | 0/12 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 204045
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 11332
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 11334
    })
})

In [11]:
rouge = evaluate.load("rouge")

In [12]:
# create baseline with lead_3 summary
lead_3_summary = [
    "\n".join(nltk.sent_tokenize(doc)[:3]) for doc in ds["validation"]["document"]
]
baseline_rouge = rouge.compute(
    predictions=lead_3_summary, references=ds["validation"]["summary"]
)
baseline_rouge

{'rouge1': 0.18462746453805412,
 'rouge2': 0.025211936030726384,
 'rougeL': 0.11982331294470741,
 'rougeLsum': 0.14510302298962025}

In [13]:
args = Seq2SeqTrainingArguments(
    output_dir=model_output_dir,
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=1,
    bf16=True,  # TODO: bf16 vs fp16?
    gradient_accumulation_steps=4,
    predict_with_generate=True,
    save_strategy="epoch",
    load_best_model_at_end=True,
    hub_model_id=hub_model_id,
    report_to="wandb",
)


In [14]:
def compute_metrics(eval_preds):
    preds, labels = eval_preds

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds = [
        "\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds
    ]
    decoded_labels = [
        "\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels
    ]

    result = rouge.compute(
        predictions=decoded_preds, references=decoded_labels, use_stemmer=True
    )
    return result

In [15]:
collator = DataCollatorForSeq2Seq(tokenizer)


In [16]:
trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    train_dataset=tk_ds["train"],
    eval_dataset=tk_ds["validation"],
    tokenizer=tokenizer,
    data_collator=collator,
    compute_metrics=compute_metrics,
)

Using cuda_amp half precision backend


In [17]:
trainer.train()

***** Running training *****
  Num examples = 204045
  Num Epochs = 1
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 4
  Total optimization steps = 6376
  Number of trainable parameters = 247577856
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
ERROR:wandb.jupyter:Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[34m[1mwandb[0m: Currently logged in as: [33mlukaemon[0m. Use [1m`wandb login --relogin`[0m to force relogin


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'loss': 2.1363, 'learning_rate': 4.607904642409034e-05, 'epoch': 0.08}
{'loss': 2.1042, 'learning_rate': 4.215809284818068e-05, 'epoch': 0.16}
{'loss': 2.094, 'learning_rate': 3.8237139272271016e-05, 'epoch': 0.24}
{'loss': 2.086, 'learning_rate': 3.431618569636136e-05, 'epoch': 0.31}
{'loss': 2.0754, 'learning_rate': 3.0395232120451694e-05, 'epoch': 0.39}
{'loss': 2.0572, 'learning_rate': 2.6474278544542037e-05, 'epoch': 0.47}
{'loss': 2.0505, 'learning_rate': 2.2553324968632372e-05, 'epoch': 0.55}
{'loss': 2.0495, 'learning_rate': 1.863237139272271e-05, 'epoch': 0.63}
{'loss': 2.0363, 'learning_rate': 1.471141781681305e-05, 'epoch': 0.71}
{'loss': 2.0447, 'learning_rate': 1.0790464240903388e-05, 'epoch': 0.78}
{'loss': 2.0346, 'learning_rate': 6.869510664993727e-06, 'epoch': 0.86}
{'loss': 2.04, 'learning_rate': 2.9485570890840656e-06, 'epoch': 0.94}


***** Running Evaluation *****
  Num examples = 11332
  Batch size = 8
Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.0"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.0"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.0"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.0"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.0"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.0"
}

Generate config GenerationConfig {
  "decoder_start_t

In [None]:
wandb.finish()

0,1
eval/loss,▁
eval/rouge1,▁
eval/rouge2,▁
eval/rougeL,▁
eval/rougeLsum,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▂▂▃▃▄▅▅▆▆▇███
train/global_step,▁▂▂▃▃▄▅▅▆▆▇███

0,1
eval/loss,1.55931
eval/rouge1,0.3738
eval/rouge2,0.15332
eval/rougeL,0.30589
eval/rougeLsum,0.30587
eval/runtime,1111.5346
eval/samples_per_second,10.195
eval/steps_per_second,1.275
train/epoch,1.0
train/global_step,6376.0
