# Summarization with HuggingFace

https://huggingface.co/docs/transformers/tasks/summarization

In [15]:
try:
    import transformers
except:
    !pip install -q transformers datasets evaluate rouge_score accelerate

    from datasets import load_dataset

    from transformers import AutoTokenizer
    from transformers import DataCollatorForSeq2Seq
    from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

    import evaluate


import time
import numpy as np

In [2]:
billsum = load_dataset("billsum", split = "ca_test")

billsum = billsum.train_test_split(test_size = 0.2)

Downloading builder script:   0%|          | 0.00/3.66k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.80k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/6.70k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/67.3M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/18949 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3269 [00:00<?, ? examples/s]

Generating ca_test split:   0%|          | 0/1237 [00:00<?, ? examples/s]

In [3]:
#billsum["train"][0]

In [4]:
checkpoint = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

In [5]:
prefix = "summarize: "


def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["text"]]
    model_inputs = tokenizer(inputs, max_length = 1024, truncation = True)

    labels = tokenizer(text_target = examples["summary"], max_length = 128, truncation = True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [6]:
tokenized_billsum = billsum.map(preprocess_function, batched = True)

Map:   0%|          | 0/989 [00:00<?, ? examples/s]

Map:   0%|          | 0/248 [00:00<?, ? examples/s]

In [7]:
data_collator = DataCollatorForSeq2Seq(tokenizer = tokenizer, model = checkpoint)

In [8]:
rouge = evaluate.load("rouge")

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [9]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens = True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens = True)

    result = rouge.compute(predictions = decoded_preds, references = decoded_labels, use_stemmer = True)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

In [10]:
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [13]:
training_args = Seq2SeqTrainingArguments(
    output_dir = "my_awesome_billsum_model",
    evaluation_strategy = "epoch",
    learning_rate = 2e-5,
    per_device_train_batch_size = 16,
    per_device_eval_batch_size = 16,
    weight_decay = 0.01,
    save_total_limit = 3,
    num_train_epochs = 4,
    predict_with_generate = True,
    fp16 = True,
    push_to_hub = False,
)

In [14]:
trainer = Seq2SeqTrainer(
    model = model,
    args = training_args,
    train_dataset = tokenized_billsum["train"],
    eval_dataset = tokenized_billsum["test"],
    tokenizer = tokenizer,
    data_collator = data_collator,
    compute_metrics = compute_metrics,
)

In [16]:
start_time = time.time()
trainer.train()
end_time = time.time()

print("Time elapsed: {:.3f}s.".format(end_time - start_time))

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,No log,2.802133,0.1238,0.0351,0.1028,0.1028,19.0
2,No log,2.579862,0.1384,0.0488,0.1127,0.1128,19.0
3,No log,2.516688,0.1428,0.0532,0.1185,0.1185,19.0
4,No log,2.49836,0.1452,0.0565,0.1208,0.1212,19.0


Time elapsed: 286.982s.


In [19]:
trainer.evaluate()

{'eval_loss': 2.4983601570129395,
 'eval_rouge1': 0.1452,
 'eval_rouge2': 0.0565,
 'eval_rougeL': 0.1208,
 'eval_rougeLsum': 0.1212,
 'eval_gen_len': 19.0,
 'eval_runtime': 21.1169,
 'eval_samples_per_second': 11.744,
 'eval_steps_per_second': 0.758,
 'epoch': 4.0}