Finetune on xsum training data and evaluate on test data

https://huggingface.co/docs/transformers/tasks/summarization (reference)

In [1]:
!pip install datasets
!pip install transformers
!pip install rouge_score
!pip install sentencepiece # need this for pegasus
!pip install evaluate
!pip install transformers[torch]

Collecting datasets
  Downloading datasets-2.14.5-py3-none-any.whl (519 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.6/519.6 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m20.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m14.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0.0,>=0.14.0 (from datasets)
  Downloading huggingface_hub-0.17.2-py3-none-a

### Load Dataset

In [None]:
import datasets
xsum = datasets.load_dataset("xsum")

In [2]:
import datasets
xsum = datasets.load_dataset("xsum", split="test")
# xsum = xsum.train_test_split(test_size=0.2)
# print(xsum)
# train_testvalid = xsum['train'].train_test_split(test_size=0.1)
# print(train_testvalid)
# test_valid = train_testvalid['test'].train_test_split(test_size=0.5)
# xsum = datasets.DatasetDict({
#     'train': train_testvalid['train'],
#     'test': test_valid['test'],
#     'valid': test_valid['train']})
xsum = xsum.train_test_split(test_size=0.2)
print(xsum)
train_testvalid = xsum['train'].train_test_split(test_size=0.1)
train_testvalid_1 = train_testvalid['test'].train_test_split(test_size=0.1)
test_valid = train_testvalid_1['test'].train_test_split(test_size=0.5)
xsum = datasets.DatasetDict({
    'train': train_testvalid_1['train'],
    'test': test_valid['test'],
    'validation': test_valid['train']})

xsum

Downloading builder script:   0%|          | 0.00/5.76k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/6.24k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/255M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.00M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/204045 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/11332 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11334 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 9067
    })
    test: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 2267
    })
})


DatasetDict({
    train: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 816
    })
    test: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 46
    })
    validation: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 45
    })
})

### Instantiate Models Instances

In [3]:
from transformers import T5ForConditionalGeneration, AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq
import torch

model_name = "t5-base"
device = "cuda" if torch.cuda.is_available() else "cpu"
t5_tokenizer = AutoTokenizer.from_pretrained(model_name)
data_collator = DataCollatorForSeq2Seq(tokenizer=t5_tokenizer, model=model_name)
t5_model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


Downloading model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

### Finetuning

In [4]:
import evaluate
import numpy as np

prefix = "summarize: "

def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples['document']]
    model_inputs = t5_tokenizer(inputs, max_length=1024, truncation=True)

    labels = t5_tokenizer(text_target=examples['summary'], max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

rouge = evaluate.load("rouge")

def compute_metrics(eval_pred):
  tokenizer = t5_tokenizer
  predictions, labels = eval_pred
  decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
  labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
  decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

  result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

  prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
  result["gen_len"] = np.mean(prediction_lens)

  return {k: round(v, 4) for k, v in result.items()}

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [5]:
tokenized_xsum = xsum.map(preprocess_function, batched=True)

Map:   0%|          | 0/816 [00:00<?, ? examples/s]

Map:   0%|          | 0/46 [00:00<?, ? examples/s]

Map:   0%|          | 0/45 [00:00<?, ? examples/s]

In [6]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer, EarlyStoppingCallback

access_token = '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'

training_args = Seq2SeqTrainingArguments(
    output_dir="xsum_finetuned_on_train",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    do_train=True,
    do_eval=True,
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    # save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    load_best_model_at_end=True,
    metric_for_best_model="rouge1",
    fp16=True,
    push_to_hub=True,
    hub_token = access_token,
    hub_strategy="all_checkpoints",
)


trainer = Seq2SeqTrainer(
    model=t5_model,
    args=training_args,
    train_dataset=tokenized_xsum["train"],
    eval_dataset=tokenized_xsum["validation"],
    tokenizer=t5_tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=5)],
)


trainer.train()

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,No log,2.448921,0.217,0.0268,0.1568,0.1565,18.9111
2,No log,2.33585,0.241,0.041,0.1904,0.1903,18.4667
3,No log,2.313697,0.254,0.0509,0.2037,0.2041,18.6222


TrainOutput(global_step=153, training_loss=2.588308496412888, metrics={'train_runtime': 105.9647, 'train_samples_per_second': 23.102, 'train_steps_per_second': 1.444, 'total_flos': 2917079565189120.0, 'train_loss': 2.588308496412888, 'epoch': 3.0})

In [7]:
trainer.push_to_hub()

pytorch_model.bin:   0%|          | 0.00/892M [00:00<?, ?B/s]

'https://huggingface.co/mpalaval/xsum_finetuned_on_train/tree/main/'

### Testing

In [8]:
from transformers import pipeline, T5ForConditionalGeneration, AutoTokenizer
import torch

model_name = "mpalaval/xsum_finetuned_on_train"
# summarizer = pipeline("summarization", model="mpalaval/exp2_xsum_model")
summarizer = T5ForConditionalGeneration.from_pretrained(model_name)
t5_tokenizer = AutoTokenizer.from_pretrained(model_name)
# device = "cuda" if torch.cuda.is_available() else "cpu"

preds, gts = [], []
counter = 0
prefix = "summarize: "
for idx in range(xsum['test'].num_rows):
  # summary = summarizer(prefix + xsum['test'][idx]['document'])
  batch = t5_tokenizer(prefix + xsum['test'][idx]['document'], truncation=True, padding="longest", return_tensors="pt")#.to(device)
  translated = summarizer.generate(**batch)
  tgt_text = t5_tokenizer.batch_decode(translated, skip_special_tokens=True)

  preds.append(tgt_text[0])
  gts.append(xsum['test'][idx]['summary'])
  counter += 1

  if counter % 10 == 0:
    print('done with test', counter)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.50k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/892M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]



done with test 10
done with test 20
done with test 30
done with test 40


In [None]:
with open('predictions_exp1', 'wb') as f:
  pickle.dump(preds, f)
with open('gts_exp1', 'wb') as f:
  pickle.dump(gts, f)

In [9]:
result = rouge.compute(predictions=preds, references=gts, use_stemmer=True)
result

{'rouge1': 0.24112154108634146,
 'rouge2': 0.05586264696543851,
 'rougeL': 0.1909727945931663,
 'rougeLsum': 0.1922019053582461}