### Setup Development Environment

In [None]:
# %pip install pytesseract transformers datasets rouge-score nltk tensorboard py7zr --upgrade

The purpose of some libraries are as follows:
- `pytesseract` is an optical character recognition (OCR) tool for python. That is, it will recognize and "read" the text embedded in images.
- `tensorboard` is a visualization tool provided with TensorFlow.
- `py7zr` is a library for reading 7z files. 7z is a file format with a high compression ratio.

In [None]:
# %pip install ipywidgets

In [None]:
from huggingface_hub import notebook_login

notebook_login()

### Load and prepare the dataset

#### Load the `samsum` dataset from the Hugging Face library

In [2]:
from datasets import load_dataset 

dataset = load_dataset("samsum")

print(f"Train dataset size: {len(dataset['train'])}")
print(f"Test dataset size: {len(dataset['test'])}")

Train dataset size: 14732
Test dataset size: 819


#### Let's checkout an example of the dataset

In [3]:
dataset['train'][0]

{'id': '13818513',
 'dialogue': "Amanda: I baked  cookies. Do you want some?\r\nJerry: Sure!\r\nAmanda: I'll bring you tomorrow :-)",
 'summary': 'Amanda baked cookies and will bring Jerry some tomorrow.'}

In [4]:
from random import randrange

sample = dataset['train'][randrange(len(dataset['train']))]
print(f"Dialogue: \n{sample['dialogue']}\n--------------")
print(f"Summary: \n{sample['summary']}\n--------------")

Dialogue: 
Rory: yo.. mess?
Reed: lets go
Rory: your still in your bed arent you -_-
Reed: just a few more minutes
Rory: dude youve been sleeping since 5
Reed: SO??
Reed: sleep has no bounds XD
Rory: yeah okay
Rory: GET UPP
Reed: okay okay
Rory: -_-
--------------
Summary: 
Rory encourages Reed to get up from bed.
--------------


#### Convert inputs (text) to token IDs

[How to use AutoTokenizer?](https://github.com/huggingface/transformers/blob/main/src/transformers/models/auto/tokenization_auto.py)

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_id="google/flan-t5-base"

# Load tokenizer of FLAN-t5-base
tokenizer = AutoTokenizer.from_pretrained(model_id)

Before we can start training, we need to preprocess our data. Abstractive Summarization is a text-generation task. Our model will take a text as input and generate a summary as output. We want to understand how long our input and output will take to batch our data efficiently.

In [6]:
from datasets import concatenate_datasets
import numpy as np

tokenized_inputs = concatenate_datasets([dataset["train"], dataset["test"]]).map(lambda x: tokenizer(x["dialogue"], truncation=True), batched=True, remove_columns=["dialogue", "summary"])
max_source_length = max([len(x) for x in tokenized_inputs["input_ids"]])
print(f"Max source length: {max_source_length}")

tokenized_targets = concatenate_datasets([dataset["train"], dataset["test"]]).map(lambda x: tokenizer(x["summary"], truncation=True), batched=True, remove_columns=["dialogue", "summary"])
max_target_length = max([len(x) for x in tokenized_targets["input_ids"]])
print(f"Max target length: {max_target_length}")

Max source length: 512
Max target length: 95


We preprocess our dataset before trainig and save it to disk

In [None]:
def preprocess_function(sample, padding="max_length"):
    # add prefix to the input for t5
    inputs = ["summarize: " + item for item in sample["dialogue"]]
    
    # tokenize inputs
    model_inputs = tokenizer(inputs, max_length=max_source_length, padding=padding, truncation=True)
    
    # Tokenize targets with the `text_target` keyword argument
    labels = tokenizer(text_target=sample["summary"], max_length=max_target_length, padding=padding, truncation=True)
    
    # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore padding in the loss
    if padding == "max_length":
        labels["input_ids"] = [
            [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
        ]
    
    model_inputs["labels"] = labels["input_ids"] # to match the model.forward signature
    return model_inputs

tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=["dialogue", "summary", "id"])
print(f"Keys of tokenized dataset: {list(tokenized_dataset['train'].features)}")

# Save datasets to disk for later easy loading
tokenized_dataset["train"].save_to_disk("data/train")
tokenized_dataset["test"].save_to_disk("data/eval")

### Fine-tune and evaluate FLAN T5

In [8]:
# %pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

In [9]:
from transformers import AutoModelForSeq2SeqLM

# huggingface hub model id
model_id="google/flan-t5-base"

# load model from the hub
model = AutoModelForSeq2SeqLM.from_pretrained(model_id)



We want to evaluate our model during training. The `Trainer` supports evaluation during training by providing a `compute_metrics`.
The most commonly used metrics to evaluate summarization task is `rogue_score` short for Recall-Oriented Understudy for Gisting Evaluation. This metric does not behave like the standard accuracy: it will compare a generated summary against a set of reference summaries

In [10]:
# %pip install evaluate

In [11]:
import evaluate 
import nltk 
import numpy as np 
from nltk.tokenize import sent_tokenize
nltk.download("punkt")

metric = evaluate.load("rouge")

# helper function to postprocess text
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds] # remove leading/trailing spaces
    labels = [label.strip() for label in labels]
    
    # rougeLSum expects newline after each sentence
    preds = ["\n".join(sent_tokenize(pred)) for pred in preds] # split into sentences
    labels = ["\n".join(sent_tokenize(label)) for label in labels]
    
    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple): 
        preds = preds[0] 
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
 
    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
 
    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    result = {k: round(v * 100, 4) for k, v in result.items()} # round the result for better readability
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    return result

[nltk_data] Downloading package punkt to /home/sanslab/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Before we can start training is to create a DataCollator that will take care of padding our inputs and labels. We will use the DataCollatorForSeq2Seq from the 🤗 Transformers library

In [12]:
from transformers import DataCollatorForSeq2Seq
 
# we want to ignore tokenizer pad token in the loss
label_pad_token_id = -100
# Data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=label_pad_token_id,
    pad_to_multiple_of=8 # pad_to_multiple_of=8 means that the training will use dynamic padding to pad the input to a multiple of 8
)

The last step is to define the hyperparameters (TrainingArguments) we want to use for our training. We are leveraging the Hugging Face Hub integration of the Trainer to automatically push our checkpoints, logs and metrics during training into a repository

In [13]:
dataset_id = "samsum"

In [14]:
from huggingface_hub import HfFolder # HfFolder is a helper class to interact with the local cache and the hub repository 
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
 
# Hugging Face repository id
repository_id = f"{model_id.split('/')[1]}-{dataset_id}"
 
# Define training args
training_args = Seq2SeqTrainingArguments(
    output_dir=repository_id,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    fp16=False, # Overflows with fp16
    learning_rate=5e-5,
    num_train_epochs=5,
    # logging & evaluation strategies
    logging_dir=f"{repository_id}/logs",
    logging_strategy="steps",
    logging_steps=500,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    # metric_for_best_model="overall_f1",
    # push to hub parameters
    report_to="tensorboard",
    push_to_hub=False,
    hub_strategy="every_save",
    hub_model_id=repository_id,
    hub_token=HfFolder.get_token(),
)
 
# Create Trainer instance
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    compute_metrics=compute_metrics,
)

In [15]:
# %pip install transformers[torch]

In [16]:
# Start training
trainer.train()

Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,1.4566,1.383415,46.9151,22.8925,39.1161,43.0414,17.449328
2,1.3394,1.374056,47.2947,23.5658,39.8063,43.487,17.181929
3,1.2786,1.36921,47.2141,23.4837,39.7822,43.2157,17.161172
4,1.2274,1.377577,47.6914,24.1243,40.1764,43.9611,17.404151
5,1.2028,1.377059,47.3328,23.5144,39.6487,43.4161,17.235653


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight'].


TrainOutput(global_step=9210, training_loss=1.3030041225570033, metrics={'train_runtime': 3193.8529, 'train_samples_per_second': 23.063, 'train_steps_per_second': 2.884, 'total_flos': 5.043922658131968e+16, 'train_loss': 1.3030041225570033, 'epoch': 5.0})

In [17]:
trainer.evaluate()



{'eval_loss': 1.3692095279693604,
 'eval_rouge1': 47.2141,
 'eval_rouge2': 23.4837,
 'eval_rougeL': 39.7822,
 'eval_rougeLsum': 43.2157,
 'eval_gen_len': 17.16117216117216,
 'eval_runtime': 29.3305,
 'eval_samples_per_second': 27.923,
 'eval_steps_per_second': 3.512,
 'epoch': 5.0}

In [None]:
# Save our tokenizer and create model card
tokenizer.save_pretrained(repository_id)
trainer.create_model_card()
# Push the results to the hub
trainer.push_to_hub()

In [20]:
dataset['test'][0]

{'id': '13862856',
 'dialogue': "Hannah: Hey, do you have Betty's number?\nAmanda: Lemme check\nHannah: <file_gif>\nAmanda: Sorry, can't find it.\nAmanda: Ask Larry\nAmanda: He called her last time we were at the park together\nHannah: I don't know him well\nHannah: <file_gif>\nAmanda: Don't be shy, he's very nice\nHannah: If you say so..\nHannah: I'd rather you texted him\nAmanda: Just text him 🙂\nHannah: Urgh.. Alright\nHannah: Bye\nAmanda: Bye bye",
 'summary': "Hannah needs Betty's number but Amanda doesn't have it. She needs to contact Larry."}

In [None]:
from transformers import pipeline
from random import randrange
 
# load model and tokenizer from huggingface hub with pipeline
summarizer = pipeline("summarization", model="stevehoang9/flan-t5-base-samsum", device=0)
 
# select a random test sample
sample = dataset['test'][randrange(len(dataset["test"]))]
print(f"dialogue: \n{sample['dialogue']}\n---------------")
 
# summarize dialogue
res = summarizer(sample["dialogue"])
 
print(f"flan-t5-base summary:\n{res[0]['summary_text']}")