In [1]:
# !pip install datasets
# !pip install sentencepiece
# !pip install transformers[torch]

## Load dataset

In [2]:
from datasets import load_dataset

dataset = load_dataset("samsum")

In [3]:
dataset = dataset.filter(lambda example, idx: idx % 10 == 0, with_indices=True)

In [4]:
dataset["train"]

Dataset({
    features: ['id', 'dialogue', 'summary'],
    num_rows: 1474
})

In [5]:
def show_samples(dataset, num_samples=3, seed=42):
    sample = dataset["train"].shuffle(seed=seed).select(range(num_samples))
    for example in sample:
        print(f"\n'>> summary: {example['summary']}'")
        print(f"'>> dialogue: {example['dialogue']}'")

show_samples(dataset)



'>> summary: Isabelle texted Mason instead of her maid. Mason will ask her to was Isabelle's clothes.'
'>> dialogue: Isabella: Have you washed my clothes?
Mason: Was I even supposed to?
Isabella: Oh sorry, I was sending text to the maid
Mason: LOL. no worries
Mason: I havent seen him since morning btw
Isabella: He might have gone to marked with mom
Mason: ok I will ask her to wash your clothes when she comes back.'

'>> summary: Anca is upset about the Brazilian president election. Thiago is convinced that the new president won't make the situation better. Anca thinks that giving people more weapon and clearing the Amazon rainforest will make it even worse.'
'>> dialogue: Anca: There've been recently very few things that upset me as much as the Brazilian election
Jenny: I know, it seems quite insane
Thiago: it's not good, but I think people outside Brazil also don't really understand how it is here now
Ricardo: exactly
Anca: yeah, I've heard it many times - corruption, crime bla

## tokenize

In [6]:
from transformers import AutoTokenizer

model_checkpoint = "google/mt5-small"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [7]:
inputs = tokenizer("I loved reading the Hunger Games!")
inputs


{'input_ids': [336, 259, 28387, 11807, 287, 62893, 295, 12507, 309, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [8]:
tokenizer.convert_ids_to_tokens(inputs.input_ids)


['▁I', '▁', 'loved', '▁reading', '▁the', '▁Hung', 'er', '▁Games', '!', '</s>']

In [9]:
max_input_length = 512
max_target_length = 30

def preprocess_function(examples):
    model_inputs = tokenizer(
        examples["dialogue"], max_length=max_input_length, truncation=True
    )
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples["summary"], max_length=max_target_length, truncation=True
        )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


In [10]:
tokenized_datasets = dataset.map(preprocess_function, batched=True)


Map:   0%|          | 0/82 [00:00<?, ? examples/s]



## Metric

In [11]:
generated_summary = "I absolutely loved reading the Hunger Games"
reference_summary = "I loved reading the Hunger Games"

In [12]:
!pip install rouge_score



In [13]:
from datasets import load_metric

rouge_score = load_metric("rouge")

  rouge_score = load_metric("rouge")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


In [14]:
scores = rouge_score.compute(
    predictions=[generated_summary], references=[reference_summary]
)
scores


{'rouge1': AggregateScore(low=Score(precision=0.8571428571428571, recall=1.0, fmeasure=0.923076923076923), mid=Score(precision=0.8571428571428571, recall=1.0, fmeasure=0.923076923076923), high=Score(precision=0.8571428571428571, recall=1.0, fmeasure=0.923076923076923)),
 'rouge2': AggregateScore(low=Score(precision=0.6666666666666666, recall=0.8, fmeasure=0.7272727272727272), mid=Score(precision=0.6666666666666666, recall=0.8, fmeasure=0.7272727272727272), high=Score(precision=0.6666666666666666, recall=0.8, fmeasure=0.7272727272727272)),
 'rougeL': AggregateScore(low=Score(precision=0.8571428571428571, recall=1.0, fmeasure=0.923076923076923), mid=Score(precision=0.8571428571428571, recall=1.0, fmeasure=0.923076923076923), high=Score(precision=0.8571428571428571, recall=1.0, fmeasure=0.923076923076923)),
 'rougeLsum': AggregateScore(low=Score(precision=0.8571428571428571, recall=1.0, fmeasure=0.923076923076923), mid=Score(precision=0.8571428571428571, recall=1.0, fmeasure=0.92307692307

In [15]:
scores["rouge1"].mid


Score(precision=0.8571428571428571, recall=1.0, fmeasure=0.923076923076923)

## Make baseline

In [16]:
!python3 -m pip install nltk




In [17]:
import nltk

nltk.download("punkt")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [18]:
from nltk.tokenize import sent_tokenize


def three_sentence_summary(text):
    return "\n".join(sent_tokenize(text)[:3])


print(three_sentence_summary(dataset["train"][1]["dialogue"]))


Lucas: Hey!
How was your day?
Demi: Hey there!


In [19]:
def evaluate_baseline(dataset, metric):
    summaries = [three_sentence_summary(text) for text in dataset["dialogue"]]
    return metric.compute(predictions=summaries, references=dataset["summary"])


In [20]:
import pandas as pd

score = evaluate_baseline(dataset["validation"], rouge_score)
rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]
rouge_dict = dict((rn, round(score[rn].mid.fmeasure * 100, 2)) for rn in rouge_names)
rouge_dict


{'rouge1': 31.02, 'rouge2': 9.24, 'rougeL': 23.39, 'rougeLsum': 27.99}

## Load model

In [21]:
from transformers import AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

In [22]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [23]:
from transformers import Seq2SeqTrainingArguments

batch_size = 8
num_train_epochs = 8
logging_steps = len(tokenized_datasets["train"]) // batch_size
model_name = model_checkpoint.split("/")[-1]

args = Seq2SeqTrainingArguments(
    output_dir=f"{model_name}-finetuned-amazon-en-es",
    evaluation_strategy="epoch",
    learning_rate=5.6e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=num_train_epochs,
    predict_with_generate=True,
    logging_steps=logging_steps,
    push_to_hub=True,
)


In [24]:
import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    # decode input
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # -100 -> padding id
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    # decode label
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    # ROUGE need \n between sentences
    decoded_preds = ["\n".join(sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(sent_tokenize(label.strip())) for label in decoded_labels]
    # ROUGE score
    result = rouge_score.compute(
        predictions=decoded_preds, references=decoded_labels, use_stemmer=True
    )
    # median scores
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    return {k: round(v, 4) for k, v in result.items()}


In [25]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)


In [27]:
tokenized_datasets = tokenized_datasets.remove_columns(
    dataset["train"].column_names
)

In [30]:
dataset["train"][1]

{'id': '13727633',
 'dialogue': 'Lucas: Hey! How was your day?\r\nDemi: Hey there! \r\nDemi: It was pretty fine, actually, thank you!\r\nDemi: I just got promoted! :D\r\nLucas: Whoa! Great news!\r\nLucas: Congratulations!\r\nLucas: Such a success has to be celebrated.\r\nDemi: I agree! :D\r\nDemi: Tonight at Death & Co.?\r\nLucas: Sure!\r\nLucas: See you there at 10pm?\r\nDemi: Yeah! See you there! :D',
 'summary': 'Demi got promoted. She will celebrate that with Lucas at Death & Co at 10 pm.'}

In [28]:
features = [tokenized_datasets["train"][i] for i in range(2)]
data_collator(features)

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'input_ids': tensor([[   259,  78642,    267,    336,   6789,    345,  18007,    260,   1272,
            521,   3007,   2155,    291,    259,  74732,    267,  85719,    309,
            259,  78642,    267,    336,    277,   1578,   8448,    521,    259,
          82548,  16725,      1,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0],
        [ 469

In [33]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)


In [34]:
trainer.train()


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
1,12.421,4.176147,8.5214,1.3412,8.1063,8.2251
2,4.6683,2.83433,20.6485,7.1917,18.6741,19.4706
3,3.6666,2.561632,20.3673,6.1998,18.2531,19.0305
4,3.3157,2.500199,28.4326,11.0801,25.391,26.4882
5,3.1834,2.458581,29.0975,11.3058,26.0004,27.5342
6,3.0983,2.419072,31.5865,11.3633,27.6063,29.6726
7,3.0338,2.425838,31.845,11.9743,28.3534,29.8196
8,2.9805,2.421282,31.833,11.5704,28.3537,29.7517




TrainOutput(global_step=1480, training_loss=4.538251131934088, metrics={'train_runtime': 633.1691, 'train_samples_per_second': 18.624, 'train_steps_per_second': 2.337, 'total_flos': 4215989791088640.0, 'train_loss': 4.538251131934088, 'epoch': 8.0})

In [35]:
trainer.push_to_hub(commit_message="Training complete", tags="summarization")

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

events.out.tfevents.1703292154.3bc8a2773ed0.7476.0:   0%|          | 0.00/9.94k [00:00<?, ?B/s]

'https://huggingface.co/naninya/mt5-small-finetuned-amazon-en-es/tree/main/'

## Without trainer

In [36]:
tokenized_datasets.set_format("torch")

In [37]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

In [38]:
from torch.utils.data import DataLoader

batch_size = 8
train_dataloader = DataLoader(
    tokenized_datasets["train"],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=batch_size,
)
eval_dataloader = DataLoader(
    tokenized_datasets["validation"], collate_fn=data_collator, batch_size=batch_size
)


In [39]:
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=2e-5)


In [40]:
from accelerate import Accelerator

accelerator = Accelerator()
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, eval_dataloader
)


In [41]:
from transformers import get_scheduler

num_train_epochs = 10
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)


In [42]:
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels]

    return preds, labels


In [43]:
from huggingface_hub import get_full_repo_name, create_repo

model_name = "test-bert-finetuned-squad-accelerate"
repo_name = get_full_repo_name(model_name)
create_repo(repo_name)

RepoUrl('https://huggingface.co/naninya/test-bert-finetuned-squad-accelerate', endpoint='https://huggingface.co', repo_type='model', repo_id='naninya/test-bert-finetuned-squad-accelerate')

In [44]:
from huggingface_hub import Repository

output_dir = "results-mt5-finetuned-squad-accelerate"
repo = Repository(output_dir, clone_from=repo_name)


For more details, please read https://huggingface.co/docs/huggingface_hub/concepts/git_vs_http.
Cloning https://huggingface.co/naninya/test-bert-finetuned-squad-accelerate into local empty directory.


In [45]:
from tqdm.auto import tqdm
import torch
import numpy as np

progress_bar = tqdm(range(num_training_steps))

for epoch in range(num_train_epochs):
    # train
    model.train()
    for step, batch in enumerate(train_dataloader):
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

    # eval
    model.eval()
    for step, batch in enumerate(eval_dataloader):
        with torch.no_grad():
            generated_tokens = accelerator.unwrap_model(model).generate(
                batch["input_ids"],
                attention_mask=batch["attention_mask"],
            )

            generated_tokens = accelerator.pad_across_processes(
                generated_tokens, dim=1, pad_index=tokenizer.pad_token_id
            )
            labels = batch["labels"]

            # padding
            labels = accelerator.pad_across_processes(
                batch["labels"], dim=1, pad_index=tokenizer.pad_token_id
            )

            generated_tokens = accelerator.gather(generated_tokens).cpu().numpy()
            labels = accelerator.gather(labels).cpu().numpy()

            # -100 to padding index
            labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
            if isinstance(generated_tokens, tuple):
                generated_tokens = generated_tokens[0]
            decoded_preds = tokenizer.batch_decode(
                generated_tokens, skip_special_tokens=True
            )
            decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

            decoded_preds, decoded_labels = postprocess_text(
                decoded_preds, decoded_labels
            )

            rouge_score.add_batch(predictions=decoded_preds, references=decoded_labels)

    # metric
    result = rouge_score.compute()
    # mid val
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    result = {k: round(v, 4) for k, v in result.items()}
    print(f"Epoch {epoch}:", result)

    # save & upload
    accelerator.wait_for_everyone()
    unwrapped_model = accelerator.unwrap_model(model)
    unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
    if accelerator.is_main_process:
        tokenizer.save_pretrained(output_dir)
        repo.push_to_hub(
            commit_message=f"Training in progress epoch {epoch}", blocking=False
        )


  0%|          | 0/1850 [00:00<?, ?it/s]



Epoch 0: {'rouge1': 2.9722, 'rouge2': 0.1109, 'rougeL': 2.6671, 'rougeLsum': 2.9348}


Adding files tracked by Git LFS: ['tokenizer.json']. This may take a bit of time if the files are large.


Epoch 1: {'rouge1': 7.0238, 'rouge2': 1.4853, 'rougeL': 6.7428, 'rougeLsum': 6.8547}




Epoch 2: {'rouge1': 7.3848, 'rouge2': 1.3495, 'rougeL': 6.992, 'rougeLsum': 7.2509}




Epoch 3: {'rouge1': 6.9088, 'rouge2': 1.0201, 'rougeL': 6.4183, 'rougeLsum': 6.7219}




Epoch 4: {'rouge1': 8.7614, 'rouge2': 1.7739, 'rougeL': 8.3529, 'rougeLsum': 8.5077}




Epoch 5: {'rouge1': 10.2545, 'rouge2': 2.0624, 'rougeL': 9.8427, 'rougeLsum': 10.0869}




Epoch 6: {'rouge1': 9.5893, 'rouge2': 1.9325, 'rougeL': 9.243, 'rougeLsum': 9.4872}




Epoch 7: {'rouge1': 9.3237, 'rouge2': 1.7473, 'rougeL': 8.9703, 'rougeLsum': 9.2366}




Epoch 8: {'rouge1': 9.1135, 'rouge2': 1.5758, 'rougeL': 8.8621, 'rougeLsum': 9.0814}




Epoch 9: {'rouge1': 9.1074, 'rouge2': 1.5758, 'rougeL': 8.8588, 'rougeLsum': 9.0798}


In [46]:
from transformers import pipeline

hub_model_id = "naninya/mt5-small-finetuned-amazon-en-es"
summarizer = pipeline("summarization", model=hub_model_id)

config.json:   0%|          | 0.00/802 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/833 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/16.3M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/416 [00:00<?, ?B/s]

In [54]:
dataset["test"][0]

{'id': '13862856',
 'dialogue': "Hannah: Hey, do you have Betty's number?\nAmanda: Lemme check\nHannah: <file_gif>\nAmanda: Sorry, can't find it.\nAmanda: Ask Larry\nAmanda: He called her last time we were at the park together\nHannah: I don't know him well\nHannah: <file_gif>\nAmanda: Don't be shy, he's very nice\nHannah: If you say so..\nHannah: I'd rather you texted him\nAmanda: Just text him 🙂\nHannah: Urgh.. Alright\nHannah: Bye\nAmanda: Bye bye",
 'summary': "Hannah needs Betty's number but Amanda doesn't have it. She needs to contact Larry."}

In [56]:
summarizer(dataset["test"][0]["dialogue"])

[{'summary_text': 'Amanda is at the park together. '}]

In [57]:
def print_summary(idx):
    dialogue = dataset["test"][idx]["dialogue"]
    summary = dataset["test"][idx]["summary"]
    result = summarizer(dataset["test"][idx]["dialogue"])[0]["summary_text"]
    print(f"'>>> Review: {dialogue}'")
    print(f"\n'>>> Title: {summary}'")
    print(f"\n'>>> Summary: {result}'")


In [58]:
print_summary(50)

'>>> Review: Helen: Hey, Simo, are you there?
Simon: Yep babe, what's up?
Helen: I was calling you before...
Simon: Sorry I was on the phone, I didn't hear you... Tell me.
Helen: It's a bit embarrassing... The toilet paper is finished, could you fetch me some tissues, please?
Simon: Hahaha sure, no worries!'

'>>> Title: Simon was on the phone before so he didn't hear Helen calling. Simon will fetch Helen some tissues as they're out of toilet paper.'

'>>> Summary: Peter was calling Simon before.'
