In [None]:
!pip install pytesseract transformers datasets rouge-score nltk tensorboard py7zr --upgrade

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
# install git-fls for pushing model and logs to the hugging face hub
!sudo apt-get install git-lfs --yes

Reading package lists... Done
Building dependency tree       
Reading state information... Done
git-lfs is already the newest version (2.9.2-1).
0 upgraded, 0 newly installed, 0 to remove and 22 not upgraded.


In [None]:
from huggingface_hub import notebook_login

notebook_login()

Token is valid.
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [None]:
dataset_id = "samsum"

In [None]:
from datasets import load_dataset

# Load dataset from the hub
dataset = load_dataset(dataset_id)

print(f"Train dataset size: {len(dataset['train'])}")
print(f"Test dataset size: {len(dataset['test'])}")

# Train dataset size: 14732
# Test dataset size: 819



  0%|          | 0/3 [00:00<?, ?it/s]

Train dataset size: 14732
Test dataset size: 819


In [None]:
from random import randrange        


sample = dataset['train'][randrange(len(dataset["train"]))]
print(f"dialogue: \n{sample['dialogue']}\n---------------")
print(f"summary: \n{sample['summary']}\n---------------")

dialogue: 
Maria: We've passed the security and we're waiting for the boarding
Aldona: no problems?
Philip: not at all, was smooth
Aldona: I told you
Maria: I know, but I am traumatised after the stopped me the last time
Maria: it was so stressful and horrible, quite humiliating
Aldona: I know, but it should not happen again
Philip: She is panicking all the time
Philip: it would be even funny, if it wasn't so annoying
Maria: I'm sorry, it's kind of a phobia I guess
Aldona: no, it will pass, you just had very bad experiences
Maria: maybe you're right
Maria: I feel already better, thanks guys for your support
Aldona: 👍
---------------
summary: 
Maria and Philip passed the security and they are waiting for the boarding. Last time she flew Maria was stopped by security guards.
---------------


In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_id="google/flan-t5-small"

# Load tokenizer of flan-t5-small
tokenizer = AutoTokenizer.from_pretrained(model_id)

loading file spiece.model from cache at /root/.cache/huggingface/hub/models--google--flan-t5-small/snapshots/f6b63ff0230b8e19027b922964cab639c1c6da9c/spiece.model
loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--google--flan-t5-small/snapshots/f6b63ff0230b8e19027b922964cab639c1c6da9c/tokenizer.json
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at /root/.cache/huggingface/hub/models--google--flan-t5-small/snapshots/f6b63ff0230b8e19027b922964cab639c1c6da9c/special_tokens_map.json
loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--google--flan-t5-small/snapshots/f6b63ff0230b8e19027b922964cab639c1c6da9c/tokenizer_config.json


In [None]:
from datasets import concatenate_datasets

# The maximum total input sequence length after tokenization. 
# Sequences longer than this will be truncated, sequences shorter will be padded.
tokenized_inputs = concatenate_datasets([dataset["train"], dataset["test"]]).map(lambda x: tokenizer(x["dialogue"], truncation=True), batched=True, remove_columns=["dialogue", "summary"])
max_source_length = max([len(x) for x in tokenized_inputs["input_ids"]])
print(f"Max source length: {max_source_length}")

# The maximum total sequence length for target text after tokenization. 
# Sequences longer than this will be truncated, sequences shorter will be padded."
tokenized_targets = concatenate_datasets([dataset["train"], dataset["test"]]).map(lambda x: tokenizer(x["summary"], truncation=True), batched=True, remove_columns=["dialogue", "summary"])
max_target_length = max([len(x) for x in tokenized_targets["input_ids"]])
print(f"Max target length: {max_target_length}")



Max source length: 512


Map:   0%|          | 0/15551 [00:00<?, ? examples/s]

Max target length: 95


In [None]:
def preprocess_function(sample,padding="max_length"):
    # add prefix to the input for t5
    inputs = ["summarize: " + item for item in sample["dialogue"]]

    # tokenize inputs
    model_inputs = tokenizer(inputs, max_length=max_source_length, padding=padding, truncation=True)

    # Tokenize targets with the `text_target` keyword argument
    labels = tokenizer(text_target=sample["summary"], max_length=max_target_length, padding=padding, truncation=True)

    # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
    # padding in the loss.
    if padding == "max_length":
        labels["input_ids"] = [
            [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
        ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=["dialogue", "summary", "id"])
print(f"Keys of tokenized dataset: {list(tokenized_dataset['train'].features)}")



Map:   0%|          | 0/819 [00:00<?, ? examples/s]



Keys of tokenized dataset: ['input_ids', 'attention_mask', 'labels']


In [None]:
from transformers import AutoModelForSeq2SeqLM

# huggingface hub model id
model_id="google/flan-t5-small"

# load model from the hub
model = AutoModelForSeq2SeqLM.from_pretrained(model_id)

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--google--flan-t5-small/snapshots/f6b63ff0230b8e19027b922964cab639c1c6da9c/config.json
Model config T5Config {
  "_name_or_path": "google/flan-t5-small",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "d_ff": 1024,
  "d_kv": 64,
  "d_model": 512,
  "decoder_start_token_id": 0,
  "dense_act_fn": "gelu_new",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "gated-gelu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 8,
  "num_heads": 6,
  "num_layers": 8,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
  

In [None]:
!pip install evaluate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import evaluate
import nltk
import numpy as np
from nltk.tokenize import sent_tokenize
nltk.download("punkt")

# Metric
metric = evaluate.load("rouge")

# helper function to postprocess text
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    # rougeLSum expects newline after each sentence
    preds = ["\n".join(sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(sent_tokenize(label)) for label in labels]

    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    result = {k: round(v * 100, 4) for k, v in result.items()}
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    return result

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
from transformers import DataCollatorForSeq2Seq

# we want to ignore tokenizer pad token in the loss
label_pad_token_id = -100
# Data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=label_pad_token_id,
    pad_to_multiple_of=8
)

In [None]:
from huggingface_hub import HfFolder
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

# Hugging Face repository id
repository_id = f"{model_id.split('/')[1]}-{dataset_id}"

# Define training args
training_args = Seq2SeqTrainingArguments(
    output_dir=repository_id,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    fp16=False, # Overflows with fp16
    learning_rate=5e-5,
    num_train_epochs=5,
    # logging & evaluation strategies
    logging_dir=f"{repository_id}/logs",
    logging_strategy="steps",
    logging_steps=500,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    # metric_for_best_model="overall_f1",
    # push to hub parameters
    report_to="tensorboard",
    push_to_hub=False,
    hub_strategy="every_save",
    hub_model_id=repository_id,
    hub_token=HfFolder.get_token(),
)

# Create Trainer instance
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    compute_metrics=compute_metrics,
)

PyTorch: setting up devices


In [None]:
# Start training
trainer.train()

***** Running training *****
  Num examples = 14732
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 9210
  Number of trainable parameters = 76961152


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,1.8076,1.65673,43.2617,18.9927,35.8416,39.2405,16.561661
2,1.7219,1.643151,43.4938,19.2611,36.1608,39.5837,16.862027
3,1.6871,1.635579,43.6007,18.9886,36.064,39.6453,17.034188
4,1.6654,1.632855,43.6841,19.3734,36.2658,39.6976,16.80464
5,1.6324,1.630559,43.8765,19.5708,36.4267,39.8764,16.807082


***** Running Evaluation *****
  Num examples = 819
  Batch size = 8
Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_tok

TrainOutput(global_step=9210, training_loss=1.7107061462733697, metrics={'train_runtime': 3633.9588, 'train_samples_per_second': 20.27, 'train_steps_per_second': 2.534, 'total_flos': 1.3848842600448e+16, 'train_loss': 1.7107061462733697, 'epoch': 5.0})

In [None]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 819
  Batch size = 8
Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}



Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_

{'eval_loss': 1.6305592060089111,
 'eval_rouge1': 43.8765,
 'eval_rouge2': 19.5708,
 'eval_rougeL': 36.4267,
 'eval_rougeLsum': 39.8764,
 'eval_gen_len': 16.807081807081808,
 'eval_runtime': 51.3341,
 'eval_samples_per_second': 15.954,
 'eval_steps_per_second': 2.006,
 'epoch': 5.0}

In [None]:
# Save our tokenizer and create model card
tokenizer.save_pretrained(repository_id)
trainer.create_model_card()
# Push the results to the hub
#trainer.push_to_hub()

tokenizer config file saved in flan-t5-small-samsum/tokenizer_config.json
Special tokens file saved in flan-t5-small-samsum/special_tokens_map.json


In [None]:

trainer.push_to_hub("my-awesome-model")

Cloning https://huggingface.co/yasminesarraj/flan-t5-small-samsum into local empty directory.
Saving model checkpoint to flan-t5-small-samsum
Configuration saved in flan-t5-small-samsum/config.json
Configuration saved in flan-t5-small-samsum/generation_config.json
Model weights saved in flan-t5-small-samsum/pytorch_model.bin


Upload file pytorch_model.bin:   0%|          | 32.0k/294M [00:00<?, ?B/s]

Upload file training_args.bin: 100%|##########| 3.56k/3.56k [00:00<?, ?B/s]

remote: Scanning LFS files of refs/heads/main for validity...        
remote: LFS file scan complete.        
To https://huggingface.co/yasminesarraj/flan-t5-small-samsum
   efa3ea0..7a9cebc  main -> main

remote: LFS file scan complete.        
To https://huggingface.co/yasminesarraj/flan-t5-small-samsum
   efa3ea0..7a9cebc  main -> main

Dropping the following result as it does not have all the necessary fields:
{'task': {'name': 'Sequence-to-sequence Language Modeling', 'type': 'text2text-generation'}, 'dataset': {'name': 'samsum', 'type': 'samsum', 'config': 'samsum', 'split': 'test', 'args': 'samsum'}}
To https://huggingface.co/yasminesarraj/flan-t5-small-samsum
   7a9cebc..e3d16e5  main -> main

   7a9cebc..e3d16e5  main -> main



'https://huggingface.co/yasminesarraj/flan-t5-small-samsum/commit/7a9cebca106078c26267c8042fcdced01342f5dd'

In [None]:
# Save our tokenizer and create model card
tokenizer.save_pretrained(repository_id)
trainer.create_model_card()
output_dir = f"{model_id.split('/')[1]}-{dataset_id}"

trainer.push_to_hub(output_dir) --overwrite_output_dir
model.push_to_hub(output_dir)

In [None]:
from transformers import pipeline
from random import randrange        

# load model and tokenizer from huggingface hub with pipeline
summarizer = pipeline("summarization", model="philschmid/flan-t5-small-samsum", device=0)

# select a random test sample
sample = dataset['test'][randrange(len(dataset["test"]))]
print(f"dialogue: \n{sample['dialogue']}\n---------------")

# summarize dialogue
res = summarizer(sample["dialogue"])

print(f"flan-t5-small summary:\n{res[0]['summary_text']}")