# Install packages

In [None]:
!pip install datasets
!pip install transformers
!pip install tqdm
!pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.10.1-py3-none-any.whl (469 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m469.0/469.0 KB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting xxhash
  Downloading xxhash-3.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.2/212.2 KB[0m [31m25.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess
  Downloading multiprocess-0.70.14-py39-none-any.whl (132 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.9/132.9 KB[0m [31m15.9 MB/s[0m eta [36m0:00:00[0m
Collecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting huggingface-hub<1.0.0,>=0.2.0
  Downloading huggingface_hub-0.13.1-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.2

# Import packages

In [None]:
from datasets import load_dataset, load_metric, list_metrics
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, DataCollator, T5ForConditionalGeneration, T5TokenizerFast
import torch
from tqdm import tqdm
from typing import Dict, List, Optional
import dataclasses
from dataclasses import dataclass, field
import logging
import os
import sys
import numpy as np
import torch
import json
import datasets
from huggingface_hub import notebook_login
from transformers import (
    T5ForConditionalGeneration, 
    T5Tokenizer, 
    EvalPrediction,
    DataCollator,
    Trainer,
    TrainingArguments)
from google.colab import files

os.environ["WANDB_DISABLED"] = "true"

logger = datasets.logging.get_logger(__name__)

# Hugging Face login

In [None]:
notebook_login()

Token is valid.
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /root/.cache/huggingface/token
Login successful


# Load SQuAD dataset

In [None]:
files.upload()

rawDataset = load_dataset("t5_squad.py")

rawDataset["train"][0]

Saving t5_squad.py to t5_squad.py
Downloading and preparing dataset t5_squad/plain_text to /root/.cache/huggingface/datasets/t5_squad/plain_text/1.0.0/02ae0815e8483cc76579286179faeb8c8fdbdd328e6741f5c465d9b0bddb8a77...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/8.12M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.05M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Dataset t5_squad downloaded and prepared to /root/.cache/huggingface/datasets/t5_squad/plain_text/1.0.0/02ae0815e8483cc76579286179faeb8c8fdbdd328e6741f5c465d9b0bddb8a77. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

{'context': 'generate questions: Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.',
 'questions': 'To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France? {sep_token} What is in front of the Notre Dame Main Building? {sep_token} The Basilica of the Sacred heart at Notre Dame is beside to which structure? {sep_token} What is the Grotto

# Data preprocessing

In [None]:
baseCheckpoint = "t5-base"

model = T5ForConditionalGeneration.from_pretrained(baseCheckpoint)
tokenizer = T5TokenizerFast.from_pretrained(baseCheckpoint)

tokenizer.sep_token = '<sep>'
tokenizer.add_tokens(['<sep>'])
tokenizer.sep_token_id

model.resize_token_embeddings(len(tokenizer))

maxInputLength =  512
maxTargetLength = 64

def convertDatasetToFeatures(dataset):

    inputEncodings = tokenizer.batch_encode_plus(dataset['context'], 
                                                  max_length=maxInputLength, 
                                                  add_special_tokens=True,
                                                  truncation=True, 
                                                  pad_to_max_length=True)
    
    targetEncodings = tokenizer.batch_encode_plus(dataset['questions'], 
                                                   max_length=maxTargetLength, 
                                                   add_special_tokens=True,
                                                   truncation=True, pad_to_max_length=True)
                                                   
    encodings = {
        'input_ids': inputEncodings['input_ids'], 
        'attention_mask': inputEncodings['attention_mask'],
        'decoder_input_ids': targetEncodings['input_ids']
        ,'decoder_attention_mask': targetEncodings['attention_mask']
    }

    return encodings

def addEOSToFeature(feature):
  feature['context'] = feature['context'] + " </s>"
  feature['questions'] = feature['questions'] + " </s>"
  return feature


def addSpecialTokensToFeatureQuestions(feature):
  feature['questions'] = feature['questions'].replace("{sep_token}", '<sep>')
  return feature


tokenizedDataset  = rawDataset.map(addEOSToFeature)
tokenizedDataset = tokenizedDataset.map(addSpecialTokensToFeatureQuestions)
tokenizedDataset  = tokenizedDataset.map(convertDatasetToFeatures,  batched=True)

tokenizedDataset["train"][0]["context"]

tokenizedDataset = tokenizedDataset.remove_columns(
    ["context", "questions"]
)

trainDataset = tokenizedDataset["train"]
validDataset = tokenizedDataset["validation"]

columns = ['input_ids', 'decoder_input_ids', 'attention_mask', 'decoder_attention_mask']
trainDataset.set_format(type='torch', columns=columns)
validDataset.set_format(type='torch', columns=columns)

torch.save(trainDataset, 'train_data.pt')
torch.save(validDataset, 'valid_data.pt')

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/892M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


Map:   0%|          | 0/18896 [00:00<?, ? examples/s]

Map:   0%|          | 0/2067 [00:00<?, ? examples/s]

Map:   0%|          | 0/18896 [00:00<?, ? examples/s]

Map:   0%|          | 0/2067 [00:00<?, ? examples/s]

Map:   0%|          | 0/18896 [00:00<?, ? examples/s]



Map:   0%|          | 0/2067 [00:00<?, ? examples/s]

# Fine tuning the T5 model and push to hub

In [None]:
@dataclass
class T2TDataCollator():
  def __call__(self, batch: List) -> Dict[str, torch.Tensor]:
    inputIds = torch.stack([feature['input_ids'] for feature in batch])
    lmLabels = torch.stack([feature['decoder_input_ids'] for feature in batch])
    lmLabels[lmLabels[:, :] == 0] = -100 
    attentionMask = torch.stack([feature['attention_mask'] for feature in batch])
    decoderAttentionMask = torch.stack([feature['decoder_attention_mask'] for feature in batch])
    
    return {
      'input_ids': inputIds, 
      'attention_mask': attentionMask,
      'labels': lmLabels, 
      'decoder_attention_mask': decoderAttentionMask
    }


trainingArgs = TrainingArguments(output_dir="./out", 
  per_device_train_batch_size=4, 
  per_device_eval_batch_size=4,
  gradient_accumulation_steps=16,
  learning_rate=1e-4, 
  num_train_epochs=7,
  logging_steps=100,
  run_name="t5-simple-qg-eng",
  evaluation_strategy="steps",
  save_steps=500,
  report_to=None,
  push_to_hub=True,
  push_to_hub_model_id="t5-simple-qg-eng"
)

trainer = Trainer(
    model=model,
    args=trainingArgs,
    train_dataset=trainDataset,
    eval_dataset=validDataset,
    data_collator=T2TDataCollator()
)

trainer.train()

trainer.push_to_hub("t5-simple-qg-eng")

using `logging_steps` to initialize `eval_steps` to 100
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Cloning https://huggingface.co/mrbalazs5/t5-simple-qg-eng into local empty directory.
***** Running training *****
  Num examples = 18896
  Num Epochs = 7
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 16
  Total optimization steps = 2065
  Number of trainable parameters = 222882816


Step,Training Loss,Validation Loss
100,2.584,1.910817
200,1.9664,1.727507
300,1.8466,1.663376
400,1.7412,1.638291
500,1.7134,1.620192
600,1.694,1.604899
700,1.6297,1.597497
800,1.6261,1.593173
900,1.6149,1.58746
1000,1.569,1.589304


***** Running Evaluation *****
  Num examples = 2067
  Batch size = 4
***** Running Evaluation *****
  Num examples = 2067
  Batch size = 4
***** Running Evaluation *****
  Num examples = 2067
  Batch size = 4
***** Running Evaluation *****
  Num examples = 2067
  Batch size = 4
***** Running Evaluation *****
  Num examples = 2067
  Batch size = 4
Saving model checkpoint to ./out/checkpoint-500
Configuration saved in ./out/checkpoint-500/config.json
Configuration saved in ./out/checkpoint-500/generation_config.json
Model weights saved in ./out/checkpoint-500/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 2067
  Batch size = 4
***** Running Evaluation *****
  Num examples = 2067
  Batch size = 4
***** Running Evaluation *****
  Num examples = 2067
  Batch size = 4
***** Running Evaluation *****
  Num examples = 2067
  Batch size = 4
***** Running Evaluation *****
  Num examples = 2067
  Batch size = 4
Saving model checkpoint to ./out/checkpoint-1000
Configuration save

Upload file pytorch_model.bin:   0%|          | 32.0k/850M [00:00<?, ?B/s]

Upload file runs/Mar11_10-26-32_30219b4c28fe/events.out.tfevents.1678530398.30219b4c28fe.406.0: 100%|#########…

remote: Scanning LFS files of refs/heads/main for validity...        
remote: LFS file scan complete.        
To https://huggingface.co/mrbalazs5/t5-simple-qg-eng
   c0a552c..e3aad99  main -> main

remote: LFS file scan complete.        
To https://huggingface.co/mrbalazs5/t5-simple-qg-eng
   c0a552c..e3aad99  main -> main

Dropping the following result as it does not have all the necessary fields:
{'task': {'name': 'Sequence-to-sequence Language Modeling', 'type': 'text2text-generation'}, 'dataset': {'name': 't5_squad', 'type': 't5_squad', 'config': 'plain_text', 'split': 'validation', 'args': 'plain_text'}}
To https://huggingface.co/mrbalazs5/t5-simple-qg-eng
   e3aad99..5426d64  main -> main

   e3aad99..5426d64  main -> main



'https://huggingface.co/mrbalazs5/t5-simple-qg-eng/commit/e3aad99a54d9c70a208788f5c0795f7dd6e64b2c'