# T5 model
Text-to-Text Transfer Transformer

In [1]:
import torch
from torch import nn
import torch.nn.functional as F
import transformers
from transformers import pipeline
from transformers import AutoTokenizer, AutoConfig, AutoModel
from datasets import load_dataset
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
from IPython.display import Image

## 1. Summary
- Overview
  - Introduced by Google (2020): "Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer"
  - Unified framework: every NLP task (translation, summarization, QA, classification) is cast as "text-to-text"
- Architecture
  - Encoder–Decoder Transformer (like seq2seq)
      - Encoder: processes input text into contextual representations
      - Decoder: autoregressively generates output tokens
- Usage
  - Convert various NLP tasks (translation, classification, summarization, fill-in-the-blank, etc.) into a "text-to-text" format.
      - For example, a classification task: "sst2 sentence: this movie is great" → "positive"
      - For example, a summarization task: "summarize: ..." → "short summary"

## 2. T5 model
- t5-small
- t5-base
- t5-large
- t5-3b
- t5-11b

In [2]:
model_name = 't5-small'

model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
config = AutoConfig.from_pretrained(model_name)

## 3. Forward

In [3]:
input_ids = tokenizer("T5, BERT, GPT2 are all based on Transformer architecture", return_tensors="pt").input_ids
decoder_input_ids = tokenizer("T5, BERT, GPT2", return_tensors="pt").input_ids  

# AutoModel -> preprocess: Prepend decoder_input_ids with start token which is pad token for T5Model.
# T5ForConditionalGeneration -> no preprocess need, as it does this internally using labels arg.
decoder_input_ids = model._shift_right(decoder_input_ids)

### Model parameters
- input_ids -> input of encoder
- decoder_input_ids -> input of decoder

In [4]:
model.eval()

outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
last_hidden_states = outputs.last_hidden_state
last_hidden_states

tensor([[[ 1.8644e-02,  1.9237e-01, -1.0070e-02,  ...,  9.7096e-02,
           4.8512e-04, -1.3691e-01],
         [ 1.1108e-01,  2.3853e-01, -1.7094e-01,  ...,  8.5612e-02,
           6.8864e-04, -1.8778e-01],
         [-2.4158e-02,  9.6821e-04,  7.7325e-02,  ...,  1.2552e-01,
           1.1121e-03, -5.3361e-02],
         ...,
         [-1.8146e-02,  1.1181e-01, -6.2067e-02,  ...,  9.9619e-02,
           6.4774e-04, -5.3506e-01],
         [ 4.9392e-02,  2.4820e-01, -5.2307e-02,  ...,  1.8961e-01,
           2.2361e-04, -2.3843e-01],
         [ 2.9291e-02,  1.8362e-01,  3.1791e-02,  ...,  1.4849e-01,
           8.6182e-05, -3.9248e-02]]], grad_fn=<MulBackward0>)

T5 source code

In [5]:
def t5_forward(model, input_ids, decoder_input_ids):
    encoder_outputs = model.encoder(input_ids=input_ids)
    hidden_states = encoder_outputs[0]
    decoder_outputs = model.decoder(input_ids=decoder_input_ids, encoder_hidden_states=hidden_states)

    return decoder_outputs.last_hidden_state

t5_forward(model, input_ids, decoder_input_ids)

tensor([[[ 1.8644e-02,  1.9237e-01, -1.0070e-02,  ...,  9.7096e-02,
           4.8512e-04, -1.3691e-01],
         [ 1.1108e-01,  2.3853e-01, -1.7094e-01,  ...,  8.5612e-02,
           6.8864e-04, -1.8778e-01],
         [-2.4158e-02,  9.6821e-04,  7.7325e-02,  ...,  1.2552e-01,
           1.1121e-03, -5.3361e-02],
         ...,
         [-1.8146e-02,  1.1181e-01, -6.2067e-02,  ...,  9.9619e-02,
           6.4774e-04, -5.3506e-01],
         [ 4.9392e-02,  2.4820e-01, -5.2307e-02,  ...,  1.8961e-01,
           2.2361e-04, -2.3843e-01],
         [ 2.9291e-02,  1.8362e-01,  3.1791e-02,  ...,  1.4849e-01,
           8.6182e-05, -3.9248e-02]]], grad_fn=<MulBackward0>)

## 4. Pretrain tasks
- Unsupervised denoising training
  - MLM (Masked Language Model) -> Add noise to the original sentence, then let the model restore the original sentence
      - span mask
- Supervised training -> manual labeling mask
    - seq2seq

The difference between T5ForConditionalGeneration to T5Model:
- Add a lm_head

In [6]:
from transformers import T5ForConditionalGeneration
model = T5ForConditionalGeneration.from_pretrained(model_name)

In [7]:
# Unsupervised denoising training
# mlm

input_ids = tokenizer("The <extra_id_0> walks in <extra_id_1> park", return_tensors="pt").input_ids
labels = tokenizer("<extra_id_0> cute dog <extra_id_1> the <extra_id_2>", return_tensors="pt").input_ids

# the forward function automatically creates the correct decoder_input_ids
loss = model(input_ids=input_ids, labels=labels).loss
loss.item()

3.7837252616882324

In [8]:
# Supervised training
# seq2seq

input_ids = tokenizer("translate English to German: What is your name?", return_tensors="pt").input_ids
labels = tokenizer("wie heißt du?", return_tensors="pt").input_ids

# the forward function automatically creates the correct decoder_input_ids
loss = model(input_ids=input_ids, labels=labels).loss
loss.item()

3.684732437133789

### 4.1 multi sentence pairs

In [9]:
max_source_length = 512
max_target_length = 128

# Example pairs (English → French)
inputs = ["Welcome to NYC", "HuggingFace is a company"]
targets = ["Bienvenue à NYC", "HuggingFace est une entreprise"]

# Add task prefix to inputs (T5 uses text-to-text format)
task_prefix = "translate English to French: "
inputs = [task_prefix + s for s in inputs]

# Encode source sentences
encoding = tokenizer(
    inputs,
    padding="longest",              # pad to longest input
    max_length=max_source_length,   # truncate if too long
    truncation=True,
    return_tensors="pt"             # return PyTorch tensors
)
input_ids, attention_mask = encoding.input_ids, encoding.attention_mask

# Encode target sentences
target_encoding = tokenizer(
    targets,
    padding="longest",
    max_length=max_target_length,
    truncation=True,
    return_tensors="pt"
)
labels = target_encoding.input_ids

# Ignore padding tokens in loss calculation
labels[labels == tokenizer.pad_token_id] = -100

# Forward pass (compute loss)
outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
loss = outputs.loss

loss.item()

0.188005730509758

## 5. Specific tasks
- model output -> [batch_size, seq_len, vocab_size]
- model.generate output -> [batch, seq_len] ids

In [10]:
input_ids = tokenizer("translate English to German: Hello, my name is reven", return_tensors='pt').input_ids

outputs = model.generate(input_ids)
tokenizer.decode(outputs[0], skip_special_tokens=True)

'Hallo, mein Name ist reven'

In [11]:
# inference
input_ids = tokenizer(
    "summarize: Transfer learning, where a model is first pre-trained on a data-rich task before being fine-tuned on a downstream task, has emerged as a powerful technique in natural language processing (NLP). The effectiveness of transfer learning has given rise to a diversity of approaches, methodology, and practice. In this paper, we explore the landscape of transfer learning techniques for NLP by introducing a unified framework that converts every language problem into a text-to-text format. Our systematic study compares pretraining objectives, architectures, unlabeled datasets, transfer approaches, and other factors on dozens of language understanding tasks. By combining the insights from our exploration with scale and our new “Colossal Clean Crawled Corpus”, we achieve state-of-the-art results on many benchmarks covering summarization, question answering, text classification, and more. To facilitate future work on transfer learning for NLP, we release our dataset, pre-trained models, and code.", 
    return_tensors="pt"
).input_ids

outputs = model.generate(input_ids, max_length=128)
tokenizer.decode(outputs[0], skip_special_tokens=True)

'transfer learning is a powerful technique in natural language processing. the effectiveness of transfer learning has given rise to a diversity of approaches, methodologies, and practice.'