# Model

In [1]:
from transformers import AutoConfig, AutoModel, AutoTokenizer
import torch

## Model download

By default, we could use "AutoModel" to download the base models. The base models contain usually the common structure of the network without task-specific output (eg, last layer of feedforward or dense layer).

In [83]:
# download base model without head

model = AutoModel.from_pretrained("google-t5/t5-small")

In [84]:
# model structure

model

T5Model(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Dropout(p=0.1, inplace=

In [112]:
# prepare some data for test

tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-small")
input_text = "translate English to French: I have an emplae sentence to be translated to french"
input_ids = tokenizer(input_text, return_tensors='pt').input_ids
output_text = ""
output_ids = tokenizer(output_text, return_tensors='pt').input_ids

In [99]:
# input some dummy inputs to show the outputs of the model

output = model(input_ids=input_ids, decoder_input_ids=output_ids)

In [100]:
# To show output of the model
# The last layer of the model is 'last_hidden_state' as shown in the printout.

# If we show the size of this last layer using :
# output.last_hidden_state.shape
# we get torch.Size([1, 1, 512]), which corresponds to the embedding dimension of the output.

# From this last layer, we can combine it to other layers to get task specific otuputs (eg. word dict logits)

output

Seq2SeqModelOutput(last_hidden_state=tensor([[[-7.9999e-02,  1.1467e-01,  7.8933e-02, -2.4194e-02,  2.6898e-03,
          -1.2803e-02, -1.4849e-01,  1.1389e-01,  4.9360e-01, -1.3014e-02,
           1.2757e-01,  5.6641e-02, -1.4565e-02, -6.3053e-02,  1.8939e+00,
          -7.0394e-02, -5.1410e-02, -4.1910e-02,  2.2532e-01, -6.9530e-02,
           7.7248e-02,  1.0673e-01, -2.8337e-02,  2.7084e+00, -8.1234e-02,
          -1.1440e-01,  9.8745e-02,  7.4430e-01,  1.2698e-01, -7.5319e-02,
           1.4852e-01, -9.9355e-02, -6.8250e-02, -1.7115e-01,  2.0939e-01,
           4.7922e-02,  1.4682e-02,  7.1147e-02, -1.7267e-02, -1.6070e-02,
           2.7061e-01, -2.4662e-02, -1.4530e-02,  8.1873e-02,  9.3931e-03,
          -5.7234e-02, -2.4526e-01, -1.2314e-01,  1.0487e-01,  2.2981e-01,
           4.2947e-02,  5.3933e-02,  1.2576e-01,  3.7194e-02, -5.7941e-02,
          -1.0944e-01,  6.8593e-02, -9.8854e-02, -2.4444e-02,  3.2015e-01,
           4.9623e-02, -1.6934e-02, -7.5678e-02, -4.5224e-02,  

In [4]:
# To get the config of the model

config = AutoConfig.from_pretrained("google-t5/t5-small")

In [5]:
config

T5Config {
  "_name_or_path": "google-t5/t5-small",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "classifier_dropout": 0.0,
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 512,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 6,
  "num_heads": 8,
  "num_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_size": 3,
      "num_beams": 4,
      "prefix": "summarize: "
    },
    "translation_en_to_de": {
      "early_stopping": true,
      "max_length": 

In [103]:
# using the config, we can change parameters
# eg. we can use the following to output the cross attension of the decoder

print(config.output_attentions)
config.output_attentions = True
print(config.output_attentions)

False
True


## Model with Head

In [105]:
# We can directly download the model with head as follow (task specific model)
# This information can be obtaied in the config info

from transformers import T5ForConditionalGeneration
model = T5ForConditionalGeneration.from_pretrained("google-t5/t5-small")

In [106]:
# The model name is now 'T5ForConditionalGeneration' 
# compared to the previously obtaied headless model is 'T5Model'.

model

T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

In [113]:
# we use the same inputs as for the basic model
# however, the variable name changed from 'decoder_input_ids' to 'labels'?
# This is not always the case, but beaware this can happen.
# To obtain the vaiable names, one can view the code source for the corresponding model.

output = model(input_ids=input_ids, labels=output_ids)

In [114]:
# In this case, the output change to 'logits' from 'last_hidden_state'.
# the shape of this output is torch.Size([1, 1, 32128]), which corresponds to the word dict size

# it is worth mentioning that the loss is also included into this model (no need to define anther one
# when finetuning)

output

Seq2SeqLMOutput(loss=tensor(7.4277, grad_fn=<NllLossBackward0>), logits=tensor([[[-15.2278,  -7.2225, -11.8031,  ..., -43.7653, -43.9005, -43.8208]]],
       grad_fn=<UnsafeViewBackward0>), past_key_values=((tensor([[[[-4.8649e-01, -2.3323e+00, -1.1428e+00,  4.3997e-01, -3.7448e+00,
            5.9211e-01,  1.7371e+00,  1.2648e-01, -9.5232e-01,  6.4317e-01,
            6.5032e-02, -2.7661e+00, -2.9257e-01, -2.2728e+00,  1.4708e+00,
            3.6940e+00, -5.9305e-01, -2.2253e+00, -2.2925e+00,  1.2926e+00,
           -1.6622e+00,  1.5806e-01, -9.8186e-01,  6.9422e-01, -2.3424e+00,
           -1.5638e-01, -9.2692e-01,  2.5009e+00,  1.5147e+00, -3.6560e-01,
            3.0006e-01,  9.5156e-01,  2.0886e+00,  3.6983e-01, -1.0588e+00,
            2.6796e+00, -1.4096e+00, -1.1152e+00, -2.3030e+00, -1.3433e+00,
            1.9916e+00,  2.5363e-03, -2.4754e+00,  7.7748e-01, -1.2229e+00,
           -1.9101e+00,  1.9616e+00, -1.2805e+00,  1.0394e+00,  1.6140e-01,
           -2.3916e-01,  2.9783e

## Comments
In this example, we use the case of translation, which need 2 input, one for encoder and one for decoder.
The specific variables names and/or output may change, but the overall concept stays the same.
And the details may obtained from the configs and source codes.