In [1]:
from functools import partial

from torch.utils.data import DataLoader
from torchtext.datasets import CNNDM, IMDB

In [8]:
from torchtext.models import T5Transform

padding_idx = 0
eos_idx = 1
max_seq_len = 512
t5_sp_model_path = "https://download.pytorch.org/models/text/t5_tokenizer_base.model"

transform = T5Transform(
    sp_model_path=t5_sp_model_path,
    max_seq_len=max_seq_len,
    eos_idx=eos_idx,
    padding_idx=padding_idx,
)

100%|██████████| 792k/792k [00:00<00:00, 1.39MB/s]


In [9]:
from torchtext.models import T5_BASE_GENERATION
transform = T5_BASE_GENERATION.transform()

In [10]:
from torchtext.models import T5_BASE_GENERATION


t5_base = T5_BASE_GENERATION
transform = t5_base.transform()
model = t5_base.get_model()
model.eval()

Downloading: "https://download.pytorch.org/models/text/t5.base.generation.v2.pt" to C:\Users\user/.cache\torch\hub\checkpoints\t5.base.generation.v2.pt
100%|██████████| 945M/945M [00:32<00:00, 30.7MB/s]  


T5Model(
  (token_embeddings): Embedding(32128, 768, padding_idx=0)
  (encoder): T5Encoder(
    (token_embeddings): Embedding(32128, 768, padding_idx=0)
    (layers): ModuleList(
      (0): T5Layer(
        (self_attn): T5MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=False)
          (relative_attention_bias): Embedding(32, 12)
        )
        (linear1): Linear(in_features=768, out_features=3072, bias=False)
        (linear2): Linear(in_features=3072, out_features=768, bias=False)
        (norm1): T5LayerNorm()
        (norm2): T5LayerNorm()
        (dropout1): Dropout(p=0.0, inplace=False)
        (dropout2): Dropout(p=0.0, inplace=False)
        (dropout3): Dropout(p=0.0, inplace=False)
      )
      (1-11): 11 x T5Layer(
        (self_attn): T5MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=False)
        )
        (linear1): Linear(in_features=768,

In [11]:
from torchtext.prototype.generate import GenerationUtils

sequence_generator = GenerationUtils(model)

In [5]:
imdb_datapipe = IMDB(split="test")

imdb_batch_size = 3
task = "sst2 sentence"
labels = {"1": "negative", "2": "positive"}


def process_labels(labels, x):
    return x[1], labels[str(x[0])]

def apply_prefix(task, x):
    return f"{task}: " + x[0], x[1] 

In [6]:
imdb_datapipe = imdb_datapipe.map(partial(process_labels, labels))
imdb_datapipe = imdb_datapipe.map(partial(apply_prefix, task))
imdb_datapipe = imdb_datapipe.batch(imdb_batch_size)
imdb_datapipe = imdb_datapipe.rows2columnar(["text", "label"])
imdb_dataloader = DataLoader(imdb_datapipe, batch_size=None)

In [7]:
for item in imdb_datapipe:
    print(item)
    break

defaultdict(<class 'list'>, {'text': ['sst2 sentence: I love sci-fi and am willing to put up with a lot. Sci-fi movies/TV are usually underfunded, under-appreciated and misunderstood. I tried to like this, I really did, but it is to good TV sci-fi as Babylon 5 is to Star Trek (the original). Silly prosthetics, cheap cardboard sets, stilted dialogues, CG that doesn\'t match the background, and painfully one-dimensional characters cannot be overcome with a \'sci-fi\' setting. (I\'m sure there are those of you out there who think Babylon 5 is good sci-fi TV. It\'s not. It\'s clichéd and uninspiring.) While US viewers might like emotion and character development, sci-fi is a genre that does not take itself seriously (cf. Star Trek). It may treat important issues, yet not as a serious philosophy. It\'s really difficult to care about the characters here as they are not simply foolish, just missing a spark of life. Their actions and reactions are wooden and predictable, often painful to watch

In [15]:
batch = next(iter(imdb_dataloader))
input_text = batch["text"]
target = batch["label"]
beam_size = 1

model_input = transform(input_text)
model_output = sequence_generator.generate(model_input, eos_idx=eos_idx, num_beams=beam_size)
output_text = transform.decode(model_output.tolist())

for i in range(imdb_batch_size):
    print(f"Example {i+1}:\n")
    print(f"input_text: {input_text[i]}\n")
    print(f"prediction: {output_text[i]}\n")
    print(f"target: {target[i]}\n\n")

Example 1:

input_text: sst2 sentence: I love sci-fi and am willing to put up with a lot. Sci-fi movies/TV are usually underfunded, under-appreciated and misunderstood. I tried to like this, I really did, but it is to good TV sci-fi as Babylon 5 is to Star Trek (the original). Silly prosthetics, cheap cardboard sets, stilted dialogues, CG that doesn't match the background, and painfully one-dimensional characters cannot be overcome with a 'sci-fi' setting. (I'm sure there are those of you out there who think Babylon 5 is good sci-fi TV. It's not. It's clichéd and uninspiring.) While US viewers might like emotion and character development, sci-fi is a genre that does not take itself seriously (cf. Star Trek). It may treat important issues, yet not as a serious philosophy. It's really difficult to care about the characters here as they are not simply foolish, just missing a spark of life. Their actions and reactions are wooden and predictable, often painful to watch. The makers of Earth 

In [105]:
model_input = transform(['sst2 sentence: WTO reaches agreement and extends tariff exemption for e-commerce for another two years'])
model_output = sequence_generator.generate(model_input, eos_idx=eos_idx, num_beams=beam_size)
output_text = transform.decode(model_output.tolist())

In [106]:
output_text

['positive']

In [69]:
from torchtext.datasets import Multi30k

multi_batch_size = 5
language_pair = ("en", "de")
multi_datapipe = Multi30k(split="train", language_pair=language_pair)
task = "translate English to Portuguese"

multi_datapipe = multi_datapipe.map(partial(apply_prefix, task))
multi_datapipe = multi_datapipe.batch(multi_batch_size)
multi_datapipe = multi_datapipe.rows2columnar(["english", "german"])
multi_dataloader = DataLoader(multi_datapipe, batch_size=None)

In [70]:
batch = next(iter(multi_dataloader))
input_text = batch["english"]
target = batch["german"]

model_input = transform(input_text)
model_output = sequence_generator.generate(model_input, eos_idx=eos_idx, num_beams=beam_size)
output_text = transform.decode(model_output.tolist())

for i in range(multi_batch_size):
    print(f"Example {i+1}:\n")
    print(f"input_text: {input_text[i]}\n")
    print(f"prediction: {output_text[i]}\n")
    print(f"target: {target[i]}\n\n")

Example 1:

input_text: translate English to Portuguese: Two young, White males are outside near many bushes.

prediction: Zwei junge, weiße Männchen sind draußen in der Nähe vieler Büsche.

target: Zwei junge weiße Männer sind im Freien in der Nähe vieler Büsche.


Example 2:

input_text: translate English to Portuguese: Several men in hard hats are operating a giant pulley system.

prediction: Mehrere Männer mit harten Hüten betreiben ein riesiges Zugsystem.

target: Mehrere Männer mit Schutzhelmen bedienen ein Antriebsradsystem.


Example 3:

input_text: translate English to Portuguese: A little girl climbing into a wooden playhouse.

prediction: Ein kleines Mädchen klettert in ein Holzspielhaus.

target: Ein kleines Mädchen klettert in ein Spielhaus aus Holz.


Example 4:

input_text: translate English to Portuguese: A man in a blue shirt is standing on a ladder cleaning a window.

prediction: Ein Mann in einem blauen Hemd steht auf einer Leiter, die ein Fenster säubert.

target: E

In [89]:
model_input = transform(['translate Geman to English: I love you'])
model_output = sequence_generator.generate(model_input, eos_idx=eos_idx, num_beams=beam_size)
output_text = transform.decode(model_output.tolist())

In [90]:
output_text

['Ich liebe dich']