In [5]:
import torch
import os
import time
from torch import optim
from torch import nn
from torch.nn import functional as F
from torch.utils.data import DataLoader, Dataset
from torchdata import datapipes as dp
from torchtext.datasets import (
    IMDB,
    DBpedia,
    CC100,
    PennTreebank,
    AG_NEWS,
    YahooAnswers,
    SQuAD2,
    SST2
)
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
import tarfile
from functools import partial
from torchtext.models import (
    T5Transform,
    FLAN_T5_XL_GENERATION,
    T5_BASE_GENERATION,
    RobertaClassificationHead
)
from transformers import T5Tokenizer
from torchtext.prototype.generate import GenerationUtils

In [2]:
from t5_pytorch import T5
# importing the T5 model that has been built by hand

In [19]:
model_params={
    "MODEL":"t5-base",             # model_type: t5-base/t5-large
    "TRAIN_BATCH_SIZE":8,          # training batch size
    "VALID_BATCH_SIZE":8,          # validation batch size
    "TRAIN_EPOCHS":3,              # number of training epochs
    "VAL_EPOCHS":1,                # number of validation epochs
    "LEARNING_RATE":1e-4,          # learning rate
    "MAX_SOURCE_TEXT_LENGTH":512,  # max length of source text
    "MAX_TARGET_TEXT_LENGTH":50,   # max length of target text
    "SEED": 42                     # set seed for reproducibility 
}

In [20]:
model = T5(
    dim=768,
    enc_num_tokens=model_params['MAX_SOURCE_TEXT_LENGTH'],
    enc_depth=6,
    enc_heads=12,
    enc_dim_head=64,
    enc_mlp_mult=4,
    dec_num_tokens=model_params['MAX_TARGET_TEXT_LENGTH'],
    dec_depth=6,
    dec_heads=12,
    dec_dim_head=64,
    dec_mlp_mult=4,
    dropout=0.,
    tie_token_emb=True
)

In [21]:
# calling the tokeniser from_pretrained directly works.

tokenizer = T5Tokenizer.from_pretrained(
    pretrained_model_name_or_path='t5-base',
    model_max_length=model_params['MAX_SOURCE_TEXT_LENGTH']
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [22]:
file_loc = "D:\\gitFolders\\pytorch_hardway\\data\\news_summary"

In [23]:
def embed_sentence(sentence):
    source = tokenizer.batch_encode_plus([sentence[1]],
                                         max_length=model_params['MAX_SOURCE_TEXT_LENGTH'],
                                         pad_to_max_length=True,
                                         truncation=True,
                                         padding="max_length",
                                         return_tensors='pt')
    target = tokenizer.batch_encode_plus([sentence[0]],
                                         max_length=model_params['MAX_TARGET_TEXT_LENGTH'],
                                         pad_to_max_length=True,
                                         truncation=True,
                                         padding="max_length",
                                         return_tensors='pt')
    source_ids = source['input_ids'].squeeze()
    target_ids = target['input_ids'].squeeze()
    source_mask = source['attention_mask'].squeeze()
    target_mask = target['attention_mask'].squeeze()
    return source_ids, source_mask, target_ids, target_mask

In [24]:
from torchdata import datapipes as dp
from torch.utils.data import DataLoader

# when the file contains lot of rows, the pipeline takes more time to process, even to load the text and 
# provide text data

news_file = dp.iter.FileLister(file_loc)
news_open = dp.iter.FileOpener(news_file, mode='r', encoding='utf-8')
news_parser = news_open.parse_csv(delimiter=',')
news_embed = news_parser.map(embed_sentence)
news_batch = news_embed.batch(8)
news_loader = DataLoader(news_batch, batch_size=None)

In [25]:
news_columns = news_batch.rows2columnar(['source_ids', 'source_mask', 'target_ids', 'target_mask'])
columns_loader = DataLoader(news_columns, batch_size=None)
test_batch = next(iter(columns_loader))

In [28]:
source = test_batch['source_ids'][1]
source_mask = test_batch['source_mask'][1]
target = test_batch['target_ids'][1]

In [29]:
print(source.shape)
print(source_mask.shape)
print(target.shape)

torch.Size([512])
torch.Size([512])
torch.Size([50])


In [30]:
output = model(source, target, source_mask)

IndexError: index out of range in self

In [4]:
src = torch.randint(0, 512, (1, 1024))
src_mask = torch.ones_like(src).bool()
tgt = torch.randint(0, 512, (1, 1024))

output = model(src, tgt, mask=src_mask)
print(output.shape)

torch.Size([1, 1024, 512])


In [None]:
ptree_train, ptree_valid, ptree_test = PennTreebank(split=('train', 'valid', 'test'))
print(list(ptree_train)[0])
print(list(ptree_test)[0])
print(list(ptree_valid)[0])
# data is all single sentences, useful for language modelling only

In [None]:
sst2_train, sst2_dev, sst2_valid = SST2(split=('train','dev','test'),)

In [None]:
print(list(sst2_dev)[9])
print(list(sst2_train)[9])
print(list(sst2_valid)[9])

In [None]:
train_sq2, test_sq2 = SQuAD2(split=('train', 'dev'))
task = 'summarize'

In [None]:
list(test_sq2)[0]

In [None]:
def apply_prefix(task, x):
    """The function removes 2 columns and returns a processed tuple""" 
    return f"{task}: " + x[0], x[1]

In [None]:
train_sq2 = train_sq2.map(partial(apply_prefix, task))  # A partial is created with apply_prefix, and for 
# the next variable, data is taken from pipe and applied
# partial(apply_prefix, task) return a function to which data_point 'x', a tuple is applied
test_sq2 = test_sq2.map(partial(apply_prefix, task))

In [None]:
x = list(train_sq2)[2]
x

In [None]:
train_sq2_batch = train_sq2.batch(8)
test_sq2_batch = test_sq2.batch(8)

In [None]:
x = list(train_sq2_batch)
x[0]

In [None]:
test_sq2_rws = test_sq2_batch.rows2columnar(["explanation", "question"])
train_sq2_rws = train_sq2_batch.rows2columnar(["explanation", "question"])

In [None]:
list(train_sq2_rws)[0]

In [None]:
test_sq2_dataloader = DataLoader(test_sq2_rws, shuffle=True, batch_size=None)
train_sq2_dataloader = DataLoader(train_sq2_rws, shuffle=True, batch_size=None)

In [None]:
train_sq2_iter = iter(train_sq2_dataloader)
next(train_sq2_iter)  # Converts into the iterator that can be used for training

In [None]:
text_batch = next(train_sq2_iter)
text = text_batch['explanation']

In [None]:
# As the data is ready for feeding, we can get the model
padding_idx = 0
eos_idx = 1
max_seq_len = 512
t5_sp_model_path = "https://download.pytorch.org/models/text/t5_tokenizer_base.model"

transform = T5Transform(
    sp_model_path=t5_sp_model_path,
    max_seq_len=max_seq_len,
    eos_idx=eos_idx,
    padding_idx=padding_idx
)

In [None]:
transform("this is a test sentence")  # tensor([  48,   19,    3,    9,  794, 7142,    1])

In [None]:
t5_base = T5_BASE_GENERATION
transform = t5_base.transform()
model = t5_base.get_model()  # model has to be in the .cache/torch/hub/checkpoints/ 
model.eval()

In [None]:
model_input = transform(text)
model_input

In [None]:
from torchtext.prototype.generate import GenerationUtils

sequence_generator = GenerationUtils(model)

In [None]:
for s in text:
    print(s)
    print(len(s))

In [None]:

beam_size = 1
model_output = sequence_generator.generate(model_input,
                                           eos_idx=eos_idx,
                                           num_beams=beam_size)

In [None]:
output_text = transform.decode(model_output.tolist())

In [None]:
for s in output_text:
    print(s)
    print(len(s))

In [None]:
train_imdb, test_imdb = IMDB(split=('train','test'))

In [None]:
list(train_imdb)[0]

In [None]:
lab_class = {
    1:"positive",
    2:"negative"
}

def label_class(point):
    return 'sst2 sentence ' + point[1], lab_class[point[0]]


In [None]:
imdb_class_train = train_imdb.map(label_class)
imdb_class_test = test_imdb.map(label_class)

In [None]:
list(imdb_class_test)[0]

In [None]:
imdb_batch_train = imdb_class_train.batch(8)
imdb_batch_test = imdb_class_test.batch(8)

In [None]:
imdb_train_rws = imdb_batch_train.rows2columnar(['statement', 'class'])
imdb_test_rws = imdb_batch_test.rows2columnar(['statement', 'class'])

In [None]:
imdb_loader_train = DataLoader(imdb_train_rws, batch_size=None,)
imdb_loader_test = DataLoader(imdb_train_rws, batch_size=None,)

In [None]:
imdb_text_data = next(iter(imdb_loader_train))
classify_text = imdb_text_data['statement']

In [None]:
text = imdb_text_data['statement']
model_input = transform(text)
model_input = sequence_generator.generate(model_input, eos_idx=eos_idx, num_beams=1)

In [None]:
output_text = transform.decode(model_output.tolist())

In [None]:
for i in range(8):
    print(f"Example {i + 1}: \n")
    print(f"Input_text: {text[i]}\n")
    print(f"Output_text: {output_text[i]}\n")

In [None]:
# Loading the T5 base encoder model
from torchtext.models import T5_BASE_ENCODER
t5_encoder_base = T5_BASE_ENCODER
t5_encoder_transform = t5_encoder_base.transform()
input_seq = ['Hello there', 'Yo where is attention']
model_input = t5_encoder_transform(input_seq)
model_input

In [None]:
t5_b_enc_model = t5_encoder_base.get_model()

In [None]:
t5_b_enc_output = t5_b_enc_model(model_input)
t5_b_enc_output.keys()

In [None]:
from torchtext.models import T5_BASE
t5_base = T5_BASE
t5_transform = t5_base.transform()
in_seq = ["Hello Seq", "Attention rocks"]
mod_input = t5_transform(in_seq)
mod_input

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
def apply_sst_transform(point):
    return t5_transform(point[0]), point[1]

In [None]:
batch_size = 16

train_dp = sst2_train.map(apply_sst_transform)
train_dp = train_dp.batch(batch_size)
train_dp = train_dp.rows2columnar(['token_ids', 'target'])
train_dl = DataLoader(train_dp, batch_size=None)

dev_dp = sst2_dev.map(apply_sst_transform)
dev_dp = dev_dp.batch(batch_size)
dev_dp = dev_dp.rows2columnar(['token_ids', 'target'])
dev_dl = DataLoader(dev_dp, batch_size=None)

In [None]:
test_text = next(iter(train_dl))
test_text['token_ids']

In [None]:
num_classes = 2
input_dim = 768

classifier_head = RobertaClassificationHead(num_classes=num_classes,
                                            input_dim=input_dim)
t5_model = t5_base.get_model()  # Unable to load the special classification on t5 models
t5_model = t5_model.to(device)

In [None]:
t5_model(test_text['token_ids'][0])  # This wil throw error, as we need to provide a masked data for T5 model to work.

In [None]:
from torchtext import functional as F
from torch.optim import AdamW
from torch.nn import CrossEntropyLoss
learn_rate = 1e-5
optimiser = AdamW(t5_model.parameters(), lr=learn_rate)
criteria = CrossEntropyLoss()

In [None]:
def train_step(input, target):
    output = t5_model(input)  # get prediction 
    loss = criteria(output, target)  # get loss, !! wont this error out due to device mismatch? Nope, its taken care
    optimiser.zero_grad()  # optimiser zeroing gradient
    loss.backward()  # back prop
    optimiser.step()  # updating model params


def eval_step(input, target):
    output = t5_model(input)  # getting pred
    loss = criteria(output, target).item()  # getting loss
    return float(loss), (output.argmax(1) == target).type(torch.float).sum().item()
    # return the loss, along with the predicted output


def evaluate():
    t5_model.eval() # push model to eval mode
    tot_loss = 0
    correct_pred = 0
    tot_pred = 0
    counter = 0
    # declare supporting variables
    with torch.no_grad():
        for batch in dev_dl:
            input = F.to_tensor(batch["token_ids"], padding_value=1).to(device)
            target = torch.tensor(batch['target']).to(device)
            loss, preds = eval_step(input, target)
            total_loss += loss
            correct_pred += preds
            tot_pred += len(target)
            counter += 1

    return total_loss / counter, correct_pred / tot_pred

In [None]:
num_epochs = 1

for e in range(num_epochs):
    for batch in train_dl:
        input = batch['token_ids']
        target = F.to_tensor(batch['target']).to(device)
        train_step(input, target)

    loss, acc = evaluate()
    print(f"Epoch = {e}, loss = {loss}, accuracy = {acc}")

In [None]:
from torchtext.models import T5Conf, T5Bundle

#T5Conf is a Dataclass

encoder_conf = T5Conf(encoder_only=True)
model_checkpoint="C:\\Users\\kamal\\.cache\\torch\\hub\\checkpoint\\t5.base.encoder.v2.pt"
model = T5Bundle.build_model(config=encoder_conf, checkpoint=model_checkpoint)

In [None]:
print(encoder_conf)