In [None]:
import pandas as pd
df = pd.read_csv("./dataset.csv")
df.dropna(inplace=True)

In [None]:
df.head()

In [None]:
# Importing libraries
import os
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
import os

# Importing the T5 modules from huggingface/transformers
from transformers import T5Tokenizer, T5ForConditionalGeneration


In [None]:
# Setting up the device for GPU usage
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [None]:
def pad_ids_with_mask(ids, max_length=-1, pad_id=0):
    if max_length == -1:
        return ids, [1] * len(ids)
    if len(ids) >= max_length:
        return ids[:max_length], [1] * max_length
    else:
        pad_len = max_length - len(ids)
        ids = ids + [pad_id] * pad_len
        mask = [1] * (max_length - pad_len) + [0] * pad_len
        return ids, mask

In [None]:
class T5Dataset(Dataset):
  """
  Creating a custom dataset for reading the dataset and 
  loading it into the dataloader to pass it to the neural network for finetuning the model

  """

  def __init__(self, dataframe, tokenizer, max_source_len, max_target_len):
    self.tokenizer = tokenizer
    self.data = dataframe
    self.max_source_len = max_source_len
    self.max_target_len = max_target_len

  def __len__(self):
    return len(self.data)

  def __getitem__(self, index):
    row = self.data.iloc[index]
    sentence_1 = str(row['sentence_1'])
    sentence_2 = str(row['sentence_2'])
    source_text = str(row['source'])
    target_text = str(row['target'])

    source_ids = []
    target_ids = []

    source_list = [sentence_1, sentence_2, source_text]
    for i in range(len(source_list)):
      source_ids += [32127] if i % 2 == 0 else [32126]
      src_ids = self.tokenizer.encode(
        source_list[i],
        add_special_tokens=False
      )
      source_ids += src_ids

    source_ids, source_id_mask = pad_ids_with_mask(source_ids, max_length=self.max_source_len-1)
    source_ids += [1]
    source_id_mask += [1]

    target_ids = self.tokenizer.encode(
      target_text,
      add_special_tokens=False
    )
    target_ids, target_id_mask = pad_ids_with_mask(target_ids, max_length=self.max_target_len)

    return {
        'source_ids': torch.tensor(source_ids, dtype=torch.long),
        'source_id_mask': torch.tensor(source_id_mask, dtype=torch.long),
        'target_ids': torch.tensor(target_ids, dtype=torch.long),
        'target_id_mask': torch.tensor(target_id_mask, dtype=torch.long)
    }

In [None]:
def train(epoch, tokenizer, model, device, loader, optimizer):
  model.train()

  for _, data in enumerate(loader, 0):
    y = data['target_ids'].to(device)
    y_ids = y[:, :-1].contiguous()
    lm_labels = y[:, 1:].clone().detach()
    lm_labels[y[:, 1:] == tokenizer.pad_token_id] = -100
    ids = data['source_ids'].to(device, dtype = torch.long)
    mask = data['source_id_mask'].to(device, dtype = torch.long)

    outputs = model(input_ids = ids, attention_mask = mask, decoder_input_ids=y_ids, labels=lm_labels)
    loss = outputs[0]

    if _ % 10 == 0:
      print(loss.item())

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

In [None]:
def T5Trainer(dataframe, model_params, output_dir="./outputs/" ):

  # Set random seeds and deterministic pytorch for reproducibility
  torch.manual_seed(model_params["SEED"]) # pytorch random seed
  np.random.seed(model_params["SEED"]) # numpy random seed
  torch.backends.cudnn.deterministic = True

  # logging
  print(f"""[Model]: Loading {model_params["MODEL"]}...\n""")

  # tokenzier for encoding the text
  tokenizer = T5Tokenizer.from_pretrained(model_params["MODEL"])

  # Defining the model. We are using t5-base model and added a Language model layer on top for generation of Summary. 
  # Further this model is sent to device (GPU/TPU) for using the hardware.
  model = T5ForConditionalGeneration.from_pretrained(model_params["MODEL"])
  model = model.to(device)
  
  # Creation of Dataset and Dataloader
  train_dataset = dataframe
  train_dataset = train_dataset.reset_index(drop=True)


  # Creating the Training and Validation dataset for further creation of Dataloader
  training_set = T5Dataset(train_dataset, tokenizer, model_params["MAX_SOURCE_TEXT_LENGTH"], model_params["MAX_TARGET_TEXT_LENGTH"])
  # val_set = T5Dataset(val_dataset, tokenizer, model_params["MAX_SOURCE_TEXT_LENGTH"], model_params["MAX_TARGET_TEXT_LENGTH"], source_text, target_text)


  # Defining the parameters for creation of dataloaders
  train_params = {
      'batch_size': model_params["TRAIN_BATCH_SIZE"],
      'shuffle': True,
      'num_workers': 0
      }


  training_loader = DataLoader(training_set, **train_params)

  # Defining the optimizer that will be used to tune the weights of the network in the training session. 
  optimizer = torch.optim.Adam(params=model.parameters(), lr=model_params["LEARNING_RATE"])

  # Training loop
  print(f'[Initiating Fine Tuning]...\n')

  for epoch in range(model_params["TRAIN_EPOCHS"]):
      train(epoch, tokenizer, model, device, training_loader, optimizer)
      
  print(f"[Saving Model]...\n")
  #Saving the model after training
  model.save_pretrained(output_dir)
  tokenizer.save_pretrained(output_dir)

In [None]:
model_params={
    "MODEL":"./mengzi-t5-base",             # model_type: t5-base/t5-large
    "TRAIN_BATCH_SIZE":8,          # training batch size
    "VALID_BATCH_SIZE":8,          # validation batch size
    "TRAIN_EPOCHS":3,              # number of training epochs
    "VAL_EPOCHS":1,                # number of validation epochs
    "LEARNING_RATE":1e-4,          # learning rate
    "MAX_SOURCE_TEXT_LENGTH":128,  # max length of source text
    "MAX_TARGET_TEXT_LENGTH":64,   # max length of target text
    "SEED": 42                     # set seed for reproducibility 

}

In [None]:
T5Trainer(dataframe=df, model_params=model_params, output_dir="outputs")

In [None]:
from transformers import T5Tokenizer
tokenizer = T5Tokenizer.from_pretrained('./mengzi-t5-base/')