## Installs and imports

In [1]:
%%capture
!pip install sentencepiece
!pip install transformers
!pip install rich[jupyter]
!pip install torchmetrics==0.6

In [2]:
# Importing libraries
import os
import numpy as np
import pandas as pd
import torch

# Importing the T5 modules from huggingface/transformers
from transformers import T5Tokenizer, T5ForConditionalGeneration

from rich.table import Column, Table
from rich import box
from rich.console import Console

In [3]:
%cd /content
!rm -r question-answering
!git clone https://github.com/michimichiamo/question-answering/
%cd question-answering

/content
rm: cannot remove 'question-answering': No such file or directory
Cloning into 'question-answering'...
remote: Enumerating objects: 290, done.[K
remote: Counting objects: 100% (277/277), done.[K
remote: Compressing objects: 100% (222/222), done.[K
remote: Total 290 (delta 147), reused 145 (delta 55), pack-reused 13[K
Receiving objects: 100% (290/290), 96.19 MiB | 10.51 MiB/s, done.
Resolving deltas: 100% (150/150), done.
Checking out files: 100% (27/27), done.
/content/question-answering


In [4]:
from util.model import read_npz

tr_ids, tr_contexts, tr_attention_masks, tr_questions = read_npz(path='./data/tokenized-qg/', split='train', task='QG')
val_ids, val_contexts, val_attention_masks, val_questions = read_npz(path='./data/tokenized-qg/', split='val', task='QG')

In [5]:
# define a rich console logger
console=Console(record=True)

def display_df(df):
    """display dataframe in ASCII format"""

    console=Console()
    table = Table(Column("source_text", justify="center" ), Column("target_text", justify="center"), title="Sample Data",pad_edge=False, box=box.ASCII)

    for i, row in enumerate(df.values.tolist()):
        table.add_row(row[0], row[1])

    console.print(table)

training_logger = Table(Column("Epoch", justify="center" ), 
                        Column("Steps", justify="center"),
                        Column("Loss", justify="center"), 
                        title="Training Status",pad_edge=False, box=box.ASCII)


In [6]:
# Setting up the device for GPU usage
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [7]:
#class YourDataSetClass(torch.utils.data.Dataset):
#  """
#  Creating a custom dataset for reading the dataset and 
#  loading it into the dataloader to pass it to the neural network for finetuning the model
#
#  """
#
#  def __init__(self, dataframe, tokenizer, source_len, target_len, source_text, target_text):
#    self.tokenizer = tokenizer
#    self.data = dataframe
#    self.source_len = source_len
#    self.summ_len = target_len
#    self.target_text = self.data[target_text]
#    self.source_text = self.data[source_text]
#
#  def __len__(self):
#    return len(self.target_text)
#
#  def __getitem__(self, index):
#    source_text = str(self.source_text[index])
#    target_text = str(self.target_text[index])
#
#    #cleaning data so as to ensure data is in string type
#    source_text = ' '.join(source_text.split())
#    target_text = ' '.join(target_text.split())
#
#    source = self.tokenizer.batch_encode_plus([source_text], max_length= self.source_len, pad_to_max_length=True, truncation=True, padding="max_length", return_tensors='pt')
#    target = self.tokenizer.batch_encode_plus([target_text], max_length= self.summ_len, pad_to_max_length=True, truncation=True, padding="max_length", return_tensors='pt')
#
#    source_ids = source['input_ids'].squeeze()
#    source_mask = source['attention_mask'].squeeze()
#    target_ids = target['input_ids'].squeeze()
#    target_mask = target['attention_mask'].squeeze()
#
#    return {
#        'source_ids': source_ids.to(dtype=torch.long), 
#        'source_mask': source_mask.to(dtype=torch.long), 
#        'target_ids': target_ids.to(dtype=torch.long),
#        'target_ids_y': target_ids.to(dtype=torch.long)
#    }

In [8]:
class Dataset(torch.utils.data.Dataset):
    'Characterizes a dataset for PyTorch'
    def __init__(self, ids, contexts, attention_masks, questions):
        'Initialization'
        self.ids = ids
        self.contexts = contexts
        self.attention_masks = attention_masks
        self.questions = questions

    def __len__(self):
        'Denotes the total number of samples'
        return len(self.contexts)

    def __getitem__(self, index):
        'Generates one sample of data'
        # Select sample
        ID = self.ids[index]
        context = torch.tensor(self.contexts[index], dtype=torch.int32)
        attention_mask = torch.tensor(self.attention_masks[index], dtype=torch.int32)
        question = torch.tensor(self.questions[index], dtype=torch.int32)

        # Pack input and output
        X = (ID, context, attention_mask)
        y = question

        return X, y

In [9]:
def train(model, optimizer, epoch, loader, pad_token, device):

    """
    Function to be called for training with the parameters passed from main function

    """

    model.train()
    for iteration, (X,y) in enumerate(loader, 0):
        # Unpack input
        _, context, attention_mask = X

        # Prepare input
        context = context.to(device, dtype = torch.long)
        attention_mask = attention_mask.to(device, dtype = torch.long)

        # Prepare target
        question = y.to(device, dtype = torch.long)
        question_ids = question[:, :-1].contiguous()
        lm_labels = question[:, 1:].clone().detach()
        lm_labels[question[:, 1:] == pad_token] = -100

        outputs = model(input_ids = context, attention_mask = attention_mask,
                        decoder_input_ids=question_ids, labels=lm_labels)
        loss = outputs[0]

        if iteration%10==0:
            training_logger.add_row(str(epoch), str(iteration), str(loss))
            console.print(training_logger)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

In [10]:
def validate(model, epoch, loader, tokenizer, device):

    """
    Function to evaluate model for predictions

    """
    model.eval()
    predictions = []
    actuals = []
    with torch.no_grad():
        for iteration, (X,y) in enumerate(loader, 0):
            # Unpack input
            _, context, attention_mask = X

            question = y.to(device, dtype = torch.long)
            context = context.to(device, dtype = torch.long)
            attention_mask = attention_mask.to(device, dtype = torch.long)

            generated_ids = model.generate(
                input_ids = context,
                attention_mask = attention_mask, 
                max_length=150, 
                num_beams=2,
                repetition_penalty=2.5, 
                length_penalty=1.0, 
                early_stopping=True
                )
            preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids]
            target = [tokenizer.decode(t, skip_special_tokens=True, clean_up_tokenization_spaces=True)for t in question]
            if iteration%10==0:
                console.print(f'Completed {iteration}')

            predictions.extend(preds)
            actuals.extend(target)
    return predictions, actuals

In [11]:
#def T5Trainer(train_dataset, val_dataset, model_params, output_dir="./outputs/" ):
#    
#    """
#    T5 trainer
#
#    """
#
#    # Set random seeds and deterministic pytorch for reproducibility
#    torch.manual_seed(model_params["SEED"]) # pytorch random seed
#    np.random.seed(model_params["SEED"]) # numpy random seed
#    torch.backends.cudnn.deterministic = True
#
#    # logging
#    console.log(f"""[Model]: Loading {model_params["MODEL"]}...\n""")
#
#    # tokenizer for encoding the text
#    tokenizer = T5Tokenizer.from_pretrained(model_params["MODEL"])
#
#    # Defining the model. We are using t5-base model and added a Language model layer on top for generation of Summary. 
#    # Further this model is sent to device (GPU/TPU) for using the hardware.
#    model = T5ForConditionalGeneration.from_pretrained(model_params["MODEL"])
#    model = model.to(device)
#    
#    # logging
#    console.log(f"[Data]: Reading data...\n")
#
#    # Defining the parameters for creation of dataloaders
#    train_params = {
#        'batch_size': model_params["TRAIN_BATCH_SIZE"],
#        'shuffle': True,
#        'num_workers': 0
#        }
#
#
#    val_params = {
#        'batch_size': model_params["VALID_BATCH_SIZE"],
#        'shuffle': False,
#        'num_workers': 0
#        }
#
#
#    # Creation of Dataloaders for testing and validation. This will be used down for training and validation stage for the model.
#    training_loader = torch.utils.data.DataLoader(train_dataset, **train_params)
#    val_loader = torch.utils.data.DataLoader(val_dataset, **val_params)
#
#
#    # Defining the optimizer that will be used to tune the weights of the network in the training session. 
#    optimizer = torch.optim.Adam(params =  model.parameters(), lr=model_params["LEARNING_RATE"])
#
#
#    # Training loop
#    console.log(f'[Initiating Fine Tuning]...\n')
#
#    for epoch in range(model_params["TRAIN_EPOCHS"]):
#        train(epoch, tokenizer, model, device, training_loader, optimizer)
#        
#    console.log(f"[Saving Model]...\n")
#    #Saving the model after training
#    path = os.path.join(output_dir, "model_files")
#    model.save_pretrained(path)
#    tokenizer.save_pretrained(path)
#
#
#    # evaluating test dataset
#    console.log(f"[Initiating Validation]...\n")
#    for epoch in range(model_params["VAL_EPOCHS"]):
#        predictions, actuals = validate(epoch, tokenizer, model, device, val_loader)
#        final_df = pd.DataFrame({'Generated Text':predictions,'Actual Text':actuals})
#        final_df.to_csv(os.path.join(output_dir,'predictions.csv'))
#  
#    console.save_text(os.path.join(output_dir,'logs.txt'))
#    
#    console.log(f"[Validation Completed.]\n")
#    console.print(f"""[Model] Model saved @ {os.path.join(output_dir, "model_files")}\n""")
#    console.print(f"""[Validation] Generation on Validation data saved @ {os.path.join(output_dir,'predictions.csv')}\n""")
#    console.print(f"""[Logs] Logs saved @ {os.path.join(output_dir,'logs.txt')}\n""")

In [11]:
def define_model(model_params):
    """
    Model definition

    """
    # Set random seeds and deterministic pytorch for reproducibility
    torch.manual_seed(model_params["SEED"]) # pytorch random seed
    np.random.seed(model_params["SEED"]) # numpy random seed
    torch.backends.cudnn.deterministic = True

    # logging
    console.log(f"""[Model]: Loading {model_params["MODEL"]}...\n""")

    # tokenizer for encoding the text
    tokenizer = T5Tokenizer.from_pretrained(model_params["MODEL"])

    # Defining the model. We are using t5-base model and added a Language model layer on top for generation of Summary. 
    # Further this model is sent to device (GPU/TPU) for using the hardware.
    model = T5ForConditionalGeneration.from_pretrained(model_params["MODEL"])
    model = model.to(device)

    # Defining the optimizer that will be used to tune the weights of the network in the training session. 
    optimizer = torch.optim.Adam(params=model.parameters(), lr=model_params["LEARNING_RATE"])

    return model, optimizer, tokenizer

def prepare_data(train_dataset, val_dataset, model_params):
    # logging
    console.log(f"[Data]: Reading data...\n")

    # Defining the parameters for creation of dataloaders
    train_params = {
        'batch_size': model_params["TRAIN_BATCH_SIZE"],
        'shuffle': True,
        'num_workers': 0,
        'pin_memory' : True
        }


    val_params = {
        'batch_size': model_params["VALID_BATCH_SIZE"],
        'shuffle': False,
        'num_workers': 0,
        'pin_memory' : True
        }


    # Creation of Dataloaders for testing and validation. This will be used down for training and validation stage for the model.
    train_loader = torch.utils.data.DataLoader(train_dataset, **train_params)
    val_loader = torch.utils.data.DataLoader(val_dataset, **val_params)

    return train_loader, val_loader


def training_loop(model, optimizer, tokenizer, model_params,
                  train_loader, val_loader, save=False, output_dir="./outputs/"):
    
    """
    Model training

    """

    # Training loop
    console.log(f'[Initiating Fine Tuning]...\n')

    pad_token = tokenizer.pad_token_id

    for epoch in range(model_params["TRAIN_EPOCHS"]):
        train(model, optimizer, epoch, train_loader, pad_token, device)
    
    #Saving the model after training    
    if save:
        console.log(f"[Saving Model]...\n")
        path = os.path.join(output_dir, "model_files")
        model.save_pretrained(path)
        tokenizer.save_pretrained(path)


    # Evaluation
    console.log(f"[Initiating Validation]...\n")
    for epoch in range(model_params["VAL_EPOCHS"]):
        predictions, actuals = validate(model, epoch, val_loader, tokenizer, device)
        final_df = pd.DataFrame({'Generated Text':predictions,'Actual Text':actuals})
        final_df.to_csv(os.path.join(output_dir,'predictions.csv'))
  
    console.save_text(os.path.join(output_dir,'logs.txt'))
    
    console.log(f"[Validation Completed.]\n")
    console.print(f"""[Model] Model saved @ {os.path.join(output_dir, "model_files")}\n""")
    console.print(f"""[Validation] Generation on Validation data saved @ {os.path.join(output_dir,'predictions.csv')}\n""")
    console.print(f"""[Logs] Logs saved @ {os.path.join(output_dir,'logs.txt')}\n""")

In [12]:
model_params={
    "MODEL":"t5-small",            # model_type: t5-base/t5-large
    "TRAIN_BATCH_SIZE":8,          # training batch size
    "VALID_BATCH_SIZE":8,          # validation batch size
    "TRAIN_EPOCHS":3,              # number of training epochs
    "VAL_EPOCHS":1,                # number of validation epochs
    "LEARNING_RATE":1e-4,          # learning rate
    "MAX_SOURCE_TEXT_LENGTH":512,  # max length of source text
    "MAX_TARGET_TEXT_LENGTH":50,   # max length of target text
    "SEED": 42                     # set seed for reproducibility 

}

In [13]:
tr_sz = -1
val_sz = -1

train_dataset = Dataset(tr_ids[:tr_sz], tr_contexts[:tr_sz], tr_attention_masks[:tr_sz], tr_questions[:tr_sz])
val_dataset = Dataset(val_ids[:val_sz], val_contexts[:val_sz], val_attention_masks[:val_sz], val_questions[:val_sz])

In [14]:
model, optimizer, tokenizer = define_model(model_params)
train_loader, val_loader = prepare_data(train_dataset, val_dataset, model_params)

Downloading:   0%|          | 0.00/773k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.32M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.17k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/231M [00:00<?, ?B/s]

In [15]:
import gc
del train_dataset, val_dataset
gc.collect()

421

In [None]:
training_loop(model, optimizer, tokenizer, model_params,
            train_loader, val_loader, save=False, output_dir="./outputs/")

In [17]:
save = False
output_dir="./outputs/"

In [18]:
"""
Model training
"""
# Training loop
console.log(f'[Initiating Fine Tuning]...\n')
for epoch in range(model_params["TRAIN_EPOCHS"]):
    train(epoch, tokenizer, model, device, train_loader, optimizer)

#Saving the model after training    
if save:
    console.log(f"[Saving Model]...\n")
    path = os.path.join(output_dir, "model_files")
    model.save_pretrained(path)
    tokenizer.save_pretrained(path)

# Evaluation
console.log(f"[Initiating Validation]...\n")
for epoch in range(model_params["VAL_EPOCHS"]):
    predictions, actuals = validate(epoch, tokenizer, model, device, val_loader)
    final_df = pd.DataFrame({'Generated Text':predictions,'Actual Text':actuals})
    final_df.to_csv(os.path.join(output_dir,'predictions.csv'))

console.save_text(os.path.join(output_dir,'logs.txt'))

console.log(f"[Validation Completed.]\n")
console.print(f"""[Model] Model saved @ {os.path.join(output_dir, "model_files")}\n""")
console.print(f"""[Validation] Generation on Validation data saved @ {os.path.join(output_dir,'predictions.csv')}\n""")
console.print(f"""[Logs] Logs saved @ {os.path.join(output_dir,'logs.txt')}\n""")

KeyboardInterrupt: ignored

In [None]:
#T5Trainer(train_loader, val_loader, model_params=model_params, output_dir="outputs")