<a href="https://colab.research.google.com/github/michimichiamo/question-answering/blob/main/QuestionGeneration.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Installs and imports

In [9]:
%%capture
!pip install sentencepiece
!pip install transformers
!pip install rich[jupyter]
!pip install torchmetrics==0.6
!pip install datasets
!pip install -U nltk

In [10]:
# Importing libraries
import os
import numpy as np
import pandas as pd
import torch

# Importing the T5 modules from huggingface/transformers
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Importing the function to load the metrics
from datasets import load_metric

from rich.table import Column, Table
from rich import box
from rich.console import Console

In [11]:
%cd /content
!rm -r question-answering
!git clone https://github.com/michimichiamo/question-answering/
%cd question-answering

/content
rm: cannot remove 'question-answering': No such file or directory
Cloning into 'question-answering'...
remote: Enumerating objects: 357, done.[K
remote: Counting objects: 100% (344/344), done.[K
remote: Compressing objects: 100% (280/280), done.[K
remote: Total 357 (delta 183), reused 175 (delta 63), pack-reused 13[K
Receiving objects: 100% (357/357), 134.35 MiB | 15.72 MiB/s, done.
Resolving deltas: 100% (186/186), done.
Checking out files: 100% (36/36), done.
/content/question-answering


# Read data

In [12]:
from util.model import read_npz

tr_ids, tr_contexts, tr_attention_masks, tr_questions = read_npz(path='./data/tokenized-qg-ans/', split='train', task='QG')
val_ids, val_contexts, val_attention_masks, val_questions = read_npz(path='./data/tokenized-qg-ans/', split='val', task='QG')

In [13]:
# define a rich console logger
console=Console(record=True)

def display_df(df):
    """display dataframe in ASCII format"""

    console=Console()
    table = Table(Column("source_text", justify="center" ), Column("target_text", justify="center"), title="Sample Data",pad_edge=False, box=box.ASCII)

    for i, row in enumerate(df.values.tolist()):
        table.add_row(row[0], row[1])

    console.print(table)

training_logger = Table(Column("Epoch", justify="center" ), 
                        Column("Steps", justify="center"),
                        Column("Loss", justify="center"), 
                        title="Training Status",pad_edge=False, box=box.ASCII)


#Prepare for training

In [14]:
# Setting up the device for GPU usage
device = 'cuda' if torch.cuda.is_available() else 'cpu'

## Dataset

In [15]:
class Dataset(torch.utils.data.Dataset):
    'Characterizes a dataset for PyTorch'
    def __init__(self, ids, contexts, attention_masks, questions):
        'Initialization'
        self.ids = ids
        self.contexts = contexts
        self.attention_masks = attention_masks
        self.questions = questions

    def __len__(self):
        'Denotes the total number of samples'
        return len(self.contexts)

    def __getitem__(self, index):
        'Generates one sample of data'
        # Select sample
        ID = self.ids[index]
        context = torch.tensor(self.contexts[index], dtype=torch.int32)
        attention_mask = torch.tensor(self.attention_masks[index], dtype=torch.int32)
        question = torch.tensor(self.questions[index], dtype=torch.int32)

        # Pack input and output
        X = (ID, context, attention_mask)
        y = question

        return X, y

## Utility functions

In [16]:
def train(model, optimizer, epoch, loader, pad_token, device):

  """
  Function to be called for training with the parameters passed from main function

  """

  model.train()
  loss_history = []
  for iteration, (X,y) in enumerate(loader, 0):
      # Unpack input
      _, context, attention_mask = X

      # Prepare input
      context = context.to(device, dtype = torch.long)
      attention_mask = attention_mask.to(device, dtype = torch.long)

      # Prepare target
      question = y.to(device, dtype = torch.long)
      question_ids = question[:, :-1].contiguous()
      lm_labels = question[:, 1:].clone().detach()
      lm_labels[question[:, 1:] == pad_token] = -100

      outputs = model(input_ids = context, attention_mask = attention_mask,
                      decoder_input_ids=question_ids, labels=lm_labels)
      loss = outputs[0]

      if iteration%10==0:
          training_logger.add_row(str(epoch), str(iteration), str(loss))
          console.print(training_logger)

      optimizer.zero_grad()
      loss.backward()
      optimizer.step()

      loss_history.append(loss)
  return loss_history

In [None]:
def question_generation(model, dataloader, tokenizer, repetition_penalty=5, length_penalty=1.5, temperature=1.0):
  tot_context = []
  tot_preds = []
  tot_target = []
  for iteration, (X,y) in enumerate(dataloader, 0):
    # Unpack input
    _, context, attention_mask = X

    question = y.to(device, dtype = torch.long)
    context = context.to(device, dtype = torch.long)
    attention_mask = attention_mask.to(device, dtype = torch.long)

    # Generate uestions
    generated_ids = model.generate(
      input_ids = context,
      attention_mask = attention_mask, 
      max_length=150, 
      num_beams=2,
      repetition_penalty=repetition_penalty, 
      length_penalty=length_penalty, 
      temperature=temperature,
      early_stopping=True,
      do_sample=False,
      )
    
    tot_context += [tot_context.append(tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in context]
    tot_preds += [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids]
    tot_target += [tokenizer.decode(t, skip_special_tokens=True, clean_up_tokenization_spaces=True)for t in question]

  return tot_context, tot_preds, tot_target

In [17]:
def validate(model, epoch, loader, tokenizer, device):

    """
    Function to evaluate model for predictions

    """
    model.eval()
    predictions = []
    actuals = []
    with torch.no_grad():
        for iteration, (X,y) in enumerate(loader, 0):
            # Unpack input
            _, context, attention_mask = X

            question = y.to(device, dtype = torch.long)
            context = context.to(device, dtype = torch.long)
            attention_mask = attention_mask.to(device, dtype = torch.long)

            generated_ids = model.generate(
                input_ids = context,
                attention_mask = attention_mask, 
                max_length=150, 
                num_beams=2,
                repetition_penalty=2.5, 
                length_penalty=1.0, 
                early_stopping=True
                )
            preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids]
            target = [tokenizer.decode(t, skip_special_tokens=True, clean_up_tokenization_spaces=True)for t in question]
            if iteration%10==0:
                console.print(f'Completed {iteration}')

            predictions.extend(preds)
            actuals.extend(target)
    return predictions, actuals

In [18]:
def define_model(model_params):
    """
    Model definition

    """
    # Set random seeds and deterministic pytorch for reproducibility
    torch.manual_seed(model_params["SEED"]) # pytorch random seed
    np.random.seed(model_params["SEED"]) # numpy random seed
    torch.backends.cudnn.deterministic = True

    # logging
    console.log(f"""[Model]: Loading {model_params["MODEL"]}...\n""")

    # tokenizer for encoding the text
    tokenizer = T5Tokenizer.from_pretrained(model_params["TOKENIZER"])

    # Defining the model. We are using t5-base model and added a Language model layer on top for generation of Summary. 
    # Further this model is sent to device (GPU/TPU) for using the hardware.
    model = T5ForConditionalGeneration.from_pretrained(model_params["MODEL"])
    model = model.to(device)

    # Defining the optimizer that will be used to tune the weights of the network in the training session. 
    optimizer = torch.optim.Adam(params=model.parameters(), lr=model_params["LEARNING_RATE"])

    return model, optimizer, tokenizer

def prepare_data(train_dataset, val_dataset, model_params):
    # logging
    console.log(f"[Data]: Reading data...\n")

    # Defining the parameters for creation of dataloaders
    train_params = {
        'batch_size': model_params["TRAIN_BATCH_SIZE"],
        'shuffle': True,
        'num_workers': 0,
        'pin_memory' : True
        }


    val_params = {
        'batch_size': model_params["VALID_BATCH_SIZE"],
        'shuffle': False,
        'num_workers': 0,
        'pin_memory' : True
        }


    # Creation of Dataloaders for testing and validation. This will be used down for training and validation stage for the model.
    train_loader = torch.utils.data.DataLoader(train_dataset, **train_params)
    val_loader = torch.utils.data.DataLoader(val_dataset, **val_params)

    return train_loader, val_loader


def training_loop(model, optimizer, tokenizer, model_params,
                  train_loader, val_loader, save=False, output_dir="./outputs/"):
    
    """
    Model training

    """

    # Training loop
    console.log(f'[Initiating Fine Tuning]...\n')

    pad_token = tokenizer.pad_token_id
    loss_history = []
    for epoch in range(model_params["TRAIN_EPOCHS"]):
        loss = train(model, optimizer, epoch, train_loader, pad_token, device)
        loss_history += loss
    
    #Saving the model after training    
    if save:
        console.log(f"[Saving Model]...\n")
        path = os.path.join(output_dir, "model_files")
        model.save_pretrained(path)
        tokenizer.save_pretrained(path)
        np.save("./outputs/loss_history.npy", np.array(loss_history))


    # Evaluation
    console.log(f"[Initiating Validation]...\n")
    for epoch in range(model_params["VAL_EPOCHS"]):
        predictions, actuals = validate(model, epoch, val_loader, tokenizer, device)
        final_df = pd.DataFrame({'Generated Text':predictions,'Actual Text':actuals})
        final_df.to_csv(os.path.join(output_dir,'predictions.csv'))
  

## Hyperparameters

In [19]:
ours = False
model_params={
    "MODEL":"t5-small" if not ours else './model',            # model_type: t5-base/t5-large
    "TOKENIZER": "t5-small",
    "TRAIN_BATCH_SIZE":8,          # training batch size
    "VALID_BATCH_SIZE":8,          # validation batch size
    "TRAIN_EPOCHS":2,              # number of training epochs
    "VAL_EPOCHS":1,                # number of validation epochs
    "LEARNING_RATE":1e-4,          # learning rate
    "MAX_SOURCE_TEXT_LENGTH":512,  # max length of source text
    "MAX_TARGET_TEXT_LENGTH":50,   # max length of target text
    "SEED": 42                     # set seed for reproducibility 

}

## Create Dataloader


In [20]:
tr_sz = -1
val_sz = -1

train_dataset = Dataset(tr_ids[:tr_sz], tr_contexts[:tr_sz], tr_attention_masks[:tr_sz], tr_questions[:tr_sz])
val_dataset = Dataset(val_ids[:val_sz], val_contexts[:val_sz], val_attention_masks[:val_sz], val_questions[:val_sz])

In [21]:
model, optimizer, tokenizer = define_model(model_params)
train_loader, val_loader = prepare_data(train_dataset, val_dataset, model_params)

Downloading:   0%|          | 0.00/773k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.32M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.17k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/231M [00:00<?, ?B/s]

In [22]:
import gc
#del train_dataset, val_dataset
gc.collect()

416

# Train

In [None]:
training_loop(model, optimizer, tokenizer, model_params,
            train_loader, val_loader, save=True, output_dir="./outputs/")

# Question Generation

In [None]:
cont, preds, targ = question_generation(model, val_loader, tokenizer)

In [None]:
cont, preds, targ = question_generation(model, val_loader, tokenizer, repetition_penalty=10.5, temperature=0.7)

In [None]:
cont, preds, targ = question_generation(model, val_loader, tokenizer, repetition_penalty=10.5, temperature=0.3)

# Evaluation

In [41]:
def evaluate_model(dataloader, weights_path='iarfmoose/t5-base-question-generator'):
  model = T5ForConditionalGeneration.from_pretrained(weights_path)
  tokenizer = T5Tokenizer.from_pretrained('t5-small')
  meteor_metric = load_metric("meteor")
  bleu_metric = load_metric("bleu")

  tot_bleu = 0
  tot_meteor = 0

  for iteration, (X,y) in enumerate(dataloader, 0):
    # Unpack input
    _, context, attention_mask = X

    question = y.to(device, dtype = torch.long)
    context = context.to(device, dtype = torch.long)
    attention_mask = attention_mask.to(device, dtype = torch.long)

    generated_ids = model.generate(
      input_ids = context,
      attention_mask = attention_mask, 
      max_length=150, 
      num_beams=2,
      temperature=0.7, 
      repetition_penalty=10.5, 
      length_penalty=1.5, 
      early_stopping=True,
      do_sample=False,
      )
    
    preds_text = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True).split() for g in generated_ids]
    target_text = [[tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True).split()] for g in question]
    
    results = bleu_metric.compute(predictions=preds_text, references=target_text)
    tot_bleu += results["bleu"]

    results = meteor_metric.compute(predictions=preds_text, references=target_text)
    tot_meteor += round(results["meteor"], 4)
    
  return tot_bleu/iteration, tot_meteor/iteration
    

In [None]:
evaluate_model(val_loader)

Downloading:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/850M [00:00<?, ?B/s]

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
