<h1>Fine-tuning T5 on Sentence Generation</h1>

In [None]:
!jt -t monokai

<h3>Import Libraries</h3>

In [None]:
import os
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler

# Importing the T5 modules from huggingface/transformers
from transformers import T5Tokenizer, T5ForConditionalGeneration
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModelWithLMHead, DistilBertTokenizer, DistilBertForMaskedLM, AutoModelForSeq2SeqLM

from rich.table import Column, Table
from rich import box
from rich.console import Console

# define a rich console logger
console=Console(record=True)

from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
cuda.empty_cache()

import json

def display_df(df):
  """display dataframe in ASCII format"""

  console=Console()
  table = Table(Column("source_text", justify="center" ), Column("target_text", justify="center"), title="Sample Data",pad_edge=False, box=box.ASCII)

  for i, row in enumerate(df.values.tolist()):
    table.add_row(row[0], row[1])

  console.print(table)

training_logger = Table(Column("Epoch", justify="center" ), 
                        Column("Steps", justify="center"),
                        Column("Loss", justify="center"), 
                        title="Training Status",pad_edge=False, box=box.ASCII)

<h3>DataSetClass</h3>
<h4>Custom dataset class for loading the dataset and passing it to the model</h4>

In [None]:
class YourDataSetClass(Dataset):
    """
    Creating a custom dataset for reading the dataset and 
    loading it into the dataloader to pass it to the neural network for finetuning the model

    """

    def __init__(self, dataframe, tokenizer, source_len, target_len, source_text, target_text):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.source_len = source_len
        self.summ_len = target_len
        self.target_text = self.data[target_text]
        self.source_text = self.data[source_text]

    def __len__(self):
        return len(self.target_text)

    def __getitem__(self, index):
        source_text = str(self.source_text[index])
        target_text = str(self.target_text[index])

        #cleaning data so as to ensure data is in string type
        source_text = ' '.join(source_text.split())
        target_text = ' '.join(target_text.split())

        source = self.tokenizer.batch_encode_plus([source_text], max_length= self.source_len, pad_to_max_length=True, truncation=True, padding="max_length", return_tensors='pt')
        target = self.tokenizer.batch_encode_plus([target_text], max_length= self.summ_len, pad_to_max_length=True, truncation=True, padding="max_length", return_tensors='pt')

        source_ids = source['input_ids'].squeeze()
        source_mask = source['attention_mask'].squeeze()
        target_ids = target['input_ids'].squeeze()
        target_mask = target['attention_mask'].squeeze()

        temp = {
            'source_ids': source_ids.to(dtype=torch.long), 
            'source_mask': source_mask.to(dtype=torch.long), 
            'target_ids': target_ids.to(dtype=torch.long),
            'target_ids_y': target_ids.to(dtype=torch.long)
        }
        
        return temp

<h3>Train method</h3>

In [None]:
def train(epoch, tokenizer, model, device, loader, optimizer):

    """
    Function to be called for training with the parameters passed from main function

    """

    model.train()
    for _,data in enumerate(loader, 0):
        y = data['target_ids'].to(device, dtype = torch.long)
        y_ids = y[:, :-1].contiguous()
        lm_labels = y[:, 1:].clone().detach()
        lm_labels[y[:, 1:] == tokenizer.pad_token_id] = -100
        ids = data['source_ids'].to(device, dtype = torch.long)
        mask = data['source_mask'].to(device, dtype = torch.long)

        outputs = model(input_ids = ids, attention_mask = mask, decoder_input_ids=y_ids, labels=lm_labels)
        loss = outputs[0]

        if _%10==0:
            training_logger.add_row(str(epoch), str(_), str(loss))
            console.print(training_logger)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

<h3>Validate method</h3>

In [None]:
def validate(epoch, tokenizer, model, device, loader):

    """
    Function to evaluate model for predictions

    """
    model.eval()
    predictions = []
    actuals = []
    with torch.no_grad():
      for _, data in enumerate(loader, 0):
          y = data['target_ids'].to(device, dtype = torch.long)
          ids = data['source_ids'].to(device, dtype = torch.long)
          mask = data['source_mask'].to(device, dtype = torch.long)

          generated_ids = model.generate(
              input_ids = ids,
              attention_mask = mask, 
              max_length=150, 
              num_beams=2,
              repetition_penalty=2.5, 
              length_penalty=1.0, 
              early_stopping=True
              )
        
          prompt = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in ids]
          preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids]
          target = [tokenizer.decode(t, skip_special_tokens=True, clean_up_tokenization_spaces=True)for t in y]
          if _%10==0:
              console.print(f'Completed {_}')

          predictions.extend(preds)
          actuals.extend(target)
            
          #print(f"Predictions: {predictions}\nActuals: {actuals}\nPrompt: {prompt}")
    return prompt, predictions, actuals

In [None]:
def predict(tokenizer, model, source_text_key, target_text_key, source_text, target_text, model_params):
    model.eval()
    predictions = []
    actuals = []
    
    val_params = {
      'batch_size': model_params["VALID_BATCH_SIZE"],
      'shuffle': False,
      'num_workers': 0
      }
    
    data_src = pd.DataFrame([{'knowledge_sent': source_text, 'human_sent': target_text}])
    data_loader = DataLoader(YourDataSetClass(data_src, tokenizer, 64, 64, source_text_key, target_text_key), **val_params)
    #print(f"data type: {data.testthis('target')}")
    with torch.no_grad():
        for _, data in enumerate(data_loader, 0):
            y = data['target_ids'].to(device, dtype = torch.long)
            ids = data['source_ids'].to(device, dtype = torch.long)
            mask = data['source_mask'].to(device, dtype = torch.long)

            generated_ids = model.generate(
                input_ids = ids,
                max_length=150, 
                num_beams=2
              )
            
            print(f"Generated ids: {generated_ids}\n\n")

            prompt = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in ids]
            preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids]
            target = [tokenizer.decode(t, skip_special_tokens=True, clean_up_tokenization_spaces=True)for t in y]

            predictions.extend(preds)
            actuals.extend(target)

            print(f"Predictions: {predictions[0]}\nActuals: {actuals[0]}\nPrompt: {prompt[0]}")
        return predictions, actuals

In [None]:
def T5Trainer(dataframe, source_text, target_text, model_params, output_dir="./outputs/" ):

    """
    T5 trainer

    """

    # Set random seeds and deterministic pytorch for reproducibility
    torch.manual_seed(model_params["SEED"]) # pytorch random seed
    np.random.seed(model_params["SEED"]) # numpy random seed
    torch.backends.cudnn.deterministic = True

    # logging
    console.log(f"""[Model]: Loading {model_params["MODEL"]}...\n""")

    # tokenzier for encoding the text
    tokenizer = T5Tokenizer.from_pretrained(model_params["MODEL"])
    #tokenizer = AutoTokenizer.from_pretrained(model_params["MODEL"])
    #tokenizer.add_special_tokens({'pad_token': '[PAD]'})

    # Defining the model. We are using t5-base model and added a Language model layer on top for generation of Summary. 
    # Further this model is sent to device (GPU/TPU) for using the hardware.
    model = T5ForConditionalGeneration.from_pretrained(model_params["MODEL"])
    #model = AutoModelForCausalLM.from_pretrained(model_params["MODEL"])
    #model = AutoModelWithLMHead.from_pretrained(model_params["MODEL"])
    model = model.to(device)

    # logging
    console.log(f"[Data]: Reading data...\n")

    # Importing the raw dataset
    dataframe = dataframe[[source_text,target_text]]
    display_df(dataframe.head(2))


    # Creation of Dataset and Dataloader
    # Defining the train size. So 80% of the data will be used for training and the rest for validation. 
    train_size = 0.8
    train_dataset=dataframe.sample(frac=train_size,random_state = model_params["SEED"])
    val_dataset=dataframe.drop(train_dataset.index).reset_index(drop=True)
    train_dataset = train_dataset.reset_index(drop=True)

    console.print(f"FULL Dataset: {dataframe.shape}")
    console.print(f"TRAIN Dataset: {train_dataset.shape}")
    console.print(f"TEST Dataset: {val_dataset.shape}\n")

    # Creating the Training and Validation dataset for further creation of Dataloader
    training_set = YourDataSetClass(train_dataset, tokenizer, model_params["MAX_SOURCE_TEXT_LENGTH"], model_params["MAX_TARGET_TEXT_LENGTH"], source_text, target_text)
    val_set = YourDataSetClass(val_dataset, tokenizer, model_params["MAX_SOURCE_TEXT_LENGTH"], model_params["MAX_TARGET_TEXT_LENGTH"], source_text, target_text)


    # Defining the parameters for creation of dataloaders
    train_params = {
      'batch_size': model_params["TRAIN_BATCH_SIZE"],
      'shuffle': True,
      'num_workers': 0
      }


    val_params = {
      'batch_size': model_params["VALID_BATCH_SIZE"],
      'shuffle': False,
      'num_workers': 0
      }


    # Creation of Dataloaders for testing and validation. This will be used down for training and validation stage for the model.
    training_loader = DataLoader(training_set, **train_params)
    val_loader = DataLoader(val_set, **val_params)
    
    """
    for eg in val_loader:
        print(eg)
        break
    return 1
    """


    # Defining the optimizer that will be used to tune the weights of the network in the training session. 
    optimizer = torch.optim.Adam(params =  model.parameters(), lr=model_params["LEARNING_RATE"])


    # Training loop
    console.log(f'[Initiating Fine Tuning]...\n')

    for epoch in range(model_params["TRAIN_EPOCHS"]):
      train(epoch, tokenizer, model, device, training_loader, optimizer)

    console.log(f"[Saving Model]...\n")
    #Saving the model after training
    path = os.path.join(output_dir, "model_files")
    model.save_pretrained(path)
    tokenizer.save_pretrained(path)


    # evaluating test dataset
    console.log(f"[Initiating Validation]...\n")
    for epoch in range(model_params["VAL_EPOCHS"]):
        knowledge_sent, predictions, actuals = validate(epoch, tokenizer, model, device, val_loader)
        final_df = pd.DataFrame({'Knowledge Sentence': knowledge_sent, 'Generated Text':predictions,'Actual Text':actuals})
        final_df.to_csv(os.path.join(output_dir,'predictions.csv'))

    console.save_text(os.path.join(output_dir,'logs.txt'))

    console.log(f"[Validation Completed.]\n")
    console.print(f"""[Model] Model saved @ {os.path.join(output_dir, "model_files")}\n""")
    console.print(f"""[Validation] Generation on Validation data saved @ {os.path.join(output_dir,'predictions.csv')}\n""")
    console.print(f"""[Logs] Logs saved @ {os.path.join(output_dir,'logs.txt')}\n""")
    
    return tokenizer, model


<h3>Load dataset</h3>

In [None]:
with open('wizards-of-wikipedia-data-extraction/out.json', 'r') as json_file:
    raw_dataset = json.load(json_file)

In [None]:
df = pd.read_pickle('wizards-of-wikipedia-data-extraction/out.pkl')
print(df[:500])

<h2>Run the model</h2>

In [None]:
#distilgpt2
model_path = '../models/t5-small'
source_text_key = 'knowledge_sent'
target_text_key = 'human_sent'
model_params={
    "MODEL":model_path,             # model_type: t5-base/t5-large
    "TRAIN_BATCH_SIZE":16,          # training batch size
    "VALID_BATCH_SIZE":2,          # validation batch size
    "TRAIN_EPOCHS":4,              # number of training epochs
    "VAL_EPOCHS":1,                # number of validation epochs
    "LEARNING_RATE":2e-4,          # learning rate
    "MAX_SOURCE_TEXT_LENGTH":64,  # max length of source text
    "MAX_TARGET_TEXT_LENGTH":64,   # max length of target text
    "SEED": 42                     # set seed for reproducibility 
}

In [None]:
tokenizer, model = T5Trainer(dataframe=df[:1000], source_text="knowledge_sent", target_text="human_sent", model_params=model_params, output_dir="outputs")

In [None]:
#test_src_text = "Science fiction (often shortened to SF or sci-fi) is a genre of speculative fiction, typically dealing with imaginative concepts such as futuristic science and technology, space travel, time travel, faster than light travel, parallel universes, and extraterrestrial life"
#test_target_text = "I think science fiction is an amazing genre for anything. Future science, technology, time travel, FTL trave;, they're all such interesting concepts"
#test_src_text = "Multan is a city and capital of Multan Division located in Punjab, Pakistan. Situated on the bank of the Chenab River, Multan is Pakistan's 7th largest city and is the major cultural and economic centre of Southern Punjab. Multan's history stretches deep into antiquity. The ancient city was site of the renowned Hindu Multan Sun Temple, and was besieged by Alexander the Great during the Mallian Campaign."
path = './outputs/model_files'
tokenizer = T5Tokenizer.from_pretrained(path, local_files_only=True)
    #tokenizer = AutoTokenizer.from_pretrained(model_params["MODEL"])
    #tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# Defining the model. We are using t5-base model and added a Language model layer on top for generation of Summary. 
# Further this model is sent to device (GPU/TPU) for using the hardware.
model = T5ForConditionalGeneration.from_pretrained(path, local_files_only=True)
#model = AutoModelForCausalLM.from_pretrained(model_params["MODEL"])
#model = AutoModelWithLMHead.from_pretrained(model_params["MODEL"])
model = model.to(device)
test_src_text = "Do you like cycling?"
test_target_text = "Of the three primary colors, Blue is my favorite"
predict(tokenizer, model, 'knowledge_sent', 'human_sent', test_src_text, test_target_text, model_params)

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

tokenizer = T5Tokenizer.from_pretrained(model_path)
model = T5ForConditionalGeneration.from_pretrained(model_path)

input_ids = tokenizer("summarize: Six tournaments have so far been played, and only the West Indies, who currently hold the title, has won the tournament on multiple occasions. The inaugural 2007 World Twenty20, was staged in South Africa, and won by India, who defeated Pakistan in the final at the Wanderers Stadium in Johannesburg. The 2009 tournament took place in England, and was won by the previous runner-up, Pakistan, who defeated Sri Lanka in the final at Lord's. The third tournament was held in 2010, hosted by the countries making up the West Indies cricket team.", return_tensors='pt', add_special_tokens=True).input_ids
outputs = model.generate(input_ids)
for output in outputs:
    print(tokenizer.decode(output, skip_special_tokens=True))

In [None]:
from transformers import AutoModelWithLMHead, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelWithLMHead.from_pretrained(model_path)

def paraphrase(text, max_length=128):

  input_ids = tokenizer.encode(text, return_tensors="pt", add_special_tokens=True)

  generated_ids = model.generate(input_ids=input_ids, num_return_sequences=5, num_beams=5, max_length=max_length, no_repeat_ngram_size=2, repetition_penalty=3.5, length_penalty=1.0, early_stopping=True)

  preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids]

  return preds
  
preds = paraphrase("paraphrase: What is the best framework for dealing with a huge text dataset?")

for pred in preds:
  print(pred)

<h3>T5 on fill-in-the-blank task</h3>

In [None]:
import torch
from transformers import T5Tokenizer, T5Config, T5ForConditionalGeneration

T5_PATH = '../models/t5-base' # "t5-small", "t5-base", "t5-large", "t5-3b", "t5-11b"

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # My envirnment uses CPU
DEVICE = 'cpu'

t5_tokenizer = T5Tokenizer.from_pretrained(T5_PATH)
t5_config = T5Config.from_pretrained(T5_PATH)
t5_mlm = T5ForConditionalGeneration.from_pretrained(T5_PATH, config=t5_config).to(DEVICE)

# Input text
#text = "Why was the tournament postponed? <extra_id_0> due to COVID-19. </s>"

In [None]:
def fill_mask(text):
    encoded = t5_tokenizer.encode_plus(text, add_special_tokens=True, return_tensors='pt')
    input_ids = encoded['input_ids'].to(DEVICE)
    
    print(f"INPUT TEXT: {text}")

    # Generaing 20 sequences with maximum length set to 5
    outputs = t5_mlm.generate(input_ids=input_ids, 
                              num_beams=200, num_return_sequences=1, max_length=20)
    
    results = []
    for output in outputs:
        result = _filter(text, output)
        results.append(result)

def _filter(text, output, end_token='<extra_id_1>'):
    # The first token is <unk> (inidex at 0) and the second token is <extra_id_0> (indexed at 32099)
    _txt = t5_tokenizer.decode(output[2:], skip_special_tokens=False, clean_up_tokenization_spaces=False)
    
    prev_end_token = "<extra_id_0>"
    result = text[:text.index(prev_end_token)] # Set result to the substring before the first sentinel mask
    for i in range(1,100):
        end_token = "<extra_id_" + str(i) + ">"
        
        print(f"DEBUG: {text}\n{prev_end_token}")
        _prev_index = text.index(prev_end_token)
        _result_prefix = text[:_prev_index]
        _result_suffix = text[_prev_index+len(prev_end_token):]
        
        print(f"\n\nprefix: {_result_prefix}\nsuffix: {_result_suffix}")
    
        print(f"TEST: {end_token}\t{_txt}")
        if end_token in _txt:
            print(f"TOKEN {end_token} IS IN TEXT: {_txt}\n{text}")
            _prev_end_token_index = _txt.index(prev_end_token) + len(prev_end_token)
            _end_token_index = _txt.index(end_token)
            if prev_end_token is None:
                result = result + _txt[:_end_token_index]
            else:
                print(f"STUFF TO ADD: {_txt[_prev_end_token_index:_end_token_index]}")
                result = result + _txt[_prev_end_token_index:_end_token_index]
            #return _result_prefix + _txt[:_end_token_index] + _result_suffix
        else:
            print(f"TOKEN {end_token} IS NOT IN TEXT")
            #return _result_prefix + _txt + _result_suffix
            break
        
        print(f"STATE OF RESULT: {result}")
        
        prev_end_token = end_token
        
    print(f"RESULT: {result + _result_suffix}")
    
    return result + _result_suffix

In [None]:
while True:
    user_input = input(">> User:")
    if user_input == "exit":
        break
    fill_mask(user_input)

In [None]:
def fill_masks_helper(text, output):
    result = ""

    starting_mask = "<extra_id_0>"
    if starting_mask in text:
        result = text[:text.index(starting_mask)]
    else:
        return text
    
    _txt = t5_tokenizer.decode(output, skip_special_tokens=False, clean_up_tokenization_spaces=False)
    
    prev_mask = None
    for i in range(0,100):
        cur_mask = "<extra_id_" + str(i) + ">"
        next_mask = "<extra_id_" + str(i+1) + ">"
        
        # Calculate the text to fill the mask
        # If the model was not able to fill the current blank, then replace the mask with an empty string
        #print(f"Decoded text: {_txt}")
        if cur_mask in _txt:
            _prev_end_token_index = _txt.index(cur_mask) + len(cur_mask)
            if next_mask in _txt:
                _end_token_index = _txt.index(next_mask)
            else:
                _end_token_index = len(_txt)
            print(f"Text to fill mask: {_txt[_prev_end_token_index:_end_token_index]}")

            # Fill the current mask
            result += _txt[_prev_end_token_index:_end_token_index] #"placeholder"
        
        suffix_start_index = text.index(cur_mask) + len(cur_mask) # get the index right after the mask being replaced

        if next_mask in text:
            # There is a mask which hasn't been filled, append the string up till that mask
            # The mask that has been found will be filled in the next loop
            suffix_end_index = text.index(next_mask)
        else:
            # All the masks present in the original text have been replaced
            # Append any remaining text after the last replaced mask and return the filled string
            return result + text[suffix_start_index:]

        # Append the result string with the substring after the mask that was filled in this loop
        # Up till the next mask, of the end of the string, whichever comes first
        result = result + text[suffix_start_index:suffix_end_index]
        
        # Update the previous mask
        prev_mask = cur_mask
        

In [None]:
def fill_mask(text):
    encoded = t5_tokenizer.encode_plus(text, add_special_tokens=True, return_tensors='pt')
    input_ids = encoded['input_ids'].to(DEVICE)
    
    print(f"INPUT TEXT: {text}")

    # Generaing 20 sequences with maximum length set to 5
    outputs = t5_mlm.generate(input_ids=input_ids, 
                              num_beams=10, num_return_sequences=3, max_length=6)
    
    results = []
    for output in outputs:
        result = fill_masks_helper(text, output)
        print(f"Result: {result}")
        results.append(result)

In [None]:
# text = "How many tournaments has Pakistan won? <extra_id_0> six <extra_id_1> . </s>"
# text = "why was the next edition of the tournament postponed to 2021?. It <extra_id_0> due to covid - 19 <extra_id_1> ."
killme = "The World Drivers' Championship, which became the FIA Formula One World Championship in 1981, has been one of the premier forms of racing around the world since its inaugural season in 1950."
# text = "What is the capital of england and the united kingdom? London is the capital of the United Kingdom. <extra_id_0> london <extra_id_1> ."
killme1 = "When did the Driver's Championship become the FIA Formula One World Championship?"
killme2 = "<extra_id_0> 1981 <extra_id_1> ."
fill_mask(f"{killme1} {killme2}")

In [None]:
continue_parsing = True
cur_mask = "<extra_id_0>"

# Calculate prefix index and prefix
if cur_mask in text:
    cur_prefix_index = text.index(cur_mask)
    cur_prefix = text[:cur_prefix_index]
else:
    overall_suffix_start_index = text.index(prev_mask + len(prev_mask)) if prev_mask is not None else 0
    result = result + text[overall_suffix_start_index:]
    return result


# Calculate suffix indexes and suffix
cur_suffix_start_index = cur_prefix_index + len(cur_mask)
next_mask = "<extra_id_1>"
if next_mask in text:
    continue_parsing = True
    cur_suffix_end_index = text.index(next_mask)
else:
    continue_parsing = False
    cur_suffix_end_index = len(text)

print(f"suffix info: {cur_suffix_start_index}, {cur_suffix_end_index}")
cur_suffix = text[cur_suffix_start_index:cur_suffix_end_index]
print(cur_prefix, "\n", cur_suffix)

<h3>OG FILL IN THE BLANKS CODE</h3>

In [None]:
import torch
from transformers import T5Tokenizer, T5Config, T5ForConditionalGeneration

T5_PATH = '../models/t5-base' # "t5-small", "t5-base", "t5-large", "t5-3b", "t5-11b"

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # My envirnment uses CPU

t5_tokenizer = T5Tokenizer.from_pretrained(T5_PATH)
t5_config = T5Config.from_pretrained(T5_PATH)
t5_mlm = T5ForConditionalGeneration.from_pretrained(T5_PATH, config=t5_config).to(DEVICE)

In [None]:
# Input text
text = 'summarize: How many tournaments has Pakistan won? six <extra_id_0> . </s>'

encoded = t5_tokenizer.encode_plus(text, add_special_tokens=True, return_tensors='pt')
input_ids = encoded['input_ids'].to(DEVICE)

# Generaing 20 sequences with maximum length set to 5
outputs = t5_mlm.generate(input_ids=input_ids, 
                          num_beams=200, num_return_sequences=5,
                          max_length=5)

_0_index = text.index('<extra_id_0>')
_result_prefix = text[:_0_index]
_result_suffix = text[_0_index+12:]  # 12 is the length of <extra_id_0>

def _filter(output, end_token='<extra_id_1>'):
    # The first token is <unk> (inidex at 0) and the second token is <extra_id_0> (indexed at 32099)
    _txt = t5_tokenizer.decode(output[2:], skip_special_tokens=False, clean_up_tokenization_spaces=False)
    if end_token in _txt:
        _end_token_index = _txt.index(end_token)
        return _result_prefix + _txt[:_end_token_index] + _result_suffix
    else:
        return _result_prefix + _txt + _result_suffix

results = list(map(_filter, outputs))
results

<h3>Question Generation</h3>

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

model_path = "../models/t5-base-e2e-qg"
tokenizer = T5Tokenizer.from_pretrained(model_path)
model = T5ForConditionalGeneration.from_pretrained(model_path)

In [None]:

q1 = """summarize: Six tournaments have so far been played, and only the West Indies, who currently hold the title, has won the tournament on multiple occasions. The inaugural 2007 World Twenty20, was staged in South Africa, and won by India, who defeated Pakistan in the final at the Wanderers Stadium in Johannesburg. The 2009 tournament took place in England, and was won by the previous runner-up, Pakistan, who defeated Sri Lanka in the final at Lord's. The third tournament was held in 2010, hosted by the countries making up the West Indies cricket team."""

q2 = """question: What does increased oxygen concentrations in the patient’s
lungs displace? context: Hyperbaric (high-pressure) medicine uses special
oxygen chambers to increase the partial pressure of O 2 around the patient
and, when needed, the medical staff. Carbon monoxide poisoning, gas gangrene,
and decompression sickness (the ’bends’) are sometimes treated using these
devices. Increased O 2 concentration in the lungs helps to displace carbon
monoxide from the heme group of hemoglobin. Oxygen gas is poisonous to the
anaerobic bacteria that cause gas gangrene, so increasing its partial pressure
helps kill them. Decompression sickness occurs in divers who decompress too
quickly after a dive, resulting in bubbles of inert gas, mostly nitrogen and
helium, forming in their blood. Increasing the pressure of O 2 as soon as
possible is part of the treatment."""

input_ids = tokenizer(q1, return_tensors='pt', add_special_tokens=True).input_ids
outputs = model.generate(input_ids)
for output in outputs:
    print(tokenizer.decode(output, skip_special_tokens=True))

<h3>T5 For Question Answering</h3>

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

model_path = "../models/t5-base"
tokenizer = T5Tokenizer.from_pretrained(model_path)
model = T5ForConditionalGeneration.from_pretrained(model_path)

In [None]:
# q0 = "question: How many tournaments have been held so far? context: Six tournaments have so far been played, and only the West Indies, who currently hold the title, has won the tournament on multiple occasions. The inaugural 2007 World Twenty20, was staged in South Africa, and won by India, who defeated Pakistan in the final at the Wanderers Stadium in Johannesburg. The 2009 tournament took place in England, and was won by the previous runner-up, Pakistan, who defeated Sri Lanka in the final at Lord's. The third tournament was held in 2010, hosted by the countries making up the West Indies cricket team."

#q = "I like Mercedes."
#c = """Multan is a city and capital of Multan Division located in Punjab, Pakistan. Situated on the bank of the Chenab River, Multan is Pakistan's 7th largest city and is the major cultural and economic centre of Southern Punjab. Multan's history stretches deep into antiquity. The ancient city was site of the renowned Hindu Multan Sun Temple, and was besieged by Alexander the Great during the Mallian Campaign. Multan was one of the most important trading centres of medieval Islamic India, and attracted a multitude of Sufi mystics in the 11th and 12th centuries, earning the city the sobriquet "City of Saints". The city, along with the nearby city of Uch, is renowned for its large number of Sufi shrines dating from that era."""

q = "Is Karachi liberal?"
c = "Karachi is Pakistan's most cosmopolitan city, linguistically, ethnically, and religiously diverse, as well as one of Pakistan's most secular and socially liberal cities."
question = f"question: {q} context: {c}"
# q1 = "ask_question: Six tournaments have so far been played, and only the West Indies, who currently hold the title, has won the tournament on multiple occasions. The inaugural 2007 World Twenty20, was staged in South Africa, and won by India, who defeated Pakistan in the final at the Wanderers Stadium in Johannesburg. The 2009 tournament took place in England, and was won by the previous runner-up, Pakistan, who defeated Sri Lanka in the final at Lord's. The third tournament was held in 2010, hosted by the countries making up the West Indies cricket team."
# q2 = "ask_question: Organised by cricket's governing body, the International Cricket Council (ICC), the tournament currently consists of 16 teams, comprising the top ten teams from the rankings at the given deadline and six other teams chosen through the T20 World Cup Qualifier."
input_ids = tokenizer(question, return_tensors='pt', add_special_tokens=True).input_ids
outputs = model.generate(input_ids)

decoded_outputs = []
for output in outputs:
    decoded_output = tokenizer.decode(output, skip_special_tokens=True)
    print(decoded_output)
    decoded_outputs.append(decoded_output)
    

# decoded_outputs[0] = "Mumbai"
inp = f"mnli hypothesis: {decoded_outputs[0]} . premise: {c} {q}"

input_ids = tokenizer(inp, return_tensors='pt', add_special_tokens=True).input_ids
outputs = model.generate(input_ids)

for output in outputs:
    print(tokenizer.decode(output, skip_special_tokens=True))

<h4>No Answer Sanity Check</h4>

In [None]:
inp = f"qnli question: {c} Which river is near Multan? The Chenab River . sentence: I also don't know what you're talking about"

input_ids = tokenizer(inp, return_tensors='pt', add_special_tokens=True).input_ids
outputs = model.generate(input_ids)

for output in outputs:
    print(tokenizer.decode(output, skip_special_tokens=True))

<h4>Contradiction Check - For QA</h4>

In [None]:
inp = f"mnli hypothesis: Daimler Benz . premise: Karl Benz founded Mercedes in 1901. Who founded Mercedes?"

input_ids = tokenizer(inp, return_tensors='pt', add_special_tokens=True).input_ids
outputs = model.generate(input_ids)

for output in outputs:
    print(tokenizer.decode(output, skip_special_tokens=True))

<h3>Contradiction Check - for answer fragments</h3>

In [None]:
inp = f"mnli hypothesis: Henry Ford. premise: Karl Benz founded Mercedes in 1901. Who founded Mercedes? Karl Benz"

input_ids = tokenizer(inp, return_tensors='pt', add_special_tokens=True).input_ids
outputs = model.generate(input_ids)

for output in outputs:
    print(tokenizer.decode(output, skip_special_tokens=True))

In [None]:
t20_text = "Organised by cricket's governing body, the International Cricket Council (ICC), the tournament currently consists of 16 teams, comprising the top ten teams from the rankings at the given deadline and six other teams chosen through the T20 World Cup Qualifier"
inp = f"summarize: {t20_text}. What do you think about the qualifiers?"

input_ids = tokenizer(inp, return_tensors='pt', add_special_tokens=True).input_ids
outputs = model.generate(input_ids)

for output in outputs:
    print(tokenizer.decode(output, skip_special_tokens=True))