In [None]:
!pip install transformers -U
!pip install langdetect

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import torch
from torch import optim
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split
import random
import json
from tqdm import tqdm
from transformers import T5Tokenizer, T5ForConditionalGeneration, T5Config
from torchtext.data.metrics import bleu_score
from langdetect import detect
import spacy

spacy_nlp = spacy.load("en_core_web_sm")
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

In [None]:
# config
config = {
    "end_token" : "</s>",
    "start_ans" : "<extra_id_1>",
    "end_ans" : "<extra_id_1>",
    "batch_size" : 8,
    "max_len_context" : 512,
    "max_len_question" : 20,
    "task" : "ask_question: ",
    "epoch" : 2,
    "path_model" : "t5_question_generation.pth",
    "learning_rate" : 5e-4,
    "schedule_rate" : 0.83,
    "period_decay" : 3,
    "accumulation_step" : 32
}

config_ask_question = {
    "early_stopping": True,
    "max_length": 20,
    "min_length" : 3,
    "num_beams": 4,
    "prefix": "ask_question: "
}

configT5_model = T5Config.from_pretrained('t5-base')
configT5_model.task_specific_params['ask_question'] = config_ask_question

In [None]:
#Tokenizer
class Tokenizer:
    def __init__(self,):
        self.tokenizer = T5Tokenizer.from_pretrained('t5-base')
    
    def encode_context(self, context, start_ans, end_ans):
        """
            this functin is used to encode the context with highlighting the positions of answer
            format of input context : "aks_question : context <start_ans> answer <end_answer> context </s>"
            
            return dict{input_ids, attention_mask, token_type_ids}
        """
        before_ans = context[:start_ans]
        ans = context[start_ans:end_ans]
        after_ans = context[end_ans:]
        input = config['task'] + before_ans + config['start_ans'] + ans + config['end_ans'] + after_ans + " " + config['end_token']
        output = self.tokenizer.encode_plus(input, max_length = config['max_len_context'], pad_to_max_length=True)
        return output
    
    def encode_question(self, question):
        """
            this function is used to encode the question
            format of input question : "question </s>"
            output : list of input_ids
        """
        input = question + " " + config['end_token']
        output = self.tokenizer.encode_plus(input, max_length=config['max_len_question'], pad_to_max_length=True)
        return output

In [None]:
# dataset
class QDataset(Dataset):
    
    def __init__(self, filename, tokenizer):
        self.data = pd.read_csv(filename)
        self.tokenizer = tokenizer
        self.shuffle()
        
    def shuffle(self):
        self.data = self.data.sample(frac=1)
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        item = self.data.iloc[idx]
        context, question, start_ans, end_ans = item['context'], item['question'], item['start_ans'], item['end_answer']
        input_dict, output_dict = self.tokenizer.encode_context(context, start_ans, end_ans), self.tokenizer.encode_question(question)
        return input_dict['input_ids'], input_dict['attention_mask'], output_dict['input_ids'], output_dict['attention_mask']

In [None]:
# model
class QGModel(nn.Module):
    def __init__(self, configT5):
        super(QGModel, self).__init__()
        self.t5_model = T5ForConditionalGeneration.from_pretrained('t5-base', config=configT5)
        
    def forward(self, input_ids_ctx, attention_mask_ctx, input_ids_qt = None, attention_mask_qt = None):
        output = self.t5_model(input_ids=input_ids_ctx, attention_mask=attention_mask_ctx, 
                               decoder_attention_mask=attention_mask_qt, lm_labels=input_ids_qt)
        return output
    
    def predict(self, intput_ids_ctx, attention_mask=None):
        output = self.t5_model.generate(intput_ids_ctx, attention_mask)
        return output

In [None]:
def evaluate(model, data_loader, tokenizer, device):
    predictions = []
    score = 0
    model.eval()
    
    with torch.no_grad():
        for _, batch in enumerate(tqdm(data_loader)):
            input_ids_ctx = torch.stack(batch[0], dim=1).to(device)
            input_ids_qt = torch.stack(batch[2], dim=1).to(device)
            output = model.predict(input_ids_ctx)
            
            for k in range(output.shape[0]):
                ground_truth = tokenizer.tokenizer.decode(input_ids_qt[k].tolist())
                predicted_question = tokenizer.tokenizer.decode(output[k].tolist())
                predictions.append((ground_truth, predicted_question))
    return score, predictions

In [None]:
def trainEpoch(model, optimizer, train_data_loader, val_data_loader, tokenizer, device):
    model.train()
    optimizer.zero_grad()
    losses = []
    for step, batch in enumerate(tqdm(train_data_loader)):
        input_ids_ctx = torch.stack(batch[0], dim=1).to(device)
        attention_mask_ctx = torch.stack(batch[1], dim=1).to(device)
        input_ids_qt = torch.stack(batch[2], dim=1).to(device)
        attention_mask_qt = torch.stack(batch[3], dim=1).to(device)
        
        loss, _, _, _ = model(input_ids_ctx, attention_mask_ctx, input_ids_qt=input_ids_qt, attention_mask_qt=attention_mask_qt)
        loss /= config['accumulation_step']
        loss.backward()
        
        if (step + 1) % config['accumulation_step'] == 0:
            optimizer.step()
            optimizer.zero_grad()
            losses.append(loss.item())
    
    score, predictions = evaluate(model, val_data_loader, tokenizer, device)
    
    return losses, predictions

In [None]:
# global training
def globalTraining(model, optimizer, train_dataset, val_dataset, tokenizer, device):
    model.to(device)
    for epoch in range(config['epoch']):
        train_data_loader = DataLoader(train_dataset, batch_size=config['batch_size'], shuffle=True)
        val_data_loader = DataLoader(val_dataset, batch_size=config['batch_size'], shuffle=False)
        losses, prediction = trainEpoch(model, optimizer, train_data_loader, val_data_loader, tokenizer, device)
        
        #print information
        print("Epoch {}/{}, loss = {} ----> {}".format(epoch, config['epoch'], losses[0], losses[-1]))
        
        #save model
        checkpoint = {
            "model_state_dict" : model.state_dict(),
            "optimizer_sate_dict" : optimizer.state_dict()
        }
        torch.save(checkpoint, config['path_model'])
        with open("losses{}.json".format(epoch + 1), "w") as file:
            json.dump(losses, file)
        with open("prediction{}.json".format(epoch + 1), "w") as file:
            json.dump(prediction, file)

# Training

In [None]:
tokenizer = Tokenizer()

In [None]:
dataset = QDataset("/kaggle/input/squad20csv/squad2.0.csv", tokenizer)

In [None]:
train_dataset, val_dataset = random_split(dataset, [80000, 6562])

In [None]:
loader = DataLoader(train_dataset, batch_size=8)

In [None]:
model = QGModel(configT5_model)

In [None]:
optimizer = optim.Adam(model.parameters())
scheduler = optim.lr_scheduler.StepLR(optimizer, 3, config['schedule_rate'])

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
globalTraining(model, optimizer, train_dataset, val_dataset, tokenizer, device)

# Generate Question according to a context

In [None]:
tokenizer = Tokenizer()

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
model = QGModel(configT5_model).to(device)
state_dict = torch.load('/kaggle/input/questiongt5/t5_question_generation.pth', map_location=device)
model.load_state_dict(state_dict['model_state_dict'])

In [None]:
def generate(model, tokenizer, context, device, list_dict_answers = None):
    # check the length of context
    if len(tokenizer.tokenizer.tokenize(context)) >= config['max_len_context'] - 4:
        raise ValueError("context too long")
    
    # check whether the context is in english
    lang = detect(context)
    if lang != 'en':
        raise ValueError('context should be in english')
    
    # define inputs of the model
    input_ids = []
    
    # preprocessing
    # name entity extraction
    ner = spacy_nlp(context)
    batch_size = len(ner.ents)
    answers = []
    for ent in ner.ents:
        temp_dict = tokenizer.encode_context(context, ent.start_char, ent.end_char)
        input_ids.append(temp_dict['input_ids'])
        answers.append(ent.text)
    
    if list_dict_answers is not None:
        for item in list_dict_answers:
            start_ans, end_ans = item['start_ans'], item['end_ans']
            answer = context[start_ans:end_ans]
            tem_dict = tokenizer.encode_context(context, start_ans, end_ans)
            input_ids.append(temp_dict['input_ids'])
            answers.append(answer)
            
            batch_size += 1

    input_ids = torch.LongTensor(input_ids).to(device)
    
    #predict question
    results = []
    for k in range(batch_size):
        predicted_question = model.predict(input_ids[k].unsqueeze(0).to(device)).squeeze(0)
        results.append(
            {
                "question" : tokenizer.tokenizer.decode(predicted_question.tolist()),
                "answer" : answers[k]
            }
        )
    
    return results

In [None]:
context = "The study of chemical kinetics concerns the second and third questions—that is, the rate at which a reaction yields products and the molecular-scale means by which a reaction occurs. This chapter examines the factors that influence the rates of chemical reactions, the mechanisms by which reactions proceed, and the quantitative techniques used to describe the rates at which reactions occur."

In [None]:
generate(model, tokenizer, context, device)