In [None]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import TensorDataset, DataLoader, Dataset
import jsonlines
from transformers import T5ForConditionalGeneration, T5Tokenizer, T5Config, get_linear_schedule_with_warmup, AdamW
from tqdm.notebook import tqdm_notebook
import time
import datetime

In [None]:
train_bool_df = pd.read_json('./BoolQ/train.jsonl', lines=True)
val_bool_df = pd.read_json('./BoolQ/val.jsonl', lines=True)

In [None]:
bool_df_all = train_bool_df.append(val_bool_df)
bool_df_all = bool_df_all.dropna()

In [None]:
true_df = bool_df_all[bool_df_all.label==True]
true_df = true_df.sample(frac=1).reset_index(drop=True).head(5000)

In [None]:
false_df = bool_df_all[bool_df_all.label==False]

In [None]:
bool_req = true_df.append(false_df)
bool_req = bool_req.sample(frac=1).reset_index(drop=True)

In [None]:
squad_full = pd.read_csv('./SQuAD_csv.csv')

In [None]:
squad_full = squad_full.sample(frac=1).reset_index(drop=True)

In [None]:
squad_req = squad_full.head(10000)

In [None]:
squad_req = squad_req.drop(["Unnamed: 0","id","answer_start"], axis=1)

In [None]:
t5_tok = T5Tokenizer.from_pretrained("t5-large")

### boolq prep

In [None]:
def input_target_text_boolq(df):

    prefix = 'boolqgen'
    
    input_text_arr = []
    target_text_arr = []
    
    for i in df.index:
        input_text_arr.append(prefix + ' answer: ' + df.label[i]+ ' context: ' + df.passage[i])
        target_text_arr.append(df.question[i] + '?')
        
    return(input_text_arr, target_text_arr)

In [None]:
bool_input_text, bool_target_text = input_target_text_boolq(bool_req)

In [None]:
bool_df = pd.DataFrame(list(zip(bool_input_text, bool_target_text)), columns = ['input_text', 'target_text'])

In [None]:
bool_inp_text = bool_df.input_text.values
bool_target_text = bool_df.target_text.values

In [None]:
bool_inp_ids = []
bool_inp_att_mask = []
for ctx in tqdm_notebook(bool_inp_text):
    tok = t5_tok.encode_plus(ctx, max_length = 512, truncation = True, 
                             return_tensors = 'pt', return_token_type_ids = False, padding = 'max_length',
                             return_attention_mask = True, add_special_tokens = True)
    bool_inp_ids.append(tok.input_ids)
    bool_inp_att_mask.append(tok.attention_mask)

In [None]:
bool_target_ids = []
bool_target_att_mask = []
for ctx in tqdm_notebook(bool_target_text):
    tok = t5_tok.encode_plus(ctx, max_length = 32, truncation = True, 
                             return_tensors = 'pt', return_token_type_ids = False, padding = 'max_length',
                             return_attention_mask = True, add_special_tokens = True)
    bool_target_ids.append(tok.input_ids)
    bool_target_att_mask.append(tok.attention_mask)

In [None]:
bool_inp_ids = torch.cat(bool_inp_ids, dim=0)
bool_inp_att_mask = torch.cat(bool_inp_att_mask, dim=0)
bool_target_ids = torch.cat(bool_target_ids, dim=0)
bool_target_att_mask = torch.cat(bool_target_att_mask, dim=0)

In [None]:
bool_tensor_dataset = torch.utils.data.TensorDataset(bool_inp_ids, bool_inp_att_mask, bool_target_ids, bool_target_att_mask)

In [None]:
train_size = int(len(bool_tensor_dataset)*.95)
val_size = len(bool_tensor_dataset) - train_size

bool_train_dataset, bool_val_dataset = torch.utils.data.random_split(bool_tensor_dataset, [train_size, val_size])

### squad prep

In [None]:
def input_target_text_squad(df):

    prefix = 'qgen'
    
    input_text_arr = []
    target_text_arr = []
    
    for i in df.index:
        inp_model = f"{prefix} answer: {df.text[i]} context: {df.context[i]}"
        input_text_arr.append(inp_model)
        target_text_arr.append(df.question[i])
        
    return(input_text_arr, target_text_arr)

In [None]:
squad_input_text, squad_target_text = input_target_text_squad(squad_req)
squad_df = pd.DataFrame(list(zip(squad_input_text, squad_target_text)), columns = ['input_text', 'target_text'])

In [None]:
squad_inp_text = squad_df.input_text.values
squad_target_text = squad_df.target_text.values

In [None]:
squad_inp_ids = []
squad_inp_att_mask = []
for ctx in tqdm_notebook(squad_inp_text):
    tok = t5_tok.encode_plus(ctx, max_length = 512, truncation = True, 
                             return_tensors = 'pt', return_token_type_ids = False, padding = 'max_length',
                             return_attention_mask = True, add_special_tokens = True)
    squad_inp_ids.append(tok.input_ids)
    squad_inp_att_mask.append(tok.attention_mask)

In [None]:
squad_target_ids = []
squad_target_att_mask = []
for ctx in tqdm_notebook(squad_target_text):
    tok = t5_tok.encode_plus(ctx, max_length = 32, truncation = True, 
                             return_tensors = 'pt', return_token_type_ids = False, padding = 'max_length',
                             return_attention_mask = True, add_special_tokens = True)
    squad_target_ids.append(tok.input_ids)
    squad_target_att_mask.append(tok.attention_mask)

In [None]:
squad_inp_ids = torch.cat(squad_inp_ids, dim=0)
squad_inp_att_mask = torch.cat(squad_inp_att_mask, dim=0)
squad_target_ids = torch.cat(squad_target_ids, dim=0)
squad_target_att_mask = torch.cat(squad_target_att_mask, dim=0)

In [None]:
squad_tensor_dataset = torch.utils.data.TensorDataset(squad_inp_ids, squad_inp_att_mask, squad_target_ids, squad_target_att_mask)

In [None]:
train_size = int(len(squad_tensor_dataset)*.95)
val_size = len(squad_tensor_dataset) - train_size

squad_train_dataset, squad_val_dataset = torch.utils.data.random_split(squad_tensor_dataset, [train_size, val_size])

### modelling

In [None]:
train_dataset = bool_train_dataset+boolans_train_dataset+squad_train_dataset
val_dataset = bool_val_dataset+boolans_val_dataset+squad_val_dataset

In [None]:
BATCH_SIZE = 16

train_loader = torch.utils.data.DataLoader(train_dataset, sampler = torch.utils.data.RandomSampler(train_dataset),
                                      batch_size = BATCH_SIZE)

val_loader = torch.utils.data.DataLoader(val_dataset, sampler = torch.utils.data.SequentialSampler(val_dataset),
                                     batch_size = BATCH_SIZE)

In [None]:
t5_model = T5ForConditionalGeneration.from_pretrained("t5-large")

In [None]:
t5_model.parallelize()

In [None]:
optimizer = AdamW(t5_model.parameters(), lr=5e-5, eps=1e-8)

In [None]:
EPOCHS = 5

total_steps = len(train_loader)*EPOCHS
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

In [None]:
def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [None]:
torch.cuda.manual_seed_all(42)


t0 = time.time()
t5_prefixed_training_stats = []
best_prefixed_accuracy = 0

for epoch in tqdm_notebook(range(EPOCHS)):
    
    print("============ EPOCH {} / {} ===========".format(epoch+1, EPOCHS))
    
    #================TRAINING=================#
    
    print('....TRAINING....')
    
    # tracking variables..
    total_prefixed_train_loss = 0
    total_prefixed_train_accuracy = 0
    
    # set to trian mode because some layers like dropout and batchnorm behave differently..
    t5_model.train()
    
    for nth_batch, batch in enumerate(train_loader):
        # logging the progress..
        if (nth_batch+1)%500 == 0 and not nth_batch == 0:
            elapsed = format_time(time.time()-t0)
            print("\nProcessed {} of {} batches".format(nth_batch+1, len(train_loader)))
        
        # attributes to be passed..
        prefixed_inp_ids = batch[0].to('cuda:0')
        prefixed_inp_mask = batch[1].to('cuda:0')
        labels = batch[2].to('cuda:0')
        
        # clear previously calculated gradients..
        t5_model.zero_grad()
        
        # forward pass..
        train_out = t5_model(input_ids = prefixed_inp_ids,
                                      attention_mask = prefixed_inp_mask,
                                      labels = labels)
        loss = train_out.loss
        logits = train_out.logits   # size (batch_size, max_label_length, vocab_size)
        
        # adding to total loss..
        total_prefixed_train_loss += loss.item()  # '.item()' gives value from tensor
        
        # backward pass to calculate gradients..
        loss.backward()
        
        # clipping norm of gradients to '1' to prevent exploding gradients problem..
        torch.nn.utils.clip_grad_norm_(t5_model.parameters(), 1.0)  # '_' is inplace operator
        
        # update parameters and take a step using computed gradient..
        optimizer.step()
        
        # update learning rate..
        scheduler.step()
        
    # average out total loss..
    prefixed_avg_train_loss = total_prefixed_train_loss / len(train_loader)
    
    # total train time..
    total_prefixed_train_time = format_time(time.time()-t0)
    
    print("\n>>>>>Average training loss {}".format(str(prefixed_avg_train_loss)))
    print("\n>>>>>Training epoch took {}".format(total_prefixed_train_time))
    
    
    
    
    #===============VALIDATING================#
    
    print("\n.....VALIDATING......")
    
    t0 = time.time()
    
    # set to eval mode because some layers like dropout and batchnorm behave differently..
    t5_model.eval()
    
    # tracking variables..
    total_prefixed_eval_accuracy = 0
    total_prefixed_eval_loss = 0
    nb_prefixed_eval_steps = 0
    
    for nth_batch, batch in enumerate(val_loader):
        # logging the progress..
        if (nth_batch+1)%30 == 0 and not nth_batch == 0:
            elapsed = format_time(time.time()-t0)
            print("\nProcessed {} of {} batches".format(nth_batch+1, len(val_loader)))
        
        # attributes to be passed..
        prefixed_inp_ids_val = batch[0].to('cuda:0')
        prefixed_inp_mask_val = batch[1].to('cuda:0')
        labels_val = batch[2].to('cuda:0')    
        
        # telling pytorch not to worry about constructing computational graph during training which is used while backprop
        with torch.no_grad():
            val_out = t5_model(input_ids = prefixed_inp_ids_val,
                                        attention_mask = prefixed_inp_mask_val,
                                       labels = labels_val)
            loss = val_out.loss
            logits = val_out.logits
        
        total_prefixed_eval_loss += loss.item()
        
        qgen(logits.detach().cpu(), labels_val.detach().cpu())
        
    prefixed_avg_val_loss = total_prefixed_eval_loss / len(val_loader)
    
#     prefixed_avg_val_accuracy = total_prefixed_eval_accuracy / len(val_loader)

    # total train time..
    total_prefixed_val_time = format_time(time.time()-t0)
    
    print("\n>>>>>Average validation loss {}".format(str(prefixed_avg_val_loss)))
    print("\n Validation epoch took {}".format(total_prefixed_val_time))    
    
        

In [None]:
t5_model.deparallelize()
t5_model.to('cpu')
t5_model.save_pretrained('text_qgen')
t5_tok.save_pretrained('text_qgen')