In [None]:
!pip install --quiet transformers==4.1.1
!pip install --quiet torchtext==0.8.0 torch==1.7.1 pytorch-lightning==1.2.2
!pip install --quiet tokenizers==0.9.4
!pip install --quiet sentencepiece==0.1.94
!pip install --quiet evaluate
!pip install --quiet rouge_score

[K     |████████████████████████████████| 1.5 MB 29.0 MB/s 
[K     |████████████████████████████████| 880 kB 56.3 MB/s 
[K     |████████████████████████████████| 2.9 MB 61.6 MB/s 
[?25h  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
[K     |████████████████████████████████| 6.9 MB 28.0 MB/s 
[K     |████████████████████████████████| 776.8 MB 18 kB/s 
[K     |████████████████████████████████| 816 kB 71.1 MB/s 
[K     |████████████████████████████████| 596 kB 71.0 MB/s 
[K     |████████████████████████████████| 141 kB 55.4 MB/s 
[K     |████████████████████████████████| 829 kB 70.3 MB/s 
[?25h  Building wheel for future (setup.py) ... [?25l[?25hdone
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torchvision 0.13.1+cu113 requires torch==1.12.1, but you have torch 1.7.1 which is incompatible.
torchaudio 0.12.1+cu113 requires torch

In [None]:

import argparse
import glob
import os
import json
import time
import logging
import random 
import re
from itertools import chain
from string import punctuation
import requests  
import pandas as pd
import numpy as np
import torch
from pathlib import Path
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl
from sklearn.model_selection import train_test_split
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import TensorBoardLogger


'''
optimizer - AdamW
T5 Conditional Generator in which we'll give conditions
T5 tokenizer because it is fast
training the model without a learning rate
'''
from transformers import (
    AdamW,
    T5ForConditionalGeneration,
    T5Tokenizer,
    get_linear_schedule_with_warmup
)

from gc import collect

import evaluate

from sklearn.model_selection import KFold

import string

In [None]:
def prepare_input_qa(row, dataset):
  if dataset == "squad":
    return "question: %s context: %s </s>" % (row["question"], row["context"])
  if dataset == "hotpotqa":
    return "question: %s context1: %s context2: %s </s>" % (row["question"], row["context1"], row["context2"])

def prepare_input_qg(row, dataset):
  if dataset == "squad":
    return "answer: %s context: %s </s>" % (row["answer"], row["context"])
  if dataset == "hotpotqa":
    return "answer: %s context1: %s context2: %s </s>" % (row["answer"], row["context1"], row["context2"])


def extract_questions_and_answers(factoid_path = Path):
  with factoid_path.open() as json_file:
    data = json.load(json_file)
    questions = data['data'][0]['paragraphs']
    data_rows = []
    for question in questions:
      context = question['context']
      for question_and_answers in question['qas']:
        question = question_and_answers['question']
        answers = question_and_answers['answers']
        for answer in answers:
          answer_text = answer['text']
          answer_start = answer['answer_start']
          answer_end = answer['answer_start'] + len(answer_text)  #Gets the end index of each answer in the paragraph
          
          data_rows.append({
                "question" : question,
                "context"  : context,
                "answer" : answer_text,
                "answer_start" : answer_start,
                "answer_end" : answer_end
            })
  return pd.DataFrame(data_rows)
  

def get_squad():

  url = "https://rajpurkar.github.io/SQuAD-explorer/dataset/"

  for file in ["train-v2.0.json", "dev-v2.0.json", "train-v1.1.json", "dev-v1.1.json"]:
    res = requests.get(f'{url}{file}')
    with open(f'squad/{file}', "wb") as f:
      for chunk in res.iter_content(chunk_size=4):
        f.write(chunk)

  factoid_path_train2 = Path("squad/train-v2.0.json")
  factoid_path_dev2 = Path("squad/dev-v2.0.json")
  factoid_path_train1 = Path("squad/train-v1.1.json")
  factoid_path_dev1 = Path("squad/dev-v1.1.json") 

  dev_df1 = extract_questions_and_answers(factoid_path_dev1) 
  train_df1 = extract_questions_and_answers(factoid_path_train1) 
  train_df2 = extract_questions_and_answers(factoid_path_train2)
  dev_df2 = extract_questions_and_answers(factoid_path_dev2)

  df = dev_df1.append(train_df1.append(train_df2.append(dev_df2)))

  df.drop_duplicates(subset=['question', 'context'], inplace=True)

  df["input_qg"] = df.apply(lambda row: prepare_input_qg(row, "squad"), axis=1)
  df["input_qa"] = df.apply(lambda row: prepare_input_qa(row, "squad"), axis=1)

  return df


def filter_context(row):
  supporting_facts = list(set([x[0] for x in row["supporting_facts"]]))
  contexts = []
  for fact in supporting_facts:
    for context in row["context"]:
      if fact in context[0] or context[0] in fact:
        contexts.append(context[1])
  row["context"] = contexts
  return row

def seperate_context(row):
  contexts = row["context"]
  row["context1"] = " ".join(contexts[0])
  row["context2"] = " ".join(contexts[1])
  return row

def get_hotpotqa():

  res = requests.get("http://curtis.ml.cmu.edu/datasets/hotpot/hotpot_train_v1.1.json")
  
  with open('hotpotqa/hotpot_train_v1.1.json', "wb") as f:
    for chunk in res.iter_content(chunk_size=4):
      f.write(chunk)

  df = pd.read_json("hotpotqa/hotpot_train_v1.1.json")

  df = df.sample(1917, random_state = 0)
  original_contexts = list(df["context"])
  df = df.apply(filter_context, axis = 1)
  df = df.apply(seperate_context, axis = 1)
  df["context"] = original_contexts
  df["input_qg"] = df.apply(lambda row: prepare_input_qg(row, "hotpotqa"), axis=1)
  df["input_qa"] = df.apply(lambda row: prepare_input_qa(row, "hotpotqa"), axis=1)
  return df
  

def create_folds(df : pd.DataFrame, dataset_name : string):

  kf = KFold(n_splits=7, random_state=33, shuffle=True)

  i = 0

  for train_index, test_index in kf.split(df):
    i += 1
    train, test = df.iloc[train_index,:], df.iloc[test_index,:]
    os.makedirs(dataset_name + "_folds/" + dataset_name + "_fold" + str(i))
    train, val = train_test_split(train, test_size = test.shape[0]/train.shape[0])
    train.to_csv(dataset_name + "_folds/" + dataset_name + "_fold" + str(i) + "/train.csv")
    val.to_csv(dataset_name + "_folds/" + dataset_name + "_fold" + str(i) + "/val.csv")
    test.drop(columns = ["input_qa"], inplace = True)
    test.to_csv(dataset_name + "_folds/" + dataset_name + "_fold" + str(i) + "/test.csv")

In [None]:
class SquadDataset(Dataset):
  def __init__(
      self,
      data:pd.DataFrame,
      tokenizer:T5Tokenizer,
      source_max_token_len: int = 396,
      target_max_token_len: int = 32,

      ):
    
    self.data =  data
    self.tokenizer =  tokenizer
    self.source_max_token_len =  source_max_token_len
    self.target_max_token_len =  target_max_token_len


  def __len__(self):
    return len(self.data)

  def __getitem__(self, index: int):
    data_row = self.data.iloc[index]

    source_encoding = self.tokenizer(
      data_row["input_qg"],
      max_length=self.source_max_token_len,
      padding='max_length',
      truncation="only_first",
      return_attention_mask=True,
      add_special_tokens=True,
      return_tensors="pt"
      )
    
    target_encoding = self.tokenizer(
      data_row['question'],
      max_length=self.target_max_token_len,
      padding='max_length',
      truncation=True,
      return_attention_mask=True,
      add_special_tokens=True,
      return_tensors="pt"
      )
    
    labels = target_encoding['input_ids']
    labels[labels==0] = -100

    return dict(
        question=data_row['question'],
        input_text = data_row["input_qg"],
        input_ids=source_encoding["input_ids"].flatten(),
        attention_mask=source_encoding['attention_mask'].flatten(),
        labels=labels.flatten()
    )

In [None]:
class SquadDataModule(pl.LightningDataModule):
  def __init__(
      self,
      train_df: pd.DataFrame,
      test_df: pd.DataFrame,
      tokenizer:T5Tokenizer,
      batch_size: int = 8,
      source_max_token_len: int = 396,
      target_max_token_len: int = 32,
      ):
    super().__init__()
    self.train_df = train_df
    self.test_df = test_df
    self.tokenizer = tokenizer
    self.batch_size = batch_size
    self.source_max_token_len = source_max_token_len
    self.target_max_token_len = target_max_token_len

  def setup(self):
    self.train_dataset = SquadDataset(
        self.train_df,
        self.tokenizer,
        self.source_max_token_len,
        self.target_max_token_len
        )

    self.test_dataset = SquadDataset(
    self.test_df,
    self.tokenizer,
    self.source_max_token_len,
    self.target_max_token_len
    )
 
  def train_dataloader(self):
    return DataLoader(
        self.train_dataset,
        batch_size=self.batch_size,
        shuffle=True,
        num_workers=4
        )
  def val_dataloader(self):
    return DataLoader(
        self.test_dataset,
        batch_size=self.batch_size,
        num_workers=4
        )

  def test_dataloader(self):
    return DataLoader(
        self.test_dataset,
        batch_size=1,
        num_workers=4
        )

In [None]:
class SquadModel(pl.LightningModule):
  def __init__(self):
    super().__init__()
    self.model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME, return_dict=True)


  def forward(self, input_ids, attention_mask, labels=None):
    output = self.model(
        input_ids, 
        attention_mask=attention_mask,
        labels=labels)

    return output.loss, output.logits

  def training_step(self, batch, batch_idx):
    input_ids = batch['input_ids']
    attention_mask=batch['attention_mask']
    labels = batch['labels']
    loss, outputs = self(input_ids, attention_mask, labels)
    self.log("train_loss", loss, prog_bar=True, logger=True)
    return {"loss": loss, "predictions":outputs, "labels": labels}

  def validation_step(self, batch, batch_idx):
    input_ids = batch['input_ids']
    attention_mask=batch['attention_mask']
    labels = batch['labels']
    loss, outputs = self(input_ids, attention_mask, labels)
    self.log("val_loss", loss, prog_bar=True, logger=True)
    return loss

  def test_step(self, batch, batch_idx):
    input_ids = batch['input_ids']
    attention_mask=batch['attention_mask']
    labels = batch['labels']
    loss, outputs = self(input_ids, attention_mask, labels)
    self.log("test_loss", loss, prog_bar=True, logger=True)
    return loss

  def configure_optimizers(self):
    optimizer = AdamW(self.parameters(), lr=0.0001)
    return optimizer

In [None]:
def generate_question(context, trained_model, tokenizer, dataset_name):

  if dataset_name == "squad":
    source_max_len = 386
    target_max_len = 34
  
  if dataset_name == "hotpotqa":
    source_max_len = 528
    target_max_len = 100
  
  source_encoding=tokenizer(
      context["input_qg"],
      max_length=source_max_len,
      padding="max_length",
      truncation="only_first",
      return_attention_mask=True,
      add_special_tokens=True,
      return_tensors="pt"
  )

  generated_ids = trained_model.model.generate(
      input_ids=source_encoding["input_ids"],
      attention_mask=source_encoding["attention_mask"],
      num_beams=1,  # greedy search
      max_length=target_max_len,
      repetition_penalty=2.5,
      early_stopping=True,
      use_cache=True)
  
  preds = [
          tokenizer.decode(generated_id, skip_special_tokens=True, clean_up_tokenization_spaces=True)
          for generated_id in generated_ids
  ]

  return "".join(preds)


In [None]:
def calculate_scores(test:pd.DataFrame):

  rouge = evaluate.load('rouge')
  bleu = evaluate.load("bleu")
  meteor = evaluate.load("meteor")

  bleu_scores = []
  rouge1_scores = []
  rouge2_scores = []
  rougeL_scores = []
  meteor_scores = []

  for i in range(len(test)):
    predictions = [test.iloc[i, :]["t5_questions"]]
    references = [test.iloc[i, :]["question"]]
    bleu_scores.append(round(bleu.compute(predictions=predictions, references=references)["bleu"],2)*100)
    rouge1_scores.append(round(rouge.compute(predictions=predictions, references=references)["rouge1"],2)*100)
    rouge2_scores.append(round(rouge.compute(predictions=predictions, references=references)["rouge2"],2)*100)
    rougeL_scores.append(round(rouge.compute(predictions=predictions, references=references)["rougeL"],2)*100)
    meteor_scores.append(round(meteor.compute(predictions=predictions, references=references)["meteor"], 2)*100)
  
  bleu_score = round(sum(bleu_scores) / len(bleu_scores), 2)
  rouge1_score = round(sum(rouge1_scores) / len(rouge1_scores), 2)
  rouge2_score = round(sum(rouge2_scores) / len(rouge2_scores), 2)
  rougeL_score = round(sum(rougeL_scores) / len(rougeL_scores), 2)
  meteor_score = round(sum(meteor_scores) / len(meteor_scores), 2)

  return bleu_score, rouge1_score, rouge2_score, rougeL_score, meteor_score

In [None]:
def cross_val(dataset_name : string):

  if dataset_name == "squad":
    source_max_token_len=387
    target_max_token_len=34
  
  if dataset_name == "hotpotqa":
    source_max_token_len=528
    target_max_token_len=100

  for i in range(7, len(os.listdir(dataset_name + "_folds"))+1):
    pl.seed_everything(0)
    collect()
    torch.cuda.empty_cache()
    tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)
    model = SquadModel()
    path = dataset_name + "_folds/" + dataset_name + "_fold" + str(i) + "/"
    train = pd.read_csv(path + "train.csv")
    val = pd.read_csv(path + "val.csv")
    test = pd.read_csv(path + "test.csv")

    
    data_module = SquadDataModule(train, val, tokenizer, batch_size=BATCH_SIZE, source_max_token_len=source_max_token_len, target_max_token_len=target_max_token_len)
    data_module.setup()

    trainer.fit(model, data_module)

    trained_model = SquadModel.load_from_checkpoint("checkpoints/bestcheckpoint.ckpt")
    trained_model.freeze()

    questions = []

    for i in range(len(test)):
      questions.append(generate_question(test.iloc[i,:], trained_model, tokenizer, dataset_name))

    test["t5_questions"] = questions

    test.to_csv(path + "test_with_t5_questions.csv")

    bleu, rouge1, rouge2, rougeL, meteor = calculate_scores(test)

    f = open(path + "t5_questions_eval.txt", "a")

    f.write("Bleu: " + str(bleu) + "\nRouge1: " + str(rouge1) + "\nRouge2: " + str(rouge2) + "\nRougeL: " + str(rougeL) + "\nMeteor: " + str(meteor))

    f.close()

    os.remove("checkpoints/bestcheckpoint.ckpt")


In [None]:
!mkdir squad
!mkdir hotpotqa
!mkdir squad_folds
!mkdir hotpotqa_folds

In [None]:
df_squad = get_squad()
create_folds(df_squad, "squad")

In [None]:
df_hotpotqa = get_hotpotqa()
create_folds(df_hotpotqa, "hotpotqa")

In [None]:
MODEL_NAME ='t5-base' 
BATCH_SIZE = 8
N_EPOCHS = 2
DATASET = "hotpotqa"

In [None]:
!unzip hotpotqa_folds_eval.zip

In [None]:
checkpoint_callback = ModelCheckpoint(
  dirpath="checkpoints",
  filename="bestcheckpoint",
  save_top_k=1,
  verbose=True,
  monitor="val_loss",
  mode="min"
)

trainer = pl.Trainer(
    callbacks=checkpoint_callback,
    checkpoint_callback=checkpoint_callback,
    max_epochs = N_EPOCHS,
    gpus=1,
    progress_bar_refresh_rate=30
)

In [None]:
cross_val(DATASET)

In [None]:
#!zip -r squad_folds_eval.zip squad_folds/
!zip -r hotpotqa_folds_eval.zip hotpotqa_folds/

In [None]:
input_len_list = [len(x) for x in df_squad["input_text"]]
min_len = min(input_len_list)
max_len = max(input_len_list)
mean_len = sum(input_len_list) / len(input_len_list)
print("Min: " + str(min_len))
print("Max: " + str(max_len))
print("Mean: " + str(mean_len))

In [None]:
qa_input = ["question: " + df_squad.iloc[i,:]["question"] + " context: " + df_squad.iloc[i,:]["context"] + " <\s>" for i in range(len(df_squad))]
input_len_list = [len(x) for x in qa_input]
min_len = min(input_len_list)
max_len = max(input_len_list)
mean_len = sum(input_len_list) / len(input_len_list)
print("Min: " + str(min_len))
print("Max: " + str(max_len))
print("Mean: " + str(mean_len))

In [None]:
output_len_list = [len(x) for x in df_squad["question"]]
min_len = min(output_len_list)
max_len = max(output_len_list)
mean_len = sum(output_len_list) / len(output_len_list)
print("Min: " + str(min_len))
print("Max: " + str(max_len))
print("Mean: " + str(mean_len))

In [None]:
output_len_list = [len(x) for x in df_squad["answer"]]
min_len = min(output_len_list)
max_len = max(output_len_list)
mean_len = sum(output_len_list) / len(output_len_list)
print("Min: " + str(min_len))
print("Max: " + str(max_len))
print("Mean: " + str(mean_len))

In [None]:
input_len_list = [len(x) for x in df_hotpotqa["input_text"]]
min_len = min(input_len_list)
max_len = max(input_len_list)
mean_len = sum(input_len_list) / len(input_len_list)
print("Min: " + str(min_len))
print("Max: " + str(max_len))
print("Mean: " + str(mean_len))

In [None]:
qa_input = ["question: " + df_hotpotqa.iloc[i,:]["question"] + " context1: " + df_hotpotqa.iloc[i,:]["context1"] + " context2: " + df_hotpotqa.iloc[i,:]["context2"] + " <\s>" for i in range(len(df_hotpotqa))]
input_len_list = [len(x) for x in qa_input]
min_len = min(input_len_list)
max_len = max(input_len_list)
mean_len = sum(input_len_list) / len(input_len_list)
print("Min: " + str(min_len))
print("Max: " + str(max_len))
print("Mean: " + str(mean_len))

In [None]:
output_len_list = [len(x) for x in df_hotpotqa["answer"]]
min_len = min(output_len_list)
max_len = max(output_len_list)
mean_len = sum(output_len_list) / len(output_len_list)
print("Min: " + str(min_len))
print("Max: " + str(max_len))
print("Mean: " + str(mean_len))

In [None]:
output_len_list = [len(x) for x in df_hotpotqa["question"]]
min_len = min(output_len_list)
max_len = max(output_len_list)
mean_len = sum(output_len_list) / len(output_len_list)
print("Min: " + str(min_len))
print("Max: " + str(max_len))
print("Mean: " + str(mean_len))