In [1]:
!pip install --quiet transformers==4.1.1
!pip install --quiet torchtext==0.8.0 torch==1.7.1 pytorch-lightning==1.2.2
!pip install --quiet tokenizers==0.9.4
!pip install --quiet sentencepiece==0.1.94

[K     |████████████████████████████████| 1.5 MB 5.1 MB/s 
[K     |████████████████████████████████| 2.9 MB 7.3 MB/s 
[K     |████████████████████████████████| 880 kB 4.2 MB/s 
[?25h  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
[K     |████████████████████████████████| 6.9 MB 4.9 MB/s 
[K     |████████████████████████████████| 776.8 MB 17 kB/s 
[K     |████████████████████████████████| 816 kB 38.9 MB/s 
[K     |████████████████████████████████| 596 kB 35.0 MB/s 
[K     |████████████████████████████████| 829 kB 47.9 MB/s 
[K     |████████████████████████████████| 141 kB 23.8 MB/s 
[?25h  Building wheel for future (setup.py) ... [?25l[?25hdone
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torchvision 0.13.1+cu113 requires torch==1.12.1, but you have torch 1.7.1 which is incompatible.
torchaudio 0.12.1+cu113 requires torch==1.

In [2]:
!unzip hotpotqa_folds_eval.zip
#!unzip squad_folds_eval.zip

Archive:  hotpotqa_folds_eval.zip
   creating: hotpotqa_folds_eval/hotpotqa_folds_eval/
   creating: hotpotqa_folds_eval/hotpotqa_folds_eval/hotpotqa_folds/
   creating: hotpotqa_folds_eval/hotpotqa_folds_eval/hotpotqa_folds/hotpotqa_fold1/
  inflating: hotpotqa_folds_eval/hotpotqa_folds_eval/hotpotqa_folds/hotpotqa_fold1/t5_questions_eval.txt  
  inflating: hotpotqa_folds_eval/hotpotqa_folds_eval/hotpotqa_folds/hotpotqa_fold1/test.csv  
  inflating: hotpotqa_folds_eval/hotpotqa_folds_eval/hotpotqa_folds/hotpotqa_fold1/test_with_t5_questions.csv  
  inflating: hotpotqa_folds_eval/hotpotqa_folds_eval/hotpotqa_folds/hotpotqa_fold1/test_with_t5_questions_and_answers.csv  
  inflating: hotpotqa_folds_eval/hotpotqa_folds_eval/hotpotqa_folds/hotpotqa_fold1/train.csv  
  inflating: hotpotqa_folds_eval/hotpotqa_folds_eval/hotpotqa_folds/hotpotqa_fold1/val.csv  
   creating: hotpotqa_folds_eval/hotpotqa_folds_eval/hotpotqa_folds/hotpotqa_fold2/
  inflating: hotpotqa_folds_eval/hotpotqa_folds_ev

In [3]:
import argparse
import glob
import os
import json
import time
import logging
import random 
import re
from itertools import chain
from string import punctuation
import requests  
import pandas as pd
import numpy as np
import torch
from pathlib import Path
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl
from sklearn.model_selection import train_test_split
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import TensorBoardLogger

'''
optimizer - AdamW
T5 Conditional Generator in which we'll give conditions
T5 tokenizer because it is fast
training the model without a learning rate
'''
from transformers import (
    AdamW,
    T5ForConditionalGeneration,
    T5Tokenizer,
    get_linear_schedule_with_warmup
)

import gc

import string

In [14]:
def normalize_text(s):
    """Removing articles and punctuation, and standardizing whitespace are all typical text processing steps."""
    import string, re

    def remove_articles(text):
        regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
        return re.sub(regex, " ", text)

    def white_space_fix(text):
        return " ".join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return "".join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))

def compute_exact_match(truth, prediction):
    return int(normalize_text(str(prediction)) == normalize_text(str(truth)))

def compute_f1(truth, prediction):
    pred_tokens = normalize_text(str(prediction)).split()
    truth_tokens = normalize_text(str(truth)).split()
    
    # if either the prediction or the truth is no-answer then f1 = 1 if they agree, 0 otherwise
    if len(pred_tokens) == 0 or len(truth_tokens) == 0:
        return int(pred_tokens == truth_tokens)
    
    common_tokens = set(pred_tokens) & set(truth_tokens)
    
    # if there are no common tokens then f1 = 0
    if len(common_tokens) == 0:
        return 0
    
    prec = len(common_tokens) / len(pred_tokens)
    rec = len(common_tokens) / len(truth_tokens)
    
    return 2 * (prec * rec) / (prec + rec)

In [5]:
class SquadDataset(Dataset):

  def __init__(
      self, 
      data : pd.DataFrame, 
      tokenizer : T5Tokenizer, 
      source_max_token_len: int = 396,
      target_max_token_len: int = 32
      ):
    self.tokenizer = tokenizer
    self.data = data
    self.source_max_token_len = source_max_token_len
    self.target_max_token_len = target_max_token_len
  
  def __len__(self):
    return len(self.data)

  def __getitem__(self, index : int):
    data_row = self.data.iloc[index]
    
    source_encoding = self.tokenizer(
        data_row["input_qa"],
        max_length = self.source_max_token_len,
        padding="max_length",
        truncation="only_second",
        return_attention_mask=True,
        add_special_tokens=True,
        return_tensors = "pt"
    )

    target_encoding = self.tokenizer(
        data_row["answer"],
        max_length = self.target_max_token_len,
        padding="max_length",
        truncation="only_second",
        return_attention_mask=True,
        add_special_tokens=True,
        return_tensors = "pt"
    )

    labels = target_encoding["input_ids"]
    labels[labels == 0] = -100

    return dict(
        input_text = data_row["input_qa"],
        answer_text = data_row["answer"],
        input_ids=source_encoding["input_ids"].flatten(),
        attention_mask=source_encoding["attention_mask"].flatten(),
        labels=labels.flatten()
    )

In [6]:
class SquadDataModule(pl.LightningDataModule):
  def __init__(
      self,
      train_df: pd.DataFrame,
      test_df: pd.DataFrame,
      tokenizer: T5Tokenizer,
      batch_size: int = 8,
      source_max_token_len: int = 396,
      target_max_token_len: int = 32
      ):
    super().__init__()
    self.batch_size = batch_size
    self.train_df = train_df
    self.test_df = test_df
    self.tokenizer = tokenizer
    self.batch_size = batch_size
    self.source_max_token_len = source_max_token_len
    self.target_max_token_len = target_max_token_len
  
  def setup(self, stage: str = None):
    self.train_dataset = SquadDataset(self.train_df, self.tokenizer, self.source_max_token_len, self.target_max_token_len)
    self.test_dataset = SquadDataset(self.test_df, self.tokenizer, self.source_max_token_len, self.target_max_token_len)
  
  def train_dataloader(self):
    return DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=True, num_workers=4)
  
  def val_dataloader(self):
    return DataLoader(self.test_dataset, batch_size=1, num_workers=4)

  def test_dataloader(self):
    return DataLoader(self.test_dataset, batch_size=1, num_workers=4)

In [7]:
class SquadModel(pl.LightningModule):

  def __init__(self):
    super().__init__()
    self.model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME, return_dict = True)

  def forward(self, input_ids, attention_mask, labels=None):
    output = self.model(
        input_ids = input_ids,
        attention_mask=attention_mask,
        labels=labels
    )

    return output.loss, output.logits

  def training_step(self, batch, batch_idx):
    input_ids = batch["input_ids"]
    attention_mask = batch["attention_mask"]
    labels = batch["labels"]
    loss, outputs = self(input_ids, attention_mask, labels)
    self.log("train_loss", loss, prog_bar=True, logger=True)
    return loss
  
  def validation_step(self, batch, batch_idx):
    input_ids = batch["input_ids"]
    attention_mask = batch["attention_mask"]
    labels = batch["labels"]
    loss, outputs = self(input_ids, attention_mask, labels)
    self.log("val_loss", loss, prog_bar=True, logger=True)
    return loss

  def test_step(self, batch, batch_idx):
    input_ids = batch["input_ids"]
    attention_mask = batch["attention_mask"]
    labels = batch["labels"]
    loss, outputs = self(input_ids, attention_mask, labels)
    self.log("test_loss", loss, prog_bar=True, logger=True)
    return loss

  def configure_optimizers(self):
    return AdamW(self.parameters(), lr = 0.0001)


In [8]:
def generate_answer(row, trained_model, tokenizer, dataset_name, from_t5_questions):

  if from_t5_questions:
    question_column = "t5_questions"
  else:
    question_column = "question"

  if dataset_name == "squad":
    source_max_len = 421
    target_max_len = 22
    row["input_qa"] =  "question: %s  context: %s </s>" % (row[question_column], row["context"])
  
  if dataset_name == "hotpotqa":
    source_max_len = 628
    target_max_len = 42
    row["input_qa"] = "question: %s, context1: %s, context2: %s </s>"  % (row[question_column], row["context1"], row["context2"])

  source_encoding = tokenizer(
        row["input_qa"],
        max_length = source_max_len,
        padding="max_length",
        truncation="only_second",
        return_attention_mask=True,
        add_special_tokens=True,
        return_tensors = "pt"
    )
  
  generated_ids = trained_model.model.generate(
        input_ids=source_encoding["input_ids"],
        attention_mask=source_encoding["attention_mask"],
        num_beams=1,
        max_length=target_max_len,
        repetition_penalty=2.5,
        length_penalty=1.0,
        early_stopping=True,
        use_cache=True
    )
  
  preds = [
             tokenizer.decode(generated_id, skip_special_tokens=True, clean_up_tokenization_spaces=True)
             for generated_id in generated_ids 
    ]
  
  return "".join(preds)

In [5]:
def cross_val(dataset_name : string):

  if dataset_name == "squad":
    source_max_token_len=421
    target_max_token_len=22
  
  if dataset_name == "hotpotqa":
    source_max_token_len=628
    target_max_token_len=42

  for i in range(7, len(os.listdir(dataset_name + "_folds_eval/" + dataset_name + "_folds/"))+1):
    pl.seed_everything(0)
    gc.collect()
    torch.cuda.empty_cache()
    tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)
    model = SquadModel()
    path = dataset_name + "_folds_eval/" + dataset_name + "_folds/" + dataset_name + "_fold" + str(i) + "/"
    train = pd.read_csv(path + "train.csv")
    val = pd.read_csv(path + "val.csv")
    test = pd.read_csv(path + "test_with_t5_questions.csv")
    train = train[["answer", "input_qa"]]
    val = val[["answer", "input_qa"]]
    #test = test[["answer", "input_qa"]]

    data_module = SquadDataModule(train, val, tokenizer, batch_size=BATCH_SIZE, source_max_token_len=source_max_token_len, target_max_token_len=target_max_token_len)
    data_module.setup()

    trainer.fit(model, data_module)

    trained_model = SquadModel.load_from_checkpoint("checkpoints/bestcheckpoint.ckpt")
    trained_model.freeze()

    answers_from_t5_questions = []
    answers_from_standard_questions = []

    for i in range(len(test)):
      answers_from_t5_questions.append(generate_answer(test.iloc[i,:], trained_model, tokenizer, dataset_name, True))
      answers_from_standard_questions.append(generate_answer(test.iloc[i,:], trained_model, tokenizer, dataset_name, False))

    test["answers_from_t5_questions"] = answers_from_t5_questions
    test["answers_from_standard_questions"] = answers_from_standard_questions

    test.to_csv(path + "test_with_t5_questions_and_answers.csv")

    os.remove("checkpoints/bestcheckpoint.ckpt")



In [10]:
def answer_evaluation(dataset : string):

  path = dataset + "_folds_eval/" + dataset + "_folds"

  exact_match_from_t5 = []
  f1_scores_from_t5 = []

  exact_match_from_standard = []
  f1_scores_from_standard = []

  exact_match_from_t5 = []
  f1_scores_from_t5 = []

  for directory in os.listdir(path):
    df = pd.read_csv(path + "/" + directory + "/" + "test_with_t5_questions_and_answers.csv")
    truth = list(df["answer"])
    predictions_from_standard = list(df["answers_from_standard_questions"])
    predictions_from_t5 = list(df["answers_from_t5_questions"])
    standard = list(zip(truth, predictions_from_standard))
    t5 = list(zip(truth, predictions_from_t5))
    exact_match_from_standard = [compute_exact_match(x[0], x[1]) for x in standard]
    f1_scores_from_standard = [compute_f1(x[0], x[1]) for x in standard]
    exact_match_from_t5 = [compute_exact_match(x[0], x[1]) for x in t5]
    f1_scores_from_t5 = [compute_f1(x[0], x[1]) for x in t5]
    
    f = open(path + "/" + directory + "/" + "t5_answers_eval.txt", "a")
    f.write("Exact Match Standard: " + str(sum(exact_match_from_standard) / len(exact_match_from_standard)) + "\n" 
            + "F1 Standard: " + str(sum(f1_scores_from_standard) / len(f1_scores_from_standard)) + "\n" 
            + "Exact Match T5: " + str(sum(exact_match_from_t5) / len(exact_match_from_t5)) + "\n" 
            + "F1 T5: " + str(sum(f1_scores_from_t5) / len(f1_scores_from_t5)) + "\n"
            )
    f.close()
  

In [7]:
MODEL_NAME ='t5-base' 
BATCH_SIZE = 8
N_EPOCHS = 2
DATASET = "hotpotqa"

In [12]:
checkpoint_callback = ModelCheckpoint(
  dirpath="checkpoints",
  filename="bestcheckpoint",
  save_top_k=1,
  verbose=True,
  monitor="val_loss",
  mode="min"
)

trainer = pl.Trainer(
    callbacks=checkpoint_callback,
    checkpoint_callback=checkpoint_callback,
    max_epochs = N_EPOCHS,
    gpus=1,
    progress_bar_refresh_rate=30
)

GPU available: True, used: True
INFO:lightning:GPU available: True, used: True
TPU available: None, using: 0 TPU cores
INFO:lightning:TPU available: None, using: 0 TPU cores


In [14]:
cross_val(DATASET)

Global seed set to 0
INFO:lightning:Global seed set to 0


Downloading:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/892M [00:00<?, ?B/s]

Some weights of the model checkpoint at t5-base were not used when initializing T5ForConditionalGeneration: ['decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight']
- This IS expected if you are initializing T5ForConditionalGeneration from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing T5ForConditionalGeneration from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).

  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 222 M 
-----------------------------------------------------
222 M     Trainable params
0         Non-trainable params
222 M     Total params
891.614   Total estimated model params size 

Validation sanity check: 0it [00:00, ?it/s]

  f"This sequence already has {self.eos_token}. In future versions this behavior may lead to duplicated eos tokens being added."
  f"This sequence already has {self.eos_token}. In future versions this behavior may lead to duplicated eos tokens being added."
  f"This sequence already has {self.eos_token}. In future versions this behavior may lead to duplicated eos tokens being added."
  f"This sequence already has {self.eos_token}. In future versions this behavior may lead to duplicated eos tokens being added."


Training: 0it [00:00, ?it/s]

  f"This sequence already has {self.eos_token}. In future versions this behavior may lead to duplicated eos tokens being added."
  f"This sequence already has {self.eos_token}. In future versions this behavior may lead to duplicated eos tokens being added."
  f"This sequence already has {self.eos_token}. In future versions this behavior may lead to duplicated eos tokens being added."
  f"This sequence already has {self.eos_token}. In future versions this behavior may lead to duplicated eos tokens being added."


Validating: 0it [00:00, ?it/s]

  f"This sequence already has {self.eos_token}. In future versions this behavior may lead to duplicated eos tokens being added."
  f"This sequence already has {self.eos_token}. In future versions this behavior may lead to duplicated eos tokens being added."
  f"This sequence already has {self.eos_token}. In future versions this behavior may lead to duplicated eos tokens being added."
  f"This sequence already has {self.eos_token}. In future versions this behavior may lead to duplicated eos tokens being added."
Epoch 0, global step 1370: val_loss reached 0.31717 (best 0.31717), saving model to "/content/checkpoints/bestcheckpoint.ckpt" as top 1
INFO:lightning:Epoch 0, global step 1370: val_loss reached 0.31717 (best 0.31717), saving model to "/content/checkpoints/bestcheckpoint.ckpt" as top 1
  f"This sequence already has {self.eos_token}. In future versions this behavior may lead to duplicated eos tokens being added."
  f"This sequence already has {self.eos_token}. In future versions t

Validating: 0it [00:00, ?it/s]

  f"This sequence already has {self.eos_token}. In future versions this behavior may lead to duplicated eos tokens being added."
  f"This sequence already has {self.eos_token}. In future versions this behavior may lead to duplicated eos tokens being added."
  f"This sequence already has {self.eos_token}. In future versions this behavior may lead to duplicated eos tokens being added."
  f"This sequence already has {self.eos_token}. In future versions this behavior may lead to duplicated eos tokens being added."
Epoch 1, step 2741: val_loss was not in top 1
INFO:lightning:Epoch 1, step 2741: val_loss was not in top 1
Some weights of the model checkpoint at t5-base were not used when initializing T5ForConditionalGeneration: ['decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight']
- This IS expected if you are initializing T5ForConditionalGeneration from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClas

In [15]:
answer_evaluation(DATASET)