In [None]:
!pip install --quiet transformers==4.1.1
!pip install --quiet torchtext==0.8.0 torch==1.7.1 pytorch-lightning==1.2.2
!pip install --quiet tokenizers==0.9.4
!pip install --quiet sentencepiece==0.1.94

[K     |████████████████████████████████| 1.5 MB 7.9 MB/s 
[K     |████████████████████████████████| 2.9 MB 40.8 MB/s 
[K     |████████████████████████████████| 880 kB 66.3 MB/s 
[?25h  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
[K     |████████████████████████████████| 6.9 MB 8.4 MB/s 
[K     |████████████████████████████████| 776.8 MB 16 kB/s 
[K     |████████████████████████████████| 816 kB 52.1 MB/s 
[K     |████████████████████████████████| 829 kB 63.0 MB/s 
[K     |████████████████████████████████| 596 kB 56.8 MB/s 
[K     |████████████████████████████████| 141 kB 52.4 MB/s 
[?25h  Building wheel for future (setup.py) ... [?25l[?25hdone
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torchvision 0.13.1+cu113 requires torch==1.12.1, but you have torch 1.7.1 which is incompatible.
torchaudio 0.12.1+cu113 requires torch==

In [None]:
import argparse
import glob 
import os
import json
import time
import logging
import random 
import re
from itertools import chain
from string import punctuation
import requests  
import pandas as pd
import numpy as np
import torch
from pathlib import Path
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl
from sklearn.model_selection import train_test_split
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import TensorBoardLogger


'''
optimizer - AdamW
T5 Conditional Generator in which we'll give conditions
T5 tokenizer because it is fast
training the model without a learning rate
'''
from transformers import (
    AdamW,
    T5ForConditionalGeneration,
    T5Tokenizer,
    get_linear_schedule_with_warmup
)

from gc import collect

import evaluate

from sklearn.model_selection import KFold

import string

In [None]:
def prepare_input_qg(row, dataset):
  if dataset == "squad":
    return "context: %s </s>" % (row["context"])
  if dataset == "hotpotqa":
    return "context1: %s context2: %s </s>" % (row["context1"], row["context2"])


def extract_questions_and_answers(factoid_path = Path):
  with factoid_path.open() as json_file:
    data = json.load(json_file)
    questions = data['data'][0]['paragraphs']
    data_rows = []
    for question in questions:
      context = question['context']
      for question_and_answers in question['qas']:
        question = question_and_answers['question']
        answers = question_and_answers['answers']
        for answer in answers:
          answer_text = answer['text']
          answer_start = answer['answer_start']
          answer_end = answer['answer_start'] + len(answer_text)  #Gets the end index of each answer in the paragraph
          
          data_rows.append({
                "question" : question,
                "context"  : context,
                "answer" : answer_text,
                "answer_start" : answer_start,
                "answer_end" : answer_end
            })
  return pd.DataFrame(data_rows)
  

def get_squad():

  url = "https://rajpurkar.github.io/SQuAD-explorer/dataset/"

  for file in ["train-v2.0.json", "dev-v2.0.json", "train-v1.1.json", "dev-v1.1.json"]:
    res = requests.get(f'{url}{file}')
    with open(f'squad/{file}', "wb") as f:
      for chunk in res.iter_content(chunk_size=4):
        f.write(chunk)

  factoid_path_train2 = Path("squad/train-v2.0.json")
  factoid_path_dev2 = Path("squad/dev-v2.0.json")
  factoid_path_train1 = Path("squad/train-v1.1.json")
  factoid_path_dev1 = Path("squad/dev-v1.1.json") 

  dev_df1 = extract_questions_and_answers(factoid_path_dev1) 
  train_df1 = extract_questions_and_answers(factoid_path_train1) 
  train_df2 = extract_questions_and_answers(factoid_path_train2)
  dev_df2 = extract_questions_and_answers(factoid_path_dev2)

  df = dev_df1.append(train_df1.append(train_df2.append(dev_df2)))

  df.drop_duplicates(subset=['question', 'context'], inplace=True)

  df["input_qg"] = df.apply(lambda row: prepare_input_qg(row, "squad"), axis=1)

  return df


def filter_context(row):
  supporting_facts = list(set([x[0] for x in row["supporting_facts"]]))
  contexts = []
  for fact in supporting_facts:
    for context in row["context"]:
      if fact in context[0] or context[0] in fact:
        contexts.append(context[1])
  row["context"] = contexts
  return row

def seperate_context(row):
  contexts = row["context"]
  row["context1"] = " ".join(contexts[0])
  row["context2"] = " ".join(contexts[1])
  return row

def get_hotpotqa():

  res = requests.get("http://curtis.ml.cmu.edu/datasets/hotpot/hotpot_train_v1.1.json")
  
  with open('hotpotqa/hotpot_train_v1.1.json', "wb") as f:
    for chunk in res.iter_content(chunk_size=4):
      f.write(chunk)

  df = pd.read_json("hotpotqa/hotpot_train_v1.1.json")

  df = df.sample(1917, random_state = 0)
  original_contexts = list(df["context"])
  df = df.apply(filter_context, axis = 1)
  df = df.apply(seperate_context, axis = 1)
  df["context"] = original_contexts
  df["input_qg"] = df.apply(lambda row: prepare_input_qg(row, "hotpotqa"), axis=1)
  return df

In [None]:
class SquadDataset(Dataset):
  def __init__(
      self,
      data:pd.DataFrame,
      tokenizer:T5Tokenizer,
      source_max_token_len: int = 396,
      target_max_token_len: int = 32,

      ):
    
    self.data =  data
    self.tokenizer =  tokenizer
    self.source_max_token_len =  source_max_token_len
    self.target_max_token_len =  target_max_token_len


  def __len__(self):
    return len(self.data)

  def __getitem__(self, index: int):
    data_row = self.data.iloc[index]

    source_encoding = self.tokenizer(
      data_row["input_qg"],
      max_length=self.source_max_token_len,
      padding='max_length',
      truncation="only_first",
      return_attention_mask=True,
      add_special_tokens=True,
      return_tensors="pt"
      )
    
    target_encoding = self.tokenizer(
      data_row['question'],
      max_length=self.target_max_token_len,
      padding='max_length',
      truncation=True,
      return_attention_mask=True,
      add_special_tokens=True,
      return_tensors="pt"
      )
    
    labels = target_encoding['input_ids']
    labels[labels==0] = -100

    return dict(
        question=data_row['question'],
        input_text = data_row["input_qg"],
        input_ids=source_encoding["input_ids"].flatten(),
        attention_mask=source_encoding['attention_mask'].flatten(),
        labels=labels.flatten()
    )

In [None]:
class SquadDataModule(pl.LightningDataModule):
  def __init__(
      self,
      train_df: pd.DataFrame,
      test_df: pd.DataFrame,
      tokenizer:T5Tokenizer,
      batch_size: int = 8,
      source_max_token_len: int = 396,
      target_max_token_len: int = 32,
      ):
    super().__init__()
    self.train_df = train_df
    self.test_df = test_df
    self.tokenizer = tokenizer
    self.batch_size = batch_size
    self.source_max_token_len = source_max_token_len
    self.target_max_token_len = target_max_token_len

  def setup(self):
    self.train_dataset = SquadDataset(
        self.train_df,
        self.tokenizer,
        self.source_max_token_len,
        self.target_max_token_len
        )

    self.test_dataset = SquadDataset(
    self.test_df,
    self.tokenizer,
    self.source_max_token_len,
    self.target_max_token_len
    )
 
  def train_dataloader(self):
    return DataLoader(
        self.train_dataset,
        batch_size=self.batch_size,
        shuffle=True,
        num_workers=4
        )
  def val_dataloader(self):
    return DataLoader(
        self.test_dataset,
        batch_size=self.batch_size,
        num_workers=4
        )

  def test_dataloader(self):
    return DataLoader(
        self.test_dataset,
        batch_size=1,
        num_workers=4
        )

In [None]:
class SquadModel(pl.LightningModule):
  def __init__(self):
    super().__init__()
    self.model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME, return_dict=True)


  def forward(self, input_ids, attention_mask, labels=None):
    output = self.model(
        input_ids, 
        attention_mask=attention_mask,
        labels=labels)

    return output.loss, output.logits

  def training_step(self, batch, batch_idx):
    input_ids = batch['input_ids']
    attention_mask=batch['attention_mask']
    labels = batch['labels']
    loss, outputs = self(input_ids, attention_mask, labels)
    self.log("train_loss", loss, prog_bar=True, logger=True)
    return {"loss": loss, "predictions":outputs, "labels": labels}

  def validation_step(self, batch, batch_idx):
    input_ids = batch['input_ids']
    attention_mask=batch['attention_mask']
    labels = batch['labels']
    loss, outputs = self(input_ids, attention_mask, labels)
    self.log("val_loss", loss, prog_bar=True, logger=True)
    return loss

  def test_step(self, batch, batch_idx):
    input_ids = batch['input_ids']
    attention_mask=batch['attention_mask']
    labels = batch['labels']
    loss, outputs = self(input_ids, attention_mask, labels)
    self.log("test_loss", loss, prog_bar=True, logger=True)
    return loss

  def configure_optimizers(self):
    optimizer = AdamW(self.parameters(), lr=0.0001)
    return optimizer


In [None]:
MODEL_NAME ='t5-base' 
N_EPOCHS = 2
DATASET = "squad"
BATCH_SIZE = 8

In [None]:
if DATASET == "squad":
  df = get_squad()
  source_max_len = 387
  target_max_len = 34  
elif DATASET == "hotpotqa":
  df = get_hotpotqa()
  source_max_len = 528
  target_max_len = 100


train, val = train_test_split(df, test_size = 0.05, random_state = 0)

In [None]:
checkpoint_callback = ModelCheckpoint(
  dirpath="checkpoints",
  filename="bestcheckpoint",
  save_top_k=1,
  verbose=True,
  monitor="val_loss",
  mode="min"
)
  
logger = TensorBoardLogger("training-logs", name="squad")

trainer = pl.Trainer(
    logger = logger,
    callbacks=checkpoint_callback,
    checkpoint_callback=checkpoint_callback,
    max_epochs = N_EPOCHS,
    gpus=1,
    progress_bar_refresh_rate=30
)

GPU available: True, used: True
INFO:lightning:GPU available: True, used: True
TPU available: None, using: 0 TPU cores
INFO:lightning:TPU available: None, using: 0 TPU cores


In [None]:
pl.seed_everything(0)
tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)
model = SquadModel()

Global seed set to 0
INFO:lightning:Global seed set to 0
Some weights of the model checkpoint at t5-base were not used when initializing T5ForConditionalGeneration: ['decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight']
- This IS expected if you are initializing T5ForConditionalGeneration from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing T5ForConditionalGeneration from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
data_module = SquadDataModule(train, val, tokenizer, batch_size=BATCH_SIZE, source_max_token_len=source_max_token_len, target_max_token_len=target_max_token_len)

data_module.setup()

trainer.fit(model, data_module)

trained_mod  BATCH_SIZE = 1 el = SquadModel.load_from_checkpoint("checkpoints/bestcheckpoint.ckpt")

trained_model.freeze()


  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 222 M 
-----------------------------------------------------
222 M     Trainable params
0         Non-trainable params
222 M     Total params
891.614   Total estimated model params size (MB)
INFO:lightning:
  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 222 M 
-----------------------------------------------------
222 M     Trainable params
0         Non-trainable params
222 M     Total params
891.614   Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

  f"This sequence already has {self.eos_token}. In future versions this behavior may lead to duplicated eos tokens being added."
  f"This sequence already has {self.eos_token}. In future versions this behavior may lead to duplicated eos tokens being added."
  f"This sequence already has {self.eos_token}. In future versions this behavior may lead to duplicated eos tokens being added."
  f"This sequence already has {self.eos_token}. In future versions this behavior may lead to duplicated eos tokens being added."


Training: 0it [00:00, ?it/s]

  f"This sequence already has {self.eos_token}. In future versions this behavior may lead to duplicated eos tokens being added."
  f"This sequence already has {self.eos_token}. In future versions this behavior may lead to duplicated eos tokens being added."
  f"This sequence already has {self.eos_token}. In future versions this behavior may lead to duplicated eos tokens being added."
  f"This sequence already has {self.eos_token}. In future versions this behavior may lead to duplicated eos tokens being added."


Validating: 0it [00:00, ?it/s]

  f"This sequence already has {self.eos_token}. In future versions this behavior may lead to duplicated eos tokens being added."
  f"This sequence already has {self.eos_token}. In future versions this behavior may lead to duplicated eos tokens being added."
  f"This sequence already has {self.eos_token}. In future versions this behavior may lead to duplicated eos tokens being added."
  f"This sequence already has {self.eos_token}. In future versions this behavior may lead to duplicated eos tokens being added."
Epoch 0, global step 1820: val_loss reached 1.41054 (best 1.41054), saving model to "/content/checkpoints/bestcheckpoint.ckpt" as top 1
INFO:lightning:Epoch 0, global step 1820: val_loss reached 1.41054 (best 1.41054), saving model to "/content/checkpoints/bestcheckpoint.ckpt" as top 1
  f"This sequence already has {self.eos_token}. In future versions this behavior may lead to duplicated eos tokens being added."
  f"This sequence already has {self.eos_token}. In future versions t

Validating: 0it [00:00, ?it/s]

  f"This sequence already has {self.eos_token}. In future versions this behavior may lead to duplicated eos tokens being added."
  f"This sequence already has {self.eos_token}. In future versions this behavior may lead to duplicated eos tokens being added."
  f"This sequence already has {self.eos_token}. In future versions this behavior may lead to duplicated eos tokens being added."
  f"This sequence already has {self.eos_token}. In future versions this behavior may lead to duplicated eos tokens being added."
Epoch 1, step 3641: val_loss was not in top 1
INFO:lightning:Epoch 1, step 3641: val_loss was not in top 1
Some weights of the model checkpoint at t5-base were not used when initializing T5ForConditionalGeneration: ['decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight']
- This IS expected if you are initializing T5ForConditionalGeneration from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClas

In [None]:
%load_ext tensorboard
%tensorboard --logdir ./training-logs

In [None]:
model.model.save_pretrained("t5_" + DATASET + "_qg")
tokenizer.save_pretrained("t5_tokenizer_" + DATASET + "_qg")

('t5_tokenizer_hotpot_qa/tokenizer_config.json',
 't5_tokenizer_hotpot_qa/special_tokens_map.json',
 't5_tokenizer_hotpot_qa/spiece.model',
 't5_tokenizer_hotpot_qa/added_tokens.json')