In [None]:
!nvidia-smi

Mon Jun 14 10:05:22 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.27       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   43C    P8     9W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
!pip install --quiet transformers==4.1.1
!pip install --quiet tokenizers==0.9.4
!pip install --quiet sentencepiece==0.1.94

[K     |████████████████████████████████| 1.5MB 25.7MB/s 
[K     |████████████████████████████████| 2.9MB 45.7MB/s 
[K     |████████████████████████████████| 901kB 52.5MB/s 
[K     |████████████████████████████████| 1.1MB 26.7MB/s 
[?25h

In [None]:
!pip install git+https://github.com/PyTorchLightning/pytorch-lightning

Collecting git+https://github.com/PyTorchLightning/pytorch-lightning
  Cloning https://github.com/PyTorchLightning/pytorch-lightning to /tmp/pip-req-build-0uw_65wc
  Running command git clone -q https://github.com/PyTorchLightning/pytorch-lightning /tmp/pip-req-build-0uw_65wc
  Running command git submodule update --init --recursive -q
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Collecting tensorboard!=2.5.0,>=2.2.0
[?25l  Downloading https://files.pythonhosted.org/packages/64/21/eebd23060763fedeefb78bc2b286e00fa1d8abda6f70efa2ee08c28af0d4/tensorboard-2.4.1-py3-none-any.whl (10.6MB)
[K     |████████████████████████████████| 10.6MB 30.4MB/s 
[?25hCollecting torchmetrics>=0.3.2
[?25l  Downloading https://files.pythonhosted.org/packages/3b/e8/513cd9d0b1c83dc14cd8f788d05cd6a34758d4fd7e4f9e5ecd5d7d599c95/torchmetrics-0.3.2-py3-none-any.whl (274kB)
[K     |██████████████

In [None]:
!rm -rf BioASQ
!rm -rf checkpoints
!rm -rf lightning_logs

In [None]:
import json
import pandas as pd
import numpy as np
import torch
from pathlib import Path
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl
from sklearn.model_selection import train_test_split
from termcolor import colored
import textwrap

from transformers import(
    AdamW,
    T5ForConditionalGeneration,
    T5TokenizerFast as T5Tokenizer
)
pl.seed_everything(42)

Global seed set to 42


42

In [None]:
!gdown --id 1BUJMqTFFVVnoJaO3EV97y79dv75927hz

Downloading...
From: https://drive.google.com/uc?id=1BUJMqTFFVVnoJaO3EV97y79dv75927hz
To: /content/QA.zip
5.48MB [00:00, 20.8MB/s]


In [None]:
!unzip -q QA.zip

In [None]:
def extract_questions_and_context(factoid_path: Path):
  with factoid_path.open() as json_file:
    data=json.load(json_file)
  questions=data["data"][0]["paragraphs"]

  data_rows= []
  for question in questions:
    context=question["context"]
    for question_and_answers in question["qas"]:
      question=question_and_answers["question"]

      data_rows.append({
            "question":question,
            "context":context,
        })
  return pd.DataFrame(data_rows)


In [None]:
extract_questions_and_context(Path("BioASQ/BioASQ-train-factoid-4b.json")).head()

Unnamed: 0,question,context
0,What is the inheritance pattern of Li–Fraumeni...,Balanced t(11;15)(q23;q15) in a TP53+/+ breast...
1,What is the inheritance pattern of Li–Fraumeni...,Genetic modeling of Li-Fraumeni syndrome in ze...
2,Which type of lung cancer is afatinib used for?,Clinical perspective of afatinib in non-small ...
3,Which hormone abnormalities are characteristic...,"DOCA sensitive pendrin expression in kidney, h..."
4,Which hormone abnormalities are characteristic...,Clinical and molecular characteristics of Pend...


In [None]:
factoid_paths = sorted(list(Path("BioASQ/").glob("BioASQ-train-*")))
dfs = []

for factoid_path in factoid_paths:
  dfs.append(extract_questions_and_context(factoid_path))

df=pd.concat(dfs)

In [None]:
df.head()


(12988, 2)

In [None]:
df.shape

(12988, 2)

In [None]:
MODEL_NAME="t5-base"

In [None]:
tokenizer=T5Tokenizer.from_pretrained(MODEL_NAME)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=791656.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1389353.0, style=ProgressStyle(descript…




In [None]:
class BioQCDataset(Dataset):
  def __init__(
      self,
      data:pd.DataFrame,
      tokenizer:T5Tokenizer,
      source_max_token_len: int = 396,
      target_max_token_len: int = 32
  ):
    self.tokenizer= tokenizer
    self.data=data
    self.source_max_token_len=source_max_token_len
    self.target_max_token_len=target_max_token_len

  def __len__(self):
    return len(self.data)
  def __getitem__(self,index: int):
    data_row=self.data.iloc[index]
    
    source_encoding=tokenizer(
    data_row["context"],
    max_length=self.source_max_token_len,
    padding="max_length",
    truncation="only_first",
    return_attention_mask=True,
    add_special_tokens=True,
    return_tensors="pt"
)
    target_encoding=tokenizer(
    data_row["question"],
    max_length=self.target_max_token_len,
    padding="max_length",
    truncation=True,
    return_attention_mask=True,
    add_special_tokens=True,
    return_tensors="pt"
)
    labels=target_encoding["input_ids"]
    labels[labels==0]=-100
    
    return dict(
        question=data_row["question"],
        context=data_row["context"],
        input_ids=source_encoding["input_ids"].flatten(),
        attention_mask=source_encoding["attention_mask"].flatten(),
        labels=labels.flatten()
    )

In [None]:
train_df,val_df= train_test_split(df,test_size=0.05)

In [None]:
class BioQCDataModule(pl.LightningDataModule):

  def __init__(
    self,
    train_df: pd.DataFrame,
    test_df: pd.DataFrame,
    tokenizer: T5Tokenizer,
    batch_size: int = 8,
    source_max_token_len: int = 396,     
    target_max_token_len: int = 32
  ):
    super().__init__()
    self.batch_size=batch_size
    self.train_df=train_df
    self.test_df=test_df
    self.tokenizer=tokenizer 
    self.source_max_token_len=source_max_token_len 
    self.target_max_token_len=target_max_token_len
  
  def setup(self):
    self.train_dataset= BioQCDataset( 
      self.train_df,
      self.tokenizer,
      self.source_max_token_len, 
      self.target_max_token_len
    )
    self.test_dataset=BioQCDataset( 
      self.test_df,
      self.tokenizer,
      self.source_max_token_len,
      self.target_max_token_len
      )

  def train_dataloader(self):
    return DataLoader(
        self.train_dataset,
        batch_size=self.batch_size,
        shuffle=True,
        num_workers=2
    )
  def val_dataloader(self):
    return DataLoader(
        self.test_dataset,
        batch_size=1,
        num_workers=2
    )
  def test_dataloader(self):
    return DataLoader(
        self.test_dataset,
        batch_size=1,
        num_workers=2
    )

In [None]:
BATCH_SIZE=8
N_EPOCHS=6
data_module=BioQCDataModule(train_df,val_df,tokenizer,batch_size=BATCH_SIZE)
data_module.setup()

In [None]:
class BioQCModel(pl.LightningModule):

  def __init__(self):
    super().__init__()
    self.model=T5ForConditionalGeneration.from_pretrained(MODEL_NAME,return_dict=True)
  
  def forward(self,input_ids,attention_mask,labels=None):
    output=self.model(
        input_ids=input_ids,
        attention_mask=attention_mask,
        labels=labels
    )
    return output.loss,output.logits
  def training_step(self,batch,batch_idx):
    input_ids=batch["input_ids"]
    attention_mask=batch["attention_mask"]
    labels=batch["labels"]
    loss,outputs=self(input_ids,attention_mask,labels)
    self.log("train_loss",loss,prog_bar=True,logger=True)
    return loss

  def validation_step(self,batch,batch_idx):
    input_ids=batch["input_ids"]
    attention_mask=batch["attention_mask"]
    labels=batch["labels"]
    loss,outputs=self(input_ids,attention_mask,labels)
    self.log("val_loss",loss,prog_bar=True,logger=True)
    return loss

  def test_step(self,batch,batch_idx):
    input_ids=batch["input_ids"]
    attention_mask=batch["attention_mask"]
    labels=batch["labels"]
    loss,outputs=self(input_ids,attention_mask,labels)
    self.log("test_loss",loss,prog_bar=True,logger=True)
    return loss
  def configure_optimizers(self):
    return AdamW(self.parameters(),lr=0.0001)

In [None]:
model= BioQCModel()

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1199.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=891691430.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at t5-base were not used when initializing T5ForConditionalGeneration: ['decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight']
- This IS expected if you are initializing T5ForConditionalGeneration from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing T5ForConditionalGeneration from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
from pytorch_lightning.callbacks import ModelCheckpoint
checkpoint_callback= ModelCheckpoint(
    dirpath="checkpoints",
    filename="best-checkpoint",
    monitor="val_loss",
    mode="min",
    save_top_k=1,
)

In [None]:
trainer=pl.Trainer(
    callbacks=[checkpoint_callback],
    #checkpoint_callback=checkpoint_callback,
    max_epochs=6,#Remember to change
    gpus=1,
    progress_bar_refresh_rate=30
)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs


In [None]:
trainer.fit(model,data_module)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 222 M 
-----------------------------------------------------
222 M     Trainable params
0         Non-trainable params
222 M     Total params
891.614   Total estimated model params size (MB)
Restored all states from the checkpoint file at None


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…

Global seed set to 42




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

In [None]:
trainer.test()

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Testing', layout=Layout(flex='2'), max=…

--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test_loss': 0.012231427244842052}
--------------------------------------------------------------------------------



[{'test_loss': 0.012231427244842052}]

In [None]:
trainer.save_checkpoint("QCNew.ckpt")

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
!cp "/content/QCNew.ckpt" -r "/content/gdrive/MyDrive/Colab Notebooks"

In [None]:
trained_model=BioQCModel.load_from_checkpoint("/content/gdrive/MyDrive/Colab Notebooks/QCNew.ckpt")
trained_model.freeze()

Some weights of the model checkpoint at t5-base were not used when initializing T5ForConditionalGeneration: ['decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight']
- This IS expected if you are initializing T5ForConditionalGeneration from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing T5ForConditionalGeneration from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
def extract_questions_and_context(factoid_path: Path):
  with factoid_path.open() as json_file:
    data=json.load(json_file)
  questions=data["data"][0]["paragraphs"]

  data_rows= []
  for question in questions:
    context=question["context"]
    for question_and_answers in question["qas"]:
      question=question_and_answers["question"]

      data_rows.append({
                "question":question,
                "context":context,
            })
  return pd.DataFrame(data_rows)

In [None]:
def generate_question(question):
  source_encoding= tokenizer(
      question["context"],
      max_length=396,
      padding="max_length",
      truncation="only_first",
      return_attention_mask=True,
      add_special_tokens=True,
      return_tensors="pt"
  )
  generated_ids=trained_model.model.generate(
      input_ids=source_encoding["input_ids"],
      attention_mask=source_encoding["attention_mask"],
      num_beams=1,
      max_length=80, #depends on dataset
      repetition_penalty=2.5,
      length_penalty=1.0,
      early_stopping=True,
      use_cache=True
  )
  preds=[
         tokenizer.decode(generated_id,skip_special_tokens=True,clean_up_tokenization_spaces=True)
         for generated_id in generated_ids
  ]
  return "".join(preds)


In [None]:
q2=extract_questions_and_context(Path("BioASQ/BioASQ-test-factoid-4b-1.json"))
sample=q2.iloc[12]
print(sample["question"])
print(sample["context"])
generate_question(sample)

Aleglitazar is agonist of which receptor?
The effect of aleglitazar on the pharmacokinetics and pharmacodynamics of S- and R-warfarin in healthy male subjects. : Aleglitazar acts through balanced activation of peroxisome proliferator-activated receptors α and γ; warfarin is a commonly prescribed anticoagulant. Given the extent of cardiovascular disease in patients with type 2 diabetes, cotreatment with aleglitazar and warfarin is likely in this population. This open-label, randomized, 2-period, crossover study in 12 healthy male subjects investigated the potential for drug-drug interactions between warfarin and aleglitazar (final data drawn from 11 white subjects). The primary objective was to investigate the effect of aleglitazar on the pharmacokinetic properties of S-warfarin and on the pharmacodynamics of the racemic mixture; the secondary objectives included the effect of aleglitazar on R-warfarin pharmacokinetics and of racemic warfarin on aleglitazar pharmacokinetics. Subjects we

'Aleglitazar is an antidote of which drug?'

In [None]:
sample=q2.iloc[1]
print(sample["question"])
print(sample["context"])
generate_question(sample)

To which family does the Zika virus belong?
Zika virus emergence in mosquitoes in southeastern Senegal, 2011. BACKGROUND: Zika virus (ZIKV; genus Flavivirus, family Flaviviridae) is maintained in a zoonotic cycle between arboreal Aedes spp. mosquitoes and nonhuman primates in African and Asian forests. Spillover into humans has been documented in both regions and the virus is currently responsible for a large outbreak in French Polynesia. ZIKV amplifications are frequent in southeastern Senegal but little is known about their seasonal and spatial dynamics. The aim of this paper is to describe the spatio-temporal patterns of the 2011 ZIKV amplification in southeastern Senegal. METHODOLOGY/FINDINGS: Mosquitoes were collected monthly from April to December 2011 except during July. Each evening from 18:00 to 21:00 hrs landing collections were performed by teams of 3 persons working simultaneously in forest (canopy and ground), savannah, agriculture, village (indoor and outdoor) and barren 

'To which family does the Zika virus belong?'

In [None]:
sample=q2.iloc[7]
print(sample["question"])
print(sample["context"])
generate_question(sample)

What is the lipid droplet used for in the cell?
GRAF1a is a brain-specific protein that promotes lipid droplet clustering and growth, and is enriched at lipid droplet junctions. Lipid droplets are found in all cell types. Normally present at low levels in the brain, they accumulate in tumours and are associated with neurodegenerative diseases. However, little is known about the mechanisms controlling their homeostasis in the brain. We found that GRAF1a, the longest GRAF1 isoform (GRAF1 is also known as ARHGAP26), was enriched in the brains of neonates. Endogenous GRAF1a was found on lipid droplets in oleic-acid-fed primary glial cells. Exclusive localization required a GRAF1a-specific hydrophobic segment and two membrane-binding regions, a BAR and a PH domain. Overexpression of GRAF1a promoted lipid droplet clustering, inhibited droplet mobility and severely perturbed lipolysis following the chase of cells overloaded with fatty acids. Under these conditions, GRAF1a concentrated at the 

'Which is the major lipid droplet protein in humans?'

# ***Question Answering Module***

In [None]:
'''def extract_questions_and_answers(factoid_path: Path):
  with factoid_path.open() as json_file:
    data=json.load(json_file)
  questions=data["data"][0]["paragraphs"]

  data_rows= []
  for question in questions:
    context=question["context"]
    for question_and_answers in question["qas"]:
      #question=question_and_answers["question"]
      answers=question_and_answers["answers"]

      for answer in answers:
        answer_text=answer["text"]
        answer_start=answer["answer_start"]
        answer_end=answer_start+len(answer_text)

        data_rows.append({
            #"question":question,
            "context":context,
            "answer_text":answer_text,
            "answer_start":answer_start,
            "answer_end":answer_end
        })
  df=pd.DataFrame(data_rows)
  questions=[]
  for i in range(0,df.shape[0]):
    sample=df.iloc[i]
    questions.append(generate_question(sample))
  
  df["question"]=questions
  return df
'''

In [None]:
def extract_questions_and_answers(factoid_path: Path):
  with factoid_path.open() as json_file:
    data=json.load(json_file)
  questions=data["data"][0]["paragraphs"]

  data_rows= []
  for question in questions:
    context=question["context"]
    for question_and_answers in question["qas"]:
      question=question_and_answers["question"]
      answers=question_and_answers["answers"]

      for answer in answers:
        answer_text=answer["text"]
        answer_start=answer["answer_start"]
        answer_end=answer_start+len(answer_text)

        data_rows.append({
            "question":question,
            "context":context,
            "answer_text":answer_text,
            "answer_start":answer_start,
            "answer_end":answer_end
        })
  return pd.DataFrame(data_rows)


In [None]:
factoid_paths2 = sorted(list(Path("BioASQ/").glob("BioASQ-train-*")))

In [None]:
dfs2 = []

for factoid_path in factoid_paths2:
  dfs2.append(extract_questions_and_answers(factoid_path))

df2=pd.concat(dfs2)

In [None]:
df2.head()

Unnamed: 0,question,context,answer_text,answer_start,answer_end
0,What is the inheritance pattern of Li–Fraumeni...,Balanced t(11;15)(q23;q15) in a TP53+/+ breast...,autosomal dominant,213,231
1,What is the inheritance pattern of Li–Fraumeni...,Genetic modeling of Li-Fraumeni syndrome in ze...,autosomal dominant,105,123
2,Which type of lung cancer is afatinib used for?,Clinical perspective of afatinib in non-small ...,EGFR-mutant NSCLC,1203,1220
3,Which hormone abnormalities are characteristic...,"DOCA sensitive pendrin expression in kidney, h...",thyroid,419,426
4,Which hormone abnormalities are characteristic...,Clinical and molecular characteristics of Pend...,thyroid,705,712


In [None]:
MODEL_NAME="t5-base"
tokenizer=T5Tokenizer.from_pretrained(MODEL_NAME)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=791656.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1389353.0, style=ProgressStyle(descript…




In [None]:
class BioQADataset(Dataset):
  def __init__(
      self,
      data:pd.DataFrame,
      tokenizer:T5Tokenizer,
      source_max_token_len: int = 396,
      target_max_token_len: int = 32
  ):
    self.tokenizer= tokenizer
    self.data=data
    self.source_max_token_len=source_max_token_len
    self.target_max_token_len=target_max_token_len

  def __len__(self):
    return len(self.data)
  def __getitem__(self,index: int):
    data_row=self.data.iloc[index]
    
    source_encoding=tokenizer(
    data_row["question"],
    data_row["context"],
    max_length=self.source_max_token_len,
    padding="max_length",
    truncation="only_second",
    return_attention_mask=True,
    add_special_tokens=True,
    return_tensors="pt"
)
    target_encoding=tokenizer(
    data_row["answer_text"],
    max_length=self.target_max_token_len,
    padding="max_length",
    truncation=True,
    return_attention_mask=True,
    add_special_tokens=True,
    return_tensors="pt"
)
    labels=target_encoding["input_ids"]
    labels[labels==0]=-100
    
    return dict(
        question=data_row["question"],
        context=data_row["context"],
        answer_text=data_row["answer_text"],
        input_ids=source_encoding["input_ids"].flatten(),
        attention_mask=source_encoding["attention_mask"].flatten(),
        labels=labels.flatten()
    )


In [None]:
class BioQADataModule(pl.LightningDataModule):

  def __init__(
    self,
    train_df: pd.DataFrame,
    test_df: pd.DataFrame,
    tokenizer: T5Tokenizer,
    batch_size: int = 8,
    source_max_token_len: int = 396,     
    target_max_token_len: int = 32
  ):
    super().__init__()
    self.batch_size=batch_size
    self.train_df=train_df
    self.test_df=test_df
    self.tokenizer=tokenizer 
    self.source_max_token_len=source_max_token_len 
    self.target_max_token_len=target_max_token_len
  
  def setup(self):
    self.train_dataset= BioQADataset( 
      self.train_df,
      self.tokenizer,
      self.source_max_token_len, 
      self.target_max_token_len
    )
    self.test_dataset=BioQADataset( 
      self.test_df,
      self.tokenizer,
      self.source_max_token_len,
      self.target_max_token_len
      )

  def train_dataloader(self):
    return DataLoader(
        self.train_dataset,
        batch_size=self.batch_size,
        shuffle=True,
        num_workers=2
    )
  def val_dataloader(self):
    return DataLoader(
        self.test_dataset,
        batch_size=1,
        num_workers=2
    )
  def test_dataloader(self):
    return DataLoader(
        self.test_dataset,
        batch_size=1,
        num_workers=2
    )

In [None]:
class BioQAModel(pl.LightningModule):

  def __init__(self):
    super().__init__()
    self.model=T5ForConditionalGeneration.from_pretrained(MODEL_NAME,return_dict=True)
  
  def forward(self,input_ids,attention_mask,labels=None):
    output=self.model(
        input_ids=input_ids,
        attention_mask=attention_mask,
        labels=labels
    )
    return output.loss,output.logits
  def training_step(self,batch,batch_idx):
    input_ids=batch["input_ids"]
    attention_mask=batch["attention_mask"]
    labels=batch["labels"]
    loss,outputs=self(input_ids,attention_mask,labels)
    self.log("train_loss",loss,prog_bar=True,logger=True)
    return loss

  def validation_step(self,batch,batch_idx):
    input_ids=batch["input_ids"]
    attention_mask=batch["attention_mask"]
    labels=batch["labels"]
    loss,outputs=self(input_ids,attention_mask,labels)
    self.log("val_loss",loss,prog_bar=True,logger=True)
    return loss

  def test_step(self,batch,batch_idx):
    input_ids=batch["input_ids"]
    attention_mask=batch["attention_mask"]
    labels=batch["labels"]
    loss,outputs=self(input_ids,attention_mask,labels)
    self.log("test_loss",loss,prog_bar=True,logger=True)
    return loss
  def configure_optimizers(self):
    return AdamW(self.parameters(),lr=0.0001)

In [None]:
train_df,val_df= train_test_split(df2,test_size=0.05)

In [None]:
BATCH_SIZE=8
N_EPOCHS=6
data_module=BioQADataModule(train_df,val_df,tokenizer,batch_size=BATCH_SIZE)
data_module.setup()

In [None]:
model= BioQAModel()

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1199.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=891691430.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at t5-base were not used when initializing T5ForConditionalGeneration: ['decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight']
- This IS expected if you are initializing T5ForConditionalGeneration from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing T5ForConditionalGeneration from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
from pytorch_lightning.callbacks import ModelCheckpoint
#from pytorch_lightning.loggers.tensorboard import TensorBoardLogger
checkpoint_callback= ModelCheckpoint(
    dirpath="checkpoints",
    filename="best-checkpoint",
    monitor="val_loss",
    mode="min",
    save_top_k=1,
)


In [None]:
trainer=pl.Trainer(
    callbacks=[checkpoint_callback],
    #checkpoint_callback=checkpoint_callback,
    max_epochs=6,#Remember to change
    gpus=1,
    progress_bar_refresh_rate=30
)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs


In [None]:
trainer.fit(model,data_module)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 222 M 
-----------------------------------------------------
222 M     Trainable params
0         Non-trainable params
222 M     Total params
891.614   Total estimated model params size (MB)
Restored all states from the checkpoint file at None


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…

Global seed set to 42




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

In [None]:
trainer.test()

NameError: ignored

In [None]:
trainer.save_checkpoint("QANew.ckpt")

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
!cp "/content/QANew.ckpt" -r "/content/gdrive/MyDrive/Colab Notebooks"

In [None]:
trained_model1=BioQCModel.load_from_checkpoint("/content/gdrive/MyDrive/Colab Notebooks/QCNew.ckpt")
trained_model1.freeze()
       

Some weights of the model checkpoint at t5-base were not used when initializing T5ForConditionalGeneration: ['decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight']
- This IS expected if you are initializing T5ForConditionalGeneration from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing T5ForConditionalGeneration from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
trained_model2=BioQAModel.load_from_checkpoint("/content/gdrive/MyDrive/Colab Notebooks/QANew.ckpt")
trained_model2.freeze()

Some weights of the model checkpoint at t5-base were not used when initializing T5ForConditionalGeneration: ['decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight']
- This IS expected if you are initializing T5ForConditionalGeneration from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing T5ForConditionalGeneration from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
def generate_question(question):
  source_encoding= tokenizer(
      question["context"],
      max_length=396,
      padding="max_length",
      truncation="only_first",
      return_attention_mask=True,
      add_special_tokens=True,
      return_tensors="pt"
  )
  generated_ids=trained_model1.model.generate(
      input_ids=source_encoding["input_ids"],
      attention_mask=source_encoding["attention_mask"],
      num_beams=1,
      max_length=80, #depends on dataset
      repetition_penalty=2.5,
      length_penalty=1.0,
      early_stopping=True,
      use_cache=True
  )
  preds=[
         tokenizer.decode(generated_id,skip_special_tokens=True,clean_up_tokenization_spaces=True)
         for generated_id in generated_ids
  ]
  return "".join(preds)


In [None]:
def generate_answer(sample,question):
  source_encoding= tokenizer(
      question,
      sample["context"],
      max_length=396,
      padding="max_length",
      truncation="only_second",
      return_attention_mask=True,
      add_special_tokens=True,
      return_tensors="pt"
  )
  generated_ids=trained_model2.model.generate(
      input_ids=source_encoding["input_ids"],
      attention_mask=source_encoding["attention_mask"],
      num_beams=1,
      max_length=80, #depends on dataset
      repetition_penalty=2.5,
      length_penalty=1.0,
      early_stopping=True,
      use_cache=True
  )
  preds=[
         tokenizer.decode(generated_id,skip_special_tokens=True,clean_up_tokenization_spaces=True)
         for generated_id in generated_ids
  ]
  return "".join(preds)

In [None]:
q=extract_questions_and_context(Path("BioASQ/BioASQ-test-factoid-4b-1.json"))
sample=q.iloc[12]
sample["context"]

'The effect of aleglitazar on the pharmacokinetics and pharmacodynamics of S- and R-warfarin in healthy male subjects. : Aleglitazar acts through balanced activation of peroxisome proliferator-activated receptors α and γ; warfarin is a commonly prescribed anticoagulant. Given the extent of cardiovascular disease in patients with type 2 diabetes, cotreatment with aleglitazar and warfarin is likely in this population. This open-label, randomized, 2-period, crossover study in 12 healthy male subjects investigated the potential for drug-drug interactions between warfarin and aleglitazar (final data drawn from 11 white subjects). The primary objective was to investigate the effect of aleglitazar on the pharmacokinetic properties of S-warfarin and on the pharmacodynamics of the racemic mixture; the secondary objectives included the effect of aleglitazar on R-warfarin pharmacokinetics and of racemic warfarin on aleglitazar pharmacokinetics. Subjects were randomized to single-dose warfarin on 

In [None]:
question=generate_question(sample)

In [None]:
question

'Aleglitazar is an antidote of which drug?'

In [None]:
generate_answer(sample,question)

'S-warfarin'

In [None]:
sample2=q.iloc[2]
sample2["context"]

'CAGEr: precise TSS data retrieval and high-resolution promoterome mining for integrative analyses. Cap analysis of gene expression (CAGE) is a high-throughput method for transcriptome analysis that provides a single base-pair resolution map of transcription start sites (TSS) and their relative usage. Despite their high resolution and functional significance, published CAGE data are still underused in promoter analysis due to the absence of tools that enable its efficient manipulation and integration with other genome data types. Here we present CAGEr, an R implementation of novel methods for the analysis of differential TSS usage and promoter dynamics, integrated with CAGE data processing and promoterome mining into a first comprehensive CAGE toolbox on a common analysis platform. Crucially, we provide collections of TSSs derived from most published CAGE datasets, as well as direct access to FANTOM5 resource of TSSs for numerous human and mouse cell/tissue types from within R, greatly

In [None]:
question2=generate_question(sample2)

In [None]:
question2

'Which toolbox is used for promoterome mining using CAGE data?'

In [None]:
generate_answer(sample2,question2)

'CAGEr'

In [None]:
sample3=q.iloc[7]
sample3["context"]

'GRAF1a is a brain-specific protein that promotes lipid droplet clustering and growth, and is enriched at lipid droplet junctions. Lipid droplets are found in all cell types. Normally present at low levels in the brain, they accumulate in tumours and are associated with neurodegenerative diseases. However, little is known about the mechanisms controlling their homeostasis in the brain. We found that GRAF1a, the longest GRAF1 isoform (GRAF1 is also known as ARHGAP26), was enriched in the brains of neonates. Endogenous GRAF1a was found on lipid droplets in oleic-acid-fed primary glial cells. Exclusive localization required a GRAF1a-specific hydrophobic segment and two membrane-binding regions, a BAR and a PH domain. Overexpression of GRAF1a promoted lipid droplet clustering, inhibited droplet mobility and severely perturbed lipolysis following the chase of cells overloaded with fatty acids. Under these conditions, GRAF1a concentrated at the interface between lipid droplets. Although GRAF

In [None]:
question3=generate_question(sample3)

In [None]:
question3

'Which is the protein implicated in neurodegenerative diseases?'

In [None]:
generate_answer(sample3,question3)

'GRAF1'