In [5]:
# !pip install transformers
!pip install evaluate
!pip install rouge
!pip install torch


import torch
import json
from tqdm import tqdm
import torch.nn as nn
from torch.optim import Adam
import nltk
import spacy
import string
import evaluate  # Bleu
from torch.utils.data import Dataset, DataLoader, RandomSampler
import pandas as pd
import numpy as np
import transformers
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from transformers import T5Tokenizer, T5Model, T5ForConditionalGeneration, T5TokenizerFast, AutoTokenizer

import warnings
warnings.filterwarnings("ignore")

In [5]:
# from google.colab import drive
# drive.mount('/content/drive')
# !ls "/content/drive/My Drive/Colab Notebooks"

In [6]:
# !cd "/content/drive/My Drive/Colab Notebooks"
model_path = "Clinical-T5-Large/"

In [7]:
TOKENIZER = AutoTokenizer.from_pretrained(model_path)
MODEL = T5ForConditionalGeneration.from_pretrained(model_path)
OPTIMIZER = Adam(MODEL.parameters(), lr=0.00001)
Q_LEN = 512   # Question Length
T_LEN = 512    # Target Length
BATCH_SIZE = 4
DEVICE = "cuda:0"
MODEL = MODEL.to(device=DEVICE)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [8]:
# from datasets import load_dataset

# dataset = load_dataset("qiaojin/PubMedQA", "pqa_unlabeled")
from datasets import load_dataset

dataset = load_dataset("starmpcc/Asclepius-Synthetic-Clinical-Notes")

Downloading readme:   0%|          | 0.00/2.92k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/402M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/158114 [00:00<?, ? examples/s]

In [9]:
dataset

DatasetDict({
    train: Dataset({
        features: ['patient_id', 'note', 'question', 'answer', 'task'],
        num_rows: 158114
    })
})

In [10]:
# def create_dataframe(context_list, question_list, answer_list):
#     if len(context_list) != len(question_list) or len(question_list) != len(answer_list):
#         raise ValueError("All lists must have the same length.")

#     context_input_list = [' '.join(i['contexts']) for i in context_list]

#     data = {
#         'context': context_input_list,
#         'question': question_list,
#         'answer': answer_list
#     }

#     df = pd.DataFrame(data)
#     return df

df = pd.DataFrame(dataset['train'])

# Filter the dataset for rows with tasks 'summarization' or 'question answering'
filtered_data = df[df['task'].isin(['Summarization', 'Question Answering'])]

# Create a new DataFrame with the specified columns
filtered_data = filtered_data[['note', 'question', 'answer']]
filtered_data.columns = ['context', 'question', 'answer']

# import ace_tools as tools; tools.display_dataframe_to_user(name="Filtered Clinical Notes DataFrame", dataframe=filtered_data)

filtered_data = filtered_data.reset_index(drop=True)

filtered_data.head()

Unnamed: 0,context,question,answer
0,Hospital Course Summary:\n\nAdmission Date: [I...,What were the key improvements in the patient'...,"During the hospital course, the patient's medi..."
1,Discharge Summary:\n\nPatient: 52-year-old mal...,How did the patient's treatment for dysphagia ...,"During the patient's hospital stay, treatment ..."
2,Discharge Summary:\n\nPatient Name: [REDACTED]...,"Can you provide a summary of the treatment, ho...",The 45-year-old female patient with a history ...
3,DISCHARGE SUMMARY:\n\nPatient Name: X\nMedical...,"Based on the given discharge summary, can you ...",The patient with a multifocal invasive mammary...
4,Hospital Course:\n\nThe patient is a 78-year-o...,What are the key findings and diagnosis of the...,The key findings of the patient include abnorm...


In [11]:
# data_df = create_dataframe(dataset['train']['context'], dataset['train']['question'], dataset['train']['long_answer'])

In [12]:
# data_df.head()
data_df = filtered_data
data_df.head()

Unnamed: 0,context,question,answer
0,Hospital Course Summary:\n\nAdmission Date: [I...,What were the key improvements in the patient'...,"During the hospital course, the patient's medi..."
1,Discharge Summary:\n\nPatient: 52-year-old mal...,How did the patient's treatment for dysphagia ...,"During the patient's hospital stay, treatment ..."
2,Discharge Summary:\n\nPatient Name: [REDACTED]...,"Can you provide a summary of the treatment, ho...",The 45-year-old female patient with a history ...
3,DISCHARGE SUMMARY:\n\nPatient Name: X\nMedical...,"Based on the given discharge summary, can you ...",The patient with a multifocal invasive mammary...
4,Hospital Course:\n\nThe patient is a 78-year-o...,What are the key findings and diagnosis of the...,The key findings of the patient include abnorm...


In [13]:
class QA_Dataset(Dataset):
    def __init__(self, tokenizer, dataframe, q_len, t_len):
        self.tokenizer = tokenizer
        self.q_len = q_len
        self.t_len = t_len
        self.data = dataframe
        self.questions = self.data["question"]
        self.context = self.data["context"]
        self.answer = self.data['answer']

    def __len__(self):
        return len(self.questions)

    def __getitem__(self, idx):
        question = self.questions[idx]
        context = self.context[idx]
        answer = self.answer[idx]

        question_tokenized = self.tokenizer(question, context, max_length=self.q_len, padding="max_length",
                                                    truncation=True, pad_to_max_length=True, add_special_tokens=True)
        answer_tokenized = self.tokenizer(answer, max_length=self.t_len, padding="max_length",
                                          truncation=True, pad_to_max_length=True, add_special_tokens=True)

        labels = torch.tensor(answer_tokenized["input_ids"], dtype=torch.long)
        labels[labels == 0] = -100

        return {
            "input_ids": torch.tensor(question_tokenized["input_ids"], dtype=torch.long),
            "attention_mask": torch.tensor(question_tokenized["attention_mask"], dtype=torch.long),
            "labels": labels,
            "decoder_attention_mask": torch.tensor(answer_tokenized["attention_mask"], dtype=torch.long)
        }

In [14]:
# Dataloader

train_data, val_data = train_test_split(filtered_data, test_size=0.2, random_state=42)

train_sampler = RandomSampler(train_data.index)
val_sampler = RandomSampler(val_data.index)

qa_dataset = QA_Dataset(TOKENIZER, data_df, Q_LEN, T_LEN)

train_loader = DataLoader(qa_dataset, batch_size=BATCH_SIZE, sampler=train_sampler)
val_loader = DataLoader(qa_dataset, batch_size=BATCH_SIZE, sampler=val_sampler)

In [15]:
train_loss = 0
val_loss = 0
train_batch_count = 0
val_batch_count = 0
epochs = 5

for epoch in range(epochs):
    MODEL.train()
    for batch in tqdm(train_loader, desc="Training batches"):
        input_ids = batch["input_ids"].to(DEVICE)
        attention_mask = batch["attention_mask"].to(DEVICE)
        labels = batch["labels"].to(DEVICE)
        decoder_attention_mask = batch["decoder_attention_mask"].to(DEVICE)

        outputs = MODEL(
                          input_ids=input_ids,
                          attention_mask=attention_mask,
                          labels=labels,
                          decoder_attention_mask=decoder_attention_mask
                        )

        OPTIMIZER.zero_grad()
        outputs.loss.backward()
        OPTIMIZER.step()
        train_loss += outputs.loss.item()
        train_batch_count += 1

    #Evaluation
    MODEL.eval()
    for batch in tqdm(val_loader, desc="Validation batches"):
        input_ids = batch["input_ids"].to(DEVICE)
        attention_mask = batch["attention_mask"].to(DEVICE)
        labels = batch["labels"].to(DEVICE)
        decoder_attention_mask = batch["decoder_attention_mask"].to(DEVICE)

        outputs = MODEL(
                          input_ids=input_ids,
                          attention_mask=attention_mask,
                          labels=labels,
                          decoder_attention_mask=decoder_attention_mask
                        )

        OPTIMIZER.zero_grad()
        outputs.loss.backward()
        OPTIMIZER.step()
        val_loss += outputs.loss.item()
        val_batch_count += 1

    print(f"{epoch+1}/{epochs} -> Train loss: {train_loss / train_batch_count}\tValidation loss: {val_loss/val_batch_count}")


Training batches:   0%|▏                                                            | 19/7959 [00:17<2:00:10,  1.10it/s]


KeyboardInterrupt: 

In [None]:
MODEL.save_pretrained("Clinical_T5_qa_model_asclepius")
TOKENIZER.save_pretrained("Clinical_T5_qa_tokenizer_asclepius")
# /content/drive/My Drive/Colab Notebooks/

# Saved files
# """('qa_tokenizer/tokenizer_config.json',
#  'qa_tokenizer/special_tokens_map.json',
#  'qa_tokenizer/spiece.model',
# 'qa_tokenizer/added_tokens.json',
# 'qa_tokenizer/tokenizer.json')"""

In [18]:
# MODEL.save_model("/content/drive/My Drive/Colab Notebooks/Clinical_T5_qa_model_actual")

In [2]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.41.2-py3-none-any.whl.metadata (43 kB)
     ---------------------------------------- 0.0/43.8 kB ? eta -:--:--
     ----------------- -------------------- 20.5/43.8 kB 640.0 kB/s eta 0:00:01
     -------------------------------------- 43.8/43.8 kB 533.0 kB/s eta 0:00:00
Collecting filelock (from transformers)
  Using cached filelock-3.14.0-py3-none-any.whl.metadata (2.8 kB)
Collecting huggingface-hub<1.0,>=0.23.0 (from transformers)
  Using cached huggingface_hub-0.23.2-py3-none-any.whl.metadata (12 kB)
Collecting numpy>=1.17 (from transformers)
  Downloading numpy-1.26.4-cp312-cp312-win_amd64.whl.metadata (61 kB)
     ---------------------------------------- 0.0/61.0 kB ? eta -:--:--
     ---------------------------------------- 61.0/61.0 kB 1.6 MB/s eta 0:00:00
Collecting pyyaml>=5.1 (from transformers)
  Downloading PyYAML-6.0.1-cp312-cp312-win_amd64.whl.metadata (2.1 kB)
Collecting regex!=2019.12.17 (from transformers)
  Download

In [3]:

from transformers import AutoModelForDocumentQuestionAnswering
# model2 = T5ForConditionalGeneration.from_pretrained("Clinical_T5_qa_model_asclepius")
# OPTIMIZER = Adam(MODEL.parameters(), lr=0.00001)
Q_LEN = 512   # Question Length
T_LEN = 512    # Target Length
BATCH_SIZE = 4
DEVICE = "cuda:0"
model2 = T5ForConditionalGeneration.from_pretrained("../Clinical_T5_qa_model_asclepius").to(DEVICE)
tokenizer2 = AutoTokenizer.from_pretrained("../Clinical_T5_qa_tokenizer_asclepius")

  from .autonotebook import tqdm as notebook_tqdm
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


NameError: name 'T5ForConditionalGeneration' is not defined

In [6]:
def predict_answer(context, question, ref_answer=None):
    inputs = tokenizer2(question, context, max_length=Q_LEN, padding="max_length", truncation=True, add_special_tokens=True)

    input_ids = torch.tensor(inputs["input_ids"], dtype=torch.long).to(DEVICE).unsqueeze(0)
    attention_mask = torch.tensor(inputs["attention_mask"], dtype=torch.long).to(DEVICE).unsqueeze(0)

    outputs = model2.generate(input_ids=input_ids, attention_mask=attention_mask)

    predicted_answer = tokenizer2.decode(outputs.flatten(), skip_special_tokens=True)

    if ref_answer:
        # Load the Bleu metric
        bleu = evaluate.load("google_bleu")
        score = bleu.compute(predictions=[predicted_answer],
                            references=[ref_answer])

        print("Context: \n", context)
        print("\n")
        print("Question: \n", question)
        return {
            "Reference Answer: ": ref_answer,
            "Predicted Answer: ": predicted_answer,
            "BLEU Score: ": score
        }
    else:
        return predicted_answer

In [7]:
context = data_df.iloc[0]['context']
question = data_df.iloc[0]['question']
answer = data_df.iloc[0]['answer']

predict_answer(context, question, answer)

NameError: name 'data_df' is not defined

In [8]:
context = """ We may use longitudinal analyses to examine long-term symptom outcomes for patients in the immediate group.
Analysis of variance may be used to examine changes in PROMIS scores concerning session attendance and/or the results of the tangibility exercise.
A conventional content approach will be used to qualitatively analyze the semi-structured exit interview data.
Study Timelines:
  Duration of an individual subject’s participation: 16 weeks.
  Estimated timeline to enroll all subjects: 1 year.
  Estimated timeline to complete the study (complete primary analyses): 1.5 years.
Study Endpoints:
  Primary Endpoint: Change in PROMIS-29 physical health summary score from before to after intervention in all 48 subjects.
  Secondary Endpoints:
    Changes in all other PROMIS scores from before to after intervention in all 48 subjects.
    Changes in all PROMIS scores between the immediate and wait-list control groups.
  Primary or secondary safety endpoints: N/A.
Recordings:
  This research involves: None of the above (Audio, photographs, video recordings with or without audio).
  """

question = "What are the Study Endpoints?"

answer = 'I dont know'

predict_answer(context, question, answer)

Downloading builder script:   0%|          | 0.00/8.64k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

Context: 
  We may use longitudinal analyses to examine long-term symptom outcomes for patients in the immediate group.
Analysis of variance may be used to examine changes in PROMIS scores concerning session attendance and/or the results of the tangibility exercise.
A conventional content approach will be used to qualitatively analyze the semi-structured exit interview data.
Study Timelines:
  Duration of an individual subject’s participation: 16 weeks.
  Estimated timeline to enroll all subjects: 1 year.
  Estimated timeline to complete the study (complete primary analyses): 1.5 years.
Study Endpoints:
  Primary Endpoint: Change in PROMIS-29 physical health summary score from before to after intervention in all 48 subjects.
  Secondary Endpoints:
    Changes in all other PROMIS scores from before to after intervention in all 48 subjects.
    Changes in all PROMIS scores between the immediate and wait-list control groups.
  Primary or secondary safety endpoints: N/A.
Recordings:
  This

{'Reference Answer: ': 'I dont know',
 'Predicted Answer: ': 'The Study Endpoints for a patient with PROMIS-29 physical health',
 'BLEU Score: ': {'google_bleu': 0.0}}