# Generate sections of Radiology reports from a given table contains medical tags

1. The input tabels for this notebook can be found in the "Data" folder in the main Git page
2. The input and output paths should be changes to your desired paths in order to get the results 

## Initialization

In [None]:
!pip install transformers -q
!pip install wandb -q
!pip install sentencepiece -q

In [None]:
# Importing libraries
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from nltk.translate.bleu_score import sentence_bleu

# Importing the T5 modules from huggingface/transformers
from transformers import T5Tokenizer, T5ForConditionalGeneration

# WandB – Import the wandb library
import wandb

In [None]:
# Setting up the device for GPU usage
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
print(f'This notebook is running with device={device}')


This notebook is running with device=cuda


In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
# Login to wandb to log the model run and all the parameters
!wandb login

[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter: 
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


## Methods 

In [None]:
def save_model(model,EPOCH,file_name,LOSS):
    #save the model into desired location pc/drive

    #Parameters for saving:
    #    model :the model 
    #    epoch :the current epoch
    #    loss :the current loss
    #    file_name :the name of the output for saving

  #PATH = '/content/drive/My Drive/Colab Notebooks/Final project - Zebra/saved models/' +file_name+'E'+str(EPOCH)+'.pt' #change the path to your folders
  torch.save({
              'epoch': EPOCH,
              'model_state_dict': model.state_dict(),
              'optimizer_state_dict': optimizer.state_dict(),
              'loss': LOSS,
              }, PATH)


def load_model(PATH, model, optimizer, train=True):
    #load the model from pc/drive

    #Parameters for saving:
    #    path :the path to load from the file
    #    model :model which the data from the file will be loaded into
    #    optimizer :optimizer which the data from the file will be loaded into
    #    train :boolean status for decide if train or eval

  checkpoint = torch.load(PATH)
  model.load_state_dict(checkpoint['model_state_dict'])
  optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
  epoch = checkpoint['epoch']
  loss = checkpoint['loss']

  if train:
    model.train()
  else:
    model.eval()

In [None]:

class CustomDataset(Dataset):
    # Creating a custom dataset for reading the dataframe and loading it into the dataloader

    def __init__(self, dataframe, tokenizer, source_len, summ_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.source_len = source_len
        self.summ_len = summ_len
        self.text = self.data.text
        self.ctext = self.data.ctext

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        ctext = str(self.ctext[index])
        ctext = ' '.join(ctext.split())

        text = str(self.text[index])
        text = ' '.join(text.split())

        source = self.tokenizer.batch_encode_plus([ctext], max_length= self.source_len, pad_to_max_length=True,return_tensors='pt')
        target = self.tokenizer.batch_encode_plus([text], max_length= self.summ_len, pad_to_max_length=True,return_tensors='pt')

        source_ids = source['input_ids'].squeeze()
        source_mask = source['attention_mask'].squeeze()
        target_ids = target['input_ids'].squeeze()
        target_mask = target['attention_mask'].squeeze()

        return {
            'source_ids': source_ids.to(dtype=torch.long), 
            'source_mask': source_mask.to(dtype=torch.long), 
            'target_ids': target_ids.to(dtype=torch.long),
            'target_mask' : target_mask.to(dtype=torch.long)
            #'target_ids_y': target_ids.to(dtype=torch.long)
        }

In [None]:
def train(epoch, tokenizer, model, device, loader, optimizer):
    #train the given model on a given dataloder

    #Parameters:
    #    epoch :number of epochs
    #    tokenizer :tokenzier for encoding the text
    #    model :the model to train
    #    device :CPU or GPU
    #    loader :the data for training
    #    optimizer :optimizer for training

    #Returns:
    #    print the loss each 500 batches

    model.train()
    for _,data in enumerate(loader, 0):
        
        lm_labels = data['target_ids'].to(device, dtype = torch.long)
        lm_labels[lm_labels == tokenizer.pad_token_id] = -100
        decoder_attention_mask = data['target_mask'].to(device, dtype = torch.long)
        ids = data['source_ids'].to(device, dtype = torch.long)
        mask = data['source_mask'].to(device, dtype = torch.long)

        outputs = model(input_ids = ids, attention_mask = mask, decoder_attention_mask = decoder_attention_mask, labels=lm_labels)
        loss = outputs[0]
        
        if _%10 == 0:
            wandb.log({"Training Loss": loss.item()})

        if _%500==0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        

In [None]:
def validate(tokenizer, model, device, loader):
    #evaluate the given model on a given data

    #Parameters:
    #    tokenizer :tokenzier for encoding the text
    #    model :the model to train
    #    device :CPU or GPU
    #    loader :the data for evaluating

    #Returns:
    #    predictions - the generated text
    #    actuals - the ground truth
    #    inputs - the input text
    #    BLEUs - the blue score between predictions and actuals for evaluation


    model.eval()
    predictions = []
    actuals = []
    inputs = []
    BLEUs = []
    with torch.no_grad():
        for _, data in enumerate(loader, 0):
            y = data['target_ids'].to(device, dtype = torch.long)
            ids = data['source_ids'].to(device, dtype = torch.long)
            mask = data['source_mask'].to(device, dtype = torch.long)

            generated_ids = model.generate(
                input_ids = ids,
                attention_mask = mask, 
                max_length=175, 
                num_beams=2,
                repetition_penalty=2.5, 
                length_penalty=1.0, 
                early_stopping=True,
                #num_return_sequences=3
                #min_length = 50,
                )
                
            #print(generated_ids.shape)
            if data['target_ids'].shape[0]==2:
              preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids]
              target = [tokenizer.decode(t, skip_special_tokens=True, clean_up_tokenization_spaces=True)for t in y]
              input = [tokenizer.decode(p, skip_special_tokens=True, clean_up_tokenization_spaces=True)for p in ids]
              BLEU = [sentence_bleu([target[0].split(' ')],preds[0].split(' ')), sentence_bleu([target[1].split(' ')],preds[1].split(' '))]
              if _%100==0:
                  print(f'Completed {_}')
             
              predictions.extend(preds)
              actuals.extend(target)
              inputs.extend(input)
              BLEUs.extend(BLEU)
    

    return predictions, actuals, inputs , BLEUs

## Main 

In [None]:
# WandB – Initialize a new run
wandb.init(project="transformers_report_generation_new", entity='zebra-idc')

# Hyperparameters and inputs
config = wandb.config          # Initialize config
config.TRAIN_BATCH_SIZE = 4    # input batch size for training 
config.VALID_BATCH_SIZE = 2    # input batch size for testing 
config.TRAIN_EPOCHS = 1        # number of epochs to train 
config.LEARNING_RATE = 1e-4    # learning rate 
config.SEED = 42               # random seed (default: 42)
config.MAX_LEN = 512
config.SUMMARY_LEN = 150 

# Set random seeds and deterministic pytorch for reproducibility
torch.manual_seed(config.SEED) # pytorch random seed
np.random.seed(config.SEED) # numpy random seed
torch.backends.cudnn.deterministic = True

# tokenzier for encoding the text
tokenizer = T5Tokenizer.from_pretrained("t5-base")

# Import the input table for the model
# Need to change it for each input we add to the model 
df = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Final project - Zebra/Data/chexbert&pert reports - train.csv',encoding='latin-1')

#deleting rows with empty cells in 'report_impression' and 'report_findings' columns
df = df[pd.notna(df['report_impression'])]
df = df[pd.notna(df['report_findings'])]

# Concatenate the medical tags with different columns from the table
df['concatenate_input'] = df['chexbert medical tags'] + '. ' + df['report_findings']
df = df[['concatenate_input','report_impression']] # define dataframe that contain the input on column and ground_truth on the other column 
df= df.reset_index(drop=True)
df = df.rename(columns={'chexbert medical tags': 'ctext', 'report_impression': 'text'})
#print(df.head())


# Split Datasets to train and validation 
# Defining the train size. So 97% of the data will be used for training and the rest will be used for validation. 
train_size = 0.97
train_dataset=df.sample(frac=train_size,random_state = config.SEED)
val_dataset=df.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)


# Add pre defined 30 instances (removed from the train set)
df = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Final project - Zebra/Data/chexbert&pert reports - test.csv',encoding='latin-1') #change the output path to yours

#deleting rows with empty cells in 'report_impression' and 'report_findings' columns
df = df[pd.notna(df['report_impression'])]
df = df[pd.notna(df['report_findings'])]


# Concatenate the medical tags with different columns from the table
df['concatenate_input'] =df['chexbert medical tags'] + '. ' + df['report_findings']
df = df[['concatenate_input','report_impression']] # define dataframe that contain the input on column and ground_truth on the other column 
df= df.reset_index(drop=True)
df = df.rename(columns={'chexbert medical tags': 'ctext', 'report_impression': 'text'})
# Concat the pre defined 30 instances with the validation set
val_dataset = pd.concat([df, val_dataset], ignore_index=True) 

#print shapes
print("FULL Dataset: {}".format(df.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(val_dataset.shape))

# Creating the Training and Validation dataset for further creation of Dataloader
training_set = CustomDataset(train_dataset, tokenizer, config.MAX_LEN, config.SUMMARY_LEN)
val_set = CustomDataset(val_dataset, tokenizer, config.MAX_LEN, config.SUMMARY_LEN)

# Defining the parameters for creation of the dataloaders
train_params = {
    'batch_size': config.TRAIN_BATCH_SIZE,
    'shuffle': True,
    'num_workers': 0
    }

val_params = {
    'batch_size': config.VALID_BATCH_SIZE,
    'shuffle': False,
    'num_workers': 0
    }

# Creation of Dataloaders for training and validation. 
training_loader = DataLoader(training_set, **train_params)
val_loader = DataLoader(val_set, **val_params)

# Defining the model. We are using t5-base model and added a Language model layer on top for generation of Summary
model = T5ForConditionalGeneration.from_pretrained("t5-base",output_attentions=True)
model = model.to(device)

# Defining the optimizer 
optimizer = torch.optim.Adam(params =  model.parameters(), lr=config.LEARNING_RATE)

# Log metrics with wandb
wandb.watch(model, log="all")

# Training loop
print('--------Training---------')
for epoch in range(config.TRAIN_EPOCHS):
    train(epoch, tokenizer, model, device, training_loader, optimizer)


# Validation loop
print('--------Validation---------')
predictions, actuals, inputs, BLEUs = validate(tokenizer, model, device, val_loader)
final_df = pd.DataFrame({'Input Text':inputs, 'Actual Text':actuals, 'Generated Text':predictions, 'BLEU Validation':BLEUs}) #create df from results
final_df.to_csv('/content/drive/My Drive/Colab Notebooks/Final project - Zebra/prediction_styling_history_addedInformation.csv') #change the output path to yours
print('Output Files generated for review')

#save_model(model,config.TRAIN_EPOCHS,'findings chexpert_',0.1) # Save the model 

