In [None]:

!pip install transformers -q
!pip install wandb -q
!pip install sentencepiece -q

[K     |████████████████████████████████| 2.9 MB 13.2 MB/s 
[K     |████████████████████████████████| 3.3 MB 54.6 MB/s 
[K     |████████████████████████████████| 636 kB 70.0 MB/s 
[K     |████████████████████████████████| 895 kB 65.5 MB/s 
[K     |████████████████████████████████| 56 kB 6.2 MB/s 
[K     |████████████████████████████████| 1.7 MB 13.8 MB/s 
[K     |████████████████████████████████| 139 kB 67.5 MB/s 
[K     |████████████████████████████████| 180 kB 70.7 MB/s 
[K     |████████████████████████████████| 97 kB 9.8 MB/s 
[K     |████████████████████████████████| 63 kB 2.5 MB/s 
[?25h  Building wheel for subprocess32 (setup.py) ... [?25l[?25hdone
  Building wheel for pathtools (setup.py) ... [?25l[?25hdone
[K     |████████████████████████████████| 1.2 MB 13.0 MB/s 
[?25h

In [None]:
# Importing stock libraries
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from nltk.translate.bleu_score import sentence_bleu

# Importing the T5 modules from huggingface/transformers
from transformers import T5Tokenizer, T5ForConditionalGeneration

# WandB – Import the wandb library
import wandb

In [None]:
# # Setting up the device for GPU usage
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
# Login to wandb to log the model run and all the parameters
!wandb login

[34m[1mwandb[0m: Currently logged in as: [33myanivroth[0m (use `wandb login --relogin` to force relogin)


In [None]:
def save_model(model,EPOCH,file_name,LOSS):
  PATH = '/content/drive/My Drive/Colab Notebooks/Final project - Zebra/saved models/' +file_name+'E'+str(EPOCH)+'.pt'
  #PATH = 'C:\\Users\\משפחת רוט' +file_name+'E'+str(EPOCH)+'.pt'
  torch.save({
              'epoch': EPOCH,
              'model_state_dict': model.state_dict(),
              'optimizer_state_dict': optimizer.state_dict(),
              'loss': LOSS,
              }, PATH)

def load_model(model,optimizer,PATH,train=True):

  checkpoint = torch.load(PATH)
  model.load_state_dict(checkpoint['model_state_dict'])
  optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
  epoch = checkpoint['epoch']
  loss = checkpoint['loss']

  if train:
    model.train()
  else:
    model.eval()

In [None]:
# Creating a custom dataset for reading the dataframe and loading it into the dataloader to pass it to the neural network at a later stage for finetuning the model and to prepare it for predictions

class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, source_len, summ_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.source_len = source_len
        self.summ_len = summ_len
        self.text = self.data.text
        self.ctext = self.data.ctext

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        ctext = str(self.ctext[index])
        ctext = ' '.join(ctext.split())

        text = str(self.text[index])
        text = ' '.join(text.split())

        source = self.tokenizer.batch_encode_plus([ctext], max_length= self.source_len, pad_to_max_length=True,return_tensors='pt')
        target = self.tokenizer.batch_encode_plus([text], max_length= self.summ_len, pad_to_max_length=True,return_tensors='pt')

        source_ids = source['input_ids'].squeeze()
        source_mask = source['attention_mask'].squeeze()
        target_ids = target['input_ids'].squeeze()
        target_mask = target['attention_mask'].squeeze()

        return {
            'source_ids': source_ids.to(dtype=torch.long), 
            'source_mask': source_mask.to(dtype=torch.long), 
            'target_ids': target_ids.to(dtype=torch.long),
            'target_mask' : target_mask.to(dtype=torch.long)
            #'target_ids_y': target_ids.to(dtype=torch.long)
        }

In [None]:
# This function creates reduces inputs by defining threshold size

def validate_attention(tokenizer, model, device, loader,threshold):
    model.eval()
    reduced_inputs = []  
    with torch.no_grad():
        for counter, data in enumerate(loader, 0):
            if (counter%500 == 0):
              print(counter)
            y = data['target_ids'].to(device, dtype = torch.long)
            y[y == tokenizer.pad_token_id] = -100
            ids = data['source_ids'].to(device, dtype = torch.long)
            mask = data['source_mask'].to(device, dtype = torch.long)
            decoder_attention_mask = data['target_mask'].to(device, dtype = torch.long)
            outputs = model(input_ids = ids, attention_mask = mask, decoder_attention_mask = decoder_attention_mask, labels=y)

            ids = ids.cpu().numpy()
            
            #accessing the softMax score of the encoder first layer
            attention = outputs.encoder_attentions[0].cpu().numpy()
            #average of each token over all the tokens in the sentence and over the attentions heads.
            mean = np.mean(attention, axis=(1,2)) 

            #searching and deleting the tokens' smallest scores for each instance
            for i in range(attention.shape[0]):
              ids_i = ids[i]
              try: #if the insatance is empty - it will add empty string to the list.
                mean[i,0:int(np.argwhere(ids_i==377))+5] = 1 # searching for the token of the FINDINGS(=377) word, (+5 for the number of tokens representing this word) - by given score equal to 1, the medical labels won't deleted.
                count_medicalTags = np.count_nonzero(mean[i]==1) # counting how many tokens are not padded tokens
                mean[i,np.argwhere(ids_i==3)] = 1
                index_i = np.argsort(mean[i]) #sorting the mean vector by indexes
               
                index = list(index_i[ids_i.shape[0]-np.count_nonzero(ids_i):ids_i.shape[0]-count_medicalTags-int(threshold*(np.count_nonzero(ids_i)-count_medicalTags))]) # saving the indexes of the smallest mean values by using threshold
                

                #print(tokenizer.decode(ids_i, skip_special_tokens=True, clean_up_tokenization_spaces=False))
                ids_i = np.delete(ids_i,index,axis=0) #deleting all the smallest values by using threshold
                ids_i = ids_i[count_medicalTags-5:] # deleting the tokens of medical tags
                #print(tokenizer.decode(ids_i, skip_special_tokens=True, clean_up_tokenization_spaces=False))
                
                reduced_inputs.extend([tokenizer.decode(ids_i, skip_special_tokens=True, clean_up_tokenization_spaces=False)]) #add the shoerten instance to the returned list
              except:
                reduced_inputs.extend([''])

            
    return reduced_inputs

                    

In [None]:
#Loading the mpretrained model
tokenizer = T5Tokenizer.from_pretrained("t5-base")
model = T5ForConditionalGeneration.from_pretrained("t5-base",output_attentions=True)
model = model.to(device)
optimizer = torch.optim.Adam(params =  model.parameters(), lr=1e-4)
PATH = '/content/drive/My Drive/Colab Notebooks/Final project - Zebra/saved models/findings chexpert_E1.pt'
load_model(model,optimizer,PATH,train=False)

Downloading:   0%|          | 0.00/773k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.32M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.17k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/850M [00:00<?, ?B/s]

In [None]:
config = wandb.config          # Initialize config
config.MAX_LEN = 512
config.SUMMARY_LEN = 150 

train_params = {
    'batch_size': 4,
    'shuffle': False,
    'num_workers': 0
    }

df = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Final project - Zebra/Data/30 examples.csv',encoding='latin-1')
# adding the head of the report to the medical tags
df['chexpert medical tags'] = df['chexpert medical tags'] + '. ' + df['report_findings']
df = df[['chexpert medical tags','report_impression']]
df= df.reset_index(drop=True)
df = df.rename(columns={'chexpert medical tags': 'ctext', 'report_impression': 'text'})
examples_dataset=df.copy()
examples_set = CustomDataset(examples_dataset, tokenizer, config.MAX_LEN, config.SUMMARY_LEN)
examples_loader = DataLoader(examples_set, **train_params)


df = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Final project - Zebra/Data/chexbert tags and reports fixed input remove 30.csv',encoding='latin-1')
# adding the head of the report to the medical tags
df['chexpert medical tags'] = df['chexpert medical tags'] + '. ' + df['report_findings']
df = df[['chexpert medical tags','report_impression']]
df= df.reset_index(drop=True)
df = df.rename(columns={'chexpert medical tags': 'ctext', 'report_impression': 'text'})
main_dataset=df.copy()
main_set = CustomDataset(main_dataset, tokenizer, config.MAX_LEN, config.SUMMARY_LEN)
main_loader = DataLoader(main_set, **train_params)

In [None]:
#creating Datasets with corrected input using 3 threshold: 0.75,0.5,0.25

three_quarters = validate_attention(tokenizer, model, device, examples_loader,threshold=0.75)
half = validate_attention(tokenizer, model, device, examples_loader,threshold=0.5)
quarter = validate_attention(tokenizer, model, device, examples_loader,threshold=0.25)
examples_df = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Final project - Zebra/Data/30 examples.csv',encoding='latin-1')

examples_df['attention_0.75%'] = three_quarters
examples_df['attention_0.5%'] = half
examples_df['attention_0.25%'] = quarter

examples_df[examples_df==''] = np.nan

examples_df.to_csv('/content/drive/My Drive/Colab Notebooks/Final project - Zebra/Data/examples_attention.csv')


In [None]:
three_quarters = validate_attention(tokenizer, model, device, main_loader,threshold=0.75)
half = validate_attention(tokenizer, model, device, main_loader,threshold=0.5)
quarter = validate_attention(tokenizer, model, device, main_loader,threshold=0.25)
main_df = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Final project - Zebra/Data/chexbert tags and reports fixed input remove 30.csv',encoding='latin-1')
main_df['attention_0.75%'] = three_quarters
main_df['attention_0.5%'] = half
main_df['attention_0.25%'] = quarter

main_df[main_df==''] = np.nan
main_df.to_csv('/content/drive/My Drive/Colab Notebooks/Final project - Zebra/Data/findings_attention.csv')


