In [None]:
# Importing stock libraries
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from nltk.translate.bleu_score import sentence_bleu
import re


In [None]:
# # Setting up the device for GPU usage
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
#Defining keywords for each medical tag
key_words = {'Cardiomegaly': ['heart size', 'cardiomegaly','cardiac silhouette','chf','heart failure','cardiac enlargement'], 
             'Atelectasis':['atelectasis', 'collapsed', 'atelectases', 'collapse'], 
             'Pleural Effusion': ['pleural effusion', 'pleural fluid', 'effusions', 'effusion'], 
             'Pneumothorax': ['pneumothorax'],
             'Edema': ['edema', 'vascular congestion'],
             'Consolidation': ['consolidation', 'consolidations'],
             'Lung Opacity': ['opacification', 'pulmonary infiltrates', 'opacities', 'opacity', 'scarring', 'biventricular decompensation'],
             'Pneumonia': ['infectious', 'infection', 'pneumonia'],
             'Support Devices': ['tube', 'pacemaker', 'internal jugular', 'pacer', 'tubes', 'picc', 'icd','leads', 'catheter' ],
             'Fracture': ['fractures', 'fracture'],
             'Enlarged Cardiomediastinum': ['cardiomediastinal silhouette', 'mediastinum', 'mediastinal', 'aortic contour', 'contour', ],
             'Lung Lesion': ['mass', 'nodules', 'nodular', 'lesion'],
             'No Finding' : ['no'],
             'Pleural Other' : ['fibrosis']
             }

In [None]:
#This function creates list of of medical tags
def split_tags (tags):
  medical_tags =  re.split(" positive ", tags)
  medical_tags = ''.join(medical_tags)
  
  medical_tags =  re.split(" positive", medical_tags)
  medical_tags = ''.join(medical_tags)
  
  medical_tags =  re.split(" negative ", medical_tags)
  medical_tags = ''.join(medical_tags)

  medical_tags =  re.split(" negative", medical_tags)
  medical_tags = ''.join(medical_tags)

  medical_tags =  re.split(" uncertain ", medical_tags)
  medical_tags = ''.join(medical_tags)

  medical_tags =  re.split(" uncertain", medical_tags)
  medical_tags = ''.join(medical_tags)
  
  medical_tags =  re.split(",", medical_tags)
  

  return medical_tags

In [None]:
#This function used to create Added information prefix. This function searching in each sentence of The impression - a key word that represent the existed medical tags. 
#Return True if there is a sentence without any connection to the medical tags.
def cutting_added_info (impression, tags):
  sentences = re.split('\.', impression)
  if sentences[-1]!='':
    sentences.append('')
  sentences_to_delete = []
  for i in range(len(sentences)):
    sentence = sentences[i].lower()
    flag = False
    for tag in tags:
      for key_word in key_words[tag]:
        if len(sentence)<5 or sentence[-1]=='1':
          flag=True
          break
        if re.search(key_word,sentence)!=None:
          flag = True
          break
      if flag:
        break
    if flag==False:
      #print(sentence)
      return True
  return False

 

In [None]:
df = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Final project - Zebra/Data/chexbert medical tags and reports after remove 30.csv',encoding='latin-1')
df = df[pd.notna(df['report_impression'])]
df = df[pd.notna(df['report_findings'])]

df= df.reset_index(drop=True)

In [None]:
#Each function creating a column in the dataframe mentioning the existence/content of the prefix

def added_information(data):
  counter=0
  for i in range(data.shape[0]):
    
    line = data.loc[i]
    tags = split_tags(str(line['chexbert medical tags']))
    if tags[0]=='nan':
      continue
    label = str(line['report_impression'])
    if cutting_added_info(label,tags):
      data.at[i,'added_information'] = '[with_added_information]'
      counter+=1
    else:
      data.at[i,'added_information'] = '[without_added_information]'
     
    
  print(counter)


  return data

def length(data):
  #This function is checking if the impression is short/long by observing the average of words per medical tags in the specific instance.
  #12.5 words is the global average.
  for i in range(data.shape[0]):
    words = (len(re.split('\s',str(data.loc[i]['report_impression']))))
    num_of_tags = (len(split_tags(str(data.loc[i]['chexbert medical tags']))))
    avg=int(words/num_of_tags)
    if avg<12.5: 
      data.at[i,'length'] = '[short]'
    else:
      data.at[i,'length'] = '[long]'

  return data
    
def find_history_sections(df):
  for i in range (len(df)):
      current_impression = str(df.loc[i,'report_impression']) #extract current impression
      if ((current_impression.find('prior') != -1) or
          (current_impression.find('comparison') != -1) or
          (current_impression.find('study') != -1) or
          (current_impression.find('compared') != -1) or
          (current_impression.find('Comparison') != -1) or
          (current_impression.find('New') != -1) or
          (current_impression.find('new') != -1)):
        df.loc[i,'history'] = '[with_history]'
      else:
        df.loc[i,'history'] = '[without_history]'

  return df


def find_numbered_sentences(df):
  for i in range (len(df)):
      current_impression = str(df.loc[i,'report_impression']) #extract current impression
      #if ((current_impression.find('1.') != -1) and (current_impression.find('2.') != -1)):
      if (current_impression.find('1.') != -1):
        df.loc[i,'numbered'] = '[numbered]'
      else:
        df.loc[i,'numbered'] = '[non_numbered]'

  return df

In [None]:
  new_data = find_numbered_sentences(find_history_sections(length(added_information(df))))

In [None]:
new_data.to_csv('/content/drive/My Drive/Colab Notebooks/Final project - Zebra/data after remove 30 with prefix.csv')