In [33]:
#import torch
#from transformers import CLIPProcessor, CLIPModel
#from PIL import Image
from xml.dom import minidom
import os
import pickle
from collections import Counter
from nltk import ngrams
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
import re
import glob

[nltk_data] Downloading package stopwords to /Users/rhia/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/rhia/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


# CLIP Model on IU Data

# Map the Findings to Relevant Images

In [3]:
ctr=0
img_findings_dict={}

for filename in os.listdir('input/ecgen-radiology/'):
    docs=minidom.parse('input/ecgen-radiology/'+filename)
    img_ref=docs.getElementsByTagName("AbstractText")
    
    for i in img_ref:
        if i.getAttribute("Label")=='FINDINGS': 
            if i.firstChild:
                finding=str(i.firstChild.data)
                f_lst=finding.split()
                if 1<len(f_lst)<63:
                    img_ref=docs.getElementsByTagName("parentImage")
                    img_arr=[str(i.getAttribute("id")) for i in img_ref]
                    if finding not in img_findings_dict:
                        img_findings_dict[finding]=img_arr
                    else:
                        for i in img_arr:
                            img_findings_dict[finding].append(i)

In [4]:
print("There are {} unique findings in the dataset".format(len(img_findings_dict)))

There are 2507 unique findings in the dataset


# Find the Most 1-Gram used in Findings

Pre-processing:
- Remove Stop Words as defined by the standard NLTK english language and meaningless words in the dataset like 'within','xxxx','show'
- Remove punctuation
- Only consider nouns as flagged by the NLTK package

Future Steps:
- Consider making plural words singular

The 10 most used nouns in the dataset are: 'chest','stable','heart','cardiomegaly','ribs','clear','hyperexpansion', 'hyperinflation', 'vasculature','lungs.'

We wanted to develop as robust as possible a dataset for the first pass, so we consider the top 500 most common words.

In [5]:
stop_words=set(stopwords.words('english'))
additional_stop_words=['within','xxxx','show']

for w in additional_stop_words:
    stop_words.add(w)

findings_lst=[f for f in img_findings_dict]

findings_str_lst=[f.split() for f in findings_lst]
findings_tokens=[w for f in findings_str_lst for w in f]

no_stop_wrds_find_lst=[]
for w in findings_tokens:
    if w.lower() not in stop_words: no_stop_wrds_find_lst.append(w)

unique_words_lst=list(set(no_stop_wrds_find_lst))
pos_unique_word_lst=nltk.pos_tag(unique_words_lst)
noun_pos_lst=['NN','NNS','NNP','NNPS']
nouns_list=[w for w,pos in pos_unique_word_lst if pos in noun_pos_lst]

findings_str=" ".join(nouns_list)
rem_punc_str=re.sub(r'[^\w\s]', '', findings_str)

findings_str_lower=rem_punc_str.lower()

In [6]:
ngram_counts=Counter(ngrams(findings_str_lower.split(),1))
ngram_counts.most_common(500)

noun_500_common_lst=[w[0][0] for w in ngram_counts.most_common(500)]

# Identify Findings with Common Nouns

3 Findings with Common Nouns:

1. heart size normal. lungs are clear. xxxx are normal. no pneumonia, effusions, edema, pneumothorax, adenopathy, nodules or masses.

2. the heart size and pulmonary vascularity appear within normal limits. lungs are free of focal airspace disease. no pleural effusion or pneumothorax is seen. vp shunt tubing is identified. the bony structures, as visualized, appear unremarkable.

3. the heart size and pulmonary vascularity appear within normal limits. the lungs are free of focal airspace disease. no pleural effusion or pneumothorax is seen. picc line is in xxxx. the tip is in the upper right atrium.','the heart is normal in size and contour. the lungs are clear, without evidence of infiltrate. there is no pneumothorax or effusion. multiple punctate round xxxx xxxx over the abdomen on the lateral view. these may reside within, or outside of the patient.


Recognizing that there are frequent phrases in each finding, we move to parsing the findings at the end of a statement ('.') and then identify the statements with the common nouns.

In [7]:
def str_with_common_nouns(list_strings,target_wrds_lst):
    """
        Identify strings with most common nouns.
        
        @P: list_strings (list) - list of strings that may contain a common word
            target_wrds_lst (list) - list of target words/common words
            
        @R: str_with_common_wrds_lst (list) - list of strings with common words
    """
    
    strs_lower_lst=[s.lower() for s in list_strings]
    str_with_common_wrds_lst=[]

    for s in strs_lower_lst:
        if any(w in s for w in target_wrds_lst):
            str_with_common_wrds_lst.append(s)
    
    return str_with_common_wrds_lst

In [8]:
findings_common_nouns_lst=str_with_common_nouns(findings_lst,noun_500_common_lst)

# Identify Statements with Common Nouns

There are 12,151 statements that contain at least one of the common nouns.

Ex: 'no pneumothorax or pneumomediastinum'
    ' heart size is upper limit normal'
    
Recognizing that there are duplicate statements and non-seense information with 'xxxx,' we move to tidying the statements.

In [9]:
statement_lst=[f.split(".") for f in findings_lst]
statement_lst_flat=[s for f in statement_lst for s in f]
        
statement_common_nouns_lst=str_with_common_nouns(statement_lst_flat,noun_500_common_lst)
print("There are {} non-distinct statements that contains at least 1 of the common nouns".format(len(statement_common_nouns_lst)))

There are 12151 non-distinct statements that contains at least 1 of the common nouns


# Identify Unique Statements

- Strip white space
- Remove newline characters and duplicate whitespace

This results in 5,080 unique statements that could be used for further analysis.

Future Steps:
- Consider cases that are incredibly similar: 'clear lungs' and 'clear right lung'
- Remove incomplete statements: statements with 'xxxx'

In [10]:
#complete_statements=[]
#complete_flag=True

statements_lst=[]

for s in statement_common_nouns_lst:
    s=s.strip()
    s=" ".join(s.split())
    rem_punc_str=re.sub(r'[^\w\s]', '', s)
    
    s_str=s.split()
    #for w in s_str: 
        #if re.search("^xx.", w): complete_flag=False
    
    #if complete_flag==True: 
        #complete_statements.append(s)
        
    #complete_flag=True
    
    statements_lst.append(s)
    
#unique_statements_lst=list(set(complete_statements))
unique_statements_lst=list(set(statements_lst))

print("There are {} distinct statements that contains at least 1 of the common nouns".format(len(unique_statements_lst)))

There are 5080 distinct statements that contains at least 1 of the common nouns


In [11]:
unique_statements_lst

['prosthetic right humeral head',
 'the lungs are hyperexpanded the hemidiaphragms are flattened',
 'no focal alveolar consolidation, no definite pleural effusion seen',
 'no pneumothorax or significant pulmonary edema',
 'the heart and cardiomediastinal silhouette or normal in size and contour',
 'no pleural effusion or pneumothorax',
 'the left base opacities could represent early pneumonia or areas of atelectasis',
 'heart size, mediastinal contour, and pulmonary vascularity are similar to comparison exam and within normal limits',
 'the heart is not grossly enlarged',
 'dual-xxxx cardiac pacemaker is in stable, xxxx position',
 'heart and mediastinum are stable with ectasia of the aorta',
 'hyperexpansion, flattening of diaphragms, and increased ap diameter consistent with history of copd',
 'osteopenia and degenerative changes are identified',
 'findings may reflect focal airspace disease or adenopathy',
 'right knee',
 'the distribution xxxx pulmonary edema',
 'the heart is large

# Identify Dataset for CLIP Model

In [12]:
imgs_analysis={}

imgs_findings_analysis_dict={}

unique_statements_set=set(unique_statements_lst)

num_pics_ctr=0

for i in img_findings_dict:
    findings_arr=i.split(".")
    for f in findings_arr:
        f=f.strip()
        f=" ".join(f.split())
        f=re.sub(r'[^\w\s]', '', f)
        
        if f in unique_statements_set:
            imgs_findings_analysis_dict[i]=img_findings_dict[i]
            num_pics_ctr+=len(img_findings_dict[i])
            continue

print("There are {} unique findings and {} images in the dataset to be considered for further analysis".format(len(imgs_findings_analysis_dict),num_pics_ctr))

There are 53 unique findings and 122 images in the dataset to be considered for further analysis


# CLIP

In [29]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)

trgt_imgs_list=[imgs_findings_analysis_dict[i] for i in imgs_findings_analysis_dict]
trgt_imgs_list_flat=[i for lst in trgt_imgs_list for i in lst]
trgt_imgs_list_set=set(trgt_imgs_list_flat)
    
images=[]
for file in glob.glob('input/NLMCXR_png/*.png'):
    file_split_1=file.split(".")
    file_split_2=file_split_1[0].split("/")
    img_indx=file_split_2[2]
    
    if img_indx in trgt_imgs_list_set: 
        images.append(Image.open(file))

inputs = processor(text=unique_statements_lst,images=images,return_tensors="pt",padding=True)

outputs = model(**inputs)
logits_per_image = outputs.logits_per_image
probs = logits_per_image.softmax(dim=1)

AttributeError: 'list' object has no attribute 'type'

# References

- https://github.com/openai/CLIP

- https://www.geeksforgeeks.org/part-speech-tagging-stop-words-using-nltk-python/

- https://dida.do/blog/clip