# Pneumothorax Classifier Dataset Creation

**References:**
- https://www.kaggle.com/code/meaninglesslives/pneumothorax-classifier
- https://github.com/ajsanjoaquin/Pneumothorax/blob/master/Pneumothorax_code.ipynb

In [1]:
import pickle
import re
import pandas as pd
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Load Dictionary with Mapping Images to Findings

In [2]:
with open('/content/drive/MyDrive/iu_image/analysis/output/img_findings_dict.pickle', 'rb') as handle:
    img_findings_dict=pickle.load(handle)
print("There are {} unique findings in the dataset".format(len(img_findings_dict)))

There are 2507 unique findings in the dataset


# Identify Pneumothorax Negative Images

In [3]:
pneumothorax_neg_dict={}
temp_dict={}

findings_pneumothorax_ctr=0

for finding in img_findings_dict:
  f=finding.lower()
  f=f.strip()
  f=re.sub(r'[^\w\s]', '', f)
  f=f.split()
  
  if 'pneumothorax' in f: 
    findings_pneumothorax_ctr+=1
    pneum_idx=f.index('pneumothorax')
    if (f[pneum_idx-1])=='no':
      pneumothorax_neg_dict[finding]=img_findings_dict[finding]
    else:
      temp_dict[finding]=img_findings_dict[finding]

print("{} findings mention pneumothorax and {} say 'no pneumothorax' ".format(findings_pneumothorax_ctr,len(pneumothorax_neg_dict)))

1818 findings mention pneumothorax and 741 say 'no pneumothorax' 


In [4]:
for finding in temp_dict:
  f=finding.lower()
  f=f.strip()
  f=f.split('.')

  for f_str in f:
    f_str=re.sub(r'[^\w\s]', '', f_str)
    f_str=f_str.split()
    
    if 'no' in f_str: 
      no_idx=f_str.index('no')
      
      wrds_after_no_lst=f_str[no_idx+1:]

      if 'pneumothorax' in wrds_after_no_lst:
        pneumothorax_neg_dict[finding]=img_findings_dict[finding]
        continue

print("{} say 'no pneumothorax' or 'no ... pneumothorax' ".format(len(pneumothorax_neg_dict)))

1603 say 'no pneumothorax' or 'no ... pneumothorax' 


In [5]:
for finding in temp_dict:
  f=finding.lower()
  f=f.strip()
  f=f.split('.')

  for f_str in f:
    f_str=re.sub(r'[^\w\s]', '', f_str)
    f_str=f_str.split()
    
    if 'without' in f_str: 
      no_idx=f_str.index('without')
      
      wrds_after_no_lst=f_str[no_idx+1:]

      if 'pneumothorax' in wrds_after_no_lst:
        pneumothorax_neg_dict[finding]=img_findings_dict[finding]
        continue

print("{} say 'no pneumothorax' or 'no ... pneumothorax' or 'without ... pneumothorax' ".format(len(pneumothorax_neg_dict)))

1719 say 'no pneumothorax' or 'no ... pneumothorax' or 'without ... pneumothorax' 


In [6]:
for finding in temp_dict:
  f=finding.lower()
  f=f.strip()
  f=f.split('.')

  for f_str in f:
    f_str=re.sub(r'[^\w\s]', '', f_str)
    f_str=f_str.split()
    
    if 'negative' in f_str: 
      no_idx=f_str.index('negative')
      
      wrds_after_no_lst=f_str[no_idx+1:]

      if 'pneumothorax' in wrds_after_no_lst:
        pneumothorax_neg_dict[finding]=img_findings_dict[finding]
        continue

print("{} say 'no pneumothorax' or 'no ... pneumothorax' or 'without ... pneumothorax' or 'negative for ... pneumothorax' ".format(len(pneumothorax_neg_dict)))

1779 say 'no pneumothorax' or 'no ... pneumothorax' or 'without ... pneumothorax' or 'negative for ... pneumothorax' 


In [7]:
for finding in temp_dict:
  f=finding.lower()
  f=f.strip()
  f=f.split('.')

  for f_str in f:
    f_str=re.sub(r'[^\w\s]', '', f_str)
    f_str=f_str.split()
    
    if 'clear' in f_str: 
      no_idx=f_str.index('clear')
      
      wrds_after_no_lst=f_str[no_idx+1:]

      if 'pneumothorax' in wrds_after_no_lst:
        pneumothorax_neg_dict[finding]=img_findings_dict[finding]
        continue

print("{} say 'no pneumothorax' or 'no ... pneumothorax' or 'without ... pneumothorax' or 'negative for ... pneumothorax or 'clear of...' ".format(len(pneumothorax_neg_dict)))

1804 say 'no pneumothorax' or 'no ... pneumothorax' or 'without ... pneumothorax' or 'negative for ... pneumothorax or 'clear of...' 


In [8]:
textfile=open("/content/drive/MyDrive/iu_image/analysis/output/manual_label_findings.txt", "w")
textfile.write('Findings \n\n')

ctr=0

for finding in img_findings_dict:
  if finding not in pneumothorax_neg_dict:
    f=finding.lower()
    f=f.strip()
    f=re.sub(r'[^\w\s]', '', f)
    f=f.split()
  
    if 'pneumothorax' in f: 
      ctr+=1
      textfile.write(finding+"\n\n")

textfile.close()

In [9]:
man_label_df=pd.read_csv("/content/drive/MyDrive/iu_image/analysis/input/manual_label_findings_edited.txt", sep="\n\n")
man_label_lst=man_label_df.values.tolist()
man_label_lst_flat=[item for lst in man_label_lst for item in lst]

  return func(*args, **kwargs)


In [10]:
pneumothorax_pos_dict={}

for s in man_label_lst_flat:
  s_lst=s.split("**")
  if s_lst[1]=='Negative':
    pneumothorax_neg_dict[s_lst[0]]=img_findings_dict[s_lst[0]]

  if s_lst[1]=='Positive':
    pneumothorax_pos_dict[s_lst[0]]=img_findings_dict[s_lst[0]]

In [11]:
with open('/content/drive/MyDrive/iu_image/analysis/output/pneumothorax_pos_dict.pickle', 'wb') as handle:
    pickle.dump(pneumothorax_pos_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('/content/drive/MyDrive/iu_image/analysis/output/pneumothorax_neg_dict.pickle', 'wb') as handle:
    pickle.dump(pneumothorax_neg_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)