$$\textbf{Exploration of training labels}$$

In [1]:
import pandas as pd
import numpy as np

In [2]:
data_train = pd.read_csv('../data/Data_Entry_2017.csv')
print(data_train.shape)
data_train.head()

(112120, 12)


Unnamed: 0,Image Index,Finding Labels,Follow-up #,Patient ID,Patient Age,Patient Gender,View Position,OriginalImage[Width,Height],OriginalImagePixelSpacing[x,y],Unnamed: 11
0,00000001_000.png,Cardiomegaly,0,1,58,M,PA,2682,2749,0.143,0.143,
1,00000001_001.png,Cardiomegaly|Emphysema,1,1,58,M,PA,2894,2729,0.143,0.143,
2,00000001_002.png,Cardiomegaly|Effusion,2,1,58,M,PA,2500,2048,0.168,0.168,
3,00000002_000.png,No Finding,0,2,81,M,PA,2500,2048,0.171,0.171,
4,00000003_000.png,Hernia,0,3,81,F,PA,2582,2991,0.143,0.143,


In [3]:
#Final column has only NaNs
print(data_train['Unnamed: 11'].isnull().sum())
data_train = data_train[data_train.columns.difference(['Unnamed: 11'])]
data_train.head()

112120


Unnamed: 0,Finding Labels,Follow-up #,Height],Image Index,OriginalImagePixelSpacing[x,OriginalImage[Width,Patient Age,Patient Gender,Patient ID,View Position,y]
0,Cardiomegaly,0,2749,00000001_000.png,0.143,2682,58,M,1,PA,0.143
1,Cardiomegaly|Emphysema,1,2729,00000001_001.png,0.143,2894,58,M,1,PA,0.143
2,Cardiomegaly|Effusion,2,2048,00000001_002.png,0.168,2500,58,M,1,PA,0.168
3,No Finding,0,2048,00000002_000.png,0.171,2500,81,M,2,PA,0.171
4,Hernia,0,2991,00000003_000.png,0.143,2582,81,F,3,PA,0.143


In [4]:
data_train.columns

Index(['Finding Labels', 'Follow-up #', 'Height]', 'Image Index',
       'OriginalImagePixelSpacing[x', 'OriginalImage[Width', 'Patient Age',
       'Patient Gender', 'Patient ID', 'View Position', 'y]'],
      dtype='object')

In [5]:
#Make usable headers
data_train.rename(columns={'Finding Labels': 'text_label', 'Follow-up #': 'follow_up_number','Height]': 'img_height_original', 'Image Index': 'img_filename',
       'OriginalImagePixelSpacing[x': 'x_pixel_spacing_original', 'OriginalImage[Width': 'img_width_original', 'Patient Age': 'age',
       'Patient Gender':'gender', 'Patient ID':'patient_id', 'View Position': 'view_position', 'y]': 'y_pixel_spacing_original'}, inplace = True)
data_train.head()

Unnamed: 0,text_label,follow_up_number,img_height_original,img_filename,x_pixel_spacing_original,img_width_original,age,gender,patient_id,view_position,y_pixel_spacing_original
0,Cardiomegaly,0,2749,00000001_000.png,0.143,2682,58,M,1,PA,0.143
1,Cardiomegaly|Emphysema,1,2729,00000001_001.png,0.143,2894,58,M,1,PA,0.143
2,Cardiomegaly|Effusion,2,2048,00000001_002.png,0.168,2500,58,M,1,PA,0.168
3,No Finding,0,2048,00000002_000.png,0.171,2500,81,M,2,PA,0.171
4,Hernia,0,2991,00000003_000.png,0.143,2582,81,F,3,PA,0.143


In [6]:
#inspect unique labels
uniquelist = data_train['text_label'].unique().tolist()
uniquelist = [x.split('|') if '|' in x else x for x in uniquelist]
finallist = []
for item in uniquelist:
    if type(item) == list:
        for it in item:
            finallist.append(it)
    else:
        finallist.append(item)
set(finallist)

{'Atelectasis',
 'Cardiomegaly',
 'Consolidation',
 'Edema',
 'Effusion',
 'Emphysema',
 'Fibrosis',
 'Hernia',
 'Infiltration',
 'Mass',
 'No Finding',
 'Nodule',
 'Pleural_Thickening',
 'Pneumonia',
 'Pneumothorax'}

In [7]:
data_train.to_pickle('data_train.pkl', protocol = -1)

$$\textbf{Explore Images to exclude}$$

Images to exclude from training fall in four categories:
1. Images with a bounding box already drawn
2. Images with side x-rays or other non-frontal X-rays 
3. Rotated Images
4. Images pre-selected by the authors as having being low quality

In [8]:
bb_list = pd.read_csv('../data/BBox_List_2017.csv')
print(bb_list.shape)
bb_list.head()

(984, 9)


Unnamed: 0,Image Index,Finding Label,Bbox [x,y,w,h],Unnamed: 6,Unnamed: 7,Unnamed: 8
0,00013118_008.png,Atelectasis,225.084746,547.019217,86.779661,79.186441,,,
1,00014716_007.png,Atelectasis,686.101695,131.543498,185.491525,313.491525,,,
2,00029817_009.png,Atelectasis,221.830508,317.053115,155.118644,216.949153,,,
3,00014687_001.png,Atelectasis,726.237288,494.95142,141.016949,55.322034,,,
4,00017877_001.png,Atelectasis,660.067797,569.780787,200.677966,78.101695,,,


In [9]:
non_pa_ap_view = pd.read_csv('../data/blacklist_non_PA_AP_view.csv', header= None)
print(non_pa_ap_view.shape)
non_pa_ap_view.head()

(56, 1)


Unnamed: 0,0
0,00000591_003.png
1,00001136_001.png
2,00001153_005.png
3,00001602_000.png
4,00001803_003.png


In [10]:
rotated_images = pd.read_csv('../data/blacklist_rotated_images.csv', header= None)
print(rotated_images.shape)
rotated_images.head()

(18, 1)


Unnamed: 0,0
0,00001255_007.png
1,00001814_001.png
2,00002180_000.png
3,00002815_003.png
4,00003693_005.png


In [11]:
low_qual = pd.read_csv('../data/blacklist_other_images_with_lower_quality.csv', header= None)
print(low_qual.shape)
low_qual.head()

(1094, 1)


Unnamed: 0,0
0,00000032_013.png
1,00000032_023.png
2,00000032_024.png
3,00000032_055.png
4,00000032_058.png


Remove all these images from both training and test sets

In [12]:
train_val = pd.read_csv('../data/train_val_list.txt', header= None).rename(columns = {0:'img_filename'})
print(train_val.shape)
train_val.head()

(86524, 1)


Unnamed: 0,img_filename
0,00000001_000.png
1,00000001_001.png
2,00000001_002.png
3,00000002_000.png
4,00000004_000.png


In [13]:
test = pd.read_csv('../data/test_list.txt', header= None).rename(columns = {0:'img_filename'})
print(test.shape)
test.head()

(25596, 1)


Unnamed: 0,img_filename
0,00000003_000.png
1,00000003_001.png
2,00000003_002.png
3,00000003_003.png
4,00000003_004.png


In [14]:
#aggregate all blacklisted names in a list
blacklist = bb_list['Image Index'].values.flatten().tolist() + non_pa_ap_view.values.flatten().tolist() + rotated_images.values.flatten().tolist() + low_qual.values.flatten().tolist()

blacklist = list(set(blacklist))
print(blacklist[:5], '\n',len(blacklist))
# print(len(set( bb_list['Image Index'].values.flatten().tolist())), len(set(blacklist)), len(set( bb_list['Image Index'].values.flatten().tolist()) - set(blacklist)), len(set(blacklist) - set( bb_list['Image Index'].values.flatten().tolist())))
# set(bb_list['Image Index'].values.flatten().tolist()).intersection(set(blacklist))

['00013922_020.png', '00025664_037.png', '00018102_001.png', '00014253_010.png', '00023195_000.png'] 
 2041


In [15]:
train_val = train_val[~train_val['img_filename'].isin(blacklist)]
test = test[~test['img_filename'].isin(blacklist)]
print(train_val.shape, test.shape)

(85690, 1) (24389, 1)


In [16]:
train_val_filtered = train_val.merge(data_train, on = 'img_filename')
print(train_val_filtered.shape)
train_val_filtered.head()

(85690, 11)


Unnamed: 0,img_filename,text_label,follow_up_number,img_height_original,x_pixel_spacing_original,img_width_original,age,gender,patient_id,view_position,y_pixel_spacing_original
0,00000001_000.png,Cardiomegaly,0,2749,0.143,2682,58,M,1,PA,0.143
1,00000001_001.png,Cardiomegaly|Emphysema,1,2729,0.143,2894,58,M,1,PA,0.143
2,00000001_002.png,Cardiomegaly|Effusion,2,2048,0.168,2500,58,M,1,PA,0.168
3,00000002_000.png,No Finding,0,2048,0.171,2500,81,M,2,PA,0.171
4,00000004_000.png,Mass|Nodule,0,2048,0.168,2500,82,M,4,AP,0.168


In [17]:
test_filtered = test.merge(data_train, on = 'img_filename')
print(test_filtered.shape)
test_filtered.head()

(24389, 11)


Unnamed: 0,img_filename,text_label,follow_up_number,img_height_original,x_pixel_spacing_original,img_width_original,age,gender,patient_id,view_position,y_pixel_spacing_original
0,00000003_000.png,Hernia,0,2991,0.143,2582,81,F,3,PA,0.143
1,00000003_001.png,Hernia,1,2048,0.168,2500,74,F,3,PA,0.168
2,00000003_002.png,Hernia,2,2500,0.168,2048,75,F,3,PA,0.168
3,00000003_003.png,Hernia|Infiltration,3,2991,0.143,2698,76,F,3,PA,0.143
4,00000003_004.png,Hernia,4,2048,0.168,2500,77,F,3,PA,0.168


In [18]:
train_val_filtered.to_pickle('train_val_filtered.pkl', protocol= -1)
test_filtered.to_pickle('test_filtered.pkl', protocol= -1)

Use train_val_filtered and test_filtered for future model use

{'00007390_012.png': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=uint8),
 '00014387_011.png': array([0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=uint8),
 '00015839_001.png': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=uint8),
 '00022672_002.png': array([0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0], dtype=uint8),
 '00027415_004.png': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=uint8)}

In [26]:
disease_list = ['Cardiomegaly', 'Atelectasis', 'Pleural_Thickening', 'Nodule', 'Effusion', 'Infiltration',
 'Fibrosis', 'Consolidation', 'Edema', 'Hernia', 'Pneumothorax', 'Emphysema', 'Mass', 'Pneumonia']

In [28]:
def text_label_to_vector(text_label):
    text_label = text_label.split('|')
    return np.array([int((i  in  text_label) == True) for i in disease_list])

In [29]:
train_val_filtered['array_label'] = train_val_filtered['text_label'].apply(text_label_to_vector)
test_filtered['array_label'] = test_filtered['text_label'].apply(text_label_to_vector)

In [30]:
train_val_filtered.head()

Unnamed: 0,img_filename,text_label,follow_up_number,img_height_original,x_pixel_spacing_original,img_width_original,age,gender,patient_id,view_position,y_pixel_spacing_original,array_label
0,00000001_000.png,Cardiomegaly,0,2749,0.143,2682,58,M,1,PA,0.143,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
1,00000001_001.png,Cardiomegaly|Emphysema,1,2729,0.143,2894,58,M,1,PA,0.143,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]"
2,00000001_002.png,Cardiomegaly|Effusion,2,2048,0.168,2500,58,M,1,PA,0.168,"[1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
3,00000002_000.png,No Finding,0,2048,0.171,2500,81,M,2,PA,0.171,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
4,00000004_000.png,Mass|Nodule,0,2048,0.168,2500,82,M,4,AP,0.168,"[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0]"


In [31]:
train_val_filtered.to_pickle('train_val_filtered.pkl', protocol= -1)
test_filtered.to_pickle('test_filtered.pkl', protocol= -1)