In [1]:
import tensorflow as tf
print(tf.__version__)
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

2.0.0
Found GPU at: /device:GPU:0


In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from glob import glob
import os
from itertools import chain
import matplotlib.gridspec as gridspec
import matplotlib.ticker as ticker

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

from sklearn.model_selection import train_test_split

sns.set_style('whitegrid')
%matplotlib
import warnings
warnings.filterwarnings('ignore')

# Load the TensorBoard notebook extension
%load_ext tensorboard

Using matplotlib backend: Qt5Agg


## Set path variables

In [3]:
import os
os.chdir("D:/datasets/NIH/")
DATA_DIR = os.getcwd()
#DATA_DIR 'D:\\datasets\\NIH'

## Read Data

In [26]:
data = pd.read_csv('Data_Entry_2017.csv')
data = data[data['Patient Age']<100] #removing datapoints which having age greater than 100

image_path = {os.path.basename(x): x for x in
             glob(os.path.join('..', DATA_DIR, 'images*', '*', '*.png'))}
print('Scans found: ', len(image_path), ', Total Headers', data.shape[0])

data['path'] = data['Image Index'].map(image_path.get)
data['Patient Age'] = data['Patient Age'].map(lambda x: int(x))
data.sample(3)

Scans found:  112120 , Total Headers 112104


Unnamed: 0,Image Index,Finding Labels,Follow-up #,Patient ID,Patient Age,Patient Gender,View Position,OriginalImage[Width,Height],OriginalImagePixelSpacing[x,y],Unnamed: 11,path
97161,00025628_016.png,Consolidation|Pleural_Thickening,16,25628,76,F,AP,3056,2544,0.139,0.139,,D:\datasets\NIH\images_011\images\00025628_016...
91938,00022935_003.png,No Finding,3,22935,64,F,PA,2540,3016,0.139,0.139,,D:\datasets\NIH\images_010\images\00022935_003...
16012,00004262_000.png,No Finding,0,4262,70,F,PA,2048,2500,0.168,0.168,,D:\datasets\NIH\images_003\images\00004262_000...


In [27]:
data['Finding Labels'] = data['Finding Labels'].map(lambda x: x.replace('No Finding', ''))

all_labels = np.unique(list(chain(*data['Finding Labels'].map(lambda x: x.split('|')).tolist())))
all_labels = [x for x in all_labels if len(x)>0]
print(f'All Labels ({(len(all_labels))}): {all_labels}')

All Labels (14): ['Atelectasis', 'Cardiomegaly', 'Consolidation', 'Edema', 'Effusion', 'Emphysema', 'Fibrosis', 'Hernia', 'Infiltration', 'Mass', 'Nodule', 'Pleural_Thickening', 'Pneumonia', 'Pneumothorax']


In [28]:
for cnt_label in all_labels:
    if len(cnt_label)>1: # Leave out empty labels
        data[cnt_label] = data['Finding Labels'].map(lambda finding: 1.0 if cnt_label in finding else 0)
data.sample(3)

Unnamed: 0,Image Index,Finding Labels,Follow-up #,Patient ID,Patient Age,Patient Gender,View Position,OriginalImage[Width,Height],OriginalImagePixelSpacing[x,...,Effusion,Emphysema,Fibrosis,Hernia,Infiltration,Mass,Nodule,Pleural_Thickening,Pneumonia,Pneumothorax
82022,00020185_002.png,,2,20185,73,M,AP,2992,2544,0.139,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
71634,00017645_010.png,Mass,10,17645,48,F,PA,2992,2991,0.143,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
93176,00023286_000.png,,0,23286,50,M,PA,2021,2021,0.194311,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [29]:
# Retain at least 1k cases per disease
MIN_CASES = 1000
all_labels = [cnt_label for cnt_label in all_labels if data[cnt_label].sum()>MIN_CASES]

#print('Clean Labels ({})'.format(len(all_labels)), [(cnt_label, int(data[cnt_label].sum())) for cnt_label in all_labels])

# f' string formatting on print statement
print(f'Clean Labels ({(len(all_labels)), [(cnt_label, int(data[cnt_label].sum())) for cnt_label in all_labels]})')

Clean Labels ((13, [('Atelectasis', 11558), ('Cardiomegaly', 2776), ('Consolidation', 4667), ('Edema', 2302), ('Effusion', 13316), ('Emphysema', 2516), ('Fibrosis', 1686), ('Infiltration', 19891), ('Mass', 5779), ('Nodule', 6331), ('Pleural_Thickening', 3384), ('Pneumonia', 1430), ('Pneumothorax', 5301)]))


In [30]:
# Resample; weight is 0.04 + number of findings
sample_weights = data['Finding Labels'].map(lambda x: len(x.split('|')) if len(x)>0 else 0).values + 0.04

sample_weights /= sample_weights.sum()
data = data.sample(40000, weights=sample_weights)


In [31]:
# Create the disease vector
data['disease_vector'] = data.apply(lambda x: [x[all_labels].values], 1).map(lambda x: x[0])

In [32]:
data['disease_vector']

107089    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, ...
9903      [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...
5248      [1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
14814     [1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...
69885     [1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ...
                                ...                        
44556     [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
104322    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
88048     [0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, ...
20036     [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...
13680     [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...
Name: disease_vector, Length: 40000, dtype: object

## Split and prepare for model

In [33]:
# Train, Test
train_df, test_df = train_test_split(data,
                              test_size = 0.20,
                              random_state = 89,
                              stratify = data['Finding Labels'].map(lambda x: x[:4]))

print('Train', train_df.shape[0], 'Test', test_df.shape[0])

Train 32000 Test 8000


In [34]:
# Train, Validate
train_df, valid_df = train_test_split(train_df,
                              test_size = 0.10,
                              random_state = 89,
                              stratify = train_df['Finding Labels'].map(lambda x: x[:4]))

print('Train', train_df.shape[0], 'Valid', valid_df.shape[0])

Train 28800 Valid 3200


In [35]:
def flow_from_dataframe(img_data_gen, in_df, path_col, y_col, **dflow_args):
    base_dir = os.path.dirname(in_df[path_col].values[0])
    df_gen = img_data_gen.flow_from_directory(base_dir,
                                              class_mode = 'sparse',
                                              **dflow_args)
    df_gen.filenames = in_df[path_col].values
    df_gen.classes = np.stack(in_df[y_col].values)
    df_gen.samples = in_df.shape[0]
    df_gen.n = in_df.shape[0]
    df_gen._set_index_array()
    df_gen.directory = ''
    #print(f'Inserting df: {in_df.shape[0]} images')
    return df_gen


In [36]:
from keras.applications.densenet import DenseNet121, preprocess_input


In [37]:
from keras.preprocessing.image import ImageDataGenerator
from PIL import Image
IMG_SIZE = (224, 224)
core_idg = ImageDataGenerator()


In [38]:
# MINE
train_gen = flow_from_dataframe(core_idg,
                                train_df,
                                path_col='path',
                                y_col='disease_vector',
                                target_size = IMG_SIZE,
                                color_mode = 'rgb',
                                batch_size = 16)

valid_gen = flow_from_dataframe(core_idg,
                                valid_df,
                                path_col='path',
                                y_col = 'disease_vector',
                                target_size = IMG_SIZE,
                                color_mode = 'rgb',
                                batch_size = 32) # use larger batches for evaulation

test_X, test_Y = next(flow_from_dataframe(core_idg,
                                          valid_df,
                                          path_col = 'path',
                                          y_col = 'disease_vector',
                                          target_size = IMG_SIZE,
                                          color_mode = 'rgb',
                                          batch_size = 8000))

Found 0 images belonging to 0 classes.
Found 0 images belonging to 0 classes.
Found 0 images belonging to 0 classes.


IndexError: list index out of range

In [20]:


t_x, t_y = next(train_gen)
fig, m_axs = plt.subplots(4, 4, figsize = (16, 16))
for (c_x, c_y, c_ax) in zip(t_x, t_y, m_axs.flatten()):
    c_ax.imshow(c_x[:,:,0])
    c_ax.set_title(', '.join([n_class for n_class, n_score in zip(all_labels, c_y) 
                             if n_score>0.5]))
    c_ax.axis('off')



IndexError: list index out of range