# Capstone - Pneumonia Detection Challenge - Modeling using CNN

### Here are the insights from CNN model experiment:

- CNN model is tried for this problem statement to get insights into challenges and compare results with using traditional CNN models for complex real world problems
- Here CNN is applied on data using: (a) traditional pixel array approach (b) data generated using Image Generators (preprocessing required for converting DICOM to PNG files)
- Experiment is done both on Grayscale and RBG data. Better results are observed with Grayscale data
- Image generators speed up the data pre-processing and useful in augmentation
- Freezing the layers is a good approach to reduce processing time
- DenseNet (with RGB data only) gives good results
- CNN Results when compared with Object Detection model YOLO, indicate object detection models are more apt for solving the problem at hand for more accurate and visible results
- Object detection models provide more justification to the results by displaying bounding boxes. CNN results are more of a black box and hence less convincing (specifically in medical fields)

Import Necessary Packages

In [None]:
!pip install pydicom

Collecting pydicom
[?25l  Downloading https://files.pythonhosted.org/packages/f4/15/df16546bc59bfca390cf072d473fb2c8acd4231636f64356593a63137e55/pydicom-2.1.2-py3-none-any.whl (1.9MB)
[K     |████████████████████████████████| 1.9MB 6.7MB/s 
[?25hInstalling collected packages: pydicom
Successfully installed pydicom-2.1.2


In [None]:
import os
import pandas as pd 
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook
from matplotlib.patches import Rectangle
import seaborn as sns
import pydicom as dcm
%matplotlib inline
import cv2

import keras
import tensorflow as tf
import tensorflow.keras
from keras.models import Sequential, Model
from keras.layers import Dense, Conv2D , MaxPool2D , Flatten , Dropout , BatchNormalization, MaxPooling2D, GlobalAveragePooling2D, ZeroPadding2D
from tensorflow.keras.applications import DenseNet201
from keras.preprocessing.image import ImageDataGenerator
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix
from keras.callbacks import ReduceLROnPlateau

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
os.chdir('/content/drive/MyDrive/RSNA_PneumoniaDetectionChallenge')
%pwd

'/content/drive/MyDrive/RSNA_PneumoniaDetectionChallenge'

In [None]:
train_df = pd.read_csv('train_labels.csv')
test_df = pd.read_csv('test_labels.csv')

In [None]:
print(train_df.shape)
train_df.head()

(30227, 9)


Unnamed: 0,patientId,x,y,width,height,path,Age,Gender,Target
0,0004cfab-14fd-4e49-80ba-63a80b6bddd6,,,,,stage2_train_images/0004cfab-14fd-4e49-80ba-63...,51,F,0
1,00313ee0-9eaa-42f4-b0ab-c148ed3241cd,,,,,stage2_train_images/00313ee0-9eaa-42f4-b0ab-c1...,48,F,0
2,00322d4d-1c29-4943-afc9-b6754be640eb,,,,,stage2_train_images/00322d4d-1c29-4943-afc9-b6...,19,M,0
3,003d8fa0-6bf1-40ed-b54c-ac657f8495c5,,,,,stage2_train_images/003d8fa0-6bf1-40ed-b54c-ac...,28,M,0
4,00436515-870c-4b36-a041-de91049b9ab4,264.0,152.0,213.0,379.0,stage2_train_images/00436515-870c-4b36-a041-de...,32,F,1


# Modeling with CNN - using Pixel Arrays

In [None]:
os.chdir('/content/drive/My Drive/RSNA_PneumoniaDetectionChallenge')
%pwd%

'/content/drive/My Drive/RSNA_PneumoniaDetectionChallenge'

Results using RGB Pixel arrays

In [None]:
from pydicom.pixel_data_handlers.util import apply_color_lut
import cv2

resize_img_rgb = []
temp0 = train_df.iloc[:, 5]

for i in range(len(temp0)):
  temp1 = dcm.dcmread(temp0[i]).pixel_array
  rgb1 = apply_color_lut(temp1, palette='PET')
  rgb2 = cv2.resize(rgb1, (150, 150))
  resize_img_rgb.append(rgb2)
  #print(i, end = ' ')

KeyboardInterrupt: ignored

In [None]:
len(resize_img_rgb)

13571

In [None]:
resize_img_rgb[0].shape

(150, 150, 3)

In [None]:
x_all_rgb = np.array(resize_img_rgb)

In [None]:
#x_all_rgb.tofile('x_all_rgb.csv', sep=',')

KeyboardInterrupt: ignored

In [None]:
print(x_all_rgb.shape)

(13571, 150, 150, 3)


In [None]:
y_train = pd.get_dummies(train_df[:13571]['Target'])

In [None]:
print(y_train.shape)
y_train.head()

(13571, 2)


Unnamed: 0,0,1
0,1,0
1,1,0
2,1,0
3,1,0
4,0,1


In [None]:
model = Sequential()
model.add(Conv2D(32 , (3,3) , strides = 1 , padding = 'same' , activation = 'relu' , input_shape = (150,150,3)))
model.add(BatchNormalization())
model.add(MaxPool2D((2,2) , strides = 2 , padding = 'same'))
model.add(Conv2D(64 , (3,3) , strides = 1 , padding = 'same' , activation = 'relu'))
model.add(Dropout(0.1))
model.add(BatchNormalization())
model.add(MaxPool2D((2,2) , strides = 2 , padding = 'same'))
model.add(Conv2D(64 , (3,3) , strides = 1 , padding = 'same' , activation = 'relu'))
model.add(BatchNormalization())
model.add(MaxPool2D((2,2) , strides = 2 , padding = 'same'))
model.add(Conv2D(128 , (3,3) , strides = 1 , padding = 'same' , activation = 'relu'))
model.add(Dropout(0.2))
model.add(BatchNormalization())
model.add(MaxPool2D((2,2) , strides = 2 , padding = 'same'))
model.add(Conv2D(256 , (3,3) , strides = 1 , padding = 'same' , activation = 'relu'))
model.add(Dropout(0.2))
model.add(BatchNormalization())
model.add(MaxPool2D((2,2) , strides = 2 , padding = 'same'))
model.add(Flatten())
model.add(Dense(units = 128 , activation = 'relu'))
model.add(Dropout(0.2))
model.add(Dense(units = 2 , activation = 'sigmoid'))
model.compile(optimizer = "rmsprop" , loss = 'binary_crossentropy' , metrics = ['accuracy'])
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_10 (Conv2D)           (None, 150, 150, 32)      896       
_________________________________________________________________
batch_normalization_10 (Batc (None, 150, 150, 32)      128       
_________________________________________________________________
max_pooling2d_10 (MaxPooling (None, 75, 75, 32)        0         
_________________________________________________________________
conv2d_11 (Conv2D)           (None, 75, 75, 64)        18496     
_________________________________________________________________
dropout_8 (Dropout)          (None, 75, 75, 64)        0         
_________________________________________________________________
batch_normalization_11 (Batc (None, 75, 75, 64)        256       
_________________________________________________________________
max_pooling2d_11 (MaxPooling (None, 38, 38, 64)       

In [None]:
x_all_rgb = x_all_rgb.astype(float)

In [None]:
history = model.fit(x_all_rgb, y_train, batch_size = 500 ,epochs = 12)
#history = model.fit(train_generator, batch_size = 500 ,epochs = 12)

Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12

KeyboardInterrupt: ignored

In [None]:
densenet = tf.keras.applications.DenseNet201(weights = 'imagenet', include_top=False, pooling = 'avg', input_shape=[150, 150, 3])
densenet.trainable = True # Using pretrained weights due to compute limitation on the worspace.

In [None]:
modelD = tf.keras.Sequential([
            densenet,
            tf.keras.layers.Dense(2, activation='sigmoid')
            ])

In [None]:
modelD.compile(
            optimizer = tf.keras.optimizers.Adam(learning_rate=0.0001, beta_1=0.9, beta_2=0.999, amsgrad=False),
            loss = tf.keras.losses.BinaryCrossentropy(),
            metrics = 'accuracy'
            )

In [None]:
history = modelD.fit(x_all_rgb, y_train, batch_size = 64 ,epochs = 12)
#history = modelD.fit(train_generator, batch_size = 32 ,epochs = 12)

Epoch 1/12

KeyboardInterrupt: ignored

Results using Grayscale images

In [None]:
import cv2
resize_img = []
temp4 = train_df.iloc[:,5]

for i in range(train_df.shape[0]):
  temp = dcm.dcmread(temp4[i]).pixel_array
  #origsize_img.append(dcm.read_file(train_df.iloc[i,5]).pixel_array)
  resize_img.append(cv2.resize(temp, (150, 150)))
  #print(i, end = ' ')

KeyboardInterrupt: ignored

In [None]:
len(resize_img)

13102

In [None]:
resize_img[0].shape

(150, 150)

In [None]:
x_all = np.array(resize_img)

In [None]:
#x_all.tofile('x_all.csv')

KeyboardInterrupt: ignored

In [None]:
x_all = x_all.reshape(13102, 150, 150, 1)
print(x_all.shape)

(13102, 150, 150, 1)


In [None]:
y_train = pd.get_dummies(train_df[:13102]['Target'])

In [None]:
print(y_train.shape)
y_train.head()

(13102, 2)


Unnamed: 0,0,1
0,1,0
1,1,0
2,1,0
3,1,0
4,0,1


In [None]:
model = Sequential()
model.add(Conv2D(32 , (3,3) , strides = 1 , padding = 'same' , activation = 'relu' , input_shape = (150,150,1)))
model.add(BatchNormalization())
model.add(MaxPool2D((2,2) , strides = 2 , padding = 'same'))
model.add(Conv2D(64 , (3,3) , strides = 1 , padding = 'same' , activation = 'relu'))
model.add(Dropout(0.1))
model.add(BatchNormalization())
model.add(MaxPool2D((2,2) , strides = 2 , padding = 'same'))
model.add(Conv2D(64 , (3,3) , strides = 1 , padding = 'same' , activation = 'relu'))
model.add(BatchNormalization())
model.add(MaxPool2D((2,2) , strides = 2 , padding = 'same'))
model.add(Conv2D(128 , (3,3) , strides = 1 , padding = 'same' , activation = 'relu'))
model.add(Dropout(0.2))
model.add(BatchNormalization())
model.add(MaxPool2D((2,2) , strides = 2 , padding = 'same'))
model.add(Conv2D(256 , (3,3) , strides = 1 , padding = 'same' , activation = 'relu'))
model.add(Dropout(0.2))
model.add(BatchNormalization())
model.add(MaxPool2D((2,2) , strides = 2 , padding = 'same'))
model.add(Flatten())
model.add(Dense(units = 128 , activation = 'relu'))
model.add(Dropout(0.2))
model.add(Dense(units = 2 , activation = 'sigmoid'))
model.compile(optimizer = "rmsprop" , loss = 'binary_crossentropy' , metrics = ['accuracy'])
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_10 (Conv2D)           (None, 150, 150, 32)      320       
_________________________________________________________________
batch_normalization_10 (Batc (None, 150, 150, 32)      128       
_________________________________________________________________
max_pooling2d_10 (MaxPooling (None, 75, 75, 32)        0         
_________________________________________________________________
conv2d_11 (Conv2D)           (None, 75, 75, 64)        18496     
_________________________________________________________________
dropout_8 (Dropout)          (None, 75, 75, 64)        0         
_________________________________________________________________
batch_normalization_11 (Batc (None, 75, 75, 64)        256       
_________________________________________________________________
max_pooling2d_11 (MaxPooling (None, 38, 38, 64)       

In [None]:
#x_all = x_all.astype(float)

In [None]:
history = model.fit(x_all, y_train, batch_size = 500 ,epochs = 12)
#history = model.fit(train_generator, batch_size = 500 ,epochs = 12)

Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12


# Modeling with CNN - Using Image Generators

In [None]:
train_df.Target = train_df.Target.astype(str)

In [None]:
os.chdir('/content/drive/MyDrive/RSNA_PneumoniaDetectionChallenge/')
%pwd%

'/content/drive/MyDrive/RSNA_PneumoniaDetectionChallenge'

In [None]:
train_df.path[0]

'stage2_train_images/0004cfab-14fd-4e49-80ba-63a80b6bddd6.dcm'

Converting DICOM to PNG files

In [None]:
t1 = 0
for f in train_df[0:3]['path']:   # remove "[:10]" to convert all images 
    ds = dcm.read_file(f) # read dicom image
    img = ds.pixel_array # get image array
    name = f.replace('stage2_train_images/', "").replace('.dcm','.png')
    #print(name)
    os.chdir('/content/drive/MyDrive/RSNA_PneumoniaDetectionChallenge/PNG_Train')
    cv2.imwrite(gray_image,img) # write png image
    os.chdir('/content/drive/MyDrive/RSNA_PneumoniaDetectionChallenge')  
    print(t1, end='-')
    t1=t1+1

%pwd%

SystemError: ignored

In [None]:
test_df.loc[0,:]

patientId             2676fc9d-7ace-4896-b698-17fc68131851.dcm
path         stage2_test_images/2676fc9d-7ace-4896-b698-17f...
Age                                                         67
Gender                                                       M
Name: 0, dtype: object

In [None]:
test_df[0:3]['path']

0    stage2_test_images/2676fc9d-7ace-4896-b698-17f...
1    stage2_test_images/25a9c27c-9351-4821-bd8d-658...
2    stage2_test_images/268c4efc-36b5-467d-a63c-b33...
Name: path, dtype: object

In [None]:
os.chdir('/content/drive/MyDrive/RSNA_PneumoniaDetectionChallenge')  
%pwd

'/content/drive/MyDrive/RSNA_PneumoniaDetectionChallenge'

In [None]:
t1 = 0
for f in test_df[:]['path']:   # remove "[:10]" to convert all images 
    #print(f.replace('stage2_test_images/', 'stage_1_test_images/'))
    ds = dcm.read_file(f.replace('stage2_test_images/', 'stage_1_test_images/')) # read dicom image
    
    img = ds.pixel_array # get image array
    #print(img)
    name = f.replace('stage2_test_images/', "").replace('.dcm','.png')
    #print(name)
    os.chdir('/content/drive/MyDrive/RSNA_PneumoniaDetectionChallenge/PNG_Test')
    cv2.imwrite(name, img) # write png image
    os.chdir('/content/drive/MyDrive/RSNA_PneumoniaDetectionChallenge')  
    #print(t1, end='-')
    t1=t1+1

%pwd

'/content/drive/MyDrive/RSNA_PneumoniaDetectionChallenge'

In [None]:
os.chdir('/content/drive/MyDrive/RSNA_PneumoniaDetectionChallenge/PNG_Train')
print(len(os.listdir()))
os.chdir('/content/drive/MyDrive/RSNA_PneumoniaDetectionChallenge/stage_1_train_images') 
print(len(os.listdir()))
os.chdir('/content/drive/MyDrive/RSNA_PneumoniaDetectionChallenge/')

22152


ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/usr/local/lib/python3.6/dist-packages/IPython/core/interactiveshell.py", line 2882, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-10-cb2cd7ee697c>", line 4, in <module>
    print(len(os.listdir()))
OSError: [Errno 5] Input/output error

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.6/dist-packages/IPython/core/interactiveshell.py", line 1823, in showtraceback
    stb = value._render_traceback_()
AttributeError: 'OSError' object has no attribute '_render_traceback_'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/lib/python3.6/genericpath.py", line 19, in exists
    os.stat(path)
FileNotFoundError: [Errno 2] No such file or directory: '<ipython-input-10-cb2cd7ee697c>'

During handling of the above exception, another exception occurred:

Tracebac

OSError: ignored

In [None]:
import tensorflow as tf
train_datagen = tf.keras.preprocessing.image.ImageDataGenerator(
    rescale=1./255,
    featurewise_center=False,
    featurewise_std_normalization=False,
    )

In [None]:
os.chdir('/content/drive/MyDrive/RSNA_PneumoniaDetectionChallenge/')

In [None]:
train_df['png_path'] = 'PNG_Train/' + train_df['path'].str.replace('stage2_train_images/', '').str.replace('.dcm','.png')
train_df.loc[0,'png_path']

'PNG_Train/0004cfab-14fd-4e49-80ba-63a80b6bddd6.png'

In [None]:
train_generator = train_datagen.flow_from_dataframe(
    train_df, directory=None, 
    x_col="png_path", y_col="Target", weight_col=None,
    target_size=(150, 150), 
    color_mode = 'grayscale',
    batch_size=224, 
    validate_filenames=True
)

Found 25207 validated image filenames belonging to 2 classes.


  .format(n_invalid, x_col)


In [None]:
train_generator.n

25207

In [None]:
train_generator_rgb = train_datagen.flow_from_dataframe(
    train_df, directory=None, 
    x_col="png_path", y_col="Target", weight_col=None,
    target_size=(150, 150), 
    #color_mode = 'grayscale',
    batch_size=64, 
    validate_filenames=True
)

Found 25207 validated image filenames belonging to 2 classes.


  .format(n_invalid, x_col)


In [None]:
train_generator_rgb.n

25207

In [None]:
model = Sequential()
model.add(Conv2D(32 , (3,3) , strides = 1 , padding = 'same' , activation = 'relu' , input_shape = (150,150,1)))
model.add(BatchNormalization())
model.add(MaxPool2D((2,2) , strides = 2 , padding = 'same'))
model.add(Conv2D(64 , (3,3) , strides = 1 , padding = 'same' , activation = 'relu'))
model.add(Dropout(0.1))
model.add(BatchNormalization())
model.add(MaxPool2D((2,2) , strides = 2 , padding = 'same'))
model.add(Conv2D(64 , (3,3) , strides = 1 , padding = 'same' , activation = 'relu'))
model.add(BatchNormalization())
model.add(MaxPool2D((2,2) , strides = 2 , padding = 'same'))
model.add(Conv2D(128 , (3,3) , strides = 1 , padding = 'same' , activation = 'relu'))
model.add(Dropout(0.2))
model.add(BatchNormalization())
model.add(MaxPool2D((2,2) , strides = 2 , padding = 'same'))
model.add(Conv2D(256 , (3,3) , strides = 1 , padding = 'same' , activation = 'relu'))
model.add(Dropout(0.2))
model.add(BatchNormalization())
model.add(MaxPool2D((2,2) , strides = 2 , padding = 'same'))
model.add(Flatten())
model.add(Dense(units = 128 , activation = 'relu'))
model.add(Dropout(0.2))
model.add(Dense(units = 2 , activation = 'sigmoid'))
#model.compile(optimizer = "rmsprop" , loss = 'binary_crossentropy' , metrics = ['accuracy'])
model.compile(optimizer = "rmsprop" , loss = 'binary_crossentropy' , metrics = ['accuracy', keras.metrics.Recall()] )

model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_10 (Conv2D)           (None, 150, 150, 32)      320       
_________________________________________________________________
batch_normalization_10 (Batc (None, 150, 150, 32)      128       
_________________________________________________________________
max_pooling2d_10 (MaxPooling (None, 75, 75, 32)        0         
_________________________________________________________________
conv2d_11 (Conv2D)           (None, 75, 75, 64)        18496     
_________________________________________________________________
dropout_8 (Dropout)          (None, 75, 75, 64)        0         
_________________________________________________________________
batch_normalization_11 (Batc (None, 75, 75, 64)        256       
_________________________________________________________________
max_pooling2d_11 (MaxPooling (None, 38, 38, 64)       

*Results using Grayscale data - CNN*

In [None]:
history = model.fit(train_generator, batch_size = 224 ,epochs = 12)

Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12

KeyboardInterrupt: ignored

In [None]:
densenet = tf.keras.applications.DenseNet201(weights = 'imagenet', include_top=False, pooling = 'avg', input_shape=[150, 150, 3])
densenet.trainable = True # Using pretrained weights due to compute limitation on the worspace.

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/densenet/densenet201_weights_tf_dim_ordering_tf_kernels_notop.h5


In [None]:
modelD = tf.keras.Sequential([
            densenet,
            tf.keras.layers.Dense(2, activation='sigmoid')
            ])

In [None]:
modelD.compile(
            optimizer = tf.keras.optimizers.Adam(learning_rate=0.0001, beta_1=0.9, beta_2=0.999, amsgrad=False),
            loss = tf.keras.losses.BinaryCrossentropy(),
            metrics = 'accuracy'
            )

*Results using RGB data on DenseNet*

In [None]:
#history = modelD.fit(x_all_rgb, y_train, batch_size = 32 ,epochs = 12)
history = modelD.fit(train_generator_rgb, batch_size = 32 ,epochs = 12)

Epoch 1/12

KeyboardInterrupt: ignored