In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import seaborn as sns #for plotting
import matplotlib.pyplot as plt #for plotting
from matplotlib.image import imread #to read image files
import tensorflow as tf
from keras.callbacks import ReduceLROnPlateau

# **Understanding the data files**
The images are explored to analyze the following:
* folder structure in which images are arranged
* Number of images in each class
* Understand the visual difference between the chest xrays of normal and a pneumonia patient
* Dimensions of the images (range) to come up with the standard image size 
* Checking the color scale of the images (whether its RGB, B/W)

# **Exploring folder structure**

In [None]:
#setting the main file path and checking the sub folders
data_dir = '../input/chest-xray-pneumonia/chest_xray'
os.listdir(data_dir)


In [None]:

test_path = data_dir + '/test/'
train_path = data_dir + '/train/'
os.listdir(test_path)

In [None]:
os.listdir(train_path)

There are 2 folders in test and train directories for each class of xrays - Normal, Pneumonia

# Reading sample images from each class

In [None]:
train_pneumonia = os.listdir(train_path + 'PNEUMONIA')

In [None]:
sample_pneumonia = train_path + 'PNEUMONIA/'+ 'person1306_bacteria_3277.jpeg'
sample_pneumonia

imread function is used to read in an image and it will be read as an array of numbers corresponding to the intensity of colors or balck and white

In [None]:
imread(sample_pneumonia)

The dimensions of the image are found by using shape attribute of the array of the image

In [None]:
plt.imshow(imread(sample_pneumonia))

The aboive figure is a sample xray from a pneumonia patient

In [None]:
os.listdir(train_path + 'NORMAL')[0]

In [None]:
sample_normal = train_path + 'NORMAL/'+ 'NORMAL2-IM-0569-0001.jpeg'

In [None]:
plt.imshow(imread(sample_normal))

#  The number of images in each folder

In [None]:

len(os.listdir(train_path + 'NORMAL'))

In [None]:
len(os.listdir(train_path + 'PNEUMONIA'))

# Checking the dimension of the images

In [None]:
imread(sample_pneumonia).shape

In [None]:
imread(sample_normal).shape

In [None]:

dim1 = []
dim2 = []

#for file in os.listdir(train_path + 'NORMAL'):
 #   img = imread(train_path + 'NORMAL/'+ file)
 #   d1, d2, color = img.shape
 #   dim1.append(d1)
 #   dim2.append(d2)


Since the above code gave out an error when we tried to assign a third value expected to capture the color channels. This indicates that the images only have two dimensions and thus they are not RGB

In [None]:
dim1 = []
dim2 = []

for file in os.listdir(train_path + 'NORMAL'):
    img = imread(train_path + 'NORMAL/'+ file)
    d1, d2 = img.shape
    dim1.append(d1)
    dim2.append(d2)

dim1
dim2

In [None]:
max(dim1)

In [None]:
max(dim2)

In [None]:
sns.jointplot(dim1,dim2)

The mean values of the image dimensions are checked to decide the standard input image size to be given to the CNN model

In [None]:
np.mean(dim1)

In [None]:
np.mean(dim2)

The average value of the dimensions are selected the standard image size for modeling. We might have to revisit if the model runs out of memory because of large image size

In [None]:
image_shape = (1400,1400) 

# Checking color scale (range) of the images

The min and max values of the pixels of the image are checked to see whether the images are normalized 

In [None]:
imread(sample_pneumonia).min()

In [None]:
imread(sample_pneumonia).max()

The min and max values are 0 and 255 respectively indicating that it is a black and white image and values are not normalised

# Balancing the data in the classes
Inorder to address the imbalance in data, we are randomly selecting 1341 pnuemoina images to balance with the 1341 normal imgaes in the train dataset

In [None]:
 import os
import random

train_pneumonia_files = os.listdir(train_path + 'PNEUMONIA')
random.seed(4)
images_list = random.sample(train_pneumonia_files, k=1341)
print(images_list)


# Image data generator/Data Augmentation
This is used to generate a huge data collection by randomly generating image files after transforming the original images. Since CNN works better with large amount of data, this is extremely useful to increase the input data variation to the model.
The images are randomly transformed with respect to some criteria which we specify in the ImageDataGenerator class:
* resclaing the image or normalizing the image (such that the pixel values of the images lie between 0 and 1 instead of 0 and 255
* randomly rotating the image axis by 10 degrees in maximum
* shifting the width of the image by 10% at max
* shifting the height of the image by 10% at max
* cutting out a portion of the image by 20% at max
*zooming into the image by 20% at max
* flipping the images horizontally

In [None]:

import tensorflow as tf
from keras.preprocessing.image import ImageDataGenerator

train_datagen = ImageDataGenerator(rescale = 1./255,
                                   rotation_range = 10,
                                   width_shift_range=0.1,
                                   height_shift_range=0.1,
                                   shear_range = 0.2,
                                   zoom_range = 0.2)
training_set = train_datagen.flow_from_directory('../input/chest-xray-pneumonia/chest_xray/train/',
                                                 target_size = (250, 250),
                                                 batch_size = 32,
                                                 color_mode = 'grayscale',
                                                 class_mode = 'binary')

In [None]:
test_datagen = ImageDataGenerator(rescale = 1./255)
test_set = test_datagen.flow_from_directory('../input/chest-xray-pneumonia/chest_xray/test/',
                                            target_size = (250, 250),
                                            batch_size = 32,
                                            color_mode = 'grayscale',
                                            class_mode = 'binary')

In [None]:
cnn = tf.keras.models.Sequential()

In [None]:
cnn.add(tf.keras.layers.Conv2D(filters=32, kernel_size=3, activation='relu', input_shape=[250, 250, 1]))
cnn.add(tf.keras.layers.MaxPool2D(pool_size=2, strides=2))

In [None]:
cnn.add(tf.keras.layers.Conv2D(filters=32, kernel_size=3, activation='relu'))
cnn.add(tf.keras.layers.MaxPool2D(pool_size=2, strides=2))

In [None]:
cnn.add(tf.keras.layers.Conv2D(filters=32, kernel_size=3, activation='relu'))
cnn.add(tf.keras.layers.MaxPool2D(pool_size=2, strides=2))

In [None]:
cnn.add(tf.keras.layers.Flatten())
cnn.add(tf.keras.layers.Dropout(0.5))

In [None]:
cnn.add(tf.keras.layers.Dense(units=512, activation='relu'))
cnn.add(tf.keras.layers.Dropout(0.5))

In [None]:
cnn.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))

In [None]:
cnn.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

In [None]:
reduce_lr = ReduceLROnPlateau(monitor = 'val_loss', factor = 0.2,
                              patience = 3, min_lr = 0.00001)
cnn.fit(x = training_set, validation_data = test_set, epochs = 15, callbacks=[reduce_lr])

In [None]:
cnn.evaluate_generator(training_set)
# 95.55% training accuracy

In [None]:
cnn.evaluate_generator(test_set)
# 91.02% accuracy - for test dataset

In [None]:
cnn.summary()

In [None]:
from tensorflow.keras.models import load_model
cnn.save('pneumonia_detector2.h5')