# Check Data Availability

In [1]:
import os
import glob

images_dir = os.path.join('../data/cell_images')
malaria_dir = os.path.join(images_dir,'Parasitized')
healthy_dir = os.path.join(images_dir,'Uninfected')

malaria_img = glob.glob(malaria_dir+'/*.png')
healthy_img = glob.glob(healthy_dir+'/*.png')

print('Malaria images : %d' %len(malaria_img))
print('Healthy images : %d' %len(healthy_img))

Malaria images : 13754
Healthy images : 13754


In [None]:
import numpy as np
import pandas as pd
import cv2
from tqdm import tqdm

data_frac = 0.1

df = pd.DataFrame({
    'img': malaria_img + healthy_img,
    'label': ['malaria']*len(malaria_img) + ['healthy']*len(healthy_img)
}).sample(frac=data_frac).reset_index(drop=True)

img_num = df.shape[0]

dim1 = []
dim2 = []

for i in range(img_num):
    dim1.append(cv2.imread(df.img[i]).shape[0])
    dim2.append(cv2.imread(df.img[i]).shape[1])
    
df['dim1'] = dim1
df['dim2'] = dim2

df

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set()

plt.figure(1, figsize=(10,10))

for i in range(16):
    plt.subplot(4,4,i+1)
    plt.subplots_adjust(hspace=0.1,wspace=0.5)
    plt.imshow(cv2.imread(df.img[i]))
    plt.title(df.label[i])
    plt.xticks([])
    plt.yticks([])

In [None]:
plt.figure(figsize=(8,4))
sns.countplot(x='label', data=df)
plt.title('Number of Each Class in %2f of Data' %(data_frac))
plt.show()

In [None]:
p = sns.jointplot(data=df,x='dim1',y='dim2', hue='label', s=100, height=10)
p.fig.suptitle("Image Dimension Distribution")
p.ax_joint.collections[0].set_alpha(1)
p.fig.tight_layout()

In [None]:
df.describe()

As we can see from plot and statistic above, the first and second dimension of images are varied and normally distributed. We need to standardize the dimension by resizing the image. We choose (130,130) as new dimensions since it's close to median and mean value. 

# Image Augmentation

In [None]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator

split = 0.2
resize = (130,130)
seed = 1

train_datagen = ImageDataGenerator(validation_split = split,
                             rescale=1./255,
                             horizontal_flip=True
                            )
test_datagen = ImageDataGenerator(rescale=1./255)

train_generator = train_datagen.flow_from_dataframe(
    df,
    directory = None,
    x_col = 'img',
    y_col = 'label',
    target_size = resize,
    class_mode = 'binary',
    batch_size = 32,
    shuffle = True,
    seed = seed
)

validation_generator = train_datagen.flow_from_dataframe(
    df,
    directory = None,
    x_col = 'img',
    y_col = 'label',
    target_size = resize,
    class_mode = 'binary',
    batch_size = 32,
    shuffle = False,
)

In [None]:
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Conv2D,MaxPool2D,Dropout,Flatten,Dense,BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping

In [None]:
model = Sequential()
model.add(Conv2D(16,(3,3),activation='relu',input_shape=(130,130,3)))
model.add(MaxPool2D(2,2))
model.add(Dropout(0.2))

model.add(Conv2D(32,(3,3),activation='relu'))
model.add(MaxPool2D(2,2))
model.add(Dropout(0.3))

model.add(Conv2D(64,(3,3),activation='relu'))
model.add(MaxPool2D(2,2))
model.add(Dropout(0.3))

model.add(Flatten())
model.add(Dense(64,activation='relu'))
model.add(Dropout(0.5))

model.add(Dense(1,activation='sigmoid'))

In [None]:
len(train_generator)

In [None]:
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])
early_stop = EarlyStopping(monitor='val_loss',patience=2)

history = model.fit_generator(generator = train_generator,
                             steps_per_epoch = len(train_generator),
                              epochs =5,
                              validation_data = validation_generator,
                              validation_steps=len(validation_generator),
                             callbacks=[early_stop])

In [None]:
epochs=5
epochRange = range(1,epochs+1)

plt.plot(epochRange,history.history['acc'])
plt.plot(epochRange,history.history['val_acc'])
plt.title('Model Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend(['Train','Validation'],loc='upper left')
plt.show()

plt.plot(epochRange,history.history['loss'])
plt.plot(epochRange,history.history['val_loss'])
plt.title('Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend(['Train','Validation'],loc='upper left')
plt.show()

import pickle

preprocessed_dir = os.path.join('../data/preprocessed')
with open(preprocessed_dir+'/preprocessed_data.pickle','wb') as preprocessed_file:
    pickle.dump(preprocessed, preprocessed_file)