In [7]:
import logging
import os

# LOGGING CONFIG ##############################
logging.basicConfig(
    format='%(asctime)s %(levelname)-8s %(message)s',
    level=logging.INFO,
    datefmt='%Y-%m-%d %H:%M:%S')
###############################################

In [9]:
import numpy as np
import tensorflow as tf
import cv2
import matplotlib.pyplot as plt

In [11]:
'''
using the chest x-ray dataset from https://www.kaggle.com/paultimothymooney/chest-xray-pneumonia/
check if the chest x-ray dataset exists
unzip the chest x-ray dataset if it exists
'''
if not os.path.exists('chest_xray'):
    if os.path.exists('chest-xray-pneumonia.zip'):
        logging.info('unzipping the dataset file')
        os.system('unzip chest-xray-pneumonia.zip')
        logging.info('unzipping is done')
        os.system('rm chest-xray-pneumonia.zip')
    else:
        logging.warning('please download the dataset from https://www.kaggle.com/paultimothymooney/chest-xray-pneumonia/')


In [13]:
def create_training_data(dir, data_type, main, sub, size):
    training_data = []
    occurrence = [0, 0, 0]
    for cat in main:
        logging.info('reading images for category {}'.format(cat))
        path =  os.path.join(dir, data_type, cat)
        label = main.index(cat)
        for img in os.listdir(path):
            # read and resize image
            try:
                img_array = cv2.imread(os.path.join(path, img), cv2.IMREAD_GRAYSCALE)
                img_array = cv2.resize(img_array, (size, size))

                # assign label for virus and bateria
                if label != 0:
                    if sub[0] in img:
                        label = 1
                    else:
                        label = 2

                # append to training data
                occurrence[label] += 1
                training_data.append([img_array, label])

            except:
                logging.warn('error reading {}'.format(img))
    
    logging.info('reading images done')
    return training_data, occurrence

            
directory = 'chest_xray'
data_type = 'train'   # choose between train and test
main_category = ['NORMAL', 'PNEUMONIA']
sub_category = ['bacteria', 'virus']
image_size = 400

# create training data with labels: 0:normal, 1:bacterial 2:viral
training_data, occurrence = create_training_data(directory, data_type, main_category, sub_category, image_size)

2020-05-13 15:13:22 INFO     reading images for category NORMAL
2020-05-13 15:13:47 INFO     reading images for category PNEUMONIA
2020-05-13 15:14:07 INFO     reading images done


In [15]:
# number of images with labels 0, 1 and 2
print('Normal:', occurrence[0], 'Bacterial:', occurrence[1], 'Viral:', occurrence[2])
# weight is labels based on their occurence
weight = [float(i)/sum(occurrence) for i in occurrence]
print('Normal:', weight[0], 'Bacterial:', weight[1], 'Viral:', weight[2])

# shuffle the training data otherwise the neural network model will be inefficient
import random 
random.shuffle(training_data)

Normal: 1341 Bacterial: 2530 Viral: 1345
Normal: 0.2570935582822086 Bacterial: 0.48504601226993865 Viral: 0.25786042944785276


In [16]:
# separate features and labels
X = []
y = []
for features, label in training_data:
    X.append(features)
    y.append(label)

In [17]:
# we have to convert a list to a numpy array that is understandable for tensorflow
# -1 means everything in the list, 1 is because the image is gray scale
X = np.array(X).reshape(-1, image_size, image_size, 1)


In [18]:
# save the training data
import pickle
pickle_out = open('trainings/X.pickle', 'wb')
pickle.dump(X, pickle_out)
pickle_out.close()

pickle_out = open('trainings/y.pickle', 'wb')
pickle.dump(y, pickle_out)
pickle_out.close()