Decided to use a CNN to predict whether a person has pneumonia based on xray imaging. Data from kaggle: [data](https://www.kaggle.com/paultimothymooney/chest-xray-pneumonia)


In [1]:
import os
from time import time
import numpy as np
import random 
from skimage.io import imread
from skimage.transform import resize

Created function to read in, process and reshape images.

In [2]:
conditions = ["NORMAL", "PNEUMONIA"]

def read_images(set_name):
    start = time()
    all_images = []
    labels = []
    x = 0
    for condition in conditions:
        path = "chest_xray/" + set_name + "/" + condition
        for image_path in os.listdir(path):
            if image_path.endswith(".jpeg"):
                image = imread(path + "/" + image_path, as_gray=True)
                image_reshaped = image.reshape([image.shape[0], image.shape[1], 1])
                all_images.append(image.astype(np.float32))
                if condition == "NORMAL":
                    labels.append(0)
                else:
                    labels.append(1)
            if x % 1000 == 0:
                print(x)
            x += 1
    x = np.array(all_images)
    y = np.array(labels)
    sample_index = random.sample(range(0, len(y)), k=len(y))
    x = x[sample_index]
    y = y[sample_index]
    print("Finished " + set_name + " import. Took %s" % str(time() - start))
    return x, y
    

Import training data.

In [3]:
# Import train sets
train_x, train_y = read_images("train")

np.save("Data/train_x.npy", train_x)
np.save("Data/train_y.npy", train_y)

del train_x, train_y

0
1000
2000
3000
4000
5000
Finished train import. Took 183.1788845062256


Import test data

In [None]:
# Import test sets
test_x, test_y = read_images("test")

np.save("Data/test_x.npy", test_x)
np.save("Data/test_y.npy", test_y)

del test_x, test_y

0


Import validation data.

In [None]:
# Import val sets
val_x, val_y = read_images("val")

np.save("Data/val_x.npy", val_x)
np.save("Data/val_y.npy", val_y)

del val_x, val_y