# Creation of the dataset

You need to downlad the publicly available NIH dataset, "cell_images.zip" and put it in your directory https://lhncbc.nlm.nih.gov/publication/pub9932

In [2]:
from imutils import paths
import pandas as pd 
import numpy as np
import random
import shutil
import glob
import os


IMAGE_DIR='./cell_images/*/*.png'
# initialize the base path to the *new* directory that will contain
# our images after computing the training and testing split
BASE_PATH = "malaria"
# derive the training, validation, and testing directories
TRAIN_PATH = os.path.sep.join([BASE_PATH, "training"])
VAL_PATH = os.path.sep.join([BASE_PATH, "validation"])
TEST_PATH = os.path.sep.join([BASE_PATH, "testing"])
# define the amount of data that will be used training
TRAIN_SPLIT = 0.8
# the amount of validation data will be a percentage of the
# *training* data
VAL_SPLIT = 0.1

In [3]:
# open csv containing name of misclassified data files

df = pd.read_csv('./All_images.csv')
false_par = df['False_parasitized'].dropna().values
false_un = df['False_uninfected'].dropna().values
false_labels = np.append(false_par,false_un)

In [4]:
# remove mislcassified data 

imagePaths = glob.glob(IMAGE_DIR)
for image_path in imagePaths:
    image_name = image_path.split('/')[-1]
    if image_name in false_labels :
        imagePaths.remove(image_path)
print('Nb images after removals: ', len(imagePaths))

Nb images after removals:  26229


## Creation of a training,validation and testing set 

In [5]:

random.seed(7)
random.shuffle(imagePaths)

# compute the training and testing split
i = int(len(imagePaths) * TRAIN_SPLIT)
trainPaths = imagePaths[:i]
testPaths = imagePaths[i:]
# we'll be using part of the training data for validation
i = int(len(trainPaths) * VAL_SPLIT)
valPaths = trainPaths[:i]
trainPaths = trainPaths[i:]

# define the datasets that we'll be building
datasets = [
    ("training", trainPaths, TRAIN_PATH),
    ("validation", valPaths, VAL_PATH),
    ("testing", testPaths, TEST_PATH)
]

## Saving the data under the new directory

In [6]:
# loop over the datasets
for (dType, imagePaths, baseOutput) in datasets:
    # show which data split we are creating
    print("[INFO] building '{}' split".format(dType))
    # if the output base output directory does not exist, create it
    if not os.path.exists(baseOutput):
        print("[INFO] 'creating {}' directory".format(baseOutput))
        os.makedirs(baseOutput)
    # loop over the input image paths
    for inputPath in imagePaths:
        # extract the filename of the input image along with its
        # corresponding class label
        filename = inputPath.split(os.path.sep)[-1]
        label = inputPath.split(os.path.sep)[-2]
        # build the path to the label directory
        labelPath = os.path.sep.join([baseOutput, label])
        # if the label output directory does not exist, create it
        if not os.path.exists(labelPath):
                print("[INFO] 'creating {}' directory".format(labelPath))
                os.makedirs(labelPath)
        # construct the path to the destination image and then copy
        # the image itself
        p = os.path.sep.join([labelPath, filename])
        shutil.copy2(inputPath, p)

[INFO] building 'training' split
[INFO] 'creating malaria_new3/training' directory
[INFO] 'creating malaria_new3/training/Parasitized' directory
[INFO] 'creating malaria_new3/training/Uninfected' directory
[INFO] building 'validation' split
[INFO] 'creating malaria_new3/validation' directory
[INFO] 'creating malaria_new3/validation/Parasitized' directory
[INFO] 'creating malaria_new3/validation/Uninfected' directory
[INFO] building 'testing' split
[INFO] 'creating malaria_new3/testing' directory
[INFO] 'creating malaria_new3/testing/Uninfected' directory
[INFO] 'creating malaria_new3/testing/Parasitized' directory


In [7]:
print(len(glob.glob(BASE_PATH + '/training/Parasitized/*.png')))
print(len(glob.glob(BASE_PATH +'/training/Uninfected/*.png')))
print(len(glob.glob(BASE_PATH +'/validation/Parasitized/*.png')))
print(len(glob.glob(BASE_PATH +'/validation/Uninfected/*.png')))
print(len(glob.glob(BASE_PATH +'/testing/Parasitized/*.png')))
print(len(glob.glob(BASE_PATH +'/testing/Uninfected/*.png')))

9389
9496
1067
1031
2709
2537
