# Save a dataset of images into a h5py file

In [1]:
from google.colab import drive
drive.mount('drive')

Drive already mounted at drive; to attempt to forcibly remount, call drive.mount("drive", force_remount=True).


## Import libraries

In [2]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue May 14 14:43:21 2019

@author: hippolyte
"""

import os
import h5py
import imgaug.augmenters as iaa
import numpy as np
import cv2
import datetime

from matplotlib import pyplot as plt
from keras.utils import to_categorical
from glob import glob
from skimage import exposure

Using TensorFlow backend.


## Global variables

In [0]:
################################################################################
### MODIFIABLE VARIABLES
################################################################################
ROOT_PATH = 'drive/My Drive/master1/medical_image_recognition/'
DATASET_NAME = 'chest_xray'
IMAGE_EXTENSIONS = ['*.jpg', '*.jpeg']

################################################################################
### DON'T CHANGE THESE VARIABLES
################################################################################
DATASET_PATH = ROOT_PATH + 'datasets/' + DATASET_NAME + '/'
ARRAY_PATH   = ROOT_PATH + 'arrays/'   + DATASET_NAME + '/'
MODEL_PATH   = ROOT_PATH + 'models/'   + DATASET_NAME + '/'
# create the directory to save arrays if it doesn't exist
#! mkdir -pv ARRAY_PATH, MODEL_PATH

# get directories and labels
DIRECTORIES = sorted([d for d in os.listdir(DATASET_PATH)])
LABELS = sorted(os.listdir(DATASET_PATH + DIRECTORIES[0]))
# helpers for labels
NUM_LABELS = len(LABELS)
LABEL_MAPPING = {label: code for code, label in enumerate(LABELS)}
# helper for paths
PATHS = {directory: DATASET_PATH + directory + '/' for directory in DIRECTORIES}

In [4]:
print(DIRECTORIES)
print(LABELS)
print(LABEL_MAPPING)

['test', 'train', 'val']
['NORMAL', 'PNEUMONIA']
{'NORMAL': 0, 'PNEUMONIA': 1}


## Functions

### Helpers

In [0]:
def label_code_to_str(label_code: int) -> str:
    """
    TODO
    """
    for label, code in LABEL_MAPPING.items():
        if label_code == code:
            return label

    raise ValueError('Couldn\'t find the code {} in labels'.format(label_code))

def process_image(image_path: str,
                  height: int=224,
                  width: int=224,
                  channels: int=3,
                  pixel_regularization: bool=True) -> np.ndarray:
    """
    TODO
    """
    img = cv2.imread(image_path)
    img = cv2.resize(img, (height, width))

    # if the image is in grayscale,
    # we should change its scale by adding the same img to each channel
    # http://me.umn.edu/courses/me5286/vision/Notes/2015/ME5286-Lecture3.pdf
    if img.shape[2] == 1:
        print('{} is in grayscale, changed dimensions.'.format(image_path))
        img = [img] * channels
    
    # convert the format used by default in cv2 to be consistent with tensorflow
    #img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    
    # normalize image pixels
    img = img.astype(np.float32) / 255.
    
    # do pixel regularization
    if pixel_regularization:
        img = exposure.equalize_adapthist(img)
    
    return img

def process_image_skimage(image_path: str,
                          height: int=224,
                          width: int=224,
                          channels: int=3,
                          pixel_regularization: bool=False) -> np.ndarray:
    """
    TODO
    """
    import skimage.io # import imread, imsave
    import skimage.transform # import resize
    
    img = skimage.io.imread(image_path)
    img = skimage.transform.resize(img, (height, width), anti_aliasing=False)

    # if the image is in grayscale,
    # we should change its scale by adding the same img to each channel
    # http://me.umn.edu/courses/me5286/vision/Notes/2015/ME5286-Lecture3.pdf
    #if img.shape[2] == 1:
    #    print('{} is in grayscale, changed dimensions.'.format(image_path))
    #    img = [img] * channels

    # convert the format used by default in cv2 to be consistent with tensorflow
    #img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    # normalize image pixels
    img = img / 255. # img.astype(np.float32) / 255.
    
    if pixel_regularization:
        img = exposure.equalize_adapthist(img)
    
    return img

### See the differences between images

In [6]:
"""
for label in LABELS:
    # encode the label into one hot encoder
    label_encoded = to_categorical(LABEL_MAPPING[label],
                                   num_classes=NUM_LABELS)

    # get the path of the images
    path = PATHS['val'] + label + '/'

    # get a list of lists with all the images per extensions
    images = [glob(path + e) for e in IMAGE_EXTENSIONS]

    # make a flat list out of list of lists
    images = [item for sublist in images for item in sublist]
    
    ## test
    import matplotlib.pyplot as plt
    raw_image = cv2.imread(images[0])
    raw_image = cv2.resize(raw_image, (224, 224))
    
    
    transformed_image = process_image(images[0],
                                      height=224,
                                      width=224,
                                      channels=224,
                                      pixel_regularization=False)
    
    transformed_imag2 = process_image(images[0],
                                      height=224,
                                      width=224,
                                      channels=224,
                                      pixel_regularization=True)
    
    
    transformed_ima21 = process_imag2(images[0],
                                      height=224,
                                      width=224,
                                      channels=224,
                                      pixel_regularization=False)
    
    transformed_ima22 = process_imag2(images[0],
                                      height=224,
                                      width=224,
                                      channels=224,
                                      pixel_regularization=True)
    
    
    fig, axs = plt.subplots(2, 3, constrained_layout=True, figsize=(10,10))
    axs[0,0].imshow(raw_image, cmap='gray')
    axs[0,0].set_title('Raw image')
    axs[0,0].axis('off')
    axs[0,1].imshow(transformed_image, cmap='gray')
    axs[0,1].set_title('transformed image')
    axs[0,1].axis('off')
    axs[0,2].imshow(transformed_imag2, cmap='gray')
    axs[0,2].set_title('transformed CLAHE image')
    axs[0,2].axis('off')
    axs[1,0].imshow(transformed_ima21, cmap='gray')
    axs[1,0].set_title('transformed image 2')
    axs[1,0].axis('off')
    axs[1,1].imshow(transformed_ima22, cmap='gray')
    axs[1,1].set_title('transformed CLAHE image 2')
    axs[1,1].axis('off')
    axs[1,2].imshow(transformed_ima22, cmap='gray')
    axs[1,2].set_title('transformed CLAHE image 2')
    axs[1,2].axis('off')
    plt.show()
"""

"\nfor label in LABELS:\n    # encode the label into one hot encoder\n    label_encoded = to_categorical(LABEL_MAPPING[label],\n                                   num_classes=NUM_LABELS)\n\n    # get the path of the images\n    path = PATHS['val'] + label + '/'\n\n    # get a list of lists with all the images per extensions\n    images = [glob(path + e) for e in IMAGE_EXTENSIONS]\n\n    # make a flat list out of list of lists\n    images = [item for sublist in images for item in sublist]\n    \n    ## test\n    import matplotlib.pyplot as plt\n    raw_image = cv2.imread(images[0])\n    raw_image = cv2.resize(raw_image, (224, 224))\n    \n    \n    transformed_image = process_image(images[0],\n                                      height=224,\n                                      width=224,\n                                      channels=224,\n                                      pixel_regularization=False)\n    \n    transformed_imag2 = process_image(images[0],\n                     

### Main

In [0]:
def save_images_dir(directory: str,
                    saving_path: str,
                    height: int=224,
                    width: int=224,
                    channels: int=3,
                    pixel_regularization=True):
    """
    TODO
    """
    shape_str = '({},{},{})'.format(height, width, channels)
    shape_str += 'pr' if pixel_regularization else ''
    filename = saving_path + directory + shape_str + '.h5'
    print('{}: saving images {} into {}'.format(directory, shape_str, filename))
    
    # some helper variables
    start = datetime.datetime.now()
    image_count = 0

    with h5py.File(filename, 'w') as file:
        for label in LABELS:
            # encode the label into one hot encoder
            label_encoded = to_categorical(LABEL_MAPPING[label],
                                           num_classes=NUM_LABELS)
            
            # get the path of the images
            path = PATHS[directory] + label + '/'

            # get a list of lists with all the images per extensions
            images = [glob(path + e) for e in IMAGE_EXTENSIONS]

            # make a flat list out of list of lists
            images = [item for sublist in images for item in sublist]
            
            if not images:
                raise IndexError('There is no images with {} in {}'.format(
                    IMAGE_EXTENSIONS, path))

            # process all the images with compression
            # here we choose the best method (9) at expense of speed
            # https://ss64.com/bash/gzip.html
            for image_index, image_path in enumerate(images):
                # process the image before to save it
                image = process_image(image_path,
                                      height=height,
                                      width=width,
                                      channels=channels,
                                      pixel_regularization=pixel_regularization)

                # image
                file.create_dataset(name='x' + str(image_count),
                                    data=image,
                                    shape=(height, width, channels),
                                    maxshape=(height, width, channels),
                                    compression='gzip',
                                    compression_opts=9)

                # label
                file.create_dataset(name='y' + str(image_count),
                                    data=label_encoded,
                                    shape=(NUM_LABELS,),
                                    maxshape=(None,),
                                    compression='gzip',
                                    compression_opts=9)

                image_count += 1

            print('{} images with {}({}) label.'.format(image_index+1,
                                                        label,
                                                        LABEL_MAPPING[label]))

    end = datetime.datetime.now()
    print('{} images in {} seconds.'.format(image_count, (end-start).seconds))

## Let's save images !

In [8]:
HEIGHT, WIDTH, CHANNELS = 224, 224, 3
DO_PIXEL_REGULARIZATION = True

for directory in DIRECTORIES:
    # save images
    save_images_dir(directory,
                    saving_path=ARRAY_PATH,
                    height=HEIGHT,
                    width=WIDTH,
                    channels=CHANNELS,
                    pixel_regularization=DO_PIXEL_REGULARIZATION)

test: saving images (224,224,3)pr into drive/My Drive/master1/medical_image_recognition/arrays/chest_xray/test(224,224,3)pr.h5


  .format(dtypeobj_in, dtypeobj_out))
  .format(dtypeobj_in, dtypeobj_out))


234 images with NORMAL(0) label.
390 images with PNEUMONIA(1) label.
624 images in 64 seconds.
train: saving images (224,224,3)pr into drive/My Drive/master1/medical_image_recognition/arrays/chest_xray/train(224,224,3)pr.h5
1346 images with NORMAL(0) label.
3875 images with PNEUMONIA(1) label.
5221 images in 560 seconds.
val: saving images (224,224,3)pr into drive/My Drive/master1/medical_image_recognition/arrays/chest_xray/val(224,224,3)pr.h5
8 images with NORMAL(0) label.
8 images with PNEUMONIA(1) label.
16 images in 1 seconds.
