<a href="https://colab.research.google.com/github/mmalinas/DeepLearning/blob/master/save_breastcancerimages_h5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import glob
from random import shuffle
import h5py
import numpy as np
import cv2
import math
import time
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
def normalize_and_write_data_into_h5_file(dest_filepath, filepaths_list, dataset_name, n_px, n_channels = 3):
    
    '''
        This function converts images to numpy arrays and writes the array data into a h5 file.
        
        dest_filepath - the name of the file with full path that is being created
        filepaths_list - source image file paths which is being converted to numpy arrays
        n_px - number of pixels - will be used as image's height and width
        n_channels - 3 for rgb
    '''
    
    data_shape = (len(filepaths_list), n_px * n_px * n_channels)
    
    with h5py.File(dest_filepath, 'a') as f:
        
        f.require_dataset(dataset_name, data_shape, np.float32)
        
        for i in range(len(filepaths_list)):
            #if (i+1) % 512 == 0:
            #    print('{}/{} files converted'.format((i+1), len(filepaths_list)))
            filepath = filepaths_list[i]
            img = cv2.imread(filepath)
            img = cv2.resize(img, (n_px, n_px), interpolation=cv2.INTER_CUBIC)
            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
            
            #Normalize the image - convert the each pixel value between 0 and 1
            img = img / 255
            #Reshape the image - roll it up into a column vector
            img = img.ravel()
            
            #img[None] makes it a proper array instead of rank 1 array
            f[dataset_name][i, ...] = img[None]

In [0]:
def write_labels_into_h5_file(dest_filepath, labels):
    
    dataset_name = "labels"
    
    with h5py.File(dest_filepath, 'a') as f:
        f.create_dataset(dataset_name, (len(labels),), np.int8)
        f[dataset_name][...] = labels

In [0]:
def convert_images_to_data_in_h5_file(src_img_filepath_pattern, dest_h5_file_path, n_px, 
                                      n_channels = 3):
    
    #Returns a list of filepaths matching the pattern given as parameter
    src_filepaths = glob.glob(src_img_filepath_pattern, recursive=True)
    
    #Create Labels based upon the substring contained in the filename
    labels = [0 if 'class0' in filepath else 1 for filepath in src_filepaths]
    
    #The zip(source_filepaths, labels) combines each element of source_filepaths list 
    #with each element of labels list forming a pair (tuple). t is the list which contains these tuples
    
    t = list(zip(src_filepaths, labels))

    class0_tuples = []
    class1_tuples = []

    #separate class 0 and class 1 tuples
    for t_tuple in t:
      if 'class0' in t_tuple[0]:
        class0_tuples.append(t_tuple)
      else:
        class1_tuples.append(t_tuple)

    shuffle(class0_tuples)
    shuffle(class1_tuples)
    class0_tuples_downsample = class0_tuples[0:len(class1_tuples)]

    training_final_index = math.ceil(0.8*len(class1_tuples))
    validation_final_index = training_final_index + math.ceil(0.1*len(class1_tuples))
    
    training_class0 = class0_tuples[0:training_final_index]
    validation_class0 = class0_tuples[training_final_index:validation_final_index]
    test_class0 = class0_tuples[validation_final_index:]

    training_class1 = class1_tuples[0:training_final_index]
    validation_class1 = class1_tuples[training_final_index:validation_final_index]
    test_class1 = class1_tuples[validation_final_index:]

    print('Creating file training_class0')
    dest_file_path_training_class0 = dest_h5_file_path + 'training_class0' + '.h5'
    src_filepaths_training_class0, labels_training_class0 = zip(*training_class0)
    normalize_and_write_data_into_h5_file(dest_file_path_training_class0, src_filepaths_training_class0, 'training_class0', n_px)
    write_labels_into_h5_file(dest_file_path_training_class0, labels_training_class0)

    print('Creating file validation_class0')
    dest_file_path_validation_class0 = dest_h5_file_path + 'validation_class0' + '.h5'
    src_filepaths_validation_class0, labels_validation_class0 = zip(*validation_class0)
    normalize_and_write_data_into_h5_file(dest_file_path_validation_class0, src_filepaths_validation_class0, 'validation_class0', n_px)
    write_labels_into_h5_file(dest_file_path_validation_class0, labels_validation_class0)

    print('Creating file test_class0')
    dest_file_path_test_class0 = dest_h5_file_path + 'test_class0' + '.h5'
    src_filepaths_test_class0, labels_test_class0 = zip(*test_class0)
    normalize_and_write_data_into_h5_file(dest_file_path_test_class0, src_filepaths_test_class0, 'test_class0', n_px)
    write_labels_into_h5_file(dest_file_path_test_class0, labels_test_class0)

    print('Creating file training_class1')
    dest_file_path_training_class1 = dest_h5_file_path + 'training_class1' + '.h5'
    src_filepaths_training_class1, labels_training_class1 = zip(*training_class1)
    normalize_and_write_data_into_h5_file(dest_file_path_training_class1, src_filepaths_training_class1, 'training_class1', n_px)
    write_labels_into_h5_file(dest_file_path_training_class1, labels_training_class1)

    print('Creating file validation_class1')
    dest_file_path_validation_class1 = dest_h5_file_path + 'validation_class1' + '.h5'
    src_filepaths_validation_class1, labels_validation_class1 = zip(*validation_class1)
    normalize_and_write_data_into_h5_file(dest_file_path_validation_class1, src_filepaths_validation_class1, 'validation_class1', n_px)
    write_labels_into_h5_file(dest_file_path_validation_class1, labels_validation_class1)

    print('Creating file test_class1')
    dest_file_path_test_class1 = dest_h5_file_path + 'test_class1' + '.h5'
    src_filepaths_test_class1, labels_test_class1 = zip(*test_class1)
    normalize_and_write_data_into_h5_file(dest_file_path_test_class1, src_filepaths_test_class1, 'test_class1', n_px)
    write_labels_into_h5_file(dest_file_path_test_class1, labels_test_class1)

In [6]:
src_filepath_pattern = '/content/drive/My Drive/breast-histopathology-images/8*/*/*'
dest_filepath = '/content/drive/My Drive/breast-normalized-dataset_'
n_px = 50
n_channels = 3

tic = time.process_time()
convert_images_to_data_in_h5_file(src_filepath_pattern, dest_filepath, n_px, n_channels)
toc = time.process_time()
print('Time taken for creating the h5 file is', (toc-tic)*1000, 'ms')

Creating file training_class0
Creating file validation_class0
Creating file test_class0
Creating file training_class1
Creating file validation_class1
Creating file test_class1
Time taken for creating the h5 file is 29954.363139999998 ms
