# This creates a new .h5 file with placeholders for training

HDF5 (aka H5) is a memory mapped file formats. The Python package h5py makes it easy to store and manipulate existing data in the form of NumPy arrays. This makes reading data on Colab faster, and will accelerate the training.

In [None]:
import h5py
from PIL import Image

fileName = 'data.h5'
numOfSamples = 10000
with h5py.File(fileName, "w") as out:
  out.create_dataset("X_train",(numOfSamples,256,256,3),dtype='u1')
  out.create_dataset("Y_train",(numOfSamples,1,1),dtype='u1')      
  out.create_dataset("X_dev",(numOfSamples,256,256,3),dtype='u1')
  out.create_dataset("Y_dev",(numOfSamples,1,1),dtype='u1')      
  out.create_dataset("X_test",(numOfSamples,256,256,3),dtype='u1')
  out.create_dataset("Y_test",(numOfSamples,1,1),dtype='u1')   

load your data into these placeholders in a Python dictionary style. Here we load images to our X_train placeholder.

In [None]:
with h5py.File(fileName, "a") as out:
   img = Image.open("X_train_1.jpg")      # X_train_1.jpg is 256 x 256 RGB image
   out['X_train'] = numpy.asarray(img)

For PyTorch (not relevant here but good to know), you will have to write your own .h5 Dataset that will be used by PyTorch DataLoader.

In [None]:
import torch
import numpy as np
import torch.optim as optim
import torch.utils.data
import torchvision.datasets as dset
import torchvision.transforms as transforms
import torchvision.utils as vutils
from PIL import Image
import h5py
 
class dataset_h5(torch.utils.data.Dataset):
    def __init__(self, in_file, transform=None):
        super(dataset_h5, self).__init__()
 
        self.file = h5py.File(in_file, 'r')
        self.transform = transform
 
    def __getitem__(self, index):
        x = self.file['X_train'][index, ...]
        y = self.file['Y_train'][index, ...]
        
        # Preprocessing each image
        if self.transform is not None:
            x = self.transform(x)        
        
        return (x, y), index
 
    def __len__(self):
        return self.file['X_train'].shape[0]

dataset = dataset_h5("PATH_TO_YOUR_.h5_FILE",transform=transform)
dataloader = torch.utils.data.DataLoader(
        dataset, batch_size=8,
        drop_last=True, shuffle=bshuffle, num_workers=1)