In [1]:
import numpy as np
import pandas as pd
import glob
import h5py
import os
import cv2

import time
from tensorflow.keras.preprocessing.image import load_img, img_to_array

# loading data into np.array 

In [2]:
# loading labels from csv file
df = pd.read_csv("Datasets/labels.csv")

# create dictionary of unique breeds with its respective id
breeds = df.breed.unique()
dict_breeds = dict(zip(breeds, range(len(breeds))))

# add duplicate column
df["breed_id"] = df.breed
# convert duplicated column as unique id
df = df.replace({"breed_id":dict_breeds})

print("Shape\t\t: {}".format(df.shape))
print("Unique labels\t: {}".format(len(breeds)))

Shape		: (10222, 3)
Unique labels	: 120


In [3]:
# hyperparameters 
EPOCHS = 50
LR = 1e-3
BATCH_SIZE = 32
INPUT_SHAPE = (64,64,3)

In [4]:
data = []
labels = []

# collect all files from directory into a list
image_files_train = [f for f in glob.glob("Datasets/train" + "/**/*", recursive=True) if not os.path.isdir(f)]
print("{} files found!".format(len(image_files_train)))

# create groud-truth label from the image path
print("loading images")
t = time.time()
for img in image_files_train:
    img_file = os.path.basename(img)
    name = img_file.split(".")[0]

    # check if image file has a record in given labels
    result = df.loc[df['id'] == name]
    if result.empty:
        print("LABEL NOT FOUND: {}".format(name))
        continue
    else:
        # reading of image 
        image = cv2.imread(img)
        image = cv2.resize(image, (INPUT_SHAPE[0], INPUT_SHAPE[1]))
        image = img_to_array(image)
        data.append(image)
        # read respective unique breed id from result 
        label = result['breed_id'].iloc[0]
        labels.append([label])
print(f'Time taken to load images: {time.time()-t}')

# pre-processing (normalisation)
print("pre-processing")
t = time.time()
data = np.array(data, dtype=np.float16) / 255.0
labels = np.array(labels, dtype=np.uint8)
print(f'Time taken to pre-processing: {time.time()-t}')

print('x_data shape:', data.shape)
print('y_data shape:', labels.shape)

10222 files found!
loading images
Time taken to load images: 42.437124490737915
pre-processing
Time taken to pre-processing: 1.4311761856079102
x_data shape: (10222, 64, 64, 3)
y_data shape: (10222, 1)


In [5]:
labels = np.array(labels, dtype=np.uint8)

In [6]:
# print out shape, dtype and data size
print("X Info")
print("-  shape:\t", data.shape)
print("-  dtype:\t", data.dtype)
print("- nbytes:\t", data.nbytes)

print("Y Info")
print("-  shape:\t", labels.shape)
print("-  dtype:\t", labels.dtype)
print("- nbytes:\t", labels.nbytes)

X Info
-  shape:	 (10222, 64, 64, 3)
-  dtype:	 float16
- nbytes:	 251215872
Y Info
-  shape:	 (10222, 1)
-  dtype:	 uint8
- nbytes:	 10222


### loading and saving with .npz


In [7]:
saving_name = "preprocessed_data_{}x{}.npz".format(INPUT_SHAPE[0], INPUT_SHAPE[1])

t = time.time()
np.savez_compressed("./Datasets/"+saving_name, X=data, Y=labels)
print(f'Time taken to save compressed data: {time.time()-t}')

Time taken to save compressed data: 23.550722122192383


In [8]:
t = time.time()
loaded = np.load("./Datasets/"+saving_name)
print(loaded.files)
loaded_X = loaded["X"]
loaded_Y = loaded["Y"]
print(f'Time taken to load compressed data: {time.time()-t}')



['X', 'Y']
Time taken to load compressed data: 1.8103985786437988


In [9]:
print("X Info")
print("-  shape:\t", loaded_X.shape)
print("-  dtype:\t", loaded_X.dtype)
print("- nbytes:\t", loaded_X.nbytes)

print("Y Info")
print("-  shape:\t", loaded_Y.shape)
print("-  dtype:\t", loaded_Y.dtype)
print("- nbytes:\t", loaded_Y.nbytes)

X Info
-  shape:	 (10222, 64, 64, 3)
-  dtype:	 float16
- nbytes:	 251215872
Y Info
-  shape:	 (10222, 1)
-  dtype:	 uint8
- nbytes:	 10222


In [10]:
# ensure correctness of data
assert labels.dtype == loaded_Y.dtype
assert (labels==loaded_Y).all()


assert data.dtype == loaded_X.dtype
assert np.allclose(data, loaded_X)
assert (data==loaded_X).all()

### loading and saving with h5 format

In [11]:
def generate_h5_data(data,labels,filename):
    assert(type(filename) is str)
    assert(len(data) == len(labels)) 
    try:
        filepath= f'./{filename}.h5'
        h5data = h5py.File(filepath, 'w')
        h5data.create_dataset('image', data= data, compression="gzip", compression_opts=9)
        h5data['label'] = labels
    finally:
        h5data.close()
        
def load_h5_data(h5filepath):
    h5file = h5py.File(h5filepath, "r")
    try:
        x_fieldname,y_fieldname, = h5file.keys()   #this order is Gfriend files
        print("The keys are: ", h5file.keys())
        data = np.array(h5file['image'][:]) # your test set features
        labels = np.array(h5file['label'][:]) # your test set labels
        print("The shape of x_field",data.shape)
        print("The shape of y_field",labels.shape)
    finally:
        h5file.close()
    return data, labels

In [12]:
saving_name = "preprocessed_data_{}x{}".format(INPUT_SHAPE[0], INPUT_SHAPE[1])

t = time.time()
generate_h5_data(data, labels, saving_name)
print(f'Time taken to save compressed data: {time.time()-t}')

Time taken to save compressed data: 21.737980842590332


In [13]:
print("loading from h5 file")
t = time.time()
loaded_X, loaded_Y = load_h5_data(saving_name+".h5")
print(f'Time taken to load compressed data: {time.time()-t}')

loading from h5 file
The keys are:  <KeysViewHDF5 ['image', 'label']>
The shape of x_field (10222, 64, 64, 3)
The shape of y_field (10222, 1)
Time taken to load compressed data: 2.6273491382598877


In [14]:
print("X Info")
print("-  shape:\t", loaded_X.shape)
print("-  dtype:\t", loaded_X.dtype)
print("- nbytes:\t", loaded_X.nbytes)

print("Y Info")
print("-  shape:\t", loaded_Y.shape)
print("-  dtype:\t", loaded_Y.dtype)
print("- nbytes:\t", loaded_Y.nbytes)

X Info
-  shape:	 (10222, 64, 64, 3)
-  dtype:	 float16
- nbytes:	 251215872
Y Info
-  shape:	 (10222, 1)
-  dtype:	 uint8
- nbytes:	 10222


In [15]:
# ensure correctness of data
assert labels.dtype == loaded_Y.dtype
assert (labels==loaded_Y).all()


assert data.dtype == loaded_X.dtype
assert np.allclose(data, loaded_X)
assert (data==loaded_X).all()