In [None]:
import numpy as np
import pandas as pd
import glob
import h5py
import os
import cv2
import math
import time
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.utils import to_categorical

from tqdm import tqdm

# Data type evaluation
* [numpy.iinfo()](https://numpy.org/doc/stable/reference/generated/numpy.iinfo.html)
* [numpy.finfo()](https://numpy.org/doc/stable/reference/generated/numpy.finfo.html)

In [None]:
intInfo = np.iinfo(np.uint8)
print("float info")
print("- bits\t\t", f"{intInfo.bits} bits")
print("- range\t\t",f"{intInfo.min} to {intInfo.max}")

In [None]:
# ref: 
floatInfo = np.finfo(np.float16)
print("float info")
print("- bits\t\t", f"{floatInfo.bits} bits")
print("- range\t\t",f"{floatInfo.min} to {floatInfo.max}")
print("- precision\t",f"{floatInfo.precision} decimal digits "  )

# loading data into np.array 

In [None]:
# loading labels from csv file
df = pd.read_csv("Datasets/labels.csv")

# create dictionary of unique breeds with its respective id
# sort the list as required by evaluation on the test set
breeds = sorted(df.breed.unique())
dict_breeds = dict(zip(breeds, range(len(breeds))))

# add duplicate column
df["breed_id"] = df.breed
# convert duplicated column as unique id
df = df.replace({"breed_id":dict_breeds})

print("Shape\t\t: {}".format(df.shape))
print("Unique labels\t: {}".format(len(breeds)))

In [None]:
# check if dictionary is in order
dict_breeds

In [None]:
# to make sure the breed_id has already been sorted to alphabetical order
df.head()

In [None]:
INPUT_SHAPE = (224,224,3)

In [None]:
data = []
labels = []

# collect all files from directory into a list
image_files_train = [f for f in glob.glob("Datasets/train" + "/**/*", recursive=True) if not os.path.isdir(f)]
print("{} files found!".format(len(image_files_train)))

# create groud-truth label from the image path
print("loading images")
t = time.time()
for img in tqdm(image_files_train):
    img_file = os.path.basename(img)
    name = img_file.split(".")[0]

    # check if image file has a record in given labels
    result = df.loc[df['id'] == name]
    if result.empty:
        print("LABEL NOT FOUND: {}".format(name))
        continue
    else:
        # reading of image 
        #image = cv2.imread(img)
        #image = cv2.resize(image, (INPUT_SHAPE[0], INPUT_SHAPE[1]))
        #image = img_to_array(image)
        image_ = load_img(img,target_size=(INPUT_SHAPE[0], INPUT_SHAPE[1]))
        image = np.asarray(image_)
        data.append(image)
        
        # read respective unique breed id from result 
        label = result['breed_id'].iloc[0]
        labels.append([label])
# one-hot the to categorical
labels = to_categorical(labels)       
print(f'Time taken to load images: {time.time()-t}')

# pre-processing (normalisation)
print("pre-processing")
t = time.time()
# convert data type to uint8 and do preprocessing later
data = np.array(data, dtype=np.uint8) #/ 255.0
labels = np.array(labels, dtype=np.uint8)
print(f'Time taken to pre-processing: {time.time()-t}')

print('x_data shape:', data.shape)
print('y_data shape:', labels.shape)

In [None]:
data[50]

In [None]:
labels[50]

In [None]:
def convert_size(size_bytes):
    if size_bytes == 0:
        return "0B"
    size_name = ("B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB")
    i = int(math.floor(math.log(size_bytes, 1024)))
    p = math.pow(1024, i)
    s = round(size_bytes / p, 2)
    return "%s %s" % (s, size_name[i])

def nparray_info(header, nparr):
    print(header)
    print("-  shape\t", nparr.shape)
    print("-  dtype\t", nparr.dtype)
    print("- nbytes\t", f"{nparr.nbytes} ({convert_size(nparr.nbytes)})")
    

In [None]:
# print out shape, dtype and data size
nparray_info("Images", data)
print()
nparray_info("Labels", labels)

### loading and saving with .npz
* [np.save()](https://numpy.org/doc/stable/reference/generated/numpy.save.html)
* [np.savez()](https://numpy.org/doc/stable/reference/generated/numpy.savez.html)
* [np.savez_compressed()](https://numpy.org/doc/stable/reference/generated/numpy.savez_compressed.html)
* [np.lib.format](https://numpy.org/doc/stable/reference/generated/numpy.lib.format.html)





In [None]:
# saving
saving_path = "./Datasets/preprocessed_data_u{}x{}.npz".format(INPUT_SHAPE[0], INPUT_SHAPE[1])

In [None]:
print("Saving to npz file")
# ensure directory is created before save data file
os.makedirs(os.path.dirname(saving_path), exist_ok=True)
t = time.time()
np.savez_compressed(saving_path, X=data, Y=labels)
print(f'Time taken to save compressed data: {time.time()-t}')

In [None]:
# test the loading of npz file
print("Loading from npz file")
t = time.time()
loaded = np.load(saving_path)
print(loaded.files)
loaded_X = loaded["X"]
loaded_Y = loaded["Y"]
print(f'Time taken to load compressed data: {time.time()-t}')

In [None]:
# print out shape, dtype and data size
nparray_info("Images (npz)", loaded_X)
print()
nparray_info("Labels (npz)", loaded_Y)

In [None]:
# ensure correctness of data
assert labels.dtype == loaded_Y.dtype
assert (labels==loaded_Y).all()


assert data.dtype == loaded_X.dtype
assert np.allclose(data, loaded_X)
assert (data==loaded_X).all()

In [None]:
loaded_Y[50]

In [None]:
# check the data
N = 20

plt.figure(figsize=(20,20))
for i in range(N):
    ax = plt.subplot(int(N/5),5,i+1)
    plt.title(breeds[np.where(loaded_Y[i]==1)[0][0]])
    plt.imshow(loaded_X[i].astype('float32'))

## Do likewise for the test dataset but w/o any label

In [None]:
#INPUT_SHAPE = (128,128,3)
#INPUT_SHAPE = (192,192,3)
INPUT_SHAPE = (224,224,3)

In [None]:
data_test = []

# collect all files from directory into a list
image_files_test = [f for f in glob.glob("Datasets/test" + "/**/*", recursive=True) if not os.path.isdir(f)]
print("{} files found!".format(len(image_files_test)))

# create groud-truth label from the image path
print("loading images")
t = time.time()
for img in tqdm(image_files_test):
    img_file = os.path.basename(img)
    name = img_file.split(".")[0]

    image_ = load_img(img,target_size=(INPUT_SHAPE[0], INPUT_SHAPE[1]))
    image = np.asarray(image_)
    data_test.append(image)
     
print(f'Time taken to load images: {time.time()-t}')

# pre-processing (normalisation)
print("pre-processing")
t = time.time()
data_test = np.array(data_test, dtype=np.float16) / 255.0
#labels = np.array(labels, dtype=np.uint8)
print(f'Time taken to pre-processing: {time.time()-t}')

print('x_data shape:', data_test.shape)
#print('y_data shape:', labels.shape)

In [None]:
data_test[50]

In [None]:
# print out shape, dtype and data size
nparray_info("Images", data_test)

### loading and saving with .npz
* [np.save()](https://numpy.org/doc/stable/reference/generated/numpy.save.html)
* [np.savez()](https://numpy.org/doc/stable/reference/generated/numpy.savez.html)
* [np.savez_compressed()](https://numpy.org/doc/stable/reference/generated/numpy.savez_compressed.html)
* [np.lib.format](https://numpy.org/doc/stable/reference/generated/numpy.lib.format.html)

In [None]:
# saving
saving_path = "./Datasets/preprocessed_data_test_{}x{}.npz".format(INPUT_SHAPE[0], INPUT_SHAPE[1])

print("Saving to npz file")
# ensure directory is created before save data file
os.makedirs(os.path.dirname(saving_path), exist_ok=True)
t = time.time()
np.savez_compressed(saving_path, X=data_test)
print(f'Time taken to save compressed data: {time.time()-t}')

In [None]:
# test the loading of npz file
print("Loading from npz file")
t = time.time()
loaded = np.load(saving_path)
print(loaded.files)
loaded_X_test = loaded["X"]
print(f'Time taken to load compressed data: {time.time()-t}')

In [None]:
# print out shape, dtype and data size
nparray_info("Images (npz)", loaded_X_test)

In [None]:
# ensure correctness of data
assert data_test.dtype == loaded_X_test.dtype
assert np.allclose(data_test, loaded_X_test)
assert (data_test==loaded_X_test).all()

In [None]:
loaded_X_test[50][0]

In [None]:
# check the data
N = 20

plt.figure(figsize=(20,20))
for i in range(N):
    ax = plt.subplot(int(N/5),5,i+1)
    plt.imshow(loaded_X_test[i].astype('float32'))