In [1]:
from google.colab import files
uploaded = files.upload()

Saving dogs_vs_cats_config.py to dogs_vs_cats_config.py


In [3]:
from google.colab import files
uploaded = files.upload()

Saving aspectawarepreprocessor.py to aspectawarepreprocessor.py


In [5]:
from google.colab import files
uploaded = files.upload()

Saving hdf5datasetwriter.py to hdf5datasetwriter.py


In [7]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [8]:
!ls /content/drive/MyDrive/train

cat.0.jpg      cat.11334.jpg  cat.1418.jpg  cat.2752.jpg  cat.4086.jpg
cat.10000.jpg  cat.11335.jpg  cat.1419.jpg  cat.2753.jpg  cat.4087.jpg
cat.10001.jpg  cat.11336.jpg  cat.141.jpg   cat.2754.jpg  cat.4088.jpg
cat.10002.jpg  cat.11337.jpg  cat.1420.jpg  cat.2755.jpg  cat.4089.jpg
cat.10003.jpg  cat.11338.jpg  cat.1421.jpg  cat.2756.jpg  cat.408.jpg
cat.10004.jpg  cat.11339.jpg  cat.1422.jpg  cat.2757.jpg  cat.4090.jpg
cat.10005.jpg  cat.1133.jpg   cat.1423.jpg  cat.2758.jpg  cat.4091.jpg
cat.10006.jpg  cat.11340.jpg  cat.1424.jpg  cat.2759.jpg  cat.4092.jpg
cat.10007.jpg  cat.11341.jpg  cat.1425.jpg  cat.275.jpg   cat.4093.jpg
cat.10008.jpg  cat.11342.jpg  cat.1426.jpg  cat.2760.jpg  cat.4094.jpg
cat.10009.jpg  cat.11343.jpg  cat.1427.jpg  cat.2761.jpg  cat.4095.jpg
cat.1000.jpg   cat.11344.jpg  cat.1428.jpg  cat.2762.jpg  cat.4096.jpg
cat.10010.jpg  cat.11345.jpg  cat.1429.jpg  cat.2763.jpg  cat.4097.jpg
cat.10011.jpg  cat.11346.jpg  cat.142.jpg   cat.2764.jpg  cat.4098.jpg
cat.100

In [12]:
# import packages
import imutils
import cv2

class AspectAwarePreprocessor:

    def __init__(self, width, height, inter = cv2.INTER_AREA):
        # store the target image width, height, and interpolation
        # method used when resizing
        self.width = width
        self.height = height
        self.inter = inter

    def preprocess(self, image):
        # grab the dimensions of the image and then initialize
        # the deltas to use when cropping
        (h, w) = image.shape[:2]
        dW = 0
        dH = 0

        # if the width is smaller than the height, then resize
        # along the width (i.e., the smaller dimension) and then
        # update the deltas to crop the height to the desired dimension
        if w < h:
            image = imutils.resize(image, width = self.width, inter = self.inter)
            dH = int((image.shape[0] - self.height) / 2.0)

        # otherwise, the height is smaller than the width so
        # resizing along the height and then update the deltas
        # to crop along with width
        else:
            image = imutils.resize(image, height = self.height, inter = self.inter)
            dW = int((image.shape[1] - self.width) / 2.0)

        # now that our images have been resized, we need to
        # re-grab the width and height, followed by performing
        # the crop
        (h, w) = image.shape[:2]
        image = image[dH : h - dH, dW : w - dW]

        # finally, resize the image to the provided spatial
        # dimensions to ensure our output image is always a fixed sieze
        return cv2.resize(image, (self.width, self.height), interpolation = self.inter)
        

In [13]:
# import packages
import h5py
import os

class HDF5DatasetWriter:

    def __init__(self, dims, outputPath, dataKey = "images", bufSize = 100):
        # check to see if the output path exists, and if so, raise an exception
        if os.path.exists(outputPath):
            raise ValueError("The supplied 'outputPath' already "
                "exists and cannot be overwritten. Manually delete "
                "the file before continuing", outputPath)

        # open the HDF5 database for writing and create two datasets:
        # one to store images/features and another to store the class labels
        self.db = h5py.File(outputPath, "w")
        self.data = self.db.create_dataset(dataKey, dims, dtype = "float")
        self.labels = self.db.create_dataset("labels", (dims[0],), dtype = "int")

        # store the buffer size, then initialize the buffer itself
        # along with the index into the datasets
        self.bufSize = bufSize
        self.buffer = {"data": [], "labels": []}
        self.idx = 0

    def add(self, rows, labels):
        # add the rows and labels to the buffer
        self.buffer["data"].extend(rows)
        self.buffer["labels"].extend(labels)

        # check to see if the buffer needs to be flushed to disk
        if len(self.buffer["data"]) >= self.bufSize:
            self.flush()

    def flush(self):
        # write the buffers to disk then reset the buffer
        i = self.idx + len(self.buffer["data"])
        self.data[self.idx : i] = self.buffer["data"]
        self.labels[self.idx : i] = self.buffer["labels"]
        self.idx = i
        self.buffer = {"data" : [], "labels": []}

    def storeClassLabels(self, classLabels):
        # create a dataset to store the actual class label names
        # then store the class labels
        dt = h5py.special_dtype(vlen = str)
        labelSet = self.db.create_dataset("label_names", (len(classLabels),), dtype = dt)
        labelSet[:] = classLabels

    def close(self):
        # check to see if there are any other entries in the buffer
        # that need to be flushed to disk
        if len(self.buffer["data"]) > 0:
            self.flush()

        # close the dataset
        self.db.close()

In [14]:
# import packages
import dogs_vs_cats_config as config
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
# import aspectawarepreprocessor
# import hdf5datasetwriter
from imutils import paths
import numpy as np
import progressbar
import json
import cv2
import os

# grab the path to the image
trainPaths = list(paths.list_images(config.IMAGES_PATH))
trainLabels = [p.split(os.path.sep)[-1].split(".")[0] for p in trainPaths]
le = LabelEncoder()
trainLabels = le.fit_transform(trainLabels)

# perform stratified sampling from the training set to build
# the testing split from the training data
split = train_test_split(trainPaths, trainLabels,
	test_size = config.NUM_TEST_IMAGES, stratify = trainLabels,
	random_state = 42)
(trainPaths, testPaths, trainLabels, testLabels) = split

# perform another stratified sampling, this time to build the validation dataset
split = train_test_split(trainPaths, trainLabels,
    test_size = config.NUM_VAL_IMAGES, stratify = trainLabels,
    random_state = 42)
(trainPaths, valPaths, trainLabels, valLabels) = split

# construct a list pairing the training, validation, and testing image paths
# along with their corresponding labels and output HDF5 files
datasets = [
    ("train", trainPaths, trainLabels, config.TRAIN_HDF5),
    ("val", valPaths, valLabels, config.VAL_HDF5),
    ("test", testPaths, testLabels, config.TEST_HDF5)
]

# initialize the image preprocessor and the lists of RGB channel averages
aap = AspectAwarePreprocessor(256, 256)
(R, G, B) = ([], [], [])

# loop over the dataset tuples
for (dType, paths, labels, outputPath) in datasets:
    # create HDF5 writer
    print("[INFO] building {}...".format(outputPath))
    writer = HDF5DatasetWriter((len(paths), 256, 256, 3), outputPath)

    # initialize the progress bar
    widgets = ["Building Dataset: ", progressbar.Percentage(), " ",
        progressbar.Bar(), " ", progressbar.ETA()]
    pbar = progressbar.ProgressBar(maxval = len(paths), widgets = widgets).start()

    # loop over the image paths
    for (i, (path, label)) in enumerate(zip(paths, labels)):
        # load the image and process it
        image = cv2.imread(path)
        image = aap.preprocess(image)

        # if we are building the training dataset, then compute the mean of
        # each channel in the image, then update the respective lists
        if dType == "train":
            (b, g, r) = cv2.mean(image)[:3]
            R.append(r)
            G.append(g)
            B.append(b)

        # add the image and label # to the HDF5 dataset
        writer.add([image], [label])
        pbar.update(i)

    # close the HDF5 writer
    pbar.finish()
    writer.close()

# construct a dictionary of averages, then serialize the means to a JSON file
print("[INFO] serializing means...")
D = {"R": np.mean(R), "G": np.mean(G), "B": np.mean(B)}
f = open(config.DATASET_MEAN, "w")
f.write(json.dumps(D))
f.close()

                                                                               Building Dataset: N/A% |                                       | ETA:  --:--:--

[INFO] building /content/drive/MyDrive/dataset/train.hdf5...


Building Dataset: 100% |#######################################| Time:  0:10:03
Building Dataset: N/A% |                                       | ETA:  --:--:--

[INFO] building /content/drive/MyDrive/dataset/val.hdf5...


Building Dataset: 100% |#######################################| Time:  0:10:24
Building Dataset: N/A% |                                       | ETA:  --:--:--

[INFO] building /content/drive/MyDrive/dataset/test.hdf5...


Building Dataset: 100% |#######################################| Time:  0:10:38


[INFO] serializing means...
