In [1]:
from tensorflow.keras.preprocessing import image
from keras.models import Sequential
from keras.layers import Dense, Conv2D, Flatten
from keras.utils import to_categorical
import numpy as np
import os
import multiprocessing
from keras.datasets import mnist
import matplotlib.pyplot as plt

In [2]:
def get_file_names(s):
    # retrieves all the filenames in a list of strings
    path = './image_data/PetImages/{}'.format(s)
    vals = []
    for root, dirs, files in os.walk(path):
        for filename in files:
            if os.path.getsize(path + '/'+ filename) == 0:
                continue
            vals.append(filename)
    return sorted(vals)

In [3]:
def get_cat_filepath(img_name):
    # Returns the filepath of a given string
    return './image_data/PetImages/cat/{}'.format(img_name)

In [4]:
def get_dog_train_filepath(img_name):
    # Returns the filepath of a given string
    return './image_data/PetImages/dogs_train/{}'.format(img_name)

In [5]:
def get_dog_test_filepath(img_name):
    # Returns the filepath of a given string
    return './image_data/PetImages/dogs_test/{}'.format(img_name)

In [6]:
# First thing to check is to see how the images' pixels average values look
def tonp(list_of_images, size=(200, 200)):
    # for img in list_of_images:
    path = get_cat_filepath(list_of_images)
    # Transforming all the images to size 400x400
    current_img = image.load_img(path, target_size=size, color_mode='grayscale')
    # makes a matrix
    img_ts = image.img_to_array(current_img)
    # converts to a vector
    img_ts = img_ts.ravel()
    current_img.close()
    try:
        # Brings all the new vectors into one giant array
        full_mat = np.concatenate((full_mat, img_ts))
    except UnboundLocalError:
        full_mat = img_ts
    return full_mat

In [7]:
# First thing to check is to see how the images' pixels average values look
def tonp_dog_train(list_of_images, size=(200, 200)):
    # for img in list_of_images:
    path = get_dog_train_filepath(list_of_images)
    # Transforming all the images to size 400x400
    current_img = image.load_img(path, target_size=size, color_mode='grayscale')
    # makes a matrix
    img_ts = image.img_to_array(current_img)
    # converts to a vector
    img_ts = img_ts.ravel()
    current_img.close()
    try:
        # Brings all the new vectors into one giant array
        full_mat = np.concatenate((full_mat, img_ts))
    except UnboundLocalError:
        full_mat = img_ts
    return full_mat

In [8]:
# First thing to check is to see how the images' pixels average values look
def tonp_dog_test(list_of_images, size=(200, 200)):
    # for img in list_of_images:
    path = get_dog_test_filepath(list_of_images)
    # Transforming all the images to size 400x400
    current_img = image.load_img(path, target_size=size, color_mode='grayscale')
    # makes a matrix
    img_ts = image.img_to_array(current_img)
    # converts to a vector
    img_ts = img_ts.ravel()
    current_img.close()
    try:
        # Brings all the new vectors into one giant array
        full_mat = np.concatenate((full_mat, img_ts))
    except UnboundLocalError:
        full_mat = img_ts
    return full_mat

In [9]:
def display_image_np(np_array):
    # The functiton takes in an np_array to display the image
    # This will display the image in grayscale
    plt.imshow(np_array, vmin=0, vmax=255, cmap='Greys_r')
    plt.axis('off')
    plt.grid(True)
    plt.show()
    plt.show()

In [10]:
def split_data():
    cat_filenames = get_file_names('cat')
    cat_filenames = cat_filenames[1:len(cat_filenames)]
    cat_filenames = cat_filenames[:len(cat_filenames) - 1]
    dog_filenames = get_file_names('dogs_train')
    dog_test_filenames = get_file_names('dogs_test')
    # Testing CNN prediction on raw data
    pool = multiprocessing.Pool()
    # Will get the data in a matrix for the cata data
    raw_data = pool.map(tonp, cat_filenames)
    # Needs to do this for each of the train and test data of the dog images
    dog_train_data = pool.map(tonp_dog_train, dog_filenames)
    dog_test_data = pool.map(tonp_dog_test, dog_test_filenames)
    # Returns the array in array([[]])
    dog_train_data = np.asarray(dog_train_data).reshape(len(dog_train_data), 200, 200, 1)
    dog_test_data = np.asarray(dog_test_data).reshape(len(dog_test_data), 200, 200, 1)
    raw_data = np.asarray(raw_data)
    # Splits the data by 70% for the cat data
    lower_split = int(np.ceil(len(raw_data) * .7))
    X_train = raw_data[:lower_split].reshape(lower_split, 200, 200, 1)
    X_test = raw_data[lower_split:].reshape(len(raw_data) - lower_split, 200, 200 ,1)
    # Creates the corresponding labels for each image
    dog_y_train = np.array([0 for _ in range(len(dog_train_data))])
    y_train = np.array([1 for _ in range(len(X_train))])
    # Does the same for the testing data
    dog_y_test = np.array([0 for _ in range(len(dog_test_data))])
    y_test = np.array([1 for _ in range(len(X_test))])
    # Joins everything together and has everything split into training and testing data
    X_train = np.concatenate((X_train, dog_train_data))
    X_test = np.concatenate((X_test, dog_test_data))
    y_train = np.concatenate((y_train, dog_y_train))
    y_test = np.concatenate((y_test, dog_y_test))
    print(X_train.shape, y_train.shape)
    print(X_test.shape, y_test.shape)
    return X_train, y_train, X_test, X_test

In [11]:
cat_filenames = get_file_names('cat')
cat_filenames = cat_filenames[1:len(cat_filenames)]
cat_filenames = cat_filenames[:len(cat_filenames) - 1]

In [12]:
dog_filenames = get_file_names('dogs_train')
dog_test_filenames = get_file_names('dogs_test')

In [15]:
# Testing CNN prediction on raw data
pool = multiprocessing.Pool()
raw_data = pool.map(tonp, cat_filenames[:10])

In [16]:
dog_train_data = pool.map(tonp_dog_train, dog_filenames)
dog_test_data = pool.map(tonp_dog_test, dog_test_filenames)

In [17]:
dog_train_data = np.asarray(dog_train_data).reshape(len(dog_train_data), 200, 200, 1)
dog_test_data = np.asarray(dog_test_data).reshape(len(dog_test_data), 200, 200, 1)

In [18]:
raw_data

[array([166., 168., 172., ...,   0.,   0.,   0.], dtype=float32),
 array([101.,  98., 100., ...,  37.,  46.,  47.], dtype=float32),
 array([255., 255., 255., ..., 255., 254., 231.], dtype=float32),
 array([39., 40., 40., ..., 43., 50., 29.], dtype=float32),
 array([ 31.,  22.,  13., ..., 169., 156., 149.], dtype=float32),
 array([223., 223., 223., ..., 210., 213., 213.], dtype=float32),
 array([111., 112., 112., ...,  78.,  78.,  78.], dtype=float32),
 array([ 8.,  8.,  8., ..., 11.,  9., 11.], dtype=float32),
 array([121., 120., 115., ..., 162., 164., 161.], dtype=float32),
 array([60., 60., 62., ..., 32., 21., 23.], dtype=float32)]

In [19]:
raw_data = np.asarray(raw_data)
raw_data

array([[166., 168., 172., ...,   0.,   0.,   0.],
       [101.,  98., 100., ...,  37.,  46.,  47.],
       [255., 255., 255., ..., 255., 254., 231.],
       ...,
       [  8.,   8.,   8., ...,  11.,   9.,  11.],
       [121., 120., 115., ..., 162., 164., 161.],
       [ 60.,  60.,  62., ...,  32.,  21.,  23.]], dtype=float32)

In [20]:
lower_split = int(np.ceil(len(raw_data) * .7))
X_train = raw_data[:lower_split].reshape(lower_split, 200, 200, 1)
X_test = raw_data[lower_split:].reshape(len(raw_data) - lower_split, 200, 200 ,1)

In [21]:
dog_y_train = np.array([0 for _ in range(len(dog_train_data))])
y_train = np.array([1 for _ in range(len(X_train))])

In [22]:
y_train[2] = 0
y_train

array([1, 1, 0, 1, 1, 1, 1])

In [19]:
dog_y_test = np.array([0 for _ in range(len(dog_test_data))])
y_test = np.array([1 for _ in range(len(X_test))])
y_test

array([1, 1, 1, ..., 1, 1, 1])

In [20]:
X_train = np.concatenate((X_train, dog_train_data))
X_test = np.concatenate((X_test, dog_test_data))
y_train = np.concatenate((y_train, dog_y_train))
y_test = np.concatenate((y_test, dog_y_test))

In [26]:
X_train

array([[[[166.],
         [168.],
         [172.],
         ...,
         [206.],
         [204.],
         [202.]],

        [[166.],
         [168.],
         [172.],
         ...,
         [206.],
         [204.],
         [203.]],

        [[166.],
         [168.],
         [172.],
         ...,
         [208.],
         [205.],
         [203.]],

        ...,

        [[124.],
         [125.],
         [126.],
         ...,
         [  2.],
         [  2.],
         [  2.]],

        [[123.],
         [123.],
         [124.],
         ...,
         [  1.],
         [  1.],
         [  1.]],

        [[121.],
         [122.],
         [123.],
         ...,
         [  0.],
         [  0.],
         [  0.]]],


       [[[101.],
         [ 98.],
         [100.],
         ...,
         [126.],
         [126.],
         [126.]],

        [[ 97.],
         [ 97.],
         [ 98.],
         ...,
         [127.],
         [126.],
         [129.]],

        [[ 98.],
         [ 97.],
      

In [21]:
print(X_train.shape, y_train.shape)
X_test.shape, y_test.shape

(12751, 200, 200, 1) (12751,)


((4750, 200, 200, 1), (4750,))

In [22]:
model = Sequential()
# input_shape = (height, width, 1 if it's grayscale)
model.add(Conv2D(64, kernel_size=3, activation='relu', input_shape=(200, 200, 1)))
model.add(Conv2D(32, kernel_size=3, activation='relu'))
model.add(Flatten())
model.add(Dense(2, activation='softmax'))
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7fa186491050>