In [1]:
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import seaborn as sns
from PIL import Image
import glob
import os
import pandas as pd
import multiprocessing 
from tensorflow.keras.preprocessing import image
import numpy as np
from skimage.filters import threshold_otsu
from skimage.filters import threshold_local
from skimage.filters import gaussian
from skimage.transform import resize
from skimage.filters import sobel, roberts
from skimage import io, color
from skimage.restoration import denoise_bilateral
from skimage.segmentation import slic
from skimage.color import label2rgb
from skimage.feature import canny
from skimage.segmentation import slic
from skimage.color import label2rgb
from skimage import data
import scipy.ndimage as ndi
import gc

In [2]:
def get_file_names(s):
    # retrieves all the filenames in a list of strings
    path = './image_data/PetImages/{}'.format(s)
    vals = []
    for root, dirs, files in os.walk(path):
        for filename in files:
            if os.path.getsize(path + '/'+ filename) == 0:
                continue
            vals.append(filename)
    return sorted(vals)

In [3]:
def display_img(s, filename):
    # Will display any image with the string and filepath provided
    path = './image_data/PetImages/{}/{}'.format(s, filename)
    plt.imshow(mpimg.imread(path))
    plt.show()

In [4]:
cat_filenames = get_file_names('cat')
dogtrain_filenames = get_file_names('dogs_train')
dogtest_filenames = get_file_names('dogs_test')

In [5]:
cat_filenames = cat_filenames[1: len(cat_filenames)]
cat_filenames = cat_filenames[:len(cat_filenames) - 1]

In [6]:
# Passing the name cat and the name of the file will display thte image
display_img('cat', cat_filenames[1])

IndexError: list index out of range

Every image has either a cat or a dog in them. There are different variations of cats and dogs. Different angles, lighting, and other different features.

In [None]:
print('The number of cat images are: {}'.format(len(cat_filenames)))

In [None]:
def display_image_np(np_array):
    # The functiton takes in an np_array to display the image
    # This will display the image in grayscale
    plt.imshow(np_array, vmin=0, vmax=255, cmap='Greys_r')
    plt.axis('off')
    plt.grid(True)
    plt.show()
    plt.show()

In [None]:
def get_cat_filepath(img_name):
    # Returns the filepath of a given string
    return './image_data/PetImages/cat/{}'.format(img_name)

In [None]:
def get_dog_train_filepath(img_name):
    # Returns the filepath of a given string
    return './image_data/PetImages/dogs_train/{}'.format(img_name)

In [None]:
def get_dog_test_filepath(img_name):
    # Returns the filepath of a given string
    return './image_data/PetImages/dogs_test/{}'.format(img_name)

In [None]:
# First thing to check is to see how the images' pixels average values look
def tonp(func, list_of_images, size=(500, 500)):
    # for img in list_of_images:
    path = func(list_of_images)
    # Transforming all the images to size 400x400
    current_img = image.load_img(path, target_size=size, color_mode='grayscale')
    # makes a matrix
    img_ts = image.img_to_array(current_img)
    # converts to a vector
    img_ts = [img_ts.ravel()]
    current_img.close()
    try:
        # Brings all the new vectors into one giant array
        full_mat = np.concatenate((full_mat, img_ts))
    except UnboundLocalError:
        full_mat = img_ts
    return full_mat

In [None]:
def get_mean_img(data):
    
    # Creates a multiprocessing pool to speed up iteration
    a_pool = multiprocessing.Pool()
    # returns a list of numpy array with the values of pixels for each image
    result = a_pool.map(tonp, data)
    # Returns the mean of each image in a (1, 90,000) vector
    res = np.mean(result, axis=0)
    # reshape the image back into a matrix
    mean_img = np.array(res).reshape((500,500))
    # Displays what the average image looks like
    display_image(mean_img)
    return mean_img

# Transformations made to the data

Grayscaling was done to the data 

In [None]:
def turn_gray(np_matrix):
    # Converts the np_matrix into a grayscale image
    return color.rgb2gray(np_matrix)

In [None]:
def wrapper_to_grayscale(args):
    return to_grayscale(*args)

In [None]:
def to_grayscale(func, data):
    # takes an image string name and then converts it into an np vector
    # Takes in the string name and a function argument
    sample_cat = tonp(func, data) 
    # takes the sample cat and then reshapes it into a 500x500 matrix
    grayscale_cat = turn_gray(sample_cat[0].reshape((500,500)))
    # Displays the iamges using the display_image fucntion
    display_image_np(grayscale_cat)
    return grayscale_cat

In [None]:
def get_transformed_image(np_array):
    im = np_array.reshape(500, 500).astype('uint')
    hist = ndi.histogram(im, min=0, max=255, bins=256)
    # Create a cumulative distribution function
    cdf = hist.cumsum() / hist.sum()
    im_equalized = cdf[im] * 255
#     This will provide the graphs  
#     tr = get_transformed_image(im_equalized)
#     fig = plt.figure(figsize=(10, 10))
#     ax1 = fig.add_subplot(221, xlabel='# Of pixels', ylabel='# Of pixels', title='Transformed Image')
#     ax1.imshow(dogtest_transformed_images[1])
#     ax2 = fig.add_subplot(222,  sharex=ax1, sharey=ax1, xlabel='# Of pixels', title='Original Image')
#     ax2.imshow(tr)
#     plt.show()
    # Transforming the image by removing the noise and preserving the edges
#     denoised_image = ndi.gaussian_filter(im_equalized, sigma=3)
    denoised_image = denoise_bilateral(im_equalized, multichannel=False)
    del im_equalized
    gc.collect()
    # Sobel is used here to be able to detect the edges of the image
    sob = sobel(denoised_image)
    return sob

In [None]:
cat_iterators = []
for cat in cat_filenames:
    cat_iterators.append((get_cat_filepath, cat))

In [None]:
dogtrain_iterators = []
for dog in dogtrain_filenames:
    dogtrain_iterators.append((get_dog_train_filepath, dog))
dogtest_iterators = []
for dog in dogtest_filenames:
    dogtest_iterators.append((get_dog_test_filepath, dog))

In [None]:
# multiprocessing to speed up the program
a_mult = multiprocessing.Pool()
# maps each iterated value onto the function to_grayscale
cat_img = a_mult.map(wrapper_to_grayscale, cat_iterators[:100])
dog_train = a_mult.map(wrapper_to_grayscale, dogtrain_iterators[:100])
dog_test = a_mult.map(wrapper_to_grayscale, dogtest_iterators[:100])

In [None]:
gc.collect()
cat_transformed_images = a_mult.map(get_transformed_image, cat_img)

In [None]:
del cat_img
gc.collect()

In [None]:
gc.collect()
dogtrain_transformed_images = a_mult.map(get_transformed_image, dog_train)

In [None]:
del dog_train
gc.collect()

In [None]:
gc.collect()
dogtest_transformed_images = a_mult.map(get_transformed_image, dog_test)

In [None]:
del dog_test
gc.collect()

In [None]:
img_num = 1
fig = plt.figure(figsize=(10, 10))
ax1 = fig.add_subplot(221, xlabel='# Of pixels', ylabel='# Of pixels', title='Transformed Image')
ax1.imshow(dogtest_transformed_images[img_num])
ax2 = fig.add_subplot(222,  sharex=ax1, sharey=ax1, xlabel='# Of pixels', title='Original Image')
ax2.imshow(mpimg.imread(get_dog_test_filepath(dogtest_filenames[img_num])))
plt.show()

#### Exploratory Data Analysis

In [None]:
# The EDA can begin by checking the variance of the images
cat_var_all = ndi.variance(cat_transformed_images[0])
cat_var_all

In [None]:
dogtrain_var_all = ndi.variance(dogtrain_transformed_images[0])
dogtrain_var_all

In [None]:
dogtest_var_all = ndi.variance(dogtest_transformed_images[0])
dogtest_var_all

In [None]:
# do stats on labels
len_cat_images = len(cat_transformed_images)
len_dog_images = len(dogtrain_transformed_images) + len(dogtest_transformed_images)

In [None]:
sns.barplot(x=['Cat', 'Dog'], y=[len_cat_images, len_dog_images])
plt.xlabel('Categories')
plt.ylabel('Amount Of Images')
plt.title('The number of image data per category')
plt.show()

PCA for the images

In [None]:
cat_vector = []
for i in range(len(cat_transformed_images)):
    cat_vector.append(cat_transformed_images[i].ravel())
cat_vector

In [None]:
dogtrain_transformed_images[0].ravel()

In [None]:
dogtrain_vector = []
for i in range(len(dogtrain_transformed_images)):
    dogtrain_vector.append(dogtrain_transformed_images[i].ravel())
dogtrain_vector

In [None]:
dogtest_vector = []
for i in range(len(dogtest_transformed_images)):
    dogtest_vector.append(dogtest_transformed_images[i].ravel())
dogtest_vector

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
pca.fit(cat_vector)
cat_pca = pca.transform(cat_vector[:100])

In [None]:
X_cat_pca = list(map(lambda x: x[0], cat_pca))
y_cat_pca = list(map(lambda x: x[1], cat_pca))

In [None]:
dogtrain_pca = pca.transform(dogtrain_vector[:100])

In [None]:
X_dogtrain_pca = list(map(lambda x: x[0], dogtrain_pca))
y_dogtrain_pca = list(map(lambda x: x[1], dogtrain_pca))

In [None]:
dogtest_pca = pca.transform(dogtest_vector)

In [None]:
X_dogtest_pca = list(map(lambda x: x[0], dogtest_pca))
y_dogtest_pca = list(map(lambda x: x[1], dogtest_pca))

In [None]:
cat_df = {'X': X_cat_pca, 'y': y_cat_pca, 'Category': ['Cat'] * 100}
cat_df = pd.DataFrame(cat_df)
dogtrain_df = {'X': X_dogtrain_pca, 'y': y_dogtrain_pca, 'Category': ['Dog'] * 100}
dogtrain_df = pd.DataFrame(dogtrain_df)
dogtest_df = {'X': X_dogtest_pca, 'y': y_dogtest_pca, 'Category': ['Dog'] * 100}
dogtest_df = pd.DataFrame(dogtest_df)

In [None]:
final_df = cat_df.append([dogtrain_df, dogtest_df])
final_df.tail()

In [None]:
plt.figure(figsize=(10, 10))
sns.scatterplot(x='X', y='y', hue='Category', data=final_df)
plt.xlabel('First Component')
plt.ylabel('Second Component')
plt.title('Sample Data of 100 Cats and Dogs Images')
plt.plot()
plt.show()

The images appear to be cluttered together. Some overlaps and some values of dogs being spread across the map.