# Preprocessing

In [21]:
import os
from tensorflow.keras.preprocessing import image
import numpy as np
import multiprocessing 
import random
import pandas as pd
import multiprocessing
import gc
from keras.models import Sequential
from keras.layers import Dense, Conv2D, Flatten, MaxPool2D
from keras.utils import np_utils
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

In [22]:
def get_file_names(s):
    # retrieves all the filenames in a list of strings
    path = './transformed_images/{}'.format(s)
    vals = []
    for root, dirs, files in os.walk(path):
        for filename in files:
            if os.path.getsize(path + '/'+ filename) == 0 or filename == '.DS_Store':
                continue
            vals.append(filename)
    return sorted(vals)

In [23]:
def tonp(func, list_of_images, size=(500, 500)):
    # for img in list_of_images:
    path = func(list_of_images)
    # Transforming all the images to size 400x400
    current_img = image.load_img(path, target_size=size, color_mode='grayscale')
    # makes a matrix
    img_ts = image.img_to_array(current_img)
    # converts to a vector
    img_ts = [img_ts.ravel()]
    current_img.close()
    try:
        # Brings all the new vectors into one giant array
        full_mat = np.concatenate((full_mat, img_ts))
    except UnboundLocalError:
        full_mat = img_ts
    return full_mat

In [24]:
def tonp_wrapper(args):
    return tonp(*args)

In [25]:
def get_cat_filepath(img_name):
    # Returns the filepath of a given string
    return './transformed_images/Cat/{}'.format(img_name)

In [26]:
def get_dog_train_filepath(img_name):
    # Returns the filepath of a given string
    return './transformed_images/DogTrain/{}'.format(img_name)

In [27]:
def get_dog_test_filepath(img_name):
    # Returns the filepath of a given string
    return './transformed_images/DogTest/{}'.format(img_name)

In [28]:
def display_image_np(np_array):
    # The functiton takes in an np_array to display the image
    # This will display the image in grayscale
    plt.imshow(np_array, vmin=0, vmax=255, cmap='Greys_r')
    plt.axis('off')
    plt.grid(True)
    plt.show()
    plt.show()

In [29]:
def set_up_data(cat_filenames, dogtrain_filenames, dogtest_filenames, sample_amount=5000):
    cat_data = []
    dogtrain_data = []
    dogtest_data = []
    # for i in range(len(cat_filenames)):
    for i in range(sample_amount):
        cat_data.append(tonp(get_cat_filepath, cat_filenames[i]))
    # for i in range(len(dogtrain_filenames)):
    for i in range(4000):
        dogtrain_data.append(tonp(get_dog_train_filepath, dogtrain_filenames[i]))
    # for i in range(len(dogtest_filenames)):
    for i in range(1000):
        dogtest_data.append(tonp(get_dog_test_filepath, dogtest_filenames[i]))
    dog_data = np.concatenate((dogtest_data, dogtrain_data))
    del dogtest_data
    del dogtrain_data
    gc.collect()
    sample_cat = random.sample(cat_data, sample_amount)
    cat_label = np.array([1 for _ in range(len(cat_data))])
    dog_label = np.array([0 for _ in range(len(dog_data))])
    all_data_label = np.concatenate((cat_label[:sample_amount], dog_label))
    all_data = np.concatenate((sample_cat, dog_data))
    del sample_cat
    del dog_data
    gc.collect()
    split_limit = int(np.floor(0.7 * len(all_data)))
    random_index = random.sample(range((len(all_data))), split_limit)
    test_idx = set(np.arange(0, len(all_data))) - set(random_index)
    X_train = [all_data[i] for i in random_index]
    y_train = np.asarray([all_data_label[i] for i in random_index])
    X_test = [all_data[i] for i in test_idx]
    y_test = np.asarray([all_data_label[i] for i in test_idx])
    del cat_data
    gc.collect()
    return X_train, y_train, X_test, y_test

In [30]:
cat_filenames = get_file_names('Cat')
dogtrain_filenames = get_file_names('DogTrain')
dogtest_filenames = get_file_names('DogTest')

In [31]:
X_train, y_train, X_test, y_test = set_up_data(cat_filenames, dogtrain_filenames, dogtest_filenames)
num_classes = 2

In [32]:
X_train = np.asarray(X_train).reshape(np.array(X_train).shape[0], 500, 500, 1)
X_test = np.asarray(X_test).reshape(np.array(X_test).shape[0], 500, 500, 1)

In [33]:
temp_y = y_test

In [34]:
y_train = np_utils.to_categorical(y_train, num_classes)
y_test = np_utils.to_categorical(y_test, num_classes)

# Modeling

In [35]:
print(X_train.shape, y_train.shape)
X_test.shape, y_test.shape

(7000, 500, 500, 1) (7000, 2)


((3000, 500, 500, 1), (3000, 2))

In [36]:
# building a linear stack of layers with the sequential model
model = Sequential()
# hidden layer
model.add(Conv2D(25, kernel_size=(3,3), padding='valid',
                 activation='relu', input_shape=(500,500,1)))
model.add(MaxPool2D(pool_size=(1,1)))
# flatten output of conv
model.add(Flatten())
# hidden layer
model.add(Dense(100, activation='relu'))
# output layer
model.add(Dense(2, activation='softmax'))

# compiling the sequential model
model.compile(loss='binary_crossentropy', metrics=['accuracy'], optimizer='adam')

model.fit(X_train, y_train, epochs=3, validation_data=(X_test, y_test))

Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7f97b8151390>

In [41]:
y_pred = np.argmax(model.predict(X_test), axis=-1)

In [42]:
y_pred

array([1, 0, 1, ..., 0, 0, 0])

In [43]:
# 73% f1_score
f1_score(temp_y, y_pred)

0.7325

In [20]:
# building a linear stack of layers with the sequential model
model = Sequential()
# hidden layer
model.add(Conv2D(25, kernel_size=(3,3), padding='valid',
                 activation='relu', input_shape=(400,400,1)))
# output layer
model.add(MaxPool2D(pool_size=(1,1)))
# flatten output of conv
model.add(Flatten())
# hidden layer
model.add(Dense(100, activation='relu'))
# output layer
model.add(Dense(2, activation='softmax'))

# compiling the sequential model
model.compile(loss='binary_crossentropy', metrics=['accuracy'], optimizer='adam')

model.fit(X_train, y_train, epochs=3, validation_data=(X_test, y_test))

Epoch 1/3


ValueError: in user code:

    /opt/anaconda3/lib/python3.7/site-packages/tensorflow/python/keras/engine/training.py:805 train_function  *
        return step_function(self, iterator)
    /opt/anaconda3/lib/python3.7/site-packages/tensorflow/python/keras/engine/training.py:795 step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    /opt/anaconda3/lib/python3.7/site-packages/tensorflow/python/distribute/distribute_lib.py:1259 run
        return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
    /opt/anaconda3/lib/python3.7/site-packages/tensorflow/python/distribute/distribute_lib.py:2730 call_for_each_replica
        return self._call_for_each_replica(fn, args, kwargs)
    /opt/anaconda3/lib/python3.7/site-packages/tensorflow/python/distribute/distribute_lib.py:3417 _call_for_each_replica
        return fn(*args, **kwargs)
    /opt/anaconda3/lib/python3.7/site-packages/tensorflow/python/keras/engine/training.py:788 run_step  **
        outputs = model.train_step(data)
    /opt/anaconda3/lib/python3.7/site-packages/tensorflow/python/keras/engine/training.py:754 train_step
        y_pred = self(x, training=True)
    /opt/anaconda3/lib/python3.7/site-packages/tensorflow/python/keras/engine/base_layer.py:1012 __call__
        outputs = call_fn(inputs, *args, **kwargs)
    /opt/anaconda3/lib/python3.7/site-packages/tensorflow/python/keras/engine/sequential.py:375 call
        return super(Sequential, self).call(inputs, training=training, mask=mask)
    /opt/anaconda3/lib/python3.7/site-packages/tensorflow/python/keras/engine/functional.py:425 call
        inputs, training=training, mask=mask)
    /opt/anaconda3/lib/python3.7/site-packages/tensorflow/python/keras/engine/functional.py:560 _run_internal_graph
        outputs = node.layer(*args, **kwargs)
    /opt/anaconda3/lib/python3.7/site-packages/tensorflow/python/keras/engine/base_layer.py:998 __call__
        input_spec.assert_input_compatibility(self.input_spec, inputs, self.name)
    /opt/anaconda3/lib/python3.7/site-packages/tensorflow/python/keras/engine/input_spec.py:259 assert_input_compatibility
        ' but received input with shape ' + display_shape(x.shape))

    ValueError: Input 0 of layer dense_2 is incompatible with the layer: expected axis -1 of input shape to have value 6200100 but received input with shape (None, 3960100)


In [17]:
y_pred = np.argmax(model.predict(X_test), axis=-1)

In [18]:
y_pred

array([0, 0, 0, ..., 1, 1, 0])

In [19]:
f1_score(temp_y, y_pred)

0.6956842493544817

In [None]:
# building a linear stack of layers with the sequential model
model = Sequential()
# hidden layer
model.add(Conv2D(25, kernel_size=(3,3), padding='valid',
                 activation='relu', input_shape=(300,300,1)))
# output layer
model.add(MaxPool2D(pool_size=(1,1)))
# flatten output of conv
model.add(Flatten())
# hidden layer
model.add(Dense(100, activation='relu'))
# output layer
model.add(Dense(2, activation='softmax'))

# compiling the sequential model
model.compile(loss='binary_crossentropy', metrics=['accuracy'], optimizer='adam')

model.fit(X_train, y_train, epochs=3, validation_data=(X_test, y_test))

In [None]:
y_pred = np.argmax(model.predict(X_test), axis=-1)

In [None]:
y_pred

In [None]:
# Had a score of 71%
f1_score(temp_y, y_pred)

In [None]:
model = Sequential()
# input_shape = (height, width, 1 if it's grayscale)
model.add(Conv2D(32, kernel_size=(3,3), activation='relu', input_shape=(300,300,1), padding='same'))
model.add(MaxPool2D())
model.add(Flatten())
model.add(Dense(1024, activation='relu'))
model.add(Dense(64, activation='sigmoid'))
model.add(Dense(2))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=3)

In [None]:
y_pred = model.predict(X_test)
y_pred

In [None]:
y_test

In [None]:
f1_score(y_pred, y_test)

0.51% accuracy for the first model.

In [51]:
import pandas as pd
index = ['Filters', 'kernel_size', 'padding', 'activation', 'input_shape', 'num_layers', 'f1_measure']
df = {'Model_1': [25, (3,3), 'valid', 'relu', (500, 500, 1), 3, 0.7325],
     'Model_2': [25, (3,3), 'valid', 'relu', (400, 400, 1), 3, 0.69],
     'Model_3': [25, (3,3), 'valid', 'relu', (300, 300, 1), 3, 0.71],
     'MOdel_4': [32, (3,3), None, 'relu', (300, 300, 1), 4, 0.51]}
pd.DataFrame(df, index=index)

Unnamed: 0,Model_1,Model_2,Model_3,MOdel_4
Filters,25,25,25,32
kernel_size,"(3, 3)","(3, 3)","(3, 3)","(3, 3)"
padding,valid,valid,valid,
activation,relu,relu,relu,relu
input_shape,"(500, 500, 1)","(400, 400, 1)","(300, 300, 1)","(300, 300, 1)"
num_layers,3,3,3,4
f1_measure,0.7325,0.69,0.71,0.51
