In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

# CITATION: I heavily relied on the following source: https://towardsdatascience.com/image-detection-from-scratch-in-keras-f314872006c9'''

# this code block LOADS THE DATA

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import cv2
import matplotlib.pyplot as plt
%matplotlib inline
import os
import random
import matplotlib.image as mpimg
# gc cleans deleted data from memory
import gc

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

# obtain the names of all images
train_dog = os.listdir("../input/training_set/training_set/dogs/")
train_cat = os.listdir("../input/training_set/training_set/cats/")

# os.listdir doesn't list the absolute path to the image, but we need that, so:
train_dog = ["../input/training_set/training_set/dogs/"+img for img in train_dog]
train_cat = ["../input/training_set/training_set/cats/"+img for img in train_cat]

train_images = train_dog[:2000] + train_cat[:2000]
random.shuffle(train_images)

# now we have training set, so don't need the rest of the images -- delete to save memory
del train_dog
del train_cat
gc.collect()

In [None]:
# this code block RESIZES IMAGES

num_rows = 150
num_columns = 150
# channels refers to color, 1=grayscale, 3=color
channels = 3

def readAndProcessImages(list_of_images):
    ''' Returns two arrays:
        1) resized images, called "resized_images",
        2) labels, called "labels", which consists of 1s and 0s -- label 1 means it is a dog and 0 means it is a cat '''
    
    resized_training_images = []
    training_labels = []
    
    for image in list_of_images:
        # read the image
        resized = cv2.resize(cv2.imread(image,cv2.IMREAD_COLOR), (num_rows,num_columns), interpolation=cv2.INTER_CUBIC)
        # append to list of resized images
        resized_training_images.append(resized)
        # get labels
        if 'dog' in image:
            training_labels.append(1)
        elif 'cat' in image:
            training_labels.append(0)
        
    return np.array(resized_training_images),np.array(training_labels)

In [None]:
training_imgs,labels = readAndProcessImages(train_images)

In [None]:
training_imgs[0]

In [None]:
labels[0]

In [None]:
# test to make sure dogs have label 1 and cats have label 0
# look at first five images

plt.figure(figsize=(20,10))
columns = 5
for i in range(columns):
    plt.subplot(5/columns + 1, columns, i+1)
    plt.imshow(training_imgs[i])
    
print(labels[:5])

In [None]:
# confirm we have 2000 dog labels and 2000 cat labels
sorted_labels = sorted(labels)
print('Here we should get 0 0')
print(sorted_labels[0],sorted_labels[1998])
print('Here we should get 0 1')
print(sorted_labels[1999],sorted_labels[2000])
print('Here we should get 1 1')
print(sorted_labels[2001],sorted_labels[-1])

In [None]:
# confirm shape of data
print('Shape of training set is ',training_imgs.shape)
print('Shape of labels is ',labels.shape)

In [None]:
# split data into training and test set
from sklearn.model_selection import train_test_split
# test_size=0.20 means that we will take 20% of the data for the test set and 80% goes to training
# random_state determines whether or not you get a reproducible result -- everytime you run train_test_split you would get a different splitting of train/test images but if you specify a given integer for random_state, if you call train_test_split with that integer, then you get that same "random" splitting

# I will call the new train/test sets 'training/test_imgs_PS' where PS stands for post-split of the original set
training_imgs_PS, test_imgs_PS, label_train_PS, label_test_PS = train_test_split(training_imgs,labels,test_size=0.20,random_state=0)

# check shape of new training/test sets
print('Shape of training set is ',training_imgs_PS.shape)
print('Shape of test set is ',test_imgs_PS.shape)
print('Shape of training labels is ',label_train_PS.shape[0])
print('Shape of test labels is ',label_test_PS.shape[0])

# check that 20% of original set went to test set
print('Check that these two numbers are equal ',test_imgs_PS.shape[0],training_imgs.shape[0]*0.20)

In [None]:
# clear memory of pre-split img arrays
del training_imgs
del labels
gc.collect()

# obtain length of training and test data
num_train = len(training_imgs_PS)
num_test = len(test_imgs_PS)

# batch size = number of samples processed before the model is updated
batch_size = 32

In [None]:
# model creation

from keras import alyers
from keras import models
from keras import optimizers
from keras.preprocessing.image import ImageDataGenerator
from keras.preprocessing.image import img_to_array, load_img

