# Cats & Dogs Classifier

The following scripts is using a dataset of realistic images of cats and dogs. We proceed to train a model to distinguish the three different shapes and analyse its accuracy.

### Import libraries

In [1]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.optimizers import RMSprop
import matplotlib.image  as mpimg
import matplotlib.pyplot as plt
from shutil import copyfile
import tensorflow as tf
import zipfile
import random
import wget
import os

print("Libraries imported successfully!")

Libraries imported successfully!


### Downloading data

In [2]:
def download_wget(URLS, FILE_EXT, DIR_PATH):
    for url in URLS:
        target_file = url.split('/')[-1]
        if target_file not in os.listdir(DIR_PATH):
            print('Downloading', url)
            wget.download(url, out=DIR_PATH)
            file_path = os.path.join(DIR_PATH, target_file)
            
            print('\n \nExtracting files from', file_path)
            if FILE_EXT == '.zip':
                zip_ref = zipfile.ZipFile(file_path,'r')
                zip_ref.extractall(DIR_PATH)
                zip_ref.close()
                os.remove(file_path)


if not os.path.isdir('Data'):
    os.mkdir('Data')   

In [3]:
DATA_URLS = ['https://download.microsoft.com/download/3/E/1/3E1C3F21-ECDB-4869-8368-6DEBA77B919F/kagglecatsanddogs_3367a.zip']
DIR_NAME = 'cats-and-dogs'

if not os.path.isdir('Data/' + DIR_NAME):
    os.mkdir('Data/' + DIR_NAME)  
    
DATA_FILE_EXT = '.zip'
DATA_DIR_PATH = './Data/' + DIR_NAME

download_wget(DATA_URLS, DATA_FILE_EXT, DATA_DIR_PATH)

Downloading https://download.microsoft.com/download/3/E/1/3E1C3F21-ECDB-4869-8368-6DEBA77B919F/kagglecatsanddogs_3367a.zip
100% [......................................................................] 824894548 / 824894548
 
Extracting files from ./Data/cats-and-dogs\kagglecatsanddogs_3367a.zip


### Dataset info

In [4]:
print(len(os.listdir('./Data/cats-and-dogs/PetImages/Cat/')))
print(len(os.listdir('./Data/cats-and-dogs/PetImages/Dog/')))

12501
12501


We create directories and subdirectories for training and testing.

In [5]:
to_create = [
    './Data/cats-and-dogs/cats-v-dogs',
    './Data/cats-and-dogs/cats-v-dogs/training',
    './Data/cats-and-dogs/cats-v-dogs/testing',
    './Data/cats-and-dogs/cats-v-dogs/training/cats',
    './Data/cats-and-dogs/cats-v-dogs/training/dogs',
    './Data/cats-and-dogs/cats-v-dogs/testing/cats',
    './Data/cats-and-dogs/cats-v-dogs/testing/dogs']

for directory in to_create:
    try:
        os.mkdir(directory)
        print(directory, 'created')
    except:
        print(directory, 'failed')


./Data/cats-and-dogs/cats-v-dogs failed
./Data/cats-and-dogs/cats-v-dogs/training failed
./Data/cats-and-dogs/cats-v-dogs/testing failed
./Data/cats-and-dogs/cats-v-dogs/training/cats failed
./Data/cats-and-dogs/cats-v-dogs/training/dogs failed
./Data/cats-and-dogs/cats-v-dogs/testing/cats failed
./Data/cats-and-dogs/cats-v-dogs/testing/dogs failed


The function **split_data** split the data for training and testing. It takes:

- SOURCE: directory containing the files
- TRAINING: directory that a portion of the files will be copied to
- TESTING: directory that a portion of the files will be copie to
- SPLIT SIZE: to determine the portion

In [6]:
def split_data(SOURCE, TRAINING, TESTING, SPLIT_SIZE):
    all_files = []
    
    for file_name in os.listdir(SOURCE):
        file_path = SOURCE + file_name

        if os.path.getsize(file_path):
            all_files.append(file_name)
        else:
            print('{} is zero length, so ignoring'.format(file_name))
    
    n_files = len(all_files)
    split_point = int(n_files * SPLIT_SIZE)
    
    shuffled = random.sample(all_files, n_files)
    
    train_set = shuffled[:split_point]
    test_set = shuffled[split_point:]
    
    for file_name in train_set:
        copyfile(SOURCE + file_name, TRAINING + file_name)
        
    for file_name in test_set:
        copyfile(SOURCE + file_name, TESTING + file_name)


CAT_SOURCE_DIR = "./Data/cats-and-dogs/PetImages/Cat/"
TRAINING_CATS_DIR = "./Data/cats-and-dogs/cats-v-dogs/training/cats/"
TESTING_CATS_DIR = "./Data/cats-and-dogs/cats-v-dogs/testing/cats/"
DOG_SOURCE_DIR = "./Data/cats-and-dogs/PetImages/Dog/"
TRAINING_DOGS_DIR = "./Data/cats-and-dogs/cats-v-dogs/training/dogs/"
TESTING_DOGS_DIR = "./Data/cats-and-dogs/cats-v-dogs/testing/dogs/"

split_size = .9
split_data(CAT_SOURCE_DIR, TRAINING_CATS_DIR, TESTING_CATS_DIR, split_size)
split_data(DOG_SOURCE_DIR, TRAINING_DOGS_DIR, TESTING_DOGS_DIR, split_size)

666.jpg is zero length, so ignoring
11702.jpg is zero length, so ignoring


Check number of images in each folder

In [7]:
print('total training cats images:', len(os.listdir('./Data/cats-and-dogs/cats-v-dogs/training/cats/')))
print('total training dogs images:', len(os.listdir('./Data/cats-and-dogs/cats-v-dogs/training/dogs/')))
print('total testing cats images:', len(os.listdir('./Data/cats-and-dogs/cats-v-dogs/testing/cats/')))
print('total testing dogs images:', len(os.listdir('./Data/cats-and-dogs/cats-v-dogs/testing/dogs/')))

total training cats images: 12375
total training dogs images: 12372
total testing cats images: 2375
total testing dogs images: 2372


### Model | Create a CNN

We define a **keras model** to make the classification

In [8]:
model = tf.keras.models.Sequential([
    # first convolution
    tf.keras.layers.Conv2D(32, (3,3), input_shape=(150, 150, 3), activation='relu'),
    tf.keras.layers.MaxPooling2D(2,2),
    # second convolution
    tf.keras.layers.Conv2D(64, (3,3), activation='relu'),
    tf.keras.layers.MaxPooling2D(2,2),
    # third convolution
    tf.keras.layers.Conv2D(128, (3, 3), activation='relu'),
    tf.keras.layers.MaxPooling2D(2,2),
    # Flatten the results to feed into a DNN
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(512, activation='relu'),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')])

model.compile(optimizer=RMSprop(lr=0.001), loss='binary_crossentropy', metrics=['acc'])

The trainning images are **normalized**, and besides training the original image, it will train the same image with zoom, making a horizontal flip, rotating it, etc. 

By means of the operation **flow_from_directory**, we specify it to use the specified directory as base and the labels will be the names of the subdirectories of the same one.

In [9]:
TRAINING_DIR = './Data/cats-and-dogs/cats-v-dogs/training'
train_datagen = ImageDataGenerator(
    rescale=1 / 255,
    rotation_range=40,
    width_shift_range=.2,
    height_shift_range=.2,
    shear_range=.2,
    zoom_range=.2,
    horizontal_flip=True,
    fill_mode='nearest')

train_generator = train_datagen.flow_from_directory(
    TRAINING_DIR,
    batch_size=64,
    class_mode='binary',
    target_size=(150, 150))

VALIDATION_DIR = './Data/cats-and-dogs/cats-v-dogs/testing'
validation_datagen = ImageDataGenerator(
    rescale=1 / 255,
    rotation_range=40,
    width_shift_range=.2,
    height_shift_range=.2,
    shear_range=.2,
    zoom_range=.2,
    horizontal_flip=True,
    fill_mode='nearest')

validation_generator = validation_datagen.flow_from_directory(
    VALIDATION_DIR,
    batch_size=64,
    class_mode='binary',
    target_size=(150, 150))

Found 24745 images belonging to 2 classes.
Found 4747 images belonging to 2 classes.


We proceed to **train the model**:

In [11]:
history = model.fit_generator(train_generator,
                              epochs=15,
                              verbose=1,
                              validation_data=validation_generator)

Epoch 1/15

KeyboardInterrupt: 

**Plot** acucuracy and loss

In [None]:
%matplotlib inline

#-----------------------------------------------------------
# Retrieve a list of list results on training and test data
# sets for each training epoch
#-----------------------------------------------------------
acc=history.history['acc']
val_acc=history.history['val_acc']
loss=history.history['loss']
val_loss=history.history['val_loss']

epochs=range(len(acc)) # Get number of epochs

#------------------------------------------------
# Plot training and validation accuracy per epoch
#------------------------------------------------
plt.plot(epochs, acc, 'r', "Training Accuracy")
plt.plot(epochs, val_acc, 'b', "Validation Accuracy")
plt.title('Training and validation accuracy')
plt.figure()

#------------------------------------------------
# Plot training and validation loss per epoch
#------------------------------------------------
plt.plot(epochs, loss, 'r', "Training Loss")
plt.plot(epochs, val_loss, 'b', "Validation Loss")


plt.title('Training and validation loss')

# Desired output. Charts with training and validation metrics. No crash :)