# **CONVOLUTIONAL NEURAL NETWORK (CNN)**

## Problem

Train a CNN on the [Cats-vs-Dogs dataset]().

Note: Dataset is not divided into train-test subsets. Needs pre-processing.

## Initialize

In [1]:
import os
import zipfile
import shutil

import numpy as np
import tensorflow as tf

from matplotlib import pyplot as plt

In [2]:
# set random seeds
np.random.seed(0)
tf.random.set_seed(0)

# show figures inline
%matplotlib inline

# TODO: use `os.path.join()` instead of `/`.

## Dataset

**Download the dataset**

In [3]:
DATA_URL = 'https://download.microsoft.com/download/3/E/1/' \
           '3E1C3F21-ECDB-4869-8368-6DEBA77B919F/kagglecatsanddogs_3367a.zip'
SAVE_DIR = '../.tmp'
NAME_ZIP = 'cats-and-dogs.zip'
ROOT_DIR = f'../.tmp/cats-and-dogs'

# if not already there, download the files
if not os.path.exists(ROOT_DIR):
    # download the dataset
    os.makedirs(SAVE_DIR, exist_ok=True)
    os.system(f"""wget --no-check-certificate {DATA_URL} -O {SAVE_DIR}/{NAME_ZIP}""")

    # unzip the file
    zip_ref = zipfile.ZipFile(f'{SAVE_DIR}/{NAME_ZIP}', 'r')
    zip_ref.extractall(ROOT_DIR)
    zip_ref.close()

# see the file structure
for root, dirs, files in os.walk(ROOT_DIR):
    level = root.replace(ROOT_DIR, '').count(os.sep)
    indent = ' ' * 4 * level
    print(f'{indent}{os.path.basename(root)}/')
    subindent = ' ' * 4 * (level + 1)
    for i, f in enumerate(files):
        print(f'{subindent}{f}')
        if i >= 2:
            print(f'{subindent}...')
            break

cats-and-dogs/
    MSR-LA - 3467.docx
    readme[1].txt
    PetImages/
        Cat/
            0.jpg
            1.jpg
            10.jpg
            ...
        Dog/
            0.jpg
            1.jpg
            10.jpg
            ...


In [4]:
# see number of files
print('# cat images:', len(os.listdir(f'{ROOT_DIR}/PetImages/Cat/')))
print('# dog images:', len(os.listdir(f'{ROOT_DIR}/PetImages/Dog/')))

# cat images: 12501
# dog images: 12501


## Data Preparation
**Split the dataset**

In [5]:
# create training, validation, and testing folders
for subset in ['train', 'valid', 'test']:
    for class_name in ['cat', 'dog']:
        os.makedirs(f'{ROOT_DIR}/{subset}/{class_name}', exist_ok=True)
        print(f"directory '{ROOT_DIR}/{subset}/{class_name}' is created")

directory '../.tmp/cats-and-dogs/train/cat' is created
directory '../.tmp/cats-and-dogs/train/dog' is created
directory '../.tmp/cats-and-dogs/valid/cat' is created
directory '../.tmp/cats-and-dogs/valid/dog' is created
directory '../.tmp/cats-and-dogs/test/cat' is created
directory '../.tmp/cats-and-dogs/test/dog' is created


In [6]:
def split_dataset(dir_source, dir_train, dir_valid, dir_test, valid_ratio, test_ratio):
    """
    Reads the files available in 'dir_source'.
    Splits it by 'valid_ratio' (valid to total ratio), and
         'testratio' (test to total ratio).
    Puts the training, validation and testing files
         in 'dir_train', 'dir_valid' and 'dir_test' folders,
         respectively.
    """

    # get all the filenames in source folder
    filenames = []  # list of all file names

    # read the source files
    for fn in os.listdir(dir_source):
        filepath = f'{dir_source}/{fn}'
        # if file's size >0, append to the list
        if os.path.getsize(filepath) > 0:
            filenames.append(fn)
        else:
            print(f"Size of '{fn}' is zero. So, ignored.")

    # determine the size of validation and testing subsets
    num_valid = int(len(filenames) * valid_ratio)
    num_test = int(len(filenames) * test_ratio)

    # remaining files are for training
    num_train = len(filenames) - num_valid - num_test

    # randomly shuffle the file names
    np.random.shuffle(filenames)

    # training and testing subsets
    train_files = filenames[:num_train]
    valid_files = filenames[num_train:num_train+num_valid]
    test_files = filenames[num_train+num_valid:]

    # function for copying all files
    def copy_all_files(src, dst, file_names):
        for file in file_names:
            shutil.copy(f'{src}/{file}', f'{dst}/{file}')

    # copy training files
    copy_all_files(dir_source, dir_train, train_files)
    # copy validation files
    copy_all_files(dir_source, dir_valid, valid_files)
    # copy test files
    copy_all_files(dir_source, dir_test, test_files)

In [7]:
VALID_RATIO = 0.1
TEST_RATIO = 0.1

# split the cat and dog images into training and testing subsets
for class_name in ['cat', 'dog']:
    # source directory
    _dir_source = f'{ROOT_DIR}/PetImages/{class_name.capitalize()}'

    # train-test directories
    _dir_train = f'{ROOT_DIR}/train/{class_name}'
    _dir_valid = f'{ROOT_DIR}/valid/{class_name}'
    _dir_test = f'{ROOT_DIR}/test/{class_name}'

    # split the dataset
    split_dataset(_dir_source, _dir_train, _dir_valid, _dir_test, VALID_RATIO, TEST_RATIO)


Size of '666.jpg' is zero. So, ignored.
Size of '11702.jpg' is zero. So, ignored.


In [8]:
# print number of files
for subset in ['train', 'valid', 'test']:
    print(subset)
    for class_name in ['cat', 'dog']:
        num_files = len(os.listdir(f'{ROOT_DIR}/{subset}/{class_name}'))
        print(f'    {class_name}: {num_files} images.')

train
    cat: 10000 images.
    dog: 10000 images.
valid
    cat: 1250 images.
    dog: 1250 images.
test
    cat: 1250 images.
    dog: 1250 images.
