# **CONVOLUTIONAL NEURAL NETWORK (CNN)**

## Problem

Train a CNN on the [horses-or-humans dataset](http://www.laurencemoroney.com/horses-or-humans-dataset/).

Note: Dataset is not divided into train-test subsets. Needs pre-processing.

## Initialize

In [1]:
import os
import zipfile
import shutil

import numpy as np

In [2]:
# set random seeds
np.random.seed(0)

## Dataset

**Helper functions**

In [3]:
def download_zip_file(data_url, name_zip, download_dir, unzip_dir):
    """
    Download and unzip a file
    :param data_url     : URL to download the dataset
    :param name_zip     : name of the .zip file
    :param download_dir : directory to download the file
    :param unzip_dir    : directory to unzip the files
    """

    # if already there, do nothing and return
    if os.path.exists(unzip_dir):
        return

    # create the download directory if not exists
    if not os.path.exists(download_dir):
        os.makedirs(download_dir, exist_ok=True)

    # download path for the zip file
    path_zip = os.path.join(download_dir, name_zip)

    # download the dataset
    os.system(f"""wget --no-check-certificate {data_url}{name_zip} -O {path_zip}""")
    print(f"'{name_zip}' downloaded to '{download_dir}'")

    # unzip the file
    zip_ref = zipfile.ZipFile(path_zip, 'r')
    zip_ref.extractall(unzip_dir)
    zip_ref.close()
    print(f"'{name_zip}' extracted to '{unzip_dir}'")

In [4]:
def file_structure(root_dir):
    """
    See file structures in the 'root_dir'
    """
    for root, dirs, files in os.walk(root_dir):
        level = root.replace(root_dir, '').count(os.sep)
        indent = ' ' * 4 * level
        print(f'{indent}{os.path.basename(root)}/')
        subindent = ' ' * 4 * (level + 1)
        for i, f in enumerate(files):
            print(f'{subindent}{f}')
            if i >= 2:
                print(f'{subindent}....(total {len(files)} files)')
                break

In [5]:
def split_dataset(
        dir_source,
        dir_train,
        dir_valid=None,
        dir_test=None,
        valid_ratio=0.,
        test_ratio=0.
):
    """
    Reads the files available in 'dir_source'.
    Splits it by 'valid_ratio' (valid to total ratio), and
         'testratio' (test to total ratio).
    Puts the training, validation and testing files
         in 'dir_train', 'dir_valid' and 'dir_test' folders,
         respectively.
    """

    # get all the filenames in source folder
    filenames = []
    for fn in os.listdir(dir_source):
        filepath = f'{dir_source}/{fn}'
        # if file's size >0, append to the list
        if os.path.getsize(filepath) > 0:
            filenames.append(fn)
        else:
            print(f"Size of '{fn}' is zero. So, ignored.")

    # determine the size of the subsets
    num_valid = int(len(filenames) * valid_ratio)
    num_test = int(len(filenames) * test_ratio)
    num_train = len(filenames) - num_valid - num_test

    # randomly shuffle the file names
    np.random.shuffle(filenames)

    # function for copying all files
    def copy_all_files(src, dst, file_names):
        for file in file_names:
            _src_ = os.path.join(src, file)
            _dst_ = os.path.join(dst, file)
            shutil.copyfile(_src_, _dst_)

    #--- training subset ---#
    # make directory. if already there, raise an error
    os.makedirs(dir_train, exist_ok=False)
    # training files
    train_files = filenames[:num_train]
    # copy training files
    copy_all_files(dir_source, dir_train, train_files)

    #--- validation subset ---#
    if dir_valid:
        # make directory. if already there, raise an error
        os.makedirs(dir_valid, exist_ok=False)
        # validation files
        valid_files = filenames[num_train:num_train+num_valid]
        # copy validation files
        copy_all_files(dir_source, dir_valid, valid_files)

    #--- testing subset ---#
    if dir_test:
        # make directory. if already there, raise an error
        os.makedirs(dir_test, exist_ok=False)
        # testing files
        test_files = filenames[num_train+num_valid:]
        # copy test files
        copy_all_files(dir_source, dir_test, test_files)

## Data Preparation

In [6]:
data_url = 'https://storage.googleapis.com/laurencemoroney-blog.appspot.com/'
root_dir = os.path.join('..', '.tmp')

# download the train-validation dataset
name_zip_1 = 'horse-or-human.zip'
unzip_dir_1 = os.path.join('..', '.tmp', 'horses-or-humans', 'train-valid')
download_zip_file(data_url, name_zip_1, root_dir, unzip_dir_1)

# split the dataset to train and validation subsets
for classname in ['horses', 'humans']:
    split_dataset(
            dir_source=os.path.join('..', '.tmp', 'horses-or-humans', 'train-valid', classname),
            dir_train=os.path.join('..', '.tmp', 'horses-or-humans', 'train', classname),
            dir_valid=os.path.join('..', '.tmp', 'horses-or-humans', 'valid', classname),
            valid_ratio=0.2,
    )

# download the test dataset
name_zip_2 = 'validation-horse-or-human.zip'
unzip_dir_2 = os.path.join('..', '.tmp', 'horses-or-humans', 'test')
download_zip_file(data_url, name_zip_2, root_dir, unzip_dir_2)

'horse-or-human.zip' downloaded to '..\.tmp'
'horse-or-human.zip' extracted to '..\.tmp\horses-or-humans\train-valid'
'validation-horse-or-human.zip' downloaded to '..\.tmp'
'validation-horse-or-human.zip' extracted to '..\.tmp\horses-or-humans\test'


In [7]:
# see the file structures
for subset in ['train', 'valid', 'test']:
    file_structure(os.path.join('..', '.tmp', 'horses-or-humans', subset))

train/
    horses/
        horse01-0.png
        horse01-1.png
        horse01-2.png
        ....(total 400 files)
    humans/
        human01-00.png
        human01-01.png
        human01-02.png
        ....(total 422 files)
valid/
    horses/
        horse01-9.png
        horse03-5.png
        horse03-8.png
        ....(total 100 files)
    humans/
        human01-03.png
        human01-04.png
        human01-06.png
        ....(total 105 files)
test/
    horses/
        horse1-000.png
        horse1-105.png
        horse1-122.png
        ....(total 128 files)
    humans/
        valhuman01-00.png
        valhuman01-01.png
        valhuman01-02.png
        ....(total 128 files)
