# Michael Eng

# Initialize Parameters

In [1]:
import os

cwd = os.getcwd()
classes = ['dog', 'cat']
num_classes = len(classes)
learning_rate = 1E-5

# Re orginaize data.

In [None]:
import shutil

# sorts images into two new folders, dog and cat, in the train directory 
def split_images_by_class(directory, classes):
    file_list = os.listdir(directory)

    for c in classes:
        os.mkdir('%s/%s' %(directory, c))
    
    for filename in file_list:
        for c in classes:
            if c in filename:
                shutil.move('%s/%s' %(directory, filename), '%s/%s/%s' %(directory, c, filename))
                
# splits data from train into a new directory, test or val by a ratio
def redistribute_images(directory, classes, dir1='/train/', dir2='/test/', train_test_ratio=0.9):
    if not os.path.exists(directory + dir1[:len(dir1) - 1]):
        os.mkdir(directory + dir1[:len(dir1) - 1])        
    if not os.path.exists(directory + dir2[:len(dir1) - 1]):
        os.mkdir(directory + dir2[:len(dir1) - 1])
            
    for c in classes:
        if not os.path.exists(directory + dir1 + c):
            os.mkdir(directory + dir1 + c)
        if not os.path.exists(directory + dir2 + c):
            os.mkdir(directory + dir2 + c)
        train_list = os.listdir(directory + dir1 + c)
        test_list = os.listdir(directory + dir2 + c)

        for element in test_list:
            train_list.append(test_list.pop(0))
        
        total = len(train_list)
        while len(test_list) < (total * (1 - train_test_ratio)):
            test_list.append(train_list.pop(0))
        for filename in train_list:
            if os.path.exists(directory + dir2 + c + '/' + filename):
                shutil.move(directory + dir2 + c + '/'  + filename, directory + dir1 + c + '/'  + filename)
        for filename in test_list:
            if os.path.exists(directory + dir1 + c + '/' + filename):
                shutil.move(directory + dir1 + c + '/'  + filename, directory + dir2 + c + '/'  + filename)

split_images_by_class(cwd + '/train', classes)
redistribute_images(cwd, classes)
redistribute_images(cwd, classes, dir2='/val/')

# Create Image Data Generator

Now that the data is organized in a scheme that ImageDataGenerator can read, ImageDataGenerators are made for train, validation, test and the actual submission data. The training data gen includes data augmentation features such as rotation, zoom, horizontal flip and more. This gives the network more training samples for free during each epoch. Shuffle is set to false on the submission generator so that the image number and the prediction can be matched. It is true by default.

In [3]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator

train_datagen = ImageDataGenerator(
    rescale=1./255,
    #featurewise_std_normalization=True, 
    #featurewise_center=True,
    rotation_range=20,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True)

test_datagen = ImageDataGenerator(rescale=1./255)

train_generator = train_datagen.flow_from_directory(
    cwd + '/train',
    target_size=(224,224),
    batch_size=64,
    class_mode='categorical')

val_generator = test_datagen.flow_from_directory(
    cwd + '/val',
    target_size=(224,224),
    batch_size=64,
    class_mode='categorical')

test_generator = test_datagen.flow_from_directory(
    cwd + '/test',
    target_size=(224,224),
    batch_size=64,
    class_mode='categorical')

submission_generator = test_datagen.flow_from_directory(
    cwd + '/submission',
    shuffle=False,
    target_size=(224,224),
    batch_size=64)

Found 20250 images belonging to 2 classes.
Found 2250 images belonging to 2 classes.
Found 2500 images belonging to 2 classes.
Found 12500 images belonging to 1 classes.


# Build pretrained vgg

The pretrained convolution layers are used from vgg16, followed by two Dense layers. The first dense layer has 50% dropout. 

In [4]:
from tensorflow.keras.layers import AveragePooling2D, Flatten, Dropout, Dense
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras import optimizers
from tensorflow.keras.initializers import glorot_uniform
from tensorflow.keras.applications import VGG16

def build_vgg_pretrained(freeze_layers=0):
    model = Sequential()
    vgg16 = VGG16(input_shape=(224,224,3), classes=num_classes, include_top=False)
    for layer in vgg16.layers[:freeze_layers]:
        layer.tranable = False
    model.add(vgg16)
    model.add(AveragePooling2D((3,3), strides=(2,2)))
    model.add(Flatten())
    model.add(Dropout(0.5))
    model.add(Dense(512, activation='relu', kernel_initializer=glorot_uniform(seed=0)))
    model.add(Dense(num_classes, activation='softmax', kernel_initializer=glorot_uniform(seed=0)))
    return model

In [8]:
vgg = build_vgg_pretrained(freeze_layers=10)
vgg.compile(loss='categorical_crossentropy',
             optimizer=optimizers.RMSprop(lr=learning_rate * 5),
             metrics=['acc'])
vgg.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
vgg16 (Functional)           (None, 7, 7, 512)         14714688  
_________________________________________________________________
average_pooling2d_1 (Average (None, 3, 3, 512)         0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 4608)              0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 4608)              0         
_________________________________________________________________
dense_2 (Dense)              (None, 512)               2359808   
_________________________________________________________________
dense_3 (Dense)              (None, 2)                 1026      
Total params: 17,075,522
Trainable params: 17,075,522
Non-trainable params: 0
__________________________________________

# Train and evaluate network

Train the network on the train generator and validation data.

In [9]:
history = vgg.fit(train_generator,
                steps_per_epoch=316,
                epochs=3,
                validation_data=val_generator,
                validation_steps=35)

Epoch 1/3
Epoch 2/3
Epoch 3/3


Evaluate the network on new test data it has never seen. This test data is just a subset of the original data in the train folder.

In [10]:
test_loss = vgg.evaluate(test_generator)



Predict the class of the unlabeled data in the given 'test1' and group the file number with its prediction in a list then sort the list. The generator sorts the files by amount of zeros instead of the actual number. For instance, the output was 1, 10, 100, 1000, 10000, 100000, 100001, 100002, ... This was sorted correctly in the following code.

In [11]:
import re

def get_predictions(generator):
    predictions = vgg.predict(generator)
    names = generator.filenames

    output = []

    for i, prediction in enumerate(predictions):
        index = int(re.findall(r"[\d]+", names[i])[0])
        if prediction[0] > prediction[1]:
            output.append([index, 0])
        else:
            output.append([index, 1])

    output.sort(key = lambda a: a[0])
    return output

In [12]:
output = get_predictions(submission_generator)

In [13]:
with open('submission.txt', 'w') as submission_file:
    submission_file.write('id, label\n')
    submission_file.writelines("%s,%s\n" %(str(element[0]), str(element[1])) for element in output)