# DOGS VS CATS REDUX CODE

I'm trying to cover all the concepts covered in Lesson 1 through 7 with regard to CNN. The code runs on sample data. 

## Initial Imports and Setup

In [None]:
# Try '% + Tab' and it gives you a list of commands that can be run in the notebook just as if it were in the terminal.
# Here we are checking the present working directory
%pwd

In [None]:
# This command helps us to make and see plots within the notebook [a python 2D plotting library] 
%matplotlib inline

#import modules that we will be using in the code
from utils import *
from vgg16 import Vgg16

# Enter the path to your data in the working directory
# path = "data/redux/" # Use this path if you are using the entire dataset
path = "data/redux/sample/" # Use this path if you are using sample dataset

In [None]:
# Other imports. Add the imports as and when you are using any particular library
import os,sys
from shutil import copyfile
import gc

## Setting up Data Folders
### This piece of code needs to be run only the very first time you run the code as they are one-time setups*

In [None]:
# Create directories
%cd data/redux
%mkdir valid
%mkdir results
%mkdir pred
%mkdir -p sample/train
%mkdir -p sample/test
%mkdir -p sample/valid
%mkdir -p sample/results
%mkdir -p sample/pred
%mkdir -p test/unknown

In [None]:
%cd train

In [None]:
# Move images to valid data folders from train data folder
g = glob('*.jpg')
shuf = np.random.permutation(g)
for i in range(2000): os.rename(shuf[i], '../valid/' + shuf[i])

In [None]:
# Copy a few images to sample/train from train data folder
g = glob('*.jpg')
shuf = np.random.permutation(g)
for i in range(200): copyfile(shuf[i], '../sample/train/' + shuf[i])

In [None]:
%cd ../sample/train

In [None]:
# Copy a all the images in train folder to pred folder, for use during pseudo labeling
g = glob('*.jpg')
shuf = np.random.permutation(g)
for i in range(len(g)): copyfile(shuf[i], '../pred/' + shuf[i])

In [None]:
%cd ../../valid

In [None]:
# Copy a few images to sample/valid from valid data folder
g = glob('*.jpg')
shuf = np.random.permutation(g)
for i in range(50): copyfile(shuf[i], '../sample/valid/' + shuf[i])

In [None]:
%cd ../test

In [None]:
# Copy all (or a few) the images in test folder to sample/test folder
g = glob('*.jpg')
shuf = np.random.permutation(g)
for i in range(100): copyfile(shuf[i], '../sample/test/' + shuf[i])

In [None]:
# Rearrange all the images to separate dogs and cats directories 
# in the train, valid, sample/train, sample/valid directories
%cd ../sample/train
%mkdir cats
%mkdir dogs
%mv cat.*.jpg cats/
%mv dog.*.jpg dogs/

%cd ../valid
%mkdir cats
%mkdir dogs
%mv cat.*.jpg cats/
%mv dog.*.jpg dogs/

%cd ../../valid
%mkdir cats
%mkdir dogs
%mv cat.*.jpg cats/
%mv dog.*.jpg dogs/

%cd ../train
%mkdir cats
%mkdir dogs
%mv cat.*.jpg cats/
%mv dog.*.jpg dogs/

%cd ../sample/pred
%mkdir cats
%mkdir dogs
%mv cat.*.jpg cats/
%mv dog.*.jpg dogs/

In [None]:
# Create a single 'unknown' class for test set as the code looks for sub-directories
%cd ../test
%mv *.jpg unknown/

%cd ../../test
%mv *.jpg unknown/

In [None]:
%cd ../../..

## Using a Trained Model and Finetuning it (VGG16)

In [None]:
#import Vgg16 helper class
vgg = Vgg16()
adam_model = vgg.model

In [None]:
# Use this command (you need to 'import gc') to free the RAM once in a while
gc.collect()

In [None]:
# Set batch size
batch_size = 4

In [None]:
# Create batches of training and validation
gen=image.ImageDataGenerator()
batches = gen.flow_from_directory(path+'train', target_size=(224,224), 
                                  class_mode='categorical', shuffle=True, batch_size=batch_size)
val_batches = gen.flow_from_directory(path+'valid', target_size=(224,224), 
                                  class_mode='categorical', shuffle=True, batch_size=batch_size)

In [None]:
# Finetune the original model to suit our needs
adam_model.pop()
for layer in adam_model.layers: layer.trainable=False
adam_model.add(Dense(batches.nb_class, activation='softmax'))
adam_model.compile(optimizer=Adam(lr=1e-4), loss='categorical_crossentropy', metrics=['accuracy'])

## Train the Finetuned Model

In [None]:
# We will train the new model with the changes which we have just made to it. Trains only the modified layers
adam_model.fit_generator(batches, samples_per_epoch=batches.nb_sample, nb_epoch=3,
                validation_data=val_batches, nb_val_samples=val_batches.nb_sample)

In [None]:
# We can further train it using different learning rates and number of epochs
adam_model.optimizer.lr = 0.001
adam_model.fit_generator(batches, samples_per_epoch=batches.nb_sample, nb_epoch=3,
                validation_data=val_batches, nb_val_samples=val_batches.nb_sample)

In [None]:
# Find out how the model is structured
adam_model.summary()

In [None]:
adam_model.save_weights(path+'results/simple_vgg_Adam.h5')

## Optimizers

We can change the optimizers and check how well the model trains for each optimizer

In [None]:
%xdel adam_model
gc.collect()

In [None]:
sgd_model = vgg.model

In [None]:
sgd_model.pop()
for layer in sgd_model.layers: layer.trainable=False
sgd_model.add(Dense(batches.nb_class, activation='softmax'))
sgd_model.compile(optimizer=SGD(lr=1e-4), loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
sgd_model.fit_generator(batches, samples_per_epoch=batches.nb_sample, nb_epoch=3,
                validation_data=val_batches, nb_val_samples=val_batches.nb_sample)

In [None]:
sgd_model.save_weights(path+'results/simple_vgg_SGD.h5')

In [None]:
%xdel sgd_model
gc.collect()

In [None]:
rms_model = vgg.model

In [None]:
rms_model.pop()
for layer in rms_model.layers: layer.trainable=False
rms_model.add(Dense(batches.nb_class, activation='softmax'))
rms_model.compile(optimizer=SGD(lr=1e-4), loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
rms_model.fit_generator(batches, samples_per_epoch=batches.nb_sample, nb_epoch=3,
                validation_data=val_batches, nb_val_samples=val_batches.nb_sample)

In [None]:
rms_model.save_weights(path+'results/simple_vgg_RMSprop.h5')

## Removing all Dense Layers of VGG

There is no use retraining the convolutional layers of a pre-trained model considering that it is only extracting the features of the image that passes through it. Hence we can try to change the dense layers to make our model better for specific classification problems.

In [None]:
%xdel rms_model
gc.collect()

In [None]:
model = vgg.model

In [None]:
model.pop()
model.pop()
model.pop()
model.pop()
model.pop()
model.pop()
model.pop()

I've added batchnormalization to the last convolutional layer and after each dense layers. Dropout and L1, L2 regularization too can be added at this point. But it is not advisable for sample data set as regularization is directly related to the amount of data we have; more the data, more regularization needs to be added.

In [None]:
# Adding Batchnormalization, dropout, L2 regularization
model.add(BatchNormalization(axis=1))
model.add(MaxPooling2D())
model.add(Flatten())
#model.add(Dropout(0.5))
model.add(Dense(256, activation='relu'))
model.add(BatchNormalization())
#model.add(Dropout(0.5))
model.add(Dense(256, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.1))
model.add(Dense(2, activation='softmax', W_regularizer=l2(0.01)))

In [None]:
model.compile(optimizer=Adam(lr=0.00001), loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
model.summary()

In [None]:
model.fit_generator(batches, samples_per_epoch=batches.nb_sample, nb_epoch=3,
                validation_data=val_batches, nb_val_samples=val_batches.nb_sample)

In [None]:
model.optimizer.lr = 0.001
model.fit_generator(batches, samples_per_epoch=batches.nb_sample, nb_epoch=3,
                validation_data=val_batches, nb_val_samples=val_batches.nb_sample)

In [None]:
model.save_weights(path+'results/vgg_dense_replace.h5')

## Visualizing Results

To visualize results i.e. to check how well the model is performing we use the validation dataset. The confusion matrix is used to see the performance of the model in a matrix form. This part is not necessary in the code, but helps us t understand what problems are being faced by the model in classification

In [None]:
# Free up the RAM a bit
gc.collect()

In [None]:
val_preds = model.predict_generator(val_batches, val_samples = val_batches.nb_sample)

In [None]:
# These are the image names as we see in the folders
filenames = val_batches.filenames
# These are the labels for each image in the validation set (0 - cat, 1 - dog)
expected_labels = val_batches.classes

In [None]:
# The predictions we get are rounded of to 0 or 1
our_predictions = val_preds[:,0]
our_labels = np.round(1-our_predictions)

In [None]:
from keras.preprocessing import image

# Helper function to plot images by index in the validation set 
# Plots is a helper function in utils.py
def plots_idx(idx, titles=None):
    plots([image.load_img(path + 'valid/' + filenames[i]) for i in idx], titles=titles)
    
# Number of images to view for each visualization task
n_view = 3

In [None]:
# 1. A few correct labels at random
correct = np.where(our_labels==expected_labels)[0]
print "Found %d correct labels" % len(correct)
idx = permutation(correct)[:n_view]
plots_idx(idx, our_predictions[idx])

In [None]:
# 2. A few incorrect labels at random
incorrect = np.where(our_labels!=expected_labels)[0]
print "Found %d incorrect labels" % len(incorrect)
idx = permutation(incorrect)[:n_view]
plots_idx(idx, our_predictions[idx])

In [None]:
# 3a. The images we most confident were cats, and are actually cats
correct_cats = np.where((our_labels==0) & (our_labels==expected_labels))[0]
print "Found %d confident correct cats labels" % len(correct_cats)
most_correct_cats = np.argsort(our_predictions[correct_cats])[::-1][:n_view]
plots_idx(correct_cats[most_correct_cats], our_predictions[correct_cats][most_correct_cats])

In [None]:
# 3b. The images we most confident were dogs, and are actually dogs
correct_dogs = np.where((our_labels==1) & (our_labels==expected_labels))[0]
print "Found %d confident correct dogs labels" % len(correct_dogs)
most_correct_dogs = np.argsort(our_predictions[correct_dogs])[:n_view]
plots_idx(correct_dogs[most_correct_dogs], our_predictions[correct_dogs][most_correct_dogs])

In [None]:
# 4a. The images we were most confident were cats, but are actually dogs
incorrect_cats = np.where((our_labels==0) & (our_labels!=expected_labels))[0]
print "Found %d incorrect cats" % len(incorrect_cats)
if len(incorrect_cats):
    most_incorrect_cats = np.argsort(our_predictions[incorrect_cats])[::-1][:n_view]
    plots_idx(incorrect_cats[most_incorrect_cats], our_predictions[incorrect_cats][most_incorrect_cats])

In [None]:
# 4b. The images we were most confident were dogs, but are actually cats
incorrect_dogs = np.where((our_labels==1) & (our_labels!=expected_labels))[0]
print "Found %d incorrect dogs" % len(incorrect_dogs)
if len(incorrect_dogs):
    most_incorrect_dogs = np.argsort(our_predictions[incorrect_dogs])[:n_view]
    plots_idx(incorrect_dogs[most_incorrect_dogs], our_predictions[incorrect_dogs][most_incorrect_dogs])

In [None]:
# 5. The most uncertain labels (ie those with probability closest to 0.5).
most_uncertain = np.argsort(np.abs(our_predictions-0.5))
plots_idx(most_uncertain[:n_view], our_predictions[most_uncertain])

In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(expected_labels, our_labels)

In [None]:
plot_confusion_matrix(cm, val_batches.class_indices)

## Data Augmentation

In [None]:
gc.collect()

In [None]:
model.optimizer.lr = 0.001

In [None]:
# Generate a set of images from train data using variations like rotate, height shift, width shift, etc.
gen_t = image.ImageDataGenerator(rotation_range=15)

batches = gen_t.flow_from_directory(path+'train', target_size=(224,224), 
                                  class_mode='categorical', shuffle=True, batch_size=batch_size)

In [None]:
# And train the model
model.fit_generator(batches, samples_per_epoch=batches.nb_sample, nb_epoch=3,
                validation_data=val_batches, nb_val_samples=val_batches.nb_sample)

In [None]:
gen_t = image.ImageDataGenerator(height_shift_range=0.05)

batches = gen_t.flow_from_directory(path+'train', target_size=(224,224), 
                                  class_mode='categorical', shuffle=True, batch_size=batch_size)

In [None]:
# And train the model
model.fit_generator(batches, samples_per_epoch=batches.nb_sample, nb_epoch=3,
                validation_data=val_batches, nb_val_samples=val_batches.nb_sample)

In [None]:
gen_t = image.ImageDataGenerator(shear_range=0.1)

batches = gen_t.flow_from_directory(path+'train', target_size=(224,224), 
                                  class_mode='categorical', shuffle=True, batch_size=batch_size)

In [None]:
# And train the model
model.fit_generator(batches, samples_per_epoch=batches.nb_sample, nb_epoch=3,
                validation_data=val_batches, nb_val_samples=val_batches.nb_sample)

In [None]:
gen_t = image.ImageDataGenerator(channel_shift_range=20)

batches = gen_t.flow_from_directory(path+'train', target_size=(224,224), 
                                  class_mode='categorical', shuffle=True, batch_size=batch_size)

In [None]:
# And train the model
model.fit_generator(batches, samples_per_epoch=batches.nb_sample, nb_epoch=3,
                validation_data=val_batches, nb_val_samples=val_batches.nb_sample)

In [None]:
gen_t = image.ImageDataGenerator(width_shift_range=0.1)

batches = gen_t.flow_from_directory(path+'train', target_size=(224,224), 
                                  class_mode='categorical', shuffle=True, batch_size=batch_size)

In [None]:
# And train the model
model.fit_generator(batches, samples_per_epoch=batches.nb_sample, nb_epoch=3,
                validation_data=val_batches, nb_val_samples=val_batches.nb_sample)

In [None]:
model.save_weights(path+'results/data_aug.h5')

In [None]:
# Generate a set of images from train data by combining all the variations like rotate, height shift, width shift, etc.
gen_t = image.ImageDataGenerator(width_shift_range=0.1, height_shift_range=0.05, shear_range=0.1, channel_shift_range=20, 
                                 rotation_range=15)

batches = gen_t.flow_from_directory(path+'train', target_size=(224,224), 
                                  class_mode='categorical', shuffle=True, batch_size=batch_size)

In [None]:
model.fit_generator(batches, samples_per_epoch=batches.nb_sample, nb_epoch=3,
                validation_data=val_batches, nb_val_samples=val_batches.nb_sample)

In [None]:
model.save_weights(path+'results/data_aug_all.h5')

## Pseudo Labeling

In [None]:
gc.collect()

In [None]:
# Load the weights of the model that has given you the best accuracy
model.load_weights(path+'results/data_aug_all.h5')

In [None]:
# Run the code for the test images and classify them
test_batches = gen.flow_from_directory(path+'test', target_size=(224,224), 
                                  class_mode='categorical', shuffle=True, batch_size=batch_size)
test_feat = model.predict_generator(test_batches, val_samples = test_batches.nb_sample)

In [None]:
test_filenames = test_batches.filenames

In [None]:
# Store the result into an excel sheet
result_name = path+'results/result.xlsx'

In [None]:
def do_clip(arr, mx): return np.clip(arr, (1-mx)/9, mx)

In [None]:
subm = do_clip(test_feat,0.97)

In [None]:
classes = sorted(batches.class_indices, key=batches.class_indices.get)

In [None]:
submission = pd.DataFrame(subm, columns=classes)
submission.insert(0, 'img', [a[4:] for a in test_filenames])

In [None]:
submission.to_excel(result_name, index=False)

In [None]:
gc.collect()

In [None]:
from openpyxl import *

In [None]:
wb = load_workbook(path+'results/result.xlsx')
sheet = wb.get_sheet_by_name('Sheet1')

In [None]:
for i in range(2, test_batches.nb_sample):
    if sheet.cell(row=i, column=2).value > sheet.cell(row=i, column=3).value:
        col = 2
    else: col = 3
    img = sheet.cell(row=i,column=1).value
    f = sheet.cell(row=1, column = col).value
    %cd data/state/sample/test/unknown
    g = glob('*.jpg')
    for i in range (test_batches.nb_sample):
        if g[i]==str(img):
            copyfile(g[i], '../../pred/' + str(f) + '/' + g[i])
    %cd ../../../../..

In [None]:
gc.collect()

In [None]:
# Generate the new set of training batches
pred_batches = gen.flow_from_directory(path+'pred', target_size=(224,224), 
                                  class_mode='categorical', shuffle=True, batch_size=batch_size)

In [None]:
# Train the model with the new set of data
model.fit_generator(batches, samples_per_epoch=batches.nb_sample, nb_epoch=5,
                validation_data=val_batches, nb_val_samples=val_batches.nb_sample)

In [None]:
model.save_weights(path+'results/pseudo_label.h5')

## Test the Model

In [None]:
gc.collect()

In [None]:
# We test the model using the test data made available. Note: Here we are using the original test folder with all data
test_batches = gen.flow_from_directory(path+'../test', target_size=(224,224), 
                                  class_mode='categorical', shuffle=True, batch_size=batch_size)
preds = model.predict_generator(test_batches, val_samples = test_batches.nb_sample)

## Submission

In [None]:
#Working on the dog prediction column
isdog = preds[:,1]

In [None]:
#We shall round our predictions on the edges
isdog = isdog.clip(min=0.05, max=0.95)

In [None]:
#Extract image ID of the filenames from the /test/unknown directory
filenames = test_batches.filenames
ids = np.array([int(f[8:f.find('.')]) for f in filenames])

In [None]:
#We join the two columns into an array for submission
subm = np.stack([ids,isdog], axis=1)

In [None]:
#Save it as a .csv file
%cd $path/results
submission_filename = 'submission1.csv'
np.savetxt(submission_filename, subm, fmt='%d,%.5f', header='id,label', comments='')

In [None]:
from IPython.display import FileLink
FileLink(path+'results/'+submission_filename)